|
|
DataMuseum.dkPresents historical artifacts from the history of: DKUUG/EUUG Conference tapes |
This is an automatic "excavation" of a thematic subset of
See our Wiki for more about DKUUG/EUUG Conference tapes Excavated with: AutoArchaeologist - Free & Open Source Software. |
top - metrics - downloadIndex: T m
Length: 12797 (0x31fd)
Types: TextFile
Names: »mksp.c«
└─⟦a0efdde77⟧ Bits:30001252 EUUGD11 Tape, 1987 Spring Conference Helsinki
└─⟦this⟧ »EUUGD11/euug-87hel/sec1/sp/mksp.c«
/* vi: set tabstop=4 : */
/*
* mksp - make soundex dictionary
* Version 1.3, December 1986
*
* If <soundexfile.{dir, pag}> do not exist, try to create them
* If they do exist and are not empty, then words will be added
* from the standard input
* Only when words are being added to an existing database are duplicate words
* ignored
* Valid words (words beginning with an alphabetic) are stored as given but
* comparisons for duplicates is case independent.
* Non-alphabetic characters are ignored in computing the soundex
*
* Permission is given to copy or distribute this program provided you
* do not remove this header or make money off of the program.
*
* Please send comments and suggestions to:
* Barry Brachman
* Dept. of Computer Science
* Univ. of British Columbia
* Vancouver, B.C. V6T 1W5
*
* .. {ihnp4!alberta, uw-beaver}!ubc-vision!ubc-cs!brachman
* brachman@cs.ubc.cdn
* brachman%ubc.csnet@csnet-relay.arpa
* brachman@ubc.csnet
*/
#include <ctype.h>
#include <errno.h>
#include <sys/types.h>
#include <sys/file.h>
#include <sys/stat.h>
#include <stdio.h>
#ifdef NEWDBM
#include <ndbm.h>
#include <sys/file.h>
#else !NEWDBM
#include <dbm.h>
#endif NEWDBM
#include "sp.h"
#define NEW_DICT 0
#define OLD_DICT 1
#define VFREQ 1000 /* frequency for verbose messages */
#define streq(X, Y) (!strcmp(X, Y))
#define strneq(X, Y, N) (!strncmp(X, Y, N))
#define USAGE "Usage: mksp -tad [-v[#]] <soundexfile>"
int map[26][667];
datum FETCH(), FIRSTKEY(), NEXTKEY();
key_t keyvec[KEYSIZE];
key_t *key = keyvec;
/*
* Soundex codes
* The program depends upon the numbers zero through six being used
* but this can easily be changed
*/
char soundex_code_map[26] = {
/*** A B C D E F G H I J K L M N O P ***/
0, 1, 2, 3, 0, 1, 2, 0, 0, 2, 2, 4, 5, 5, 0, 1,
/*** Q R S T U V W X Y Z ***/
2, 6, 2, 3, 0, 1, 0, 2, 0, 2
};
int digit_part;
int aflag, dflag, tflag, vflag;
char *mk_word();
main(argc, argv)
int argc;
char **argv;
{
register int i;
char *file;
if (argc != 3 && argc != 4) {
fprintf(stderr, "%s\n", USAGE);
exit(1);
}
aflag = dflag = tflag = vflag = 0;
file = (char *) NULL;
for (i = 1; i < argc; i++) {
if (streq(argv[i], "-a"))
aflag = 1;
else if (streq(argv[i], "-d"))
dflag = 1;
else if (streq(argv[i], "-t"))
tflag = 1;
else if (strneq(argv[i], "-v", 2)) {
if (isdigit(argv[i][2]))
vflag = atoi(&argv[i][2]);
else
vflag = 1;
}
else if (file == (char *) NULL)
file = argv[i];
else {
fprintf(stderr, "%s\n", USAGE);
exit(1);
}
}
if (file == (char *) NULL || (tflag + aflag + dflag) != 1) {
fprintf(stderr, "%s\n", USAGE);
exit(1);
}
if (aflag) {
addwords(file);
if (vflag > 1) {
register int j, total;
int m, max;
fprintf(stderr, "Counters:\n");
for (i = 0; i < 26; i++) {
total = max = map[i][0];
for (j = 1; j < 667; j++) {
total += (m = map[i][j]);
if (m > max)
max = m;
}
if (max > 0)
fprintf(stderr, "%c: max %d total %d\n", 'a'+i, max, total);
}
}
}
else if (dflag)
deletewords(file);
else if (tflag)
prcontents(file);
exit(0);
}
/*
* Add words read from stdin to the database
* The key is the 3 digit soundex code for the word plus a disambiguating
* counter. Different counter values are used for words with the same soundex
* code. The maximum counter value is MAXCOUNT. If the counter overflows then
* we lose, but given at least 8 bits this seems unlikely.
*/
addwords(name)
char *name;
{
register int c, count, delete, duplicate, i, len;
register int *p;
int ch, s, status;
char wcopy[MAXWORDLEN + 2], word[MAXWORDLEN + 2];
datum dbm_key, dbm_content;
status = setup(name);
if (DBMINIT(name, O_RDWR) == -1) {
fprintf(stderr, "mksp: Can't initialize\n");
exit(1);
}
for (i = 0; i < 26; i++)
for (c = 0; c < 667; c++)
map[i][c] = 0;
dbm_key.dptr = (char *) key;
dbm_key.dsize = KEYSIZE;
count = 0;
while (fgets(word, sizeof(word), stdin) != (char *) NULL) {
len = strlen(word);
if (word[len - 1] != '\n') {
fprintf(stderr, "mksp: Word too long: %s", word);
while ((ch = getchar()) != '\n') /* flush rest of line */
putc(ch, stderr);
putc('\n', stderr);
continue;
}
word[--len] = '\0';
if (len > MAXWORDLEN) {
fprintf(stderr, "mksp: Word too long: %s\n", word);
continue;
}
if ((s = soundex(word, 3)) == BAD_WORD) {
if (vflag)
fprintf(stderr, "Ignoring bad word: %s\n", word);
continue;
}
ch = (isupper(word[0]) ? tolower(word[0]) : word[0]) - 'a';
p = &(map[ch][digit_part]);
/*
* If words are being added to an old dictionary,
* check for duplication and watch for a deleted entry
* The reason for only checking for duplicates in old dictionaries is
* that usually when you're creating a new dictionary the words are
* already sorted and unique and the creation of a large dictionary is
* slow enough already.
*/
duplicate = 0;
delete = -1; /* an 'impossible' counter */
if (status == OLD_DICT) {
c = 0;
while (1) {
mk_key(key, s, c);
dbm_content = FETCH(dbm_key);
if (dbm_content.dptr == 0)
break;
if (!IS_DELETED(dbm_content)) {
char *str;
str = mk_word(dbm_content.dptr, dbm_content.dsize, s);
if (strmatch(word, str) == 0) {
duplicate = 1;
if (vflag)
fprintf(stderr, "duplicate: %s\n", word);
break;
}
}
else if (delete < 0) /* choose delete nearest front */
delete = c;
if (++c > MAXCOUNT) {
fprintf(stderr, "mksp: Counter overflow\n");
fprintf(stderr, "soundex: %c%d\n", ch+'a', b10(digit_part));
exit(1);
}
}
if (duplicate)
continue;
*p = c;
}
if (*p > MAXCOUNT) {
fprintf(stderr, "mksp: Counter overflow\n");
fprintf(stderr, "soundex: %c%d\n", ch+'a', b10(digit_part));
exit(1);
}
mk_key(key, s, *p);
*p = *p + 1;
strcpy(wcopy, word);
if (len == 1) {
if (isupper(wcopy[0]))
wcopy[0] |= UPPER_CHAR;
wcopy[0] |= SINGLE_CHAR;
dbm_content.dptr = wcopy;
}
else {
if (isupper(wcopy[1]))
wcopy[1] = wcopy[1] - 'A' + 26;
else if (islower(wcopy[1]))
wcopy[1] = wcopy[1] - 'a';
else if ((wcopy[1] = tospchar(wcopy[1])) == '\0') {
fprintf(stderr, "Bogus second char: can't happen!\n");
exit(1);
}
if (isupper(wcopy[0]))
wcopy[1] = wcopy[1] | UPPER_CHAR;
dbm_content.dptr = wcopy + 1;
len--;
}
dbm_content.dsize = len; /* null not stored */
if (delete < 0) {
if (STORE(dbm_key, dbm_content) == -1) {
fprintf(stderr, "mksp: Can't store\n");
exit(1);
}
}
else {
if (vflag)
fprintf(stderr, "reusing: %s\n", word);
mk_key(key, s, delete);
if (REPLACE(dbm_key, dbm_content) == -1) {
fprintf(stderr, "mksp: Can't replace\n");
exit(1);
}
}
count++;
if (vflag > 1)
fprintf(stderr, "%5d: %s(%d)\n", count, word, ex_count(key));
if (vflag && (count % VFREQ) == 0)
fprintf(stderr, "%5d: %s\n", count, word);
}
if (vflag)
fprintf(stderr, "%d words\n", count);
DBMCLOSE();
}
/*
* Print out everything
*/
prcontents(name)
char *name;
{
register int s;
datum dbm_key, dbm_content;
if (DBMINIT(name, O_RDONLY) == -1)
exit(1);
dbm_key = FIRSTKEY();
while (dbm_key.dptr != NULL) {
dbm_content = FETCH(dbm_key);
if (dbm_content.dptr == 0)
break; /* ??? */
if (vflag)
printf("%3d. ", ex_count((key_t *) dbm_key.dptr));
if (IS_DELETED(dbm_content)) {
if (vflag)
printf("(deleted)\n");
}
else {
s = ex_soundex((key_t *) dbm_key.dptr);
printf("%s\n", mk_word(dbm_content.dptr, dbm_content.dsize, s));
}
dbm_key = NEXTKEY(dbm_key);
}
DBMCLOSE();
}
/*
* When words are deleted they must be marked as such rather than deleted
* using DELETE(). This is because the sequence of counters must remain
* continuous. If DELETE() is used then any entries with the same soundex
* but with a larger counter value would not be accessible. This approach
* does cost some extra space but if an addition is made to the chain then
* a deleted counter slot will be reused. Also, the storage used by the word
* should be made available to dbm. This could be improved somewhat
* by actually using DELETE() on the last entry of the chain.
*/
deletewords(name)
char *name;
{
register int c, ch, len, s;
register char *p;
char word[MAXWORDLEN + 2];
datum dbm_key, dbm_content;
if (DBMINIT(name, O_RDWR) == -1)
exit(1);
while (fgets(word, sizeof(word), stdin) != (char *) NULL) {
len = strlen(word);
if (word[len - 1] != '\n') {
fprintf(stderr, "mksp: Word too long: %s", word);
while ((ch = getchar()) != '\n') /* flush rest of line */
putc(ch, stderr);
putc('\n', stderr);
continue;
}
word[--len] = '\0';
if (len > MAXWORDLEN) {
fprintf(stderr, "mksp: Word too long: %s\n", word);
continue;
}
if ((s = soundex(word, 3)) == BAD_WORD) {
if (vflag)
fprintf(stderr, "Bad word: %s\n", word);
continue;
}
c = 0;
while (1) {
dbm_key.dptr = (char *) key;
dbm_key.dsize = KEYSIZE;
mk_key(key, s, c);
dbm_content = FETCH(dbm_key);
if (dbm_content.dptr == NULL) {
if (vflag)
fprintf(stderr, "Not found: %s\n", word);
break;
}
if (!IS_DELETED(dbm_content)) {
p = mk_word(dbm_content.dptr, dbm_content.dsize, s);
if (strmatch(word, p) == 0) {
/*
* Aside:
* Since dptr points to static storage it must be reset
* if we want to retain the old content (content.dptr=word)
* This took a while to determine...
* Anyhow, since there is no need to store the old word
* we free up the space
*/
dbm_content.dptr = "";
dbm_content.dsize = 0;
if (REPLACE(dbm_key, dbm_content) == -1)
fprintf(stderr, "mksp: delete of '%s' failed\n", word);
else if (vflag) {
if (vflag > 1)
fprintf(stderr, "%d. %s ", c, p);
fprintf(stderr, "deleted\n");
}
break;
}
else if (vflag > 1)
fprintf(stderr, "%d. %s\n", c, p);
}
else if (vflag > 1)
fprintf(stderr, "%d. (deleted)\n", c);
if (++c > MAXCOUNT) {
ch = isupper(word[0]) ? tolower(word[0]) : word[0];
fprintf(stderr, "mksp: Counter overflow\n");
fprintf(stderr, "soundex: %c%d\n", ch, b10(digit_part));
exit(1);
}
}
}
DBMCLOSE();
}
/*
* Setup the dictionary files if necessary
*/
setup(name)
char *name;
{
register int s1, s2;
s1 = check_dict(name, ".dir");
s2 = check_dict(name, ".pag");
if (s1 == NEW_DICT && s2 == NEW_DICT)
return(NEW_DICT);
return(OLD_DICT);
}
/*
* Check if a dictionary file exists:
* - if not, try to create it
* - if so, see if it is empty
* Return NEW_DICT if an empty file exists,
* OLD_DICT if a non-empty file exists
* Default mode for new files is 0666
*/
check_dict(name, ext)
char *name, *ext;
{
register int len, s;
char *filename;
struct stat statbuf;
char *malloc();
extern int errno;
len = strlen(name) + strlen(ext) + 1;
filename = (char *) malloc((unsigned) len);
if (filename == (char *) NULL) {
fprintf(stderr, "mksp: Can't malloc '%s.%s'\n", name, ext);
exit(1);
}
sprintf(filename, "%s%s", name, ext);
if (stat(filename, &statbuf) == -1) {
if (errno != ENOENT) {
perror("mksp");
exit(1);
}
if (creat(filename, 0666) == -1) {
perror("mksp");
exit(1);
}
s = NEW_DICT;
}
else {
if (statbuf.st_size == 0)
s = NEW_DICT;
else
s = OLD_DICT;
}
return(s);
}
/*
* Compute an 'n' digit Soundex code for 'word'
* As a side effect, leave the digit part of the soundex in digit_part
*
* Since the soundex can be considered a base 7 number, if 'n' is:
* 3 require 9 (10 if base 10) bits for digits
* 4 require 12 (13) bits
* 5 require 15 (17) bits
* 6 require 17 (20) bits
*
* The three slightly different versions of this routine should be coalesced.
*/
soundex(word, n)
register char *word;
int n;
{
register int c, soundex_length, previous_code;
register char *p, *w;
char wcopy[MAXWORDLEN + 2];
if (!IS_VALID(word))
return(-1);
strcpy(wcopy, word);
p = w = wcopy;
while (*p != '\0') {
if (isupper(*p))
*p = tolower(*p);
p++;
}
digit_part = 0;
soundex_length = 0;
previous_code = soundex_code_map[*w - 'a'];
for (p = w + 1; *p != '\0' && soundex_length < n; p++) {
if (!isalpha(*p))
continue;
c = soundex_code_map[*p - 'a'];
if (c == 0 || previous_code == c) {
previous_code = c;
continue;
}
digit_part = digit_part * 7 + c;
previous_code = c;
soundex_length++;
}
while (soundex_length++ < n)
digit_part *= 7;
return((digit_part << 5) + *w - 'a');
}
b10(n)
int n;
{
register int b10, s;
for (b10 = 0, s = 1; n != 0; n /= 7) {
b10 += (n % 7) * s;
s *= 10;
}
return(b10);
}