|
DataMuseum.dkPresents historical artifacts from the history of: Commodore CBM-900 |
This is an automatic "excavation" of a thematic subset of
See our Wiki for more about Commodore CBM-900 Excavated with: AutoArchaeologist - Free & Open Source Software. |
top - metrics - download
Length: 14686 (0x395e) Types: TextFile Notes: UNIX file Names: »typo.c«
└─⟦f27320a65⟧ Bits:30001972 Commodore 900 hard disk image with partial source code └─⟦f4b8d8c84⟧ UNIX Filesystem └─⟦this⟧ »cmd/typo.c«
/* * Detect typographical errors. */ #include <stdio.h> #include <ctype.h> #include <sys/stat.h> #define oc(c) ((c)+'`') #define NSBRK 512 /* Amount to sbrk at once in bytes */ #define NWORD 200 /* Longest word (ridiculously big) */ #define NHASH 64 /* Number of hash buckets (power of 2) */ char dictfile[] = "/usr/dict/dict"; char digfile[] = "/usr/dict/digrams"; char trifile[] = "/usr/dict/trigrams"; char tmp[] = "/tmp/typoXXXXXX"; char sort[] = "sort"; char *tmpfile; char notrimem[] = "Out of memory for trigrams"; char nosort[] = "Cannot locate sort"; char word[NWORD]; FILE *ifp; FILE *pfp; char ibuf[BUFSIZ]; char obuf[BUFSIZ]; char xbuf[BUFSIZ]; unsigned hashval; int rflag; /* Raw (no removal of nroff things) input */ int nflag; /* No pre-defined statistics or exceptions */ int sflag; /* Write out statistics */ typedef struct EXCEPT { unsigned e_hval; struct EXCEPT *e_next; char e_word[]; } EXCEPT; EXCEPT *except[NHASH]; /* * Table of digrams, indexed by first * and then second character. [1-26] * represent [a-z] inclusive and 0 * represents the beginning or ending of word. */ unsigned digrams[27][27]; /* * Table of trigrams, indexed by first and * second letter and allocated on the third letter. * After all of the input data has been * sampled, the t_freq entry changes from * frequency to an index of peculiarity. */ typedef struct TRIGRAMS { struct TRIGRAMS *t_next; char t_char; unsigned char t_freq; } TRIGRAMS; TRIGRAMS *trigrams[27][27]; /* * Table of logarithms scaled by 1000. * Table from 0-99 inclusive. */ int log1[] = { 0000, 0693, 1099, 1386, 1609, 1792, 1946, 2080, 2197, /* 1-9 */ 2303, 2398, 2485, 2565, 2639, 2708, 2772, 2833, 2890, /* 10-18 */ 2944, 2996, 3045, 3091, 3136, 3178, 3219, 3258, 3296, /* 19-27 */ 3332, 3367, 3401, 3434, 3466, 3497, 3526, 3555, 3584, /* 28-36 */ 3611, 3638, 3664, 3689, 3714, 3738, 3761, 3784, 3807, /* 37-45 */ 3829, 3850, 3871, 3892, 3912, 3932, 3951, 3970, 3989, /* 46-54 */ 4007, 4025, 4043, 4060, 4078, 4094, 4111, 4127, 4143, /* 55-63 */ 4159, 4174, 4190, 4205, 4220, 4234, 4249, 4263, 4277, /* 64-72 */ 4291, 4304, 4318, 4331, 4344, 4357, 4369, 4382, 4394, /* 73-81 */ 4407, 4419, 4431, 4443, 4454, 4466, 4477, 4489, 4500, /* 82-90 */ 4511, 4521, 4533, 4543, 4554, 4564, 4575, 4585, 4595, /* 91-99 */ }; /* * Table of logarithms for each multiple * of ten from 100 - 990 inclusive * (scaled as above). */ int log2[] = { 4605, 4701, 4788, 4868, 4942, 5011, 4075, 5136, 5193, /* 100-180 */ 5247, 5298, 5347, 5394, 5438, 5481, 5522, 5561, 5598, /* 190-270 */ 5635, 5670, 5704, 5737, 5768, 5799, 5829, 5858, 5886, /* 280-360 */ 5914, 5940, 5966, 5992, 6016, 6040, 6064, 6087, 6109, /* 370-450 */ 6131, 6153, 6174, 6194, 6215, 6234, 6254, 6273, 6292, /* 460-540 */ 6310, 6328, 6346, 6663, 6380, 6397, 6414, 6430, 6446, /* 550-630 */ 6462, 6477, 6492, 6507, 6522, 6537, 6551, 6565, 6579, /* 640-720 */ 6593, 6607, 6620, 6633, 6646, 6659, 6672, 6684, 6697, /* 730-810 */ 6709, 6721, 6733, 6745, 6757, 6769, 6780, 6791, 6802, /* 820-900 */ 6813, 6824, 6835, 6846, 6857, 6867, 6877, 6888, 6898, /* 910-990 */ }; /* * Table of logarithms of multiples * of 100 from 1000-9900 inclusive * (scaled as before). */ int log3[] = { 6908, 7003, 7090, 7170, 7244, 7313, 7378, 7438, 7496, /* 1000-1800 */ 7550, 7601, 7650, 7696, 7741, 7783, 7824, 7863, 7901, /* 1900-2700 */ 7937, 7973, 8006, 8039, 8071, 8102, 8132, 8161, 8189, /* 2800-3600 */ 8216, 8243, 8269, 8294, 8319, 8343, 8366, 8389, 8412, /* 3700-4500 */ 8434, 8455, 8476, 8497, 8517, 8537, 8556, 8576, 8594, /* 4600-5400 */ 8613, 8631, 8648, 8666, 8683, 8700, 8716, 8732, 8748, /* 5500-6300 */ 8764, 8780, 8795, 8810, 8825, 8839, 8854, 8868, 8882, /* 6400-7200 */ 8896, 8909, 8923, 8936, 8949, 8962, 8975, 8987, 9000, /* 7300-8100 */ 9012, 9024, 9036, 9048, 9060, 9071, 9083, 9094, 9105, /* 8200-9000 */ 9116, 9127, 9138, 9149, 9159, 9170, 9180, 9190, 9200, /* 9100-9900 */ }; char *malloc(); char *mktemp(); char *sbrk(); char *getword(); EXCEPT *lookup(); FILE *pinit(); main(argc, argv) char *argv[]; { register char *ap; register int i; FILE *fp; /* * Because we have our own allocator. */ setbuf(stdout, obuf); setbuf(stderr, NULL); while (argc>1 && *argv[1]=='-') { for (ap = &argv[1][1]; *ap != '\0'; ap++) switch (*ap) { case 'n': nflag = 1; break; case 'r': rflag = 1; break; case 's': sflag = 1; break; default: usage(); } argc--; argv++; } if (sflag == 0) { readdicts(); pfp = pinit(); } if (argc > 1) for (i=1; i<argc; i++) { if ((fp = fopen(argv[i], "r")) == NULL) tyerr("Cannot open input `%s'", argv[i]); typo(fp); } else typo(stdin); if (!sflag) { pterm(pfp); precompute(); /* * Re-read the sorted word * list. */ reread(); } else outstats(); rmexit(0); } /* * Called for each input word * to set up file pointer, read * of individual words and enter * frequency statistics into the * tables. */ typo(fp) register FILE *fp; { ifp = fp; setbuf(fp, ibuf); while (getword() != NULL) { stats(word); if (!sflag && lookup(word)==NULL) fprintf(pfp, "%s\n", word); } if (fp != stdin) fclose(fp); } /* * Initialise the trigram and digram * tables if the user requests help. * Read in the exception dictionary. * Put each word in a hash table. */ readdicts() { FILE *fp; if (nflag) return; if ((fp = fopen(dictfile, "r")) != NULL) { register int l; register char *cp; register EXCEPT *ep; setbuf(fp, xbuf); while (fgets(cp = word, sizeof word, fp) != NULL) { l = strlen(cp); cp[l-1] = '\0'; if ((ep = (EXCEPT *)malloc(l + sizeof *ep)) == NULL) tyerr("Out of memory for dictionary"); hashval = 0; while (*cp) hashval += *cp++; l = hashval%NHASH; ep->e_next = except[l]; except[l] = ep; ep->e_hval = hashval; strcpy(ep->e_word, word); } fclose(fp); } if ((fp = fopen(digfile, "r")) != NULL) { register int t1, t2; setbuf(fp, xbuf); while (fgets(word, NWORD, fp) != NULL) { t1 = cton(word[0]); t2 = cton(word[1]); digrams[t1][t2] = atoi(&word[2]); } fclose(fp); } if ((fp = fopen(trifile, "r")) != NULL) { register TRIGRAMS *tp; register int t1, t2, t3; setbuf(fp, xbuf); while (fgets(word, NWORD, fp) != NULL) { t1 = cton(word[0]); t2 = cton(word[1]); t3 = cton(word[2]); for (tp = trigrams[t1][t2]; tp!=NULL; tp = tp->t_next) { if (tp->t_char == t3) { tp->t_freq += atoi(&word[3]); break; } } if (tp == NULL) { if ((tp = (TRIGRAMS *)malloc(sizeof *tp))==NULL) tyerr(notrimem); tp->t_freq = atoi(&word[3]); tp->t_char = t3; tp->t_next = trigrams[t1][t2]; trigrams[t1][t2] = tp; } } fclose(fp); } } /* * Get a character from a digram or * trigram file and check it. * Convert character to index number * for tables. */ cton(c) register unsigned c; { if ((c -= '`') >= 27) tyerr("Invalid digram/trigram file format"); return (c); } /* * Get the next word from the input. */ char * getword() { register char *cp; register int c; while (!isalpha(c = tgetc())) if (c == EOF) return (NULL); cp = word; for (;;) { if (isupper(c)) c = tolower(c); *cp++ = c; if (!isalpha(c = tgetc())) break; } *cp = '\0'; return (word); } /* * Get a character. This also checks * for roff stuff (if -r is not specified) * and hyphenated words. */ tgetc() { static int nlflag = 1; register int c; while (nlflag) { if ((c = getc(ifp)) == '.') while ((c = getc(ifp))!='\n' && c!=EOF) ; else { ungetc(c, ifp); nlflag = 0; break; } } again: if ((c = getc(ifp)) == '\n') nlflag = 1; else if (c == '-') { if ((c = getc(ifp)) == '\n') goto again; } else if (c == '\'') goto again; else if (c == '\\') { /* Fonts and sizes */ switch (c = getc(ifp)) { case 'f': c = getc(ifp); goto again; case 's': while ((c = getc(ifp))>='0' && c<='9') ; } } return (c); } /* * Lookup an input word * in the exception list. */ EXCEPT * lookup(wp) char *wp; { register char *cp = wp; register EXCEPT *ep; hashval = 0; while (*cp != '\0') hashval += *cp++; for (ep = except[hashval%NHASH]; ep != NULL; ep = ep->e_next) if (ep->e_hval==hashval && strcmp(ep->e_word, word)==0) return (ep); return (NULL); } /* * Compute the trigram and digram statistics * on this word. */ stats(wp) register char *wp; { register int t1, t2, t3; register TRIGRAMS *tp; t1 = 0; while (*wp != '\0') { t2 = *wp++; t3 = *wp; if (t2>='a' && t2<='z') t2 -= 'a'-1; if (t3>='a' && t3<='z') t3 -= 'a'-1; digrams[t1][t2]++; for (tp = trigrams[t1][t2]; tp != NULL; tp = tp->t_next) if (tp->t_char == t3) { tp->t_freq++; break; } if (tp == NULL) { if ((tp = (TRIGRAMS *)malloc(sizeof *tp)) != NULL) { tp->t_freq = 1; tp->t_char = t3; tp->t_next = trigrams[t1][t2]; trigrams[t1][t2] = tp; } } t1 = t2; } digrams[t2][t3]++; } /* * Output the digram and trigram statistics * onto file `digram' and `trigram'. */ outstats() { char x1buf[BUFSIZ]; register TRIGRAMS *tp; register int t1, t2; register int freq; FILE *dfp, *tfp; if ((dfp = fopen("digrams", "w")) == NULL) tyerr("Cannot create digrams file"); if ((tfp = fopen("trigrams", "w")) == NULL) tyerr("Cannot create trigrams file"); setbuf(dfp, xbuf); setbuf(tfp, x1buf); for (t1 = 0; t1 < 27; t1++) for (t2 = 0; t2 < 27; t2++) { if ((freq = digrams[t1][t2]) != 0) fprintf(dfp, "%c%c%d\n", oc(t1), oc(t2), freq); for (tp = trigrams[t1][t2]; tp!=NULL; tp = tp->t_next) fprintf(tfp, "%c%c%c%d\n", oc(t1), oc(t2), oc(tp->t_char), tp->t_freq); } fclose(dfp); fclose(tfp); } /* * Compute index of peculiarity on a * word. This is the RMS mean of the * index for each trigram (say XYZ). * This index is 1/2(log (xy) + log(yz)) - log(xyz) * If there is only one trigram, the RMS * mean is taken to be the one sample. */ pindex(wp) register char *wp; { register int t1, t2, t3; register TRIGRAMS *tp; register long sumsq; register long sum; register unsigned count; register int ntri; ntri = 0; t1 = 0; sum = sumsq = 0; while (*wp != '\0') { t2 = *wp++; t3 = *wp; if (t2>='a' && t2<='z') t2 -= 'a'-1; if (t3>='a' && t3<='z') t3 -= 'a'-1; for (tp = trigrams[t1][t2]; tp != NULL; tp = tp->t_next) if (tp->t_char == t3) { count = tp->t_freq; break; } if (tp == NULL) tyerr("Missing trigram"); sum += count; sumsq += (long)count*count; ntri++; t1 = t2; } if (ntri > 1) { sum *= sum; sum /= ntri; sum = sumsq-sum; } count = (qsqrt((int)sum) + 5) / 10; return (count); } /* * Pre-compute the trigram statistics. * This essentially transforms the * trigram tables from a table * of frequencies to a table of * indices of peculiarity for that * particular trigram. */ precompute() { register int i, j; register TRIGRAMS *tp; register int logij; for (i=0; i<27; i++) for (j=0; j<27; j++) { logij = qlog(digrams[i][j]-1); for (tp = trigrams[i][j]; tp != NULL; tp = tp->t_next) { tp->t_freq = ((logij + qlog(digrams[j][tp->t_char]-1))/2 - qlog(tp->t_freq-1)) / 100; } } } /* * Evaluate a natural logarithm * quickly by table lookup. * The resulting logarithm * is multiplied by 1000. * By definition, the log of 0 (or less) is * -10. */ qlog(n) register unsigned n; { if (n <= 0) return (-10*1000); if (n < 100) return (log1[n-1]); if (n < 1000) return (log2[(n-95)/10]); if (n < 10000) return (log3[(n-950)/100]); return (10*1000); } /* * Quick version of sqare root. * Uses Newton's method. */ qsqrt(x) register unsigned x; { register int maxterm = 50; register unsigned old, new; if ((old = x/10) == 0) { old = 1; if (x == 0) return (0); } do { new = (x/old+old)/2; if (old == new) break; old = new; } while (--maxterm); return (new); } /* * Build pipe to a sort routine. */ FILE * pinit() { int pfd[2]; register int pid; register int fd; FILE *fp; tmpfile = mktemp(tmp); if (pipe(pfd) < 0) tyerr("Cannot pipe to sort"); if ((pid = fork()) < 0) tyerr("Cannot fork for sort"); if (pid) { close(pfd[0]); if ((fp = fdopen(pfd[1], "w")) != NULL) setbuf(fp, xbuf); return (fp); } else { dup2(pfd[0], 0); close(pfd[0]); close(pfd[1]); if ((fd = creat(tmp, 0666)) < 0) tyerr("Cannot create temporary file"); dup2(fd, 1); close(fd); execlp(sort, sort, "-u", NULL); tyerr(nosort); } } /* * Close off the pipe and wait * for the sort command to * complete. */ pterm(fp) FILE *fp; { int status; fclose(fp); while (wait(&status) >= 0) if (status) tyerr("Sort failed"); } /* * Re-read the sorted word-list * from the temp file. Because * sort -u is not implemented, * we have to guarantee uniqueness * ourselves. * The output is piped into a * `sort -nr' command. */ reread() { FILE *fp; register int ind; int pid, status; int pfd[2]; if ((fp = fopen(tmpfile, "r")) == NULL) tyerr("Cannot re-open temp-file"); setbuf(fp, xbuf); if (pipe(pfd) < 0) tyerr("Cannot create pipe to sort output"); if ((pid = fork()) < 0) fprintf(stderr, "Cannot fork -- output is not sorted\n"); else if (pid) { dup2(pfd[1], 1); close(pfd[0]); close(pfd[1]); } else { dup2(pfd[0], 0); close(pfd[0]); close(pfd[1]); execlp(sort, sort, "-nr", NULL); tyerr(nosort); } while (fgets(word, NWORD, fp) != NULL) { word[strlen(word)-1] = '\0'; ind = pindex(word); printf("%2d %s\n", ind, word); } fclose(fp); fclose(stdout); while (wait(&status) >= 0) ; } /* * A simple malloc for which there * is no free. It just uses sbrk and * returns a value. * `size' is in bytes. */ char * malloc(size) register unsigned size; { static int *rp, *ep; register int *cp; if ((size = (size+sizeof(int)-1)/sizeof(int)) == 0) return (NULL); if (rp==NULL || rp+size>=ep) { if ((ep = (int *)sbrk(NSBRK)) == NULL) return (NULL); if (rp == NULL) rp = ep; ep += NSBRK/sizeof(int); } cp = rp; rp += size; return (cp); } /* * Dummy free. */ free(p) char *p; { tyerr("Free not allowed"); } /* * Dummy realloc */ char * realloc(p) char *p; { tyerr("realloc not allowed"); } /* * Exit, removing the temp-file * if it is found. */ rmexit(s) { if (tmpfile != NULL) unlink(tmpfile); exit(s); } /* * Error and usage messages */ /* VARARGS */ tyerr(x) { fprintf(stderr, "typo: %r", &x); putc('\n', stderr); rmexit(1); } usage() { fprintf(stderr, "Usage: typo [-nrs] file\n"); rmexit(1); }