|
DataMuseum.dkPresents historical artifacts from the history of: Commodore CBM-900 |
This is an automatic "excavation" of a thematic subset of
See our Wiki for more about Commodore CBM-900 Excavated with: AutoArchaeologist - Free & Open Source Software. |
top - metrics - download
Length: 3897 (0xf39) Types: TextFile Notes: UNIX file Names: »prep.c«
└─⟦f27320a65⟧ Bits:30001972 Commodore 900 hard disk image with partial source code └─⟦f4b8d8c84⟧ UNIX Filesystem └─⟦this⟧ »cmd/prep.c«
/* * Prepare text for statistical processing * by breaking it into words (and possibly * also punctuation marks) and discarding * certain words if this is desired. */ #include <stdio.h> #include <ctype.h> #define NHASH 64 /* Hash buckets for ignore and only */ #define NWORD 400 /* Longest word */ typedef struct WORDS { struct WORDS *w_next; char w_name[]; } WORDS; WORDS *words[NHASH]; int pflag; /* Print punctuation as well */ int dflag; /* Print (input) word numbers */ int fflag; /* Fold upper into lower case */ int nignore; /* Number of ignored words */ int nonly; /* Number of only words */ long wordno; /* Input word number */ char wordbuf[NWORD]; char missing[] = "Missing `%s' file argument"; char onlyone[] = "Only one of `-i' or `-o' may be given"; main(argc, argv) int argc; char *argv[]; { register char *ap; register int i; register int estat = 0; register FILE *fp; while (argc>1 && *argv[1]=='-') { for (ap = &argv[1][1]; *ap != '\0'; ap++) switch (*ap) { case 'd': dflag = 1; break; case 'f': fflag = 1; break; case 'p': pflag = 1; break; case 'i': if (nonly) preperr(onlyone); if (argc < 3) preperr(missing, "ignore"); argv++; argc--; nignore += enter(argv[1]); break; case 'o': if (nignore) preperr(onlyone); if (argc < 3) preperr(missing, "only"); argv++; argc--; nonly += enter(argv[1]); break; default: usage(); } argv++; argc--; } if (argc > 1) for (i=1; i<argc; i++) { if ((fp = fopen(argv[i], "r")) == NULL) preperr("Cannot open `%s'", argv[i]); estat |= prep(fp); fclose(fp); } else estat = prep(stdin); exit(estat); } /* * Run prep on each input file. */ prep(fp) FILE *fp; { register char *cp; register int c; register int inword = 0; while ((c = getc(fp)) != EOF) { if (!isascii(c)) c = '\0'; if (fflag && isupper(c)) c = tolower(c); if (inword) { if (isalpha(c) || c=='\'') { *cp++ = c; continue; } if (c == '-') { if ((c = getc(fp)) == '\n') continue; ungetc(c, fp); c = '-'; } *cp = '\0'; inword = 0; wordno++; print(wordbuf); } if (isalpha(c) || c=='\'') { inword++; cp = wordbuf; *cp++ = c; } else if (pflag && ispunct(c)) { putchar(c); putchar('\n'); } } } /* * Print out a word. */ print(word) char *word; { if ((nignore && lookup(word)) || (nonly && !lookup(word))) return; if (dflag) printf("%D\t", wordno); printf("%s\n", wordbuf); } /* * Enter words from the given file * into the hash table. */ enter(fn) char *fn; { register char *cp; register WORDS *wp; register int c; register unsigned hash; register int nword = 0; register FILE *fp; if ((fp = fopen(fn, "r")) == NULL) preperr("Cannot open `%s'", fn); while (fgets(wordbuf, NWORD, fp) != NULL) { hash = 0; cp = wordbuf; while ((c = *cp++) != '\0') { if (c == '\n') { cp[-1] = '\0'; break; } if (isupper(c)) *cp = c = tolower(c); hash += c; } if ((wp = (WORDS *)malloc(sizeof(WORDS) + cp-wordbuf)) == NULL) preperr("Out of memory for words from `%s'", fn); strcpy(wp->w_name, wordbuf); wp->w_next = words[hash %= NHASH]; words[hash] = wp; nword++; } fclose(fp); return (nword); } /* * Lookup a word in either the only * or exception list. */ lookup(word) char *word; { register WORDS *wp; register char *cp; register unsigned hash = 0; cp = word; while (*cp != '\0') if (isupper(*cp)) hash += tolower(*cp++); else hash += *cp++; for (wp = words[hash%NHASH]; wp != NULL; wp = wp->w_next) if (strcmp(wp->w_name, word) == 0) return (1); return (0); } /* VARARGS */ preperr(x) { fprintf(stderr, "prep: %r\n", &x); exit(1); } usage() { fprintf(stderr, "Usage: prep [-dfp] [-i file] [-o file] [file ...]\n"); exit(1); }