⟦f4102919b⟧

TextFile

/*
 * autopun - Phrase reparser.
 *
 * Usage:
 *	autopun [-e English_dict] [-p Phone_dict]
 * or
 *	autopun [-e English_dict] -c -p Phone_dict
 *
 * Given an English phrase (as stdin), autopun prints (to stdout) a table
 * that can be used to create phonemically-similar phrases.  E.g.,
 *  "Happy Birthday" can be recast as "Hub pip Earth tee".
 */

#include <stdio.h>
#include "phoneme.h"
#define TRUE 1
#define FALSE 0

char *cmd;				/* name of this program		*/
int phread = FALSE;			/* "read the phone dict"	*/
int phcreate = FALSE;			/* "create the phone dict"	*/
char *engldict = "/usr/dict/words";	/* English dictionary file	*/
char *phonedict;			/* phonemic dictionary file	*/
char *indictname;			/* name of the dict being read	*/
FILE *indict;				/* the open dict being read	*/
FILE *outdict;				/* the open dict being written	*/
					/* ...name == phonedict		*/

/*
 * struct amatch - representation of the phonemic match of a word with
 * some part of the phrase.
 */

struct amatch {
    struct amatch *next;	/* next word matching at this position	*/
    char *text;			/* English text of the matching word	*/
    short nextpos;		/* position of phoneme after this word	*/
};

/*
 * struct posinfo - information about one phonemic position in the phrase.
 */

struct posinfo {
    struct amatch *wlist;	/* list of words that match at this point */
    unsigned num_parents;	/* # of words leading directly to this point */
};

/*
 * posinfo[] - indexed by phoneme position in the phrase,
 */

#define MAX_PHONES	300	/* Max # of phonemes in a phrase	*/
struct posinfo posinfo[MAX_PHONES];

short target[MAX_PHONES];	/* phonemes of the phrase to reparse	*/
int targlen;			/* # of phonemes in target[] (less P_end) */

/*
 * phmap[] - phoneme map.  Maps a set of similar-sounding phonemes into one.
 *  Used in preprocessing strings before comparison.
 */

short phmap[P_NUM] = {
    P_end,	/* P_end */
    P_IY,	/* P_IY */
    P_IY,	/* P_IH */
    P_EY,	/* P_EY */
    P_EY,	/* P_EH */
    P_AE,	/* P_AE */
    P_AE,	/* P_AA */
    P_AE,	/* P_AO */
    P_AE,	/* P_OW */
    P_AE,	/* P_UH */
    P_UW,	/* P_UW */
    P_UW,	/* P_ER */
    P_AE,	/* P_AX */
    P_AE,	/* P_AH */
    P_AY,	/* P_AY */
    P_AE,	/* P_AW */
    P_OY,	/* P_OY */
    P_p,	/* P_p */
    P_p,	/* P_b */
    P_t,	/* P_t */
    P_t,	/* P_d */
    P_k,	/* P_k */
    P_g,	/* P_g */
    P_f,	/* P_f */
    P_f,	/* P_v */
    P_TH,	/* P_TH */
    P_f,	/* P_DH */
    P_s,	/* P_s */
    P_s,	/* P_z */
    P_s,	/* P_SH */
    P_s,	/* P_ZH */
    P_HH,	/* P_HH */
    P_m,	/* P_m */
    P_n,	/* P_n */
    P_n,	/* P_NG */
    P_l,	/* P_l */
    P_l,	/* P_w */
    P_y,	/* P_y */
    P_r,	/* P_r */
    P_CH,	/* P_CH */
    P_CH,	/* P_j */
    P_WH,	/* P_WH */
    P_end	/* P_PAS */
};

main(argc, argv)
int argc;
char **argv;
{
    char *cp;
    char line[300];
    char *strrchr();

    /*
     * get the basename of the command name,
     * for use in error messages.
     */

    if ((cmd = strrchr(argv[0], '/'))) {
	++cmd;
    } else {
	cmd = argv[0];
    }

    while (++argv, --argc > 0) {
	cp = *argv;
	if (*cp == '-' && *(cp + 1) != '\0') {
	    switch(*++cp) {
	    case 'c':	/* "create the phonemic dictionary */
			/* (requires -p flag)		   */
		phcreate = TRUE;
		break;
	    case 'p':	/* "filename of phonemic dictionary" */
		if (++argv, --argc <= 0 ||
		  (**argv == '-' && *(*argv + 1) != '\0')) {
			bomb("missing -p filename");
		}
		phonedict = *argv;
		break;
	    case 'e':	/* "filename of English dictionary" */
		if (++argv, --argc <= 0) {
			bomb("missing -e filename");
		}
		engldict = *argv;
		break;
	    default:
		bomb("unknown switch `%c'", *cp);
	    }
	} else {
	    bomb("extra filename `%s'", *argv);
	}
    }
    if (phcreate && !phonedict) {
	bomb("missing -p flag");
    }
    phread = (phonedict && !phcreate);

    /*
     * open up the necessary files:
     * Input is either an English or phonemic dictionary;
     * Output is a phonemic dictionary (if requested).
     */

    if (phread) {
	indictname = phonedict;
    } else {
	indictname = engldict;
    }
    if (!(indict = fopen(indictname, "r"))) {
	fprintf(stderr, "%s: can't open \"%s\" -- ", cmd, indictname);
	perror("");
	exit(1);
    }

    if (phcreate) {
	if (strcmp(indictname, phonedict) == 0) {
	    bomb("can't overwrite `%s'", phonedict);
	}
	if (!(outdict = fopen(phonedict, "w"))) {
	    fprintf(stderr, "%s: can't create \"%s\" -- ", cmd, phonedict);
	    perror("");
	    exit(1);
	}
    }

    /*
     * Grab a phrase and process it.
     */

    if (isatty(fileno(stdin))) {
	fputs("Enter English text: ", stderr);
	fflush(stderr);
    }
    if (fgets(line, 300, stdin)) {
	line[strlen(line) - 1] = '\0';  /* removes the terminating \n */
	reparse(line);
    }

    /*
     * Close up shop, making sure that any I/O errors are reported
     */

    if (ferror(indict)) {
	fprintf(stderr, "%s: problem reading \"%s\" -- ", cmd, indictname);
	perror("");
	exit(1);
    }
    (void) fclose(indict);
    if (phcreate) {
	if (fclose(outdict) != 0) {
	    fprintf(stderr, "%s: problem writing \"%s\" -- ", cmd, phonedict);
	    perror("");
	    exit(1);
	}
    }
    exit(0);
}

/*
 * reparse() - given a line of English text, 
 * Find and print the info necessary to reparse that line's phonemes
 * into other English phrases.
 */

reparse(text)
char *text;
{
    char dictword[MAX_PHONES];	/* an English word from the dictionary	      */
    char *textcopy;		/* dynamic copy of the text		      */
    short phrase[MAX_PHONES];	/* the mapped, phonemic version of the phrase */
    short testword[MAX_PHONES];	/* ditto for a word from the dictionary	*/
    int twordlen;		/* # of phonemes in testword[] (less P_end)   */
    register short *sp, *dp;	/* temp source and dest phoneme pointers      */
    int idx;			/* index where a match started		      */
    short *xlate_line();
    short *mapphrase();
    char *strsave();

    /*
     * Translate the input phrase and copy it to a safe place.
     */

    sp = xlate_line(text);
    (void) mapphrase(sp);

    dp = target;
    while (*sp != P_end) {
	*dp++ = *sp++;
    }
    *dp = P_end;
    targlen = dp - &target[0];

    /*
     * For each word in the dictionary,
     *   Convert that word into phonemic codes;
     *   Write the converted codes to the phonemic dictionary (if necessary);
     *   Record where that word would fit into the input phrase.
     */

    while (fgets(dictword, MAX_PHONES, indict)) {
	dictword[strlen(dictword) - 1] = '\0';

	if (phread) {
	    twordlen = encphones(dictword, testword);
	} else {
	    sp = xlate_line(dictword);
	    twordlen = mapphrase(sp) - sp;
	    if (twordlen == 0) {
		continue;			/* (loop leap) */
	    }
	    dp = testword;
	    while (*sp != P_end) {
		*dp++ = *sp++;
	    }
	    *dp = P_end;

	    if (phcreate) {
		writephones(dictword, testword);
	    }
	}

	/*
	 * Search for and record matches until
	 * one can't possibly exist (too few phonemes left).
	 */

	sp = target;
	dp = &target[targlen];
	textcopy = (char *) 0;
	while (dp - sp >= twordlen &&
	  (idx = wordidx(sp, testword)) != -1) {
	    sp += idx + 1;
	    if (!textcopy) {
		textcopy = strsave(dictword);
	    }
	    recmatch((sp - 1) - target, twordlen, textcopy);
	}
    }

    prune();
    saymatches(text);
}

/*
 * prune() - prune away useless matches:
 * remove potential matches that lead to unmatchable parts of the phrase;
 * remove unreachable match lists.
 */

prune()
{
    int pos;
    struct amatch *prevm;
    struct amatch *curm;

    /*
     * note and remove all the matches that lead to an unmatchable point.
     */

    for (pos = 0; pos < MAX_PHONES; ++pos) {
	posinfo[pos].num_parents = 0;
    }
    for (pos = MAX_PHONES - 1; pos >= 0; --pos) {
	prevm = (struct amatch *) 0;
	curm = posinfo[pos].wlist;
	while (curm) {

	    /*
	     * If this word leads us to the end, everything is o.k.
	     * If this word leads us to a matchable point,
	     * note that we can reach that point.
	     * Otherwise, this word is a dead-end -- remove it.
	     */

	    if (curm->nextpos >= targlen) {
		prevm = curm;
	    } else if (posinfo[curm->nextpos].wlist) {
		++posinfo[curm->nextpos].num_parents;
		prevm = curm;
	    } else {
		if (!prevm) {
		    posinfo[pos].wlist = curm->next;
		} else {
		    prevm->next = curm->next;
		}
		/* (we should free curm here if we are reclaiming space) */
	    }
	    curm = curm->next;
	}
    }

    /*
     * Find and remove each unreachable point in the phrase
     * (except the first one).
     * This traversal cascades forward.
     */

    for (pos = 1; pos < MAX_PHONES; ++pos) {
	if (posinfo[pos].num_parents > 0) continue;
	for (curm = posinfo[pos].wlist; curm; curm = curm->next) {
	    if (curm->nextpos >= targlen) continue;

	    --posinfo[curm->nextpos].num_parents;
	}
	posinfo[pos].wlist = (struct amatch *) 0;
	/* (if we were reclaiming space, here's where we'd do it */
    }
}

/*
 * saymatches() - print the phrase match information table.
 */

saymatches(text)
char *text;		/* the original text	*/
{
    int pos;
    int curcol;
    int addcols;
    struct amatch *curm;

    printf("%s\n", text);
    for (pos = 0; pos < MAX_PHONES; ++pos) {
	if (!posinfo[pos].wlist) continue;
	printf("%02d:\n", pos);
	curcol = 0;
	for (curm = posinfo[pos].wlist; curm; curm = curm->next) {
	    addcols = 1 + strlen(curm->text) + 1 + 2;
	    if (curcol + addcols >= 70) {
		printf("\n");
		curcol = 0;
	    }
	    if (curcol == 0) {
		printf(" ");
		curcol += 1;
	    }
	    printf(" %s:", curm->text);
	    if (curm->nextpos >= targlen) {
		printf("$ ");
	    } else {
		printf("%02d", curm->nextpos);
	    }
	    curcol += addcols;
	}
	printf("\n");
    }
}

/*
 * mapphrase() - given a phonetic word or phrase, map it in place via phmap[],
 * returning a pointer to the new end.
 */

short *
mapphrase(pp)
short *pp;	/* a P_end-terminated word/phrase to map	*/
{
    short *dp;	/* points to where to put the next phoneme	*/

    dp = pp;
    while (*pp != P_end) {
	*dp = phmap[*pp];
	if (*dp != P_end) ++dp;
	++pp;
    }
    *dp = P_end;
    return(dp);
}

/*
 * wordidx() - given a phrase and a comparison word,
 * return the index in the phrase where the word was found (-1 if not found).
 *
 * Wordidx() assumes both the phrase and the word have been mapped by phmap[].
 */

int
wordidx(phrase, word)
short *phrase;		/* a P_end-terminated list of phonemes	*/
short *word;		/* ditto				*/
{
    short *start;	/* the starting phoneme being compared		*/
    register short *pp;	/* the current phrase phoneme being compared	*/
    register short *wp;	/* the current word phoneme being compared	*/

    for (start = phrase; *start != P_end; ++start) {
	wp = word;
	pp = start;
	while (*wp != P_end) {
	    if (*pp != *wp) break;
	    ++pp, ++wp;
	}
	if (*wp == P_end) {
	    return(start - phrase);
	}
    }
    return(-1);
}

/*
 * recmatch() - record a match.
 */

recmatch(pos, phlen, text)
int pos;		/* position of the match within the phrase	*/
int phlen;		/* # of phonemes matched			*/
char *text;		/* text that matched				*/
{
    struct amatch *prevm;
    struct amatch *newm;
    struct amatch *nextm;
    struct amatch *matchalloc();

    newm = matchalloc();
    newm->text = text;
    newm->nextpos = pos + phlen;

    prevm = (struct amatch *) 0;
    for (nextm = posinfo[pos].wlist; nextm && newm->nextpos < nextm->nextpos;
      prevm = nextm, nextm = nextm->next) {
	/* (empty body) */
    }

    if (!prevm) {
	newm->next = posinfo[pos].wlist;
	posinfo[pos].wlist = newm;
    } else {
	newm->next = prevm->next;
	prevm->next = newm;
    }
}

/*
 * writephones() - write a list of phonemes to a file.
 */

writephones(ep, sp)
char *ep;		/* english text 		*/
short *sp;		/* corresponding phonemes	*/
{
    fputs(ep, outdict);
    putc(' ', outdict);
    while (*sp != P_end) {
	putc((int) *sp + (int) '!', outdict);
	++sp;
    }
    putc('\n', outdict);
    if (ferror(outdict)) {
	fprintf(stderr, "Error: problem writing \"%s\" -- ", phonedict);
	perror("");
	exit(1);
    }
}

/*
 * encphones() - encode ascii from a phoneme file into phonetic codes.
 */

int		/* returns # of phonemes in word[]		*/
encphones(text, word)
register char *text;	/* encoded phonemes (less the newline) */
short *word;		/* where to put the phonemes		*/
{
    register short *sp;
    char *strchr();

    /*
     * separate the English text from its encoded form
     */

    text = strchr(text, ' ');
    *text++ = '\0';

    sp = word;
    while (*text) {
	*sp = (short) (*text - '!');
	++text, ++sp;
    }
    *sp = P_end;
    return(sp - word);
}

/*
 * matchalloc() - allocate a new match element.
 */

struct amatch *
matchalloc()
{
#define MAX_MATCHES 1000	/* max # of matches in a phrase		*/
    static struct amatch matchpool[MAX_MATCHES];
    static struct amatch *nextpool = matchpool;

    if (nextpool >= &matchpool[MAX_MATCHES]) {
	fprintf(stderr, "Error: too many matches (over %d)\n", MAX_MATCHES);
	exit(1);
    }
    return (nextpool++);
}

/*
 * strsave() - copy the given string to a malloc'ed area,
 *  returning the resultant pointer.
 */

char *
strsave(s)
char *s;
{
    char *ret;
    char *malloc();

    if (!(ret = malloc(strlen(s) + 1))) {
	fprintf(stderr, "Error: out of memory saving \"%s\"\n", s);
	exit(1);
    }
    (void) strcpy(ret, s);
    return(ret);
}

/* VARARGS 1 */
bomb(str, a1, a2, a3)
char *str;
int a1, a2, a3;
{
    fprintf(stderr, "%s: ", cmd);
    fprintf(stderr, str, a1, a2, a3);
    fprintf(stderr, "\n");
    fprintf(stderr, "Usage:\n %s [-c] [-e English_dict] [-p Phone_dict]\n",
      cmd);
    exit(1);
}
DataMuseum.dk

DKUUG/EUUG Conference tapes

⟦f4102919b⟧ TextFile

Derivation

TextFile