|
|
DataMuseum.dkPresents historical artifacts from the history of: DKUUG/EUUG Conference tapes |
This is an automatic "excavation" of a thematic subset of
See our Wiki for more about DKUUG/EUUG Conference tapes Excavated with: AutoArchaeologist - Free & Open Source Software. |
top - metrics - downloadIndex: T m
Length: 10592 (0x2960)
Types: TextFile
Names: »markov3.l«
└─⟦b20c6495f⟧ Bits:30007238 EUUGD18: Wien-båndet, efterår 1987
└─⟦this⟧ »EUUGD18/General/Article/markov3.l«
%{
/*
* Copyright (c) 1986 by Joe Buck
*
* Permission is granted to use, redistribute, or modify this program,
* as long as you don't pretend you wrote it. Send improvements or
* bug reports to {ihnp4,hplabs,sun}!oliveb!epimass!jbuck.
*
* The program generates simulated Usenet articles, given Usenet articles
* as input.
*
* This program constructs a table of frequencies for a token appearing,
* given the two preceding tokens. A "token" is a sequence of non-blank
* characters. An entirely blank line is also treated as a token, as is
* the beginning and end of an article.
*
* The program is designed to process news articles, rejecting text from
* the header, signature, and included text, together with cruft inserted
* by rn.
*
* After the table is built (and it can be big), articles are generated
* on the standard output.
*/
int in_included_text = 0;
%}
%Start HDR BODY SIG
%%
<HDR>^[^ \t]+:.*\n ; /* Header line, e.g. "From: foo@bar.UUCP" */
<HDR>^[ \t]+[^ \t].*\n ; /* Continuation of header line */
<HDR>^[ \t]*$ BEGIN BODY;
<BODY>^[><].*\n { if (!in_included_text) {
in_included_text = 1;
process_token ("\n> ...\n\n");
}
}
<BODY>^"In article".*\n ; /* Reject rn crud */
<BODY>^"-- "$ BEGIN SIG;
<BODY>[ \t]+ ; /* Skip white space */
<BODY>\n[ \t\n]*\n { process_token ("\n"); /* Paragraph break */}
<BODY>[^ \t\n]+ { in_included_text = 0; process_token (yytext); }
<HDR>. ; /* Eat anything that escaped */
<HDR>\n ;
<BODY>\n ;
<SIG>. ;
<SIG>\n ;
%%
void perror(), exit();
char *strcpy(), *malloc();
extern int optind;
extern char *optarg;
/*
* hashtab is a hash table storing all the tokens we encounter.
*/
struct htentry {
char *htext;
struct htentry *hlink;
};
#define HSIZE 3557 /* Should be prime */
#define Fprintf (void)fprintf
#define Printf (void)printf
struct htentry hashtab[HSIZE];
/*
* node and succnode are portions of the big structure we're going to build.
* node represents something like ("was", "a") in a binary tree.
* a linked list of succnodes contain tokens that may follow ("was", "a")
*/
struct node {
char *text;
char *text2;
int ocount;
struct node *lc, *rc;
struct succnode *succ;
};
struct succnode {
struct node *scnod;
int count;
struct succnode *link;
};
struct node *prev_code = NULL;
char *prev_token = NULL, **Argv;
int init_state = HDR;
int verbose = 0;
struct node *root = NULL, *tknptr;
struct succnode *start = NULL;
int n_pairs = 0, n_tokens = 0, n_files = 0, n_total = 0;
struct node *insert_token();
char *savetoken();
process_token (txt)
char *txt;
{
struct node *code;
char *token = savetoken (txt);
/* We have a new token. Say the previous two tokens were "one" "way"
* and the current token is "to". Then prev_code points to a node
* for ("one", "way") and token is "to". This function adds ("way", "to") as a
* successor to ("one","way") and makes prev_code point to ("way","to").
*/
code = insert_token (prev_token, token);
insert_pair (prev_code, code);
prev_code = code;
prev_token = token;
return;
}
/*
* here it is.
*/
main (argc, argv)
int argc;
char **argv;
{
int i, c, n_articles = 10;
char *dumpfile = NULL;
extern int optind;
extern char *optarg;
while ((c = getopt (argc, argv, "pvn:d:s:")) != EOF) {
switch (c) {
case 'v':
verbose = 1;
break;
case 'p': /* Input is plain text, not Usenet stuff */
init_state = BODY;
break;
case 'n': /* # articles to generate */
n_articles = atoi (optarg);
break;
case 'd': /* where to dump the data structure */
dumpfile = optarg;
break;
case 's': /* Set the seed for rand */
srand (atoi (optarg));
break;
default:
Fprintf (stderr, "Usage: markov3 [-p] [-n n_art] [-d dump] files");
exit (1);
}
}
BEGIN init_state; /* initial state of lexical analyzer */
/* Note: if optind == argc, there are no file arguments. yyin is left
* initialized to stdin.
*/
if (optind < argc) {
/* yyin is lex input stream. Point to first file. */
if ((yyin = fopen (argv[optind], "r")) == NULL) {
perror (argv[optind]);
exit (1);
}
}
Argv = argv; /* make it global so yywrap can access it */
n_files = 1;
/* yylex puts all the input files through the lexical analyzer and builds
* the database.
*/
(void) yylex ();
if (dumpfile)
dump_database (dumpfile);
if (verbose)
Fprintf (stderr,
"Total of %d tokens (%d different), %d different pairs, %d files\n",
n_total, n_tokens, n_pairs, n_files);
/* Generate the articles, separated by form feeds */
for (i = 0; i < n_articles; i++) {
if (i > 0) output_word ("\n\f\n");
generate_article ();
}
return 0;
}
/*
* Lex calls this when EOF is reached. It opens the next file if there
* is one. Lex interprets a return value of 1 to mean "all done" and 0
* to mean "keep going".
*/
yywrap () {
(void) fclose (yyin);
insert_pair (prev_code, (struct node *)0);
prev_code = NULL;
if (Argv[optind] == NULL) return 1;
else if ((yyin = fopen (Argv[optind], "r")) == NULL) {
perror (Argv[optind]);
exit (1);
}
optind++;
in_included_text = 0;
if (verbose && n_files % 10 == 0)
Fprintf (stderr, "%d files\n", n_files);
n_files++;
BEGIN init_state;
return 0;
}
/*
* This function saves a token in the hash table (if it isn't there
* already) and returns a pointer to the stored copy.
*/
char *
savetoken (txt)
char *txt;
{
int h;
char *p;
struct htentry *hp;
n_total++;
for (p = txt, h = 0; *p; h += *p++);
hp = hashtab + (h % HSIZE);
while (hp->hlink) {
if (strcmp (hp->htext, txt) == 0) {
return hp->htext;
}
hp = hp->hlink;
}
/* OK, it's a new token. Make hp->hlink point to a new,
* null block and make hp->htext point to the text.
*/
hp->hlink = (struct htentry *) malloc (sizeof *hp);
hp->htext = malloc ((unsigned)(strlen (txt) + 1));
(void) strcpy (hp->htext, txt);
hp->hlink->hlink = NULL;
hp->hlink->htext = NULL;
n_tokens++;
return hp->htext;
}
/*
* This recursive function inserts a token pair into the tree.
*/
struct node *
insert_in_tree (p, txt, txt2)
struct node *p;
char *txt, *txt2;
{
int cmp;
if (p == NULL) {
/* Create a new node. */
p = (struct node *) malloc (sizeof *p);
p->text = txt;
p->text2 = txt2;
p->lc = p->rc = NULL;
p->succ = NULL;
p->ocount = 1;
tknptr = p;
n_pairs++;
if (verbose && n_pairs % 1000 == 0)
Fprintf (stderr, "%d pairs\n", n_pairs);
return p;
}
cmp = my_strcmp (p->text, txt);
if (cmp == 0) cmp = my_strcmp (p->text2, txt2);
if (cmp == 0) {
/* It's a match. Increment the count. */
tknptr = p;
p->ocount += 1;
}
/* Look in the subtrees. */
else if (cmp < 0) p->lc = insert_in_tree (p->lc, txt, txt2);
else p->rc = insert_in_tree (p->rc, txt, txt2);
return p;
}
/*
* This just calls insert_in_tree starting at the root
*/
struct node *
insert_token (txt, txt2)
char *txt,*txt2;
{
root = insert_in_tree (root, txt, txt2);
return tknptr;
}
/*
* This function adds a successor.
*/
struct succnode *
insert_in_succ_chain (sp, np)
struct succnode *sp;
struct node *np;
{
if (sp == NULL) {
sp = (struct succnode *) malloc (sizeof *sp);
sp->scnod = np;
sp->count = 1;
sp->link = NULL;
}
else if (sp->scnod == np)
sp->count += 1;
else sp->link = insert_in_succ_chain (sp->link, np);
return sp;
}
/*
* This calls insert_in_succ_chain starting at the right place.
*/
insert_pair (p1, p2)
struct node *p1, *p2;
{
if (p1) p1->succ = insert_in_succ_chain (p1->succ, p2);
else start = insert_in_succ_chain (start, p2);
}
/*
* This function dumps the stored data structure onto a file.
* Now if only I had a function to read it back in.
*/
char *
pr_token (txt)
char *txt;
{
if (txt[0] != '\n')
return txt;
return txt[1] ? "<INCL>" : "<LF>";
}
treedump (tree, fp)
struct node *tree;
FILE *fp;
{
if (tree) {
treedump (tree->rc, fp);
Fprintf (fp, "( %s %s ) %d", pr_token (tree->text),
pr_token (tree->text2), tree->ocount);
chaindump (tree->succ, fp);
treedump (tree->lc, fp);
}
}
/*
* Subroutine of treedump; it does one row.
*/
chaindump (p, fp)
struct succnode *p;
FILE *fp;
{
char *text;
while (p) {
if (p->scnod == NULL)
text = "<EOF>";
else text = pr_token (p->scnod->text2);
Fprintf (fp, " %s %d", text, p->count);
p = p->link;
}
putc ('\n', fp);
}
/*
* This routine generates the dump file (-d option)
*/
dump_database (file)
char *file;
{
FILE *fp = fopen (file, "w");
if (fp == NULL) {
Fprintf (stderr, "markov: can't open ");
perror (file);
exit (1);
}
Fprintf (fp, "START:");
chaindump (start, fp);
treedump (root, fp);
}
/* roll (n) generates a uniformly distributed rv between 0 and n-1.
* This may need to be changed for your system.
*/
roll (n) {
return ((((rand()>>8)&0x7fff)*n)/0x7fff);
}
/*
* Here it is! This function generates an article by traversing the
* structure we've built.
*/
generate_article () {
struct succnode *p = start;
int ncounts = n_files;
int n, accum;
char *tp;
while (1) {
/* Roll the dice to find out the next token. The code below selects the
* next token, and the new state, with a probability corresponding to the
* frequency in the input.
*/
n = roll (ncounts);
accum = p->count;
while (accum <= n && p->link) {
p = p->link;
accum += p->count;
}
tp = p->scnod->text2;
/* Check for "end of story" */
if (p->scnod == (struct node *) NULL || tp == (char *) NULL)
break;
output_word (tp);
ncounts = p->scnod->ocount;
p = p->scnod->succ;
}
putchar ('\n');
return;
}
/*
* This version handles null strings *
*/
my_strcmp (a, b)
register char *a, *b;
{
if (a == NULL) return b ? -1 : 0;
if (b == NULL) return 1;
return strcmp (a, b);
}
#define LEN 75
output_word (word)
char *word;
{
static char line[LEN+1];
static int room = LEN;
int l = strlen (word);
/* If word won't fit, or starts with \n, dump the current line */
if ((l >= room || word[0] == '\n') && line[0]) {
Printf ("%s\n", line);
line[0] = 0;
room = LEN;
}
/* If word won't fit in the buffer or starts with \n, print it now */
if (l >= LEN)
Printf ("%s\n", word);
else if (word[0] == '\n')
Printf ("%s", word);
/* Otherwise fill it in */
else {
(void)strcat (line, word);
(void)strcat (line, " ");
room -= (l + 1);
}
return;
}