|
DataMuseum.dkPresents historical artifacts from the history of: DKUUG/EUUG Conference tapes |
This is an automatic "excavation" of a thematic subset of
See our Wiki for more about DKUUG/EUUG Conference tapes Excavated with: AutoArchaeologist - Free & Open Source Software. |
top - metrics - downloadIndex: T k
Length: 11145 (0x2b89) Types: TextFile Names: »kmp.cc«
└─⟦a05ed705a⟧ Bits:30007078 DKUUG GNU 2/12/89 └─⟦cc8755de2⟧ »./libg++-1.36.1.tar.Z« └─⟦23757c458⟧ └─⟦this⟧ »libg++/etc/kmp.cc«
//From: "Douglas C. Schmidt" <schmidt@zola.ICS.UCI.EDU> //Date: Fri, 28 Jul 89 11:47:11 -0700 /* Nifty little program that illustrates an implementation of the Knuth, Morris, Pratt string matching algorithm. This program has a user interface similar to fgrep, i.e., when you provide it with a fixed pattern it prints out all the lines from the standard input (or one user-specified input file) that contain at least one substring that matches the provided pattern. Relevant options are: -i: Ignore case when performing comparisons. -n: Print out lines numbers along with the matching lines. -v: Print out lines that *don't* match the pattern. The code below is extensively commented. If these comments distract you, run the code through g++ -E first ;-). */ #include <stdio.h> #include <std.h> /* This array is designed for mapping upper and lower case letter together for a case independent comparison. The mappings are based upon the ASCII character sequences. */ char charmap[] = { '\000', '\001', '\002', '\003', '\004', '\005', '\006', '\007', '\010', '\011', '\012', '\013', '\014', '\015', '\016', '\017', '\020', '\021', '\022', '\023', '\024', '\025', '\026', '\027', '\030', '\031', '\032', '\033', '\034', '\035', '\036', '\037', '\040', '\041', '\042', '\043', '\044', '\045', '\046', '\047', '\050', '\051', '\052', '\053', '\054', '\055', '\056', '\057', '\060', '\061', '\062', '\063', '\064', '\065', '\066', '\067', '\070', '\071', '\072', '\073', '\074', '\075', '\076', '\077', '\100', '\141', '\142', '\143', '\144', '\145', '\146', '\147', '\150', '\151', '\152', '\153', '\154', '\155', '\156', '\157', '\160', '\161', '\162', '\163', '\164', '\165', '\166', '\167', '\170', '\171', '\172', '\133', '\134', '\135', '\136', '\137', '\140', '\141', '\142', '\143', '\144', '\145', '\146', '\147', '\150', '\151', '\152', '\153', '\154', '\155', '\156', '\157', '\160', '\161', '\162', '\163', '\164', '\165', '\166', '\167', '\170', '\171', '\172', '\173', '\174', '\175', '\176', '\177', '\200', '\201', '\202', '\203', '\204', '\205', '\206', '\207', '\210', '\211', '\212', '\213', '\214', '\215', '\216', '\217', '\220', '\221', '\222', '\223', '\224', '\225', '\226', '\227', '\230', '\231', '\232', '\233', '\234', '\235', '\236', '\237', '\240', '\241', '\242', '\243', '\244', '\245', '\246', '\247', '\250', '\251', '\252', '\253', '\254', '\255', '\256', '\257', '\260', '\261', '\262', '\263', '\264', '\265', '\266', '\267', '\270', '\271', '\272', '\273', '\274', '\275', '\276', '\277', '\300', '\341', '\342', '\343', '\344', '\345', '\346', '\347', '\350', '\351', '\352', '\353', '\354', '\355', '\356', '\357', '\360', '\361', '\362', '\363', '\364', '\365', '\366', '\367', '\370', '\371', '\372', '\333', '\334', '\335', '\336', '\337', '\340', '\341', '\342', '\343', '\344', '\345', '\346', '\347', '\350', '\351', '\352', '\353', '\354', '\355', '\356', '\357', '\360', '\361', '\362', '\363', '\364', '\365', '\366', '\367', '\370', '\371', '\372', '\373', '\374', '\375', '\376', '\377', }; /* The list of available program options. */ enum options { DEBUG, FOLD_CASE, LINE_NUMBERS, INVERSE, OPTION_SIZE }; /* Vector storing the enabled options (all initially disabled). */ char option[OPTION_SIZE]; /* Data and function members necessary to implement the KMP algorithm. */ class kmp { private: #ifdef GNUG const int RESET_PATTERN_FLAG = -1; #else #define RESET_PATTERN_FLAG -1 #endif int *fail_trans_table; /* Stores the generated nfa used when matching. */ char *pattern; /* Stores the user-supplied pattern. */ int pattern_len; /* Pattern length. */ void print_fail_trans_table (void); public: kmp (char *pattern, int pattern_len, int debug = 0); ~kmp () { delete fail_trans_table; } int operator ()(char *text, int text_len); }; /* Provide a debugging dump of the failure function. Nothing fancy... */ void kmp::print_fail_trans_table (void) { int i; for (i = 0; i < pattern_len; i++) printf ("%3d,", i); putchar ('\n'); for (i = 0; i < pattern_len; i++) printf ("%3d,", fail_trans_table [i]); putchar ('\n'); } /* This constructor builds the transition table that encodes the failure function automaton. The table stores the PATTERN index we go to in the event of a mismatch with the input text string. This generation process runs in time linear to the pattern size. The terms SUFFIX and PREFIX used below are defined in terms of each other. That is, at each state of the failure function we seek to determine the largest pattern prefix that matches with the largest suffix in the actual input text. This information informs us how far we can legitimately shift the pattern to the right when a mismatch occurs during the string matching process. Stated more formally, this means that for all i (0 <= i <= (PATTERN_LEN - 1)) FAIL_TRANS_TABLE (i) is the largest j < i, such that PATTERN[1..j - 1] is a suffix of PATTERN[1..i - 1] and PATTERN[i] != PATTERN[j]. */ kmp::kmp (char *pat, int len, int debug): pattern (pat), fail_trans_table (new int[len]), pattern_len (len) { /* Points 1 beyond the rightmost potential *suffix* in the pattern (which is actually simulating the behavior of an actual input text string. */ int suffix_end = 0; /* Points 1 beyond the rightmost potential *prefix* in the pattern. */ int prefix_end = fail_trans_table[suffix_end] = RESET_PATTERN_FLAG; do { /* This WHILE loop uses the precomputed failure function to look further left into the pattern trying to find an index location where the pattern prefix matches the pattern suffix. However, if/when we locate the RESET_PATTERN_FLAG this means that we can just skip ahead to the next character in the input text. */ while (prefix_end != RESET_PATTERN_FLAG && pattern[suffix_end] != pattern[prefix_end]) prefix_end = fail_trans_table[prefix_end]; /* Once SUFFIX_END and PREFIX_END are pre-incremented below we know that the first PREFIX_END characters of PATTERN match the characters in positions PATTERN[SUFFIX_END - PREFIX_END .. SUFFIX_END - 1], i.e., the last PREFIX_END characters in the rightmost part of the first SUFFIX_END - 1 characters in PATTERN. If the character at location PATTERN[SUFFIX_END] matches that at PATTERN[PREFIX_END] it is silly to have the failure transition jump to that pattern location (since it would immediately fail to match, of course!). Instead, in that case we just ``borrow'' the previously computed transition stored at FAIL_TRANS_TABLE[PREFIX_END] and use it. */ if (pattern[++suffix_end] == pattern[++prefix_end]) fail_trans_table[suffix_end] = fail_trans_table[prefix_end]; else fail_trans_table[suffix_end] = prefix_end; } /* Adding the extra 1 here is necessary since C strings are indexed from 0 to pattern_len - 1... */ while (suffix_end + 1 < pattern_len); if (debug) print_fail_trans_table (); } /* Actually perform the KMP matching algorithm using the generated determinisitic pattern matching match encoded in the failure transition table. This version is optimized on the assumption that there will be more characters in the text input that *don't* match the pattern than ones that do match it. Therefore, we make a special effort to keep looping through the failure function as long as the text and pattern don't match. */ int kmp::operator ()(char *text, int text_len) { int suffix_end = RESET_PATTERN_FLAG; int prefix_end = RESET_PATTERN_FLAG; /* If TEXT length is shorted than PATTERN we'll bail out now... */ if (text_len < pattern_len) return 0; /* Split the following two cases so that we don't pay any unnecessary overhead when case folding is not required. */ if (option[FOLD_CASE]) /* Note how this main loop is almost identical with the `failure transition table building' algorithm used above. */ do { /* Ignore case when matching and keep applying the failure function until we get a match or are forced to restart the pattern (starting with the *following* input text character, that is). */ while (prefix_end != RESET_PATTERN_FLAG && charmap[text[suffix_end]] != charmap[pattern[prefix_end]]) prefix_end = fail_trans_table[prefix_end]; if (prefix_end + 1 >= pattern_len) return 1; else ++suffix_end, ++prefix_end; } /* This conditional expression is used to terminate the search when it becomes clear that we can't match the PATTERN since the TEXT has fewer unexamined characters than the PATTERN length. */ while (text_len - suffix_end >= pattern_len - prefix_end); else /* This loop is identical with the preceding one, except that we don't bother to fold case... */ do { while (prefix_end != RESET_PATTERN_FLAG && text[suffix_end] != pattern[prefix_end]) prefix_end = fail_trans_table[prefix_end]; if (prefix_end + 1 >= pattern_len) return 1; else ++suffix_end, ++prefix_end; } while (text_len - suffix_end >= pattern_len - prefix_end); return 0; } /* The name sez it all! */ void print_usage_and_die (char *prog_name) { fprintf (stderr, "usage: %s [-inv] pattern [file]\n", prog_name); exit (1); } /* Main driver program. Emulates certain useful features of fgrep. */ int main (int argc, char **argv) { if (argc == 1) print_usage_and_die (argv[0]); /* Required for getopt. */ extern int optind; /* A rather arbitrary limit... */ const int MAX_LINE = 200; char text[MAX_LINE]; int c; /* Initialize any user-specified options. */ while ((c = getopt (argc, argv, "dinv")) != EOF) switch (c) { case 'd': option[DEBUG] = 1; break; case 'i': option[FOLD_CASE] = 1; break; case 'n': option[LINE_NUMBERS] = 1; break; case 'v': option[INVERSE] = 1; break; default: print_usage_and_die (argv[0]); } /* Call the constructor to build the failure function table. */ kmp match (argv[optind], strlen (argv[optind]), option[DEBUG]); /* Handle a user-specified input file. */ if (argv[++optind] && !freopen (argv[optind], "r", stdin)) { perror (argv[0]); return 1; } /* Split the following into two separate cases to avoid overhead when line numbers are not required. */ if (option[LINE_NUMBERS]) { int line; for (line = 1; gets (text); line++) if (match (text, strlen (text)) != option[INVERSE]) printf ("%d:%s\n", line, text); } else { while (gets (text)) if (match (text, strlen (text)) != option[INVERSE]) puts (text); } return 0; }