|
DataMuseum.dkPresents historical artifacts from the history of: DKUUG/EUUG Conference tapes |
This is an automatic "excavation" of a thematic subset of
See our Wiki for more about DKUUG/EUUG Conference tapes Excavated with: AutoArchaeologist - Free & Open Source Software. |
top - downloadIndex: ┃ T m ┃
Length: 6218 (0x184a) Types: TextFile Names: »munchlist.sh«
└─⟦a0efdde77⟧ Bits:30001252 EUUGD11 Tape, 1987 Spring Conference Helsinki └─ ⟦this⟧ »EUUGD11/euug-87hel/sec1/ispell/munchlist.sh«
: Use /bin/sh # # Given a list of words for ispell, generate a reduced list # in which all possible suffixes have been collapsed. The reduced # list will match the same list as the original. # # Usage: # # munchlist [ -d hashfile ] [ -e ] [ -w chars ] [ file ] ... # # Options: # # -d hashfile # Remove any words that are covered by 'hashfile'. The # default is the default ispell dictionary. The words # will be removed only if all suffixes are covered by # the hash file. A hashfile of /dev/null should be # specified when the main dictionary is being munched. # -e Economical algorithm. This will use much less temporary # disk space, at the expense of time. Useful with large files # (such as complete dictionaries). # -w Passed on to ispell (specify chars that are part of a word) # # The given input files are merged, then processed by 'ispell -c' # to generate possible suffix lists; these are then combined # and reduced. The final result is written to standard output. # # For portability to older systems, I have avoided getopt. # # Geoff Kuenning # 2/28/87 # LIBDIR=/tmp2/lib # Must match config.h DEFDICT=dict.191 # Must match config.h EXPAND1=${LIBDIR}/expand1.sed EXPAND2=${LIBDIR}/expand2.sed TDIR=${TMPDIR:-/usr/tmp} TMP=${TDIR}/munch$$ cheap=no dictopt= wchars= while [ $# != 0 ] do case "$1" in -d) case "$2" in /dev/null) dictopt=NONE ;; *) dictopt="-d $2" ;; esac shift ;; -e) cheap=yes ;; -w) wchars="-w $2" shift ;; *) break esac shift done # # Awk program to combine suffixes onto one line # AWKMUNCH=' { if ($1 != old1 && old1 != "") { print old1 suffixes suffixes = "" } old1 = $1 for (i = 2; i <= NF; i++) suffixes = suffixes "/" $i } END { if (old1 != "") print old1 suffixes }' # # Awk program to break suffixes up into one per line # AWKUNMUNCH=' { print $1 for (i = 2; i <= NF; i++) print $1 "/" $i }' trap "/bin/rm -f ${TMP}*; exit 1" 1 2 15 # # Collect all the input (cat), convert to uppercase (tr), expand all # the suffix options (two sed's), and preserve (sorted) for later # joining. Unless an explicitly null dictionary was specified, remove # all expanded words that are covered by the dictionary (ispell). # if [ "X$dictopt" = "XNONE" ] then cat "$@" | tr '[a-z]' '[A-Z]' \ | sed -f $EXPAND1 | sed -f $EXPAND2 | sort -u > ${TMP}a else cat "$@" | tr '[a-z]' '[A-Z]' \ | sed -f $EXPAND1 | sed -f $EXPAND2 | sort -u \ | ispell -l $dictopt -p /dev/null > ${TMP}a fi # # Munch the input to generate roots and suffixes (ispell -c). We are # only interested in words that have at least one suffix (egrep /); the # next step will pick up the rest. Some of the roots are illegal. We # use join to restrict the output to those root words that are found # in the original dictionary. # # Note: one disadvantage of this pipeline is that for a large file, # the join and awk may be sitting around for a long time while ispell # and sort run. You can get rid of this by splitting the pipe, at # the expense of more temp file space. # if [ $cheap = yes ] then ispell $wchars -c -d /dev/null -p /dev/null < ${TMP}a \ | egrep / | sort -u -t/ +0 -1 +1 \ | join -t/ - ${TMP}a | awk -F/ "$AWKMUNCH" > ${TMP}b else ispell $wchars -c -d /dev/null -p /dev/null < ${TMP}a \ | egrep / | sort -u -t/ +0 -1 +1 \ | join -t/ - ${TMP}a > ${TMP}b fi # # There is now one slight problem: the suffix flags X, J, and Z # are simply the addition of an "S" to the suffixes N, G, and R, # respectively. This produces redundant entries in the output file; # for example, ABBREVIATE/N/X and ABBREVIATION/S. We must get rid # of the unnecessary duplicates. The candidates are those words that # have only an "S" flag (egrep). We strip off the "S" (sed), and # generate a list of roots that might have made these words (ispell -c). # Of these roots, we select those that have the N, G, or R flags, # replacing each with the plural equivalent X, J, or Z (sed -n). # Using join once again, we select those that have legal roots # and put them in ${TMP}c. # if [ $cheap = yes ] then egrep '^[^/]*/S$' ${TMP}b | sed 's@/S$@@' \ | ispell -c -d /dev/null -p /dev/null \ | sed -n -e '/\/N/s/N$/X/p' -e '/\/G/s/G$/J/p' -e '/\/R/s/R$/Z/p' \ | sort -u -t/ +0 -1 +1 \ | join -t/ - ${TMP}a \ | awk -F/ "$AWKMUNCH" > ${TMP}c else egrep '^[^/]*/S$' ${TMP}b | sed 's@/S$@@' \ | ispell -c -d /dev/null -p /dev/null \ | sed -n -e '/\/N/s/N$/X/p' -e '/\/G/s/G$/J/p' -e '/\/R/s/R$/Z/p' \ | sort -u -t/ +0 -1 +1 \ | join -t/ - ${TMP}a > ${TMP}c fi # # Now we have to eliminate the stuff covered by ${TMP}c from ${TMP}. # First, we re-expand the suffixes we just made (sed -f pair), and let # ispell re-create the /S version (ispell -c). We select the /S versions # only (egrep), sort them (sort) for comm, and use comm to delete these # from ${TMP}b. The output of comm (i.e., the trimmed version of # ${TMP}b) is combined with our special-suffixes file ${TMP}c (sort, # with preceding awk, if $cheap) and reduced in size (AWKMUNCH) to # produce a final list of all words that have at least one suffix. # if [ $cheap = yes ] then sed -f $EXPAND1 < ${TMP}c | sed -f $EXPAND2 \ | ispell -c -d /dev/null -p /dev/null \ | egrep '\/S$' | sort -u -t/ +0 -1 +1 | comm -13 - ${TMP}b \ | awk -F/ "$AWKUNMUNCH" - ${TMP}c \ | sort -u -t/ +0 -1 +1 - \ | awk -F/ "$AWKMUNCH" > ${TMP}d else sed -f $EXPAND1 < ${TMP}c | sed -f $EXPAND2 \ | ispell -c -d /dev/null -p /dev/null \ | egrep '\/S$' | sort -u -t/ +0 -1 +1 | comm -13 - ${TMP}b \ | sort -u -t/ +0 -1 +1 - ${TMP}c \ | awk -F/ "$AWKMUNCH" > ${TMP}d fi /bin/rm -f ${TMP}[bc] # # Now a slick trick. Use ispell to select those (root) words from the original # list (${TMP}a) that are not covered by the suffix list (${TMP}d). Then we # merge these with the suffix list and sort it to produce the final output. # ispell $wchars -d /dev/null -p ${TMP}d -l < ${TMP}a | tr -d \\015 \ | sort -u -t/ +0 -1 +1 - ${TMP}d /bin/rm -f ${TMP}*