⟦d8eb2dc41⟧

TextFile

/*
        Tibdex - makes an inverted index for tib
                 a slight modification of Invert of bib

    input:  records of lines, separated by blank lines
    output: key:file1 start/length ... start/length:file2 start/length ...

Note to non-unix users:
    This program uses a unix command "sort" which sorts records in a file 
    "tmp_file" with the following sort key: 
          ASCII sort on the first field (fields are separated by spaces),
          ASCII sort on the second field,
          numeric sort on the third record,
          numeric sort on the fourth record,
    removing duplicates, and then writes to the same file. The use of "sort"
    is isolated by the "#define SORT_IT" statement just below.  If "sort" 
    is unavailable, replace the sort in the definition of "SORT_IT" by some 
    equivalent sort.  If the format of "SORT_IT" is unacceptable, modify 
    appropriately the one place further below where "SORT_IT" is used.
    It may not be necessary to remove duplicates.
     
*/

#define SORT_IT \
  sprintf(sortcmd,"sort -u +0 -1 +1 -2 +2n -3 +3n %s -o %s",tmp_file,tmp_file);\
    system(sortcmd);


#include "stdio.h"
#include "tib.h"
#define isnull(x)  (*(x) == NULL)
#define makelow(c) ('A'<=(c) && (c)<='Z' ? (c)-'A'+'a' : c)

char    headerline[240] = " ";/* header line -- list of files            */
int     max_kcnt = 100;     /*  max number of keys                      */
int     max_klen =   6;     /*  max length of keys                      */
char    *ignore =           /*  string of line starts to ignore         */
            "CNOPVcnopv\\\%";
char    *common = COMFILE;  /*  name of file of common words            */
char    *INDEX = INDXFILE;  /*  name of output file                     */
char    tmp_file[120];       /*  name of temporary file                  */
char    dirsp2[]=DIRSEP;    /*  directory separator character           */
char    optch2[]=OPTCH;     /*  option character on call                */
int	silent = 0;	    /*  0 => statistics printed			*/
			    /*  1 => no statisitics printed		*/

long int nextrecord(), recsize(), nextline();
char sortcmd[MAXSTR];

int     argc;
char    **argv;

main(argcount,arglist)
int argcount;
char **arglist;
{   char            filename[MAXSTR], *pcom, *getenv(), *ptemp;
    FILE            *input, *output;
    long int        start,length;
    char            word[MAXSTR];
    int             kcnt;
    char            tag_line[MAXSTR];
    char            argchk[3];

    long int	    records = 0;  /*  number of records read           */
    long int	    keys    = 0;  /*  number of keys read (occurences) */
    long int	    distinct;     /*  number of distinct keys          */
    long int	    shorten();
   int i, strcmp();

   /* header */
   for (i = 1; i < argcount; i++) {
      strcpy(argchk,OPTCH);
      strcat(argchk,"z");
      if (strcmp(arglist[i],argchk) == 0)
         silent = true;
   }
   if (silent == false)
      fprintf (stderr, "Tibdex -- version %s, released %s.\n", VERSION, RDATE);

    /* get file names from environment */
    pcom = getenv("COMFILE");
    if (pcom != NULL)
       strcpy(common,pcom);
    ptemp = getenv("TMPDIR");
    if (ptemp != NULL) {
       strcpy(tmp_file,ptemp);
       strcat(tmp_file,dirsp2);
       strcat(tmp_file,"tibdXXXXXX");
    }
    else
       strcpy(tmp_file,INVTEMPFILE);

    argc= argcount-1;
    argv= arglist+1;
    mktemp(tmp_file);
    output= fopen(tmp_file,"w");
    if (output == NULL) {
       fprintf(stderr, "tibdex: can't open temporary output file\n");
       exit(1);
    }

    for ( flags() ; argc>0 ; argc--, argv++ ,flags() )
    {   /* open input file              */
         strcpy(filename, *argv);
         input = fopen(filename, "r");
         if (input == NULL) {
            strcat(filename, ".ref");
            input=fopen(filename, "r");
            if (input == NULL) {
               fprintf(stderr, "can't open %s or %s\n", *argv, filename);
               exit(1);
               }
            }
            strcat(headerline, " ");
            strcat(headerline, filename);
            start=      0L;
            length=     0L;

        for(;;) /* each record  */
        {   /* find start of next record (exit if none)     */
                start= nextrecord(input,start+length);
                if (start==EOF)   break;
            records++;
	    kcnt= 0;
            length= recsize(input,start);
            sprintf(tag_line, " %s %D %D\n", filename, start, length);

            while (ftell(input) < start+length && kcnt < max_kcnt)
            {   getword(input,word,ignore);
                makekey(word,max_klen,common);
                if (!isnull(word))
                {   fputs(word,output); fputs(tag_line,output);
                    kcnt++; keys++;
                }
            }
        }
        fclose(input);
    }
    fclose(output);

    SORT_IT

    distinct = shorten(tmp_file,INDEX);
    if( silent == 0 )
	fprintf(stderr,
	    "%D documents   %D distinct keys  %D key occurrences\n",
	    records, distinct, keys);
}



/*  Flag    Meaning                             Default
    -ki     Keys per record                     100
    -li     max Length of keys                  6
    -%str   ignore lines that begin with %x     CNOPVXcnopv
            where x is in str
            str is a seq of chars
    -cfile  file contains Common words          /??????/common
            do not use common words as keys
    -pfile  name of output file                 INDEX
    -s	    do not print statistics		statistics printed
*/

#define    operand     (strlen(*argv+2)==0 ? (argv++,argc--,*argv) : *argv+2)

flags()
{   for (; argc>0 && *argv[0]==optch2[0];  argc--,argv++)
    {   switch ((*argv)[1])
        {   case 'k':   max_kcnt= atoi(operand);
                        break;
            case 'l':   max_klen= atoi(operand);
                        break;
            case 'c':   common=  operand;
                        break;
            case '%':   ignore=  *argv+2;
                        break;
            case 'p':   INDEX=  operand;
                        break;
	    case 'z':	silent= 1;
			break;
            default:    fprintf(stderr, "unknown flag '%s'\n", *argv);
        }
    }
}


/*  shorten(inf,outf): file "inf" consists of lines of the form:
        key file start length
    sorted by key and file.  replace lines with the same key
    with one line of the form:
        key:file1 start/length ... start/length:file2 start/length ...
    rename as file "outf"
    returns number of lines in output
*/
long shorten(inf,outf)
char *inf, *outf;
{   FILE *in, *out;
    char line[MAXSTR];
    char key[MAXSTR],  newkey[MAXSTR],
         file[MAXSTR], newfile[MAXSTR];
    long int start, length;
    long int lines = 0;

    strcpy(file,"");
    strcpy(key,"");
    in=  fopen(inf, "r");
    out= fopen(outf, "w");
    if (in==NULL || out==NULL)
    {   fprintf(stderr, "tibdex: error in opening file for compression\n");
        return(1);
    }

    fputs(headerline,out);
    fprintf(out, "\n");
    getline(in,line);
    sscanf(line,"%s%s%ld%ld", key, file, &start, &length);
    fprintf(out, "%s :%s %D/%D", key, file, start, length);
    for ( getline(in, line) ; !feof(in);  getline(in, line))
    {   sscanf(line,"%s%s%ld%ld", newkey, newfile, &start, &length);
        if (strcmp(key,newkey)!=0)
        {   strcpy(key, newkey);
            strcpy(file, newfile);
            fprintf(out, "\n%s :%s %D/%D",  key, file, start, length);
	    lines++;
        }
        else if (strcmp(file,newfile)!=0)
        {   strcpy(file,newfile);
            fprintf(out, ":%s %D/%D", file, start, length);
        }
        else
            fprintf(out, " %D/%D", start, length);
    }
    fprintf(out, "\n");
    lines++;

    fclose(in); fclose(out);
    unlink(inf);
    return (lines);
}
DataMuseum.dk

DKUUG/EUUG Conference tapes

⟦d8eb2dc41⟧ TextFile

Derivation

TextFile