DataMuseum.dk

Presents historical artifacts from the history of:

DKUUG/EUUG Conference tapes

This is an automatic "excavation" of a thematic subset of
artifacts from Datamuseum.dk's BitArchive.

See our Wiki for more about DKUUG/EUUG Conference tapes

Excavated with: AutoArchaeologist - Free & Open Source Software.


top - download
Index: ┃ T a

⟦220077e93⟧ TextFile

    Length: 20510 (0x501e)
    Types: TextFile
    Names: »anova.c«

Derivation

└─⟦87ddcff64⟧ Bits:30001253 CPHDIST85 Tape, 1985 Autumn Conference Copenhagen
    └─ ⟦this⟧ »cph85dist/stat/src/anova.c« 

TextFile

#include "unixstat.h"
PGM(anova,Analysis of Variance,5.1,1985)
/* Copyright (c) 1982 Gary Perlman (see Copyright file) */

#ifndef MSDOS /* no signals on MSDOS */
#include <signal.h>
#endif

/*
	This program does a general analysis of variance.
	It was written by Gary Perlman at UCSD April 1980.
		STRUCTURE OF ANOVA
	main
		getlevels
			number
			sstrings
		readdata
			sstrings
			offset
		cellmeans
			offset
			setsource
			nextlevel
			printmean
		anova
			setsource
			pof
		ALPHABETICAL ANNOTATION
	anova ()
		prints the design summary
		prints F table with each systematic source
		being tested against its own error term
	cellmeans ()
		allocates space for sums of squares of sums
		computes brackets == sums of squares of sums
		determines whether type of factors is BETWEEN
		prints the cell means, counts, and standard deviations
		checks the integrity of design
		frees data array space
	getlevels (argc, argv)
		reads from the standard input
		checks for arguments and stored them in factorname[i]
		opens datafile to store copy of input data
		sets nfactors, nlevels[factor], and levelnames
		checks whether data is numerical
	nextlevel (level, source, sourceflag)
		increments the indicies in level to loop through
		all nlevels of each factor.  If sourceflag is true,
		the source[factors] are incremented, otherwise the
		non-source[factors] are.  returns false if there
		is no nextlevel.
	number (string)
		returns true if its string argument is a number
		used to check the validity of input data
	offset (level)
		for its argument int array level, returns array index
		makes data array seem like a multidimensional array
	pof (F, df1, df2)
		computes the probability of F ratio
	printmean (count, sum, sumsq)
		prints N, MEAN, and SD if appropriate
	readdata ()
		allocates space for data and number of replications
		reads from temporary file containing copy of data
		stores the data by calling offset with level numbers
		averages the data over replications
	sstrings (line, matrix, maxstrings, maxchars)
		reads from line argument into matrix argument columns
		each column will hold at most maxchars-1 characters
		returns the number of columns read in (at most maxstrings)
	setsource (sourcenum, source)
		for its argument sourcenum, returns a binary
		representation of it in its argument array, source
*/

#define MAXFACT 	 10	/* the maximum number of factors              */
#define MAXLEV		500	/* maximum number of levels per factor	      */
#define MAXSTRING 	 16	/* the maximum length of an input string      */
#define RANDOM	 	  0	/* zeroeth column is RANDOM factor	      */
typedef int boolean;		/* no boolean type in C                       */
#define true  		  1
#define false 		  0

int	nfactors;		/* total number of factors                    */
unsigned nsources = 1;		/* number of sources                          */
int	nlevels[MAXFACT];	/* number of levels of each factor            */
boolean	between[MAXFACT];	/* true if factor is between                  */
char	*factorname[MAXFACT+1]   /* default names for factors		      */
	= {"RANDOM", "A", "B", "C", "D", "E", "F", "G", "H", "I"};
char	*levelname[MAXFACT][MAXLEV];	/* level names			      */
float	*datax;			/* will hold all the data                     */
short	*nreplics;		/* number of replications in each cell        */
double	*bracket;		/* will hold bracket values                   */
FILE	*datafile;		/* input file                                 */
char	tmpname[20]="anova.tmp";/* temporary file where data are stored       */
boolean	errorflag = false;	/* set to true in case of fatal error	      */

main (argc, argv) int argc; char *argv[];
	{
#ifndef MSDOS /* no signals on MSDOS */
	extern	onintr ();
	VOID signal (SIGINT, onintr); /* should really check if ignored */
#endif
	ARGV0;
	checkstdin (Argv0);
	getlevels (argc, argv);
	readdata ();
	cellmeans ();
	anova ();
	exit (0);
	}

#ifndef MSDOS /* no signals on MSDOS */
onintr () { VOID signal (SIGINT, SIG_IGN); unlink (tmpname); exit (1); }
#endif

/* getlevels finds the number of levels of each factor.
   For each line, it reads in the levels of each factor.
   It assumes that the number of levels equals the maximum levelnumber.
   The data is read from the stdin but is copied for further use in datafile.
*/
getlevels (argc, argv) int argc; char **argv;
    {
    register int factor;			/* looping variable	      */
    register int level;				/* looping variable	      */
    char	line[BUFSIZ];			/* each data line read in here*/
    char	*ptr;
    char	column[MAXFACT+2][MAXSTRING];	/* data line separated in cols*/
    int 	ncols;				/* number of columns in line  */
    boolean	newlevel;			/* T if new level name	      */

    /* if there are arguments, STORE their names */
    for (factor = 0; factor < argc-1 && factor <= MAXFACT; factor++)
	    factorname[factor] = strdup (argv[factor+1]);

    /* OPEN datafile to store copy of data */
#ifndef MSDOS /* MSDOS uses default anova.tmp */
    VOID sprintf (tmpname, "/tmp/anova.%d", getpid ());
#endif
    if ((datafile = fopen (tmpname, "w")) == NULL)
	ERROPEN ("temporary file")

    /* SET nfactors and INITIALIZE levels */
firstline:
    if (!fgets (line, BUFSIZ, stdin))
	ERRDATA
    ptr = line;
    while (isspace (*ptr)) ptr++;
    if (*ptr == NULL) goto firstline;
    nfactors = sstrings (line, column, MAXFACT+2, MAXSTRING) - 1;
    if (nfactors == 0)
		ERRMSG0 (Input data format: factor-levels data)
    else if (nfactors > MAXFACT)
		ERRMANY (columns, MAXFACT)
    if (argc < nfactors+2) factorname[nfactors] = "DATA";

    /* SET nlevels[factor] and levelname[factor] */
    do  {
	ptr = line;
	while (isspace (*ptr)) ptr++;
	if (*ptr == '\0') continue;
	fputs (line, datafile);
	ncols = sstrings (line, column, MAXFACT+1, MAXSTRING);
	if (ncols != nfactors+1)
	    {
	    if (ncols == 0) continue;
		ERRRAGGED
	    }

	/* see if there are any new levels */
	for (factor = 0; factor < nfactors; factor++)
	    {
	    newlevel = true;
	    for (level = 0; level < nlevels[factor]; level++)
		if (!strcmp (levelname[factor][level], column[factor]))
			{newlevel = false; break;}
	    if (newlevel)
		{
		if (nlevels[factor] == MAXLEV)
			ERRMANY (levels, MAXLEV)
		levelname[factor][nlevels[factor]] = strdup (column[factor]);
		nlevels[factor]++;
		}
	    }

	/* CHECK to make sure that data is numerical */
	if (!number (column[nfactors])) /* data column */
	    ERRNUM (column[nfactors],data value)
	} while (fgets (line, BUFSIZ, stdin));
    }

/* readdata reads in the data from datafile and stores it in the data array.
   Space is allocated for the data array and the number of replics per cell.
   For each line, it reads the levels of each factor and finds the location
   where the data is to be stored in data by calling offset with the level
   numbers stored in the array called level.  Any space not used in data
   (because of nested design, for example) has nreplics == 0.
   Finally, it removes the temporary data file.
*/
readdata ()
    {
    register int address;		/* where data will be added	      */
    register int factor;		/* looping variable		      */
    int	level[MAXFACT];			/* level of each factor		      */
    char	line[BUFSIZ];		/* each data input line read in here  */
    char	column[MAXFACT+1][MAXSTRING]; /* data line in columns	      */
    unsigned ncells = 1;		/* product of all levels	      */
    char	*calloc ();

    /* ALLOCATE space for data assuming worst case */
    for (factor = 0; factor < nfactors; factor++)
	    ncells *= nlevels[factor];
    if ((datax = (float *) calloc (ncells, sizeof (float))) == NULL)
	    ERRSPACE (data)
    if ((nreplics = (short *) calloc (ncells, sizeof (*nreplics))) == NULL)
	    ERRSPACE (data)

    /* STORE data in data array */
#ifdef MSDOS /* freopen does not work on MSDOS */
	fclose (datafile); 
	datafile = fopen (tmpname, "r");
#else
    datafile = freopen (tmpname, "r", datafile);
#endif
    while (fgets (line, BUFSIZ, datafile))
	{
	sstrings (line, column, MAXFACT+1, MAXSTRING);
	for (factor = 0; factor < nfactors; factor++)
	    {
	    level[factor] = 0;
	    while (strcmp (column[factor], levelname[factor][level[factor]]))
		level[factor]++;
	    }
	address = offset (level);
	nreplics[address]++;
	datax[address] += atof (column[nfactors]);
	}

    /* AVERAGE data over all replications */
    for (address = 0; address < ncells; address++)
	if (nreplics[address] > 1) datax[address] /= nreplics[address];
    unlink (tmpname); /* done with datafile */
    }

/* offset returns a unique index for each combination of levels of factors
   it gets passed in level[factor]. level[factor] >= 0 for all factors.
*/
int
offset (level) int level[MAXFACT];
	{
	register int factor;		/* looping variable		      */
	int	aindex;			/* level of each factor read in here  */
	int	coeff = 1;		/* aindex multiplied by coeff	      */

	aindex = level[nfactors-1];
	for (factor = nfactors-2; factor >= 0; factor--)
		{
		coeff *= nlevels[factor+1];
		aindex += coeff * (level[factor]);
		}
	return (aindex);
	}

/* cellmeans does the heavy computation involved in computing sums of squares
   as well as printing cell means.  The input to cellmeans is a set
   of arrays holding design information, and the data array which has to be
   in a particular format as defined by the program, offset.
   cellmeans also determines the type of each factor.
   The algorithm procedes as follows:
	for each source (from 0 to 2**nfactors), the sums and sums of squares
	for each level of that source are computed by looping through all
	the non-sources.  From this information, the mean and sd are
	printed for each level, and a term is added into bracket[sourcenum],
	an array of sums of squares of sums.  This array will be used by anova.
*/
cellmeans ()
    {
    register int factor;		/* looping variable                   */
    int	level[MAXFACT];			/* level of each factor               */
    int	sourcenum;			/* source number                      */
    boolean	source[MAXFACT];	/* boolean powerset of factors        */
    int	nterms;				/* number of terms in source          */
    double	datum;			/* each datum read into here          */
    int	address;			/* result from ofset                  */
    double	sum;			/* sum for each level of each factor  */
    double	sumsq;			/* for each level of each factor      */
    int	count;				/* count used along with sum          */
    int	sumcount[MAXFACT];		/* sum of counts of all levels of fact*/
    int	withprod = 1;			/* product of nlevels[WITHIN]         */
    boolean	sources, nonsources;	/* while loop variables               */
    char	*calloc ();

    /* INITIALIZE sumcount. Used to check integrity of design */
    for (factor = 0; factor < nfactors; factor++) sumcount[factor] = 0;
    
    /* ALLOCATE space for sums of squares of sums */
    nsources = 1 << nfactors;
    if ((bracket = (double *) calloc (nsources, sizeof (double))) == NULL)
	ERRSPACE (computations)

    /* begin huge loop */
    for (sourcenum = 0; sourcenum < nsources; sourcenum++)
	{
	nterms = setsource (sourcenum, source);

	/* PRINT names of sources */
	if (!source[RANDOM])
	    {
	    printf ("SOURCE: ");
	    if (sourcenum == 0) printf ("grand mean");
	    else for (factor = 1; factor < nfactors; factor++)
		if (source[factor]) printf ("%s ", factorname[factor]);
	    putchar ('\n');

	    /* PRINT header for cellmeans table */
	    for (factor = 1; factor < nfactors; factor++)
		printf ("%-5.5s  ", factorname[factor]);
	    printf ("   N       MEAN         SD         SE\n");
	    }

	/* COMPUTE sums and brackets */
	for (factor = 0; factor < nfactors; factor++) level[factor] = 0;
	sources = true;		/* starts up sources loop */
	while (sources)
	    {
	    sum =   0.0;
	    sumsq = 0.0;
	    count = 0;
	    nonsources = true;	/* starts up nonsources loop */
	    while (nonsources)
		{
		address = offset (level);
		if (nreplics[address])
		    {
		    datum = datax[address];
		    sum += datum;
		    sumsq += datum*datum;
		    count++;
		    }

		/* UPDATE levels of nonsource factors */
		nonsources = nextlevel (level, source, false);
		}

	    /* PRINT cell means */
	    if (!source[RANDOM])
		{
		for (factor = 1; factor < nfactors; factor++)
		    if (source[factor])
			{
			printf ("%-5.5s  ", levelname[factor][level[factor]]);
			if (nterms == 1) sumcount[factor] += count;
			}
		    else printf ("       ");
		printmean (count, sum, sumsq);
		}

	    /* UPDATE bracket or type of factor */
	    if (count) bracket[sourcenum] += sum*sum/count;
	    else if (nterms == 2 && source[RANDOM])
		for (factor = 1; factor < nfactors; factor++)
		    if (source[factor]) {between[factor] = true; break;}

	    /* UPDATE levels of source factors */
	    sources = nextlevel (level, source, true);
	    }
	if (!source[RANDOM]) putchar ('\n');
	}

    /* CHECK integrity of the design */
    for (factor = 0; factor < nfactors; factor++)
	if (!between[factor]) withprod *= nlevels[factor];
    for (factor = 1; factor < nfactors; factor++)
	if (sumcount[factor] != withprod)
	    {
	    WARNING (Unbalanced factor)
	    errorflag = true;
	    }

    /* FREE stored data */
#ifndef MSDOS /* can't free allocated space with our MSDOS C compiler */
    cfree ((char *) datax);
    cfree ((char *) nreplics);
#endif
    }

boolean
nextlevel (level, source, sourceflag)
	int level[MAXFACT];
	boolean source[MAXFACT], sourceflag;
	{
	register int factor;

	/* if sourceflag is true, increment a source factor,
	   otherwise, increment a non-source factor */
	for (factor = nfactors-1; factor >= 0; factor--)
		if (sourceflag == source[factor])
			if (++level[factor] < nlevels[factor]) break;
			else level[factor] = 0;
	return (factor >= 0);
	}

printmean (count, sum, sumsq) double sum, sumsq;
    {
    if (count)
	{
	printf ("%4d %10.4f ", count, sum/count);
	if (count > 1) /* ok to compute sd */
	    {
	    double sd = sqrt ((sumsq-sum*sum/count)/(count-1));
	    double se = sd / sqrt ((double) count);
	    printf ("%10.4f %10.4f", sd, se);
	    }
	}
    else
	{
	printf ("   Empty cells are not allowed!");
	errorflag = true;
	}
    putchar ('\n');
    }

/* setsource takes sourcenum and turns it into a binary representation in
   source[factor]. It returns the number of factors in the source array.  */
int
setsource (sourcenum, source) int sourcenum; boolean *source;
	{
	register int factor;		/* looping variable		      */
	int nterms = 0;			/* number of terms in source	      */

	for (factor = 0; factor < nfactors; factor++)
		{
		if (sourcenum % 2)
			{
			source[factor] = true;
			nterms++;
			}
		else source[factor] = false;
		sourcenum /= 2;
		}
	return (nterms);
	}

/* anova assumes that the array bracket has been allocated and set up with
   bracket values as described in Keppel.  anova's task is to compute from
   these bracket values the ss for each source, and then, with the array
   between (which holds the type of each factor (WITHIN or BETWEEN)) gets
   the error term for each source.  anova prints for each testable
   source, a mini-F-table with its particular error term.
*/
anova ()
    {
    register int factor;		/* looping variable		      */
    int	sourcenum;			/* source number		      */
    boolean	source[MAXFACT];	/* flag for each factor in source     */
    int	nterms;				/* number of terms in source	      */
    register int subsource;
    boolean	tmpsource[MAXFACT];	/* flag for each factor in subsource  */
    int	nsubs;				/* number of terms in subsource       */
    boolean	error[MAXFACT];		/* will hold factors in error term    */
    int	nerror;				/* number of terms in error source    */
    int	betprod = 1;			/* product nlevels of between factors */
    int	withprod = 1;			/* product nlevels within subj facts  */
    double	sseffect, sserror;	/* sum of squares		      */
    int	dfeffect, dferror;		/* degrees of freedom		      */
    double	mseffect, mserror;	/* mean square			      */
    double	F, p;			/* F ratio and probability	      */
    double	pof ();			/* probability of F ratio	      */


    /* COMPUTE product of nlevels of between/within subjects factors */
    for (factor = 0; factor < nfactors; factor++)
	if (nlevels[factor] <= 1)
	    ERRMSG1 (Too few levels of factor %s, factorname[factor])
	else if (between[factor]) betprod *= nlevels[factor];
	else withprod *= nlevels[factor];

    /* PRINT design summary */
    printf ("FACTOR: ");
    for (factor = 0; factor <= nfactors; factor++)
	printf ("%10.10s ", factorname[factor]);
    putchar ('\n');
    printf ("LEVELS: ");
    for (factor=0; factor < nfactors; factor++)
	printf ("%10d ", nlevels[factor]);
    printf ("%10d\n", withprod);
    printf ("TYPE  :     RANDOM ");
    for (factor = 1; factor < nfactors; factor++)
	if (between[factor]) printf ("   BETWEEN ");
	else printf ("    WITHIN ");
    printf ("      DATA\n");
    
    /* EXIT if errorflag is set */
    if (errorflag) ERRMSG0 (No F table due to previous fatal error)

    /* PRINT table header */
    putchar ('\n');
    printf ("SOURCE                SS     df             MS         F      p\n");
    printf ("===============================================================\n");

    /* if sourcenum is odd, then RANDOM will be in its source,
       so we only want even sourcenums for the main effects */
    for (sourcenum = 0; sourcenum < nsources; sourcenum += 2)
	{
	nterms = setsource (sourcenum, source);

	/* PRINT name of source */
	if (sourcenum == 0) printf ("mean");
	else for (factor = 1; factor < nfactors; factor++)
	    if (source[factor])
		if (nterms == 1)
		    {
		    printf ("%-7.7s", factorname[factor]);
		    break;
		    }
		else	printf ("%c", factorname[factor][0]);
	printf ("\t");

	/* COMPUTE sseffect by adding brackets, alternating signs */
	sseffect = 0.0;
	for (subsource = 0; subsource <= sourcenum; subsource += 2)
	    {
	    nsubs = setsource (subsource, tmpsource);
	    if (subset (tmpsource, source))
		sseffect += ((nterms-nsubs)%2 ? -1.0 : 1.0) * bracket[subsource];
	    }
	if (sseffect < FZERO) sseffect = 0.0;

	/* COMPUTE df for effect */
	dfeffect = 1;
	for (factor = 1; factor < nfactors; factor++)
	    if (source[factor]) dfeffect *= nlevels[factor] - 1;

	/* COMPUTE value of error term */
	/* the error term for a source factor is WxS/B,
	   where W is the set of all within subjects factors
	   IN THE SOURCE, and B is the set of ALL between
	   subject factors in the WHOLE design. S is the
	   subjects or RANDOM factor.  A bracket term is used
	   int the error term if it includes all between subject
	   factors, and if the only other factors it includes are
	   within subject factors or RANDOM.
	*/

	/* set up error source */
	error[RANDOM] = true;
	nerror = 1;
	for (factor = 1; factor < nfactors; factor++)
	    if (between[factor] || (source[factor]))
		{
		error[factor] = true;
		nerror++;
		}
	    else error[factor] = false;

	/* COMPUTE sserror by adding up appropriate brackets */
	sserror = 0.0;
	for (subsource = 0; subsource < nsources; subsource++)
	    {
	    nsubs = setsource (subsource, tmpsource);
	    if (subset (tmpsource, error))
		if (subset (between, tmpsource))
		    sserror += ((nerror-nsubs)%2 ? -1.0 : 1.0) *
				    bracket[subsource];
	    }
	if (sserror < FZERO) sserror = 0.0;

	/* COMPUTE df for error term */
	dferror = nlevels[RANDOM] - betprod;
	for (factor = 1; factor < nfactors; factor++)
	    if (source[factor] && !between[factor])
		dferror *= nlevels[factor] - 1;

	/* COMPUTE mean squares and F */
	mseffect = sseffect/dfeffect;
	mserror = sserror/dferror;
	if (fzero (mserror))
	    {
	    F = 0.0;
	    p = 0.0;
	    }
	else
	    {
	    F = mseffect/mserror;
	    p = pof (F, dfeffect, dferror);
	    }

	/* PRINT effect part */
	printf ("%16.4f %6d %14.4f %9.3f %6.3f ",
		sseffect, dfeffect, mseffect, F, p);
	if (p <= 0.05) putchar ('*');
	if (p <= 0.01) putchar ('*');
	if (p <= 0.001) putchar ('*');
	putchar ('\n');

	/* PRINT error part */
	for (factor = 1; factor < nfactors; factor++)
		if (error[factor] && !between[factor])
			printf ("%c", factorname[factor][0]);
	printf ("%c/", factorname[RANDOM][0]);
	for (factor = 1; factor < nfactors; factor++)
		if (error[factor] && between[factor])
			printf ("%c", factorname[factor][0]);
	printf ("\t");
	printf ("%16.4f %6d %14.4f\n\n", sserror, dferror, mserror);
	}
    }

boolean
subset (set1, set2) boolean *set1, *set2;
	{
	register int i;
	for (i = 0; i < nfactors; i++)
		if (set1[i] > set2[i]) return (false);
	return (true);
	}