|
|
DataMuseum.dkPresents historical artifacts from the history of: DKUUG/EUUG Conference tapes |
This is an automatic "excavation" of a thematic subset of
See our Wiki for more about DKUUG/EUUG Conference tapes Excavated with: AutoArchaeologist - Free & Open Source Software. |
top - metrics - downloadIndex: T a
Length: 20510 (0x501e)
Types: TextFile
Names: »anova.c«
└─⟦87ddcff64⟧ Bits:30001253 CPHDIST85 Tape, 1985 Autumn Conference Copenhagen
└─⟦this⟧ »cph85dist/stat/src/anova.c«
#include "unixstat.h"
PGM(anova,Analysis of Variance,5.1,1985)
/* Copyright (c) 1982 Gary Perlman (see Copyright file) */
#ifndef MSDOS /* no signals on MSDOS */
#include <signal.h>
#endif
/*
This program does a general analysis of variance.
It was written by Gary Perlman at UCSD April 1980.
STRUCTURE OF ANOVA
main
getlevels
number
sstrings
readdata
sstrings
offset
cellmeans
offset
setsource
nextlevel
printmean
anova
setsource
pof
ALPHABETICAL ANNOTATION
anova ()
prints the design summary
prints F table with each systematic source
being tested against its own error term
cellmeans ()
allocates space for sums of squares of sums
computes brackets == sums of squares of sums
determines whether type of factors is BETWEEN
prints the cell means, counts, and standard deviations
checks the integrity of design
frees data array space
getlevels (argc, argv)
reads from the standard input
checks for arguments and stored them in factorname[i]
opens datafile to store copy of input data
sets nfactors, nlevels[factor], and levelnames
checks whether data is numerical
nextlevel (level, source, sourceflag)
increments the indicies in level to loop through
all nlevels of each factor. If sourceflag is true,
the source[factors] are incremented, otherwise the
non-source[factors] are. returns false if there
is no nextlevel.
number (string)
returns true if its string argument is a number
used to check the validity of input data
offset (level)
for its argument int array level, returns array index
makes data array seem like a multidimensional array
pof (F, df1, df2)
computes the probability of F ratio
printmean (count, sum, sumsq)
prints N, MEAN, and SD if appropriate
readdata ()
allocates space for data and number of replications
reads from temporary file containing copy of data
stores the data by calling offset with level numbers
averages the data over replications
sstrings (line, matrix, maxstrings, maxchars)
reads from line argument into matrix argument columns
each column will hold at most maxchars-1 characters
returns the number of columns read in (at most maxstrings)
setsource (sourcenum, source)
for its argument sourcenum, returns a binary
representation of it in its argument array, source
*/
#define MAXFACT 10 /* the maximum number of factors */
#define MAXLEV 500 /* maximum number of levels per factor */
#define MAXSTRING 16 /* the maximum length of an input string */
#define RANDOM 0 /* zeroeth column is RANDOM factor */
typedef int boolean; /* no boolean type in C */
#define true 1
#define false 0
int nfactors; /* total number of factors */
unsigned nsources = 1; /* number of sources */
int nlevels[MAXFACT]; /* number of levels of each factor */
boolean between[MAXFACT]; /* true if factor is between */
char *factorname[MAXFACT+1] /* default names for factors */
= {"RANDOM", "A", "B", "C", "D", "E", "F", "G", "H", "I"};
char *levelname[MAXFACT][MAXLEV]; /* level names */
float *datax; /* will hold all the data */
short *nreplics; /* number of replications in each cell */
double *bracket; /* will hold bracket values */
FILE *datafile; /* input file */
char tmpname[20]="anova.tmp";/* temporary file where data are stored */
boolean errorflag = false; /* set to true in case of fatal error */
main (argc, argv) int argc; char *argv[];
{
#ifndef MSDOS /* no signals on MSDOS */
extern onintr ();
VOID signal (SIGINT, onintr); /* should really check if ignored */
#endif
ARGV0;
checkstdin (Argv0);
getlevels (argc, argv);
readdata ();
cellmeans ();
anova ();
exit (0);
}
#ifndef MSDOS /* no signals on MSDOS */
onintr () { VOID signal (SIGINT, SIG_IGN); unlink (tmpname); exit (1); }
#endif
/* getlevels finds the number of levels of each factor.
For each line, it reads in the levels of each factor.
It assumes that the number of levels equals the maximum levelnumber.
The data is read from the stdin but is copied for further use in datafile.
*/
getlevels (argc, argv) int argc; char **argv;
{
register int factor; /* looping variable */
register int level; /* looping variable */
char line[BUFSIZ]; /* each data line read in here*/
char *ptr;
char column[MAXFACT+2][MAXSTRING]; /* data line separated in cols*/
int ncols; /* number of columns in line */
boolean newlevel; /* T if new level name */
/* if there are arguments, STORE their names */
for (factor = 0; factor < argc-1 && factor <= MAXFACT; factor++)
factorname[factor] = strdup (argv[factor+1]);
/* OPEN datafile to store copy of data */
#ifndef MSDOS /* MSDOS uses default anova.tmp */
VOID sprintf (tmpname, "/tmp/anova.%d", getpid ());
#endif
if ((datafile = fopen (tmpname, "w")) == NULL)
ERROPEN ("temporary file")
/* SET nfactors and INITIALIZE levels */
firstline:
if (!fgets (line, BUFSIZ, stdin))
ERRDATA
ptr = line;
while (isspace (*ptr)) ptr++;
if (*ptr == NULL) goto firstline;
nfactors = sstrings (line, column, MAXFACT+2, MAXSTRING) - 1;
if (nfactors == 0)
ERRMSG0 (Input data format: factor-levels data)
else if (nfactors > MAXFACT)
ERRMANY (columns, MAXFACT)
if (argc < nfactors+2) factorname[nfactors] = "DATA";
/* SET nlevels[factor] and levelname[factor] */
do {
ptr = line;
while (isspace (*ptr)) ptr++;
if (*ptr == '\0') continue;
fputs (line, datafile);
ncols = sstrings (line, column, MAXFACT+1, MAXSTRING);
if (ncols != nfactors+1)
{
if (ncols == 0) continue;
ERRRAGGED
}
/* see if there are any new levels */
for (factor = 0; factor < nfactors; factor++)
{
newlevel = true;
for (level = 0; level < nlevels[factor]; level++)
if (!strcmp (levelname[factor][level], column[factor]))
{newlevel = false; break;}
if (newlevel)
{
if (nlevels[factor] == MAXLEV)
ERRMANY (levels, MAXLEV)
levelname[factor][nlevels[factor]] = strdup (column[factor]);
nlevels[factor]++;
}
}
/* CHECK to make sure that data is numerical */
if (!number (column[nfactors])) /* data column */
ERRNUM (column[nfactors],data value)
} while (fgets (line, BUFSIZ, stdin));
}
/* readdata reads in the data from datafile and stores it in the data array.
Space is allocated for the data array and the number of replics per cell.
For each line, it reads the levels of each factor and finds the location
where the data is to be stored in data by calling offset with the level
numbers stored in the array called level. Any space not used in data
(because of nested design, for example) has nreplics == 0.
Finally, it removes the temporary data file.
*/
readdata ()
{
register int address; /* where data will be added */
register int factor; /* looping variable */
int level[MAXFACT]; /* level of each factor */
char line[BUFSIZ]; /* each data input line read in here */
char column[MAXFACT+1][MAXSTRING]; /* data line in columns */
unsigned ncells = 1; /* product of all levels */
char *calloc ();
/* ALLOCATE space for data assuming worst case */
for (factor = 0; factor < nfactors; factor++)
ncells *= nlevels[factor];
if ((datax = (float *) calloc (ncells, sizeof (float))) == NULL)
ERRSPACE (data)
if ((nreplics = (short *) calloc (ncells, sizeof (*nreplics))) == NULL)
ERRSPACE (data)
/* STORE data in data array */
#ifdef MSDOS /* freopen does not work on MSDOS */
fclose (datafile);
datafile = fopen (tmpname, "r");
#else
datafile = freopen (tmpname, "r", datafile);
#endif
while (fgets (line, BUFSIZ, datafile))
{
sstrings (line, column, MAXFACT+1, MAXSTRING);
for (factor = 0; factor < nfactors; factor++)
{
level[factor] = 0;
while (strcmp (column[factor], levelname[factor][level[factor]]))
level[factor]++;
}
address = offset (level);
nreplics[address]++;
datax[address] += atof (column[nfactors]);
}
/* AVERAGE data over all replications */
for (address = 0; address < ncells; address++)
if (nreplics[address] > 1) datax[address] /= nreplics[address];
unlink (tmpname); /* done with datafile */
}
/* offset returns a unique index for each combination of levels of factors
it gets passed in level[factor]. level[factor] >= 0 for all factors.
*/
int
offset (level) int level[MAXFACT];
{
register int factor; /* looping variable */
int aindex; /* level of each factor read in here */
int coeff = 1; /* aindex multiplied by coeff */
aindex = level[nfactors-1];
for (factor = nfactors-2; factor >= 0; factor--)
{
coeff *= nlevels[factor+1];
aindex += coeff * (level[factor]);
}
return (aindex);
}
/* cellmeans does the heavy computation involved in computing sums of squares
as well as printing cell means. The input to cellmeans is a set
of arrays holding design information, and the data array which has to be
in a particular format as defined by the program, offset.
cellmeans also determines the type of each factor.
The algorithm procedes as follows:
for each source (from 0 to 2**nfactors), the sums and sums of squares
for each level of that source are computed by looping through all
the non-sources. From this information, the mean and sd are
printed for each level, and a term is added into bracket[sourcenum],
an array of sums of squares of sums. This array will be used by anova.
*/
cellmeans ()
{
register int factor; /* looping variable */
int level[MAXFACT]; /* level of each factor */
int sourcenum; /* source number */
boolean source[MAXFACT]; /* boolean powerset of factors */
int nterms; /* number of terms in source */
double datum; /* each datum read into here */
int address; /* result from ofset */
double sum; /* sum for each level of each factor */
double sumsq; /* for each level of each factor */
int count; /* count used along with sum */
int sumcount[MAXFACT]; /* sum of counts of all levels of fact*/
int withprod = 1; /* product of nlevels[WITHIN] */
boolean sources, nonsources; /* while loop variables */
char *calloc ();
/* INITIALIZE sumcount. Used to check integrity of design */
for (factor = 0; factor < nfactors; factor++) sumcount[factor] = 0;
/* ALLOCATE space for sums of squares of sums */
nsources = 1 << nfactors;
if ((bracket = (double *) calloc (nsources, sizeof (double))) == NULL)
ERRSPACE (computations)
/* begin huge loop */
for (sourcenum = 0; sourcenum < nsources; sourcenum++)
{
nterms = setsource (sourcenum, source);
/* PRINT names of sources */
if (!source[RANDOM])
{
printf ("SOURCE: ");
if (sourcenum == 0) printf ("grand mean");
else for (factor = 1; factor < nfactors; factor++)
if (source[factor]) printf ("%s ", factorname[factor]);
putchar ('\n');
/* PRINT header for cellmeans table */
for (factor = 1; factor < nfactors; factor++)
printf ("%-5.5s ", factorname[factor]);
printf (" N MEAN SD SE\n");
}
/* COMPUTE sums and brackets */
for (factor = 0; factor < nfactors; factor++) level[factor] = 0;
sources = true; /* starts up sources loop */
while (sources)
{
sum = 0.0;
sumsq = 0.0;
count = 0;
nonsources = true; /* starts up nonsources loop */
while (nonsources)
{
address = offset (level);
if (nreplics[address])
{
datum = datax[address];
sum += datum;
sumsq += datum*datum;
count++;
}
/* UPDATE levels of nonsource factors */
nonsources = nextlevel (level, source, false);
}
/* PRINT cell means */
if (!source[RANDOM])
{
for (factor = 1; factor < nfactors; factor++)
if (source[factor])
{
printf ("%-5.5s ", levelname[factor][level[factor]]);
if (nterms == 1) sumcount[factor] += count;
}
else printf (" ");
printmean (count, sum, sumsq);
}
/* UPDATE bracket or type of factor */
if (count) bracket[sourcenum] += sum*sum/count;
else if (nterms == 2 && source[RANDOM])
for (factor = 1; factor < nfactors; factor++)
if (source[factor]) {between[factor] = true; break;}
/* UPDATE levels of source factors */
sources = nextlevel (level, source, true);
}
if (!source[RANDOM]) putchar ('\n');
}
/* CHECK integrity of the design */
for (factor = 0; factor < nfactors; factor++)
if (!between[factor]) withprod *= nlevels[factor];
for (factor = 1; factor < nfactors; factor++)
if (sumcount[factor] != withprod)
{
WARNING (Unbalanced factor)
errorflag = true;
}
/* FREE stored data */
#ifndef MSDOS /* can't free allocated space with our MSDOS C compiler */
cfree ((char *) datax);
cfree ((char *) nreplics);
#endif
}
boolean
nextlevel (level, source, sourceflag)
int level[MAXFACT];
boolean source[MAXFACT], sourceflag;
{
register int factor;
/* if sourceflag is true, increment a source factor,
otherwise, increment a non-source factor */
for (factor = nfactors-1; factor >= 0; factor--)
if (sourceflag == source[factor])
if (++level[factor] < nlevels[factor]) break;
else level[factor] = 0;
return (factor >= 0);
}
printmean (count, sum, sumsq) double sum, sumsq;
{
if (count)
{
printf ("%4d %10.4f ", count, sum/count);
if (count > 1) /* ok to compute sd */
{
double sd = sqrt ((sumsq-sum*sum/count)/(count-1));
double se = sd / sqrt ((double) count);
printf ("%10.4f %10.4f", sd, se);
}
}
else
{
printf (" Empty cells are not allowed!");
errorflag = true;
}
putchar ('\n');
}
/* setsource takes sourcenum and turns it into a binary representation in
source[factor]. It returns the number of factors in the source array. */
int
setsource (sourcenum, source) int sourcenum; boolean *source;
{
register int factor; /* looping variable */
int nterms = 0; /* number of terms in source */
for (factor = 0; factor < nfactors; factor++)
{
if (sourcenum % 2)
{
source[factor] = true;
nterms++;
}
else source[factor] = false;
sourcenum /= 2;
}
return (nterms);
}
/* anova assumes that the array bracket has been allocated and set up with
bracket values as described in Keppel. anova's task is to compute from
these bracket values the ss for each source, and then, with the array
between (which holds the type of each factor (WITHIN or BETWEEN)) gets
the error term for each source. anova prints for each testable
source, a mini-F-table with its particular error term.
*/
anova ()
{
register int factor; /* looping variable */
int sourcenum; /* source number */
boolean source[MAXFACT]; /* flag for each factor in source */
int nterms; /* number of terms in source */
register int subsource;
boolean tmpsource[MAXFACT]; /* flag for each factor in subsource */
int nsubs; /* number of terms in subsource */
boolean error[MAXFACT]; /* will hold factors in error term */
int nerror; /* number of terms in error source */
int betprod = 1; /* product nlevels of between factors */
int withprod = 1; /* product nlevels within subj facts */
double sseffect, sserror; /* sum of squares */
int dfeffect, dferror; /* degrees of freedom */
double mseffect, mserror; /* mean square */
double F, p; /* F ratio and probability */
double pof (); /* probability of F ratio */
/* COMPUTE product of nlevels of between/within subjects factors */
for (factor = 0; factor < nfactors; factor++)
if (nlevels[factor] <= 1)
ERRMSG1 (Too few levels of factor %s, factorname[factor])
else if (between[factor]) betprod *= nlevels[factor];
else withprod *= nlevels[factor];
/* PRINT design summary */
printf ("FACTOR: ");
for (factor = 0; factor <= nfactors; factor++)
printf ("%10.10s ", factorname[factor]);
putchar ('\n');
printf ("LEVELS: ");
for (factor=0; factor < nfactors; factor++)
printf ("%10d ", nlevels[factor]);
printf ("%10d\n", withprod);
printf ("TYPE : RANDOM ");
for (factor = 1; factor < nfactors; factor++)
if (between[factor]) printf (" BETWEEN ");
else printf (" WITHIN ");
printf (" DATA\n");
/* EXIT if errorflag is set */
if (errorflag) ERRMSG0 (No F table due to previous fatal error)
/* PRINT table header */
putchar ('\n');
printf ("SOURCE SS df MS F p\n");
printf ("===============================================================\n");
/* if sourcenum is odd, then RANDOM will be in its source,
so we only want even sourcenums for the main effects */
for (sourcenum = 0; sourcenum < nsources; sourcenum += 2)
{
nterms = setsource (sourcenum, source);
/* PRINT name of source */
if (sourcenum == 0) printf ("mean");
else for (factor = 1; factor < nfactors; factor++)
if (source[factor])
if (nterms == 1)
{
printf ("%-7.7s", factorname[factor]);
break;
}
else printf ("%c", factorname[factor][0]);
printf ("\t");
/* COMPUTE sseffect by adding brackets, alternating signs */
sseffect = 0.0;
for (subsource = 0; subsource <= sourcenum; subsource += 2)
{
nsubs = setsource (subsource, tmpsource);
if (subset (tmpsource, source))
sseffect += ((nterms-nsubs)%2 ? -1.0 : 1.0) * bracket[subsource];
}
if (sseffect < FZERO) sseffect = 0.0;
/* COMPUTE df for effect */
dfeffect = 1;
for (factor = 1; factor < nfactors; factor++)
if (source[factor]) dfeffect *= nlevels[factor] - 1;
/* COMPUTE value of error term */
/* the error term for a source factor is WxS/B,
where W is the set of all within subjects factors
IN THE SOURCE, and B is the set of ALL between
subject factors in the WHOLE design. S is the
subjects or RANDOM factor. A bracket term is used
int the error term if it includes all between subject
factors, and if the only other factors it includes are
within subject factors or RANDOM.
*/
/* set up error source */
error[RANDOM] = true;
nerror = 1;
for (factor = 1; factor < nfactors; factor++)
if (between[factor] || (source[factor]))
{
error[factor] = true;
nerror++;
}
else error[factor] = false;
/* COMPUTE sserror by adding up appropriate brackets */
sserror = 0.0;
for (subsource = 0; subsource < nsources; subsource++)
{
nsubs = setsource (subsource, tmpsource);
if (subset (tmpsource, error))
if (subset (between, tmpsource))
sserror += ((nerror-nsubs)%2 ? -1.0 : 1.0) *
bracket[subsource];
}
if (sserror < FZERO) sserror = 0.0;
/* COMPUTE df for error term */
dferror = nlevels[RANDOM] - betprod;
for (factor = 1; factor < nfactors; factor++)
if (source[factor] && !between[factor])
dferror *= nlevels[factor] - 1;
/* COMPUTE mean squares and F */
mseffect = sseffect/dfeffect;
mserror = sserror/dferror;
if (fzero (mserror))
{
F = 0.0;
p = 0.0;
}
else
{
F = mseffect/mserror;
p = pof (F, dfeffect, dferror);
}
/* PRINT effect part */
printf ("%16.4f %6d %14.4f %9.3f %6.3f ",
sseffect, dfeffect, mseffect, F, p);
if (p <= 0.05) putchar ('*');
if (p <= 0.01) putchar ('*');
if (p <= 0.001) putchar ('*');
putchar ('\n');
/* PRINT error part */
for (factor = 1; factor < nfactors; factor++)
if (error[factor] && !between[factor])
printf ("%c", factorname[factor][0]);
printf ("%c/", factorname[RANDOM][0]);
for (factor = 1; factor < nfactors; factor++)
if (error[factor] && between[factor])
printf ("%c", factorname[factor][0]);
printf ("\t");
printf ("%16.4f %6d %14.4f\n\n", sserror, dferror, mserror);
}
}
boolean
subset (set1, set2) boolean *set1, *set2;
{
register int i;
for (i = 0; i < nfactors; i++)
if (set1[i] > set2[i]) return (false);
return (true);
}