/* eutest -- exact U test * * eutest [-u] [-m [n]] [-t [n]] [files] * * eutest compares two datasets for a difference in position by an * extension of the Mann-Whitney U test (also known as the Wilcoxon * rank sum test). The measurements in the two datasets are ranked from * 1 to N, where N = n1 + n2 is the total number of measurements in the * two datasets, then the ranks of the measurements in one dataset are * summed. (When there are equal measurements, the average rank is used * for each). Statistical significance is then determined by * reassorting the measurements between the two datasets, and * determining what portion of reassortments give rank sums less than or * equal to the actual value, and what proportion gives rank sums * greater than or equal to the actual value. * * The input consists of a series of lines, each containing one dataset. * A line consists of a title followed by numerical values. The title * and numerical values are separated from each other by whitespace * (blanks or tabs). At present there is no way to include whitespace * in a title. Input is read from the file named in the command line if * there is one, or from standard input if no file is named. If more * than one file is named, they are read in order and logically * concatenated. An example of an eutest input file is: * * I5- 8 10 9 8.5 9 8 9 * intact 12 9 11 10 11 8 10 * M3- 11 12 12 10 * * Every possible pairwise comparison of the datasets is done. (In this * case there are three: I5- vs intact, I5- vs M3-, and intact vs M3-.) * The ouput contains one line for each comparison. Each line begins * with the titles of the two datasets compared, followed by the * percentage of assortments that give a rank sum less than or equal to * the actual, followed by the percetage that give a greater than or * equal rank sum. The first dataset is significantly less than the * second at level alpha in a one-tailed test if the first percentage is * less than alpha. The first dataset is significantly greater than the * second in a one-tailed test if the second percentage is less than * alpha. The two datasets are different in a two-tailed test if either * percentage is less than alpha/2. * * eutest has two modes: exact and Monte Carlo. Exact mode is the * default. In this mode all possible reassortments of the data are * examined to determine exact significance levels. The number of such * assortments is N!/(n1! n2!), so exact mode becomes impractically slow * as the datasets become large. Monte Carlo mode, specified by the -m * option, can be used for large datasets. In this mode reassortments * are generated at random. If a number is given following the -m * option, it is the number of random reassortments to check. If no * number is given it defaults to 10000, which is usually sufficient for * distinguishing significant, highly significant, and non-significant. * The -t option is used to set a time limit on each comparison. If * more than the specified time elapses, the particular test is aborted * and eutest goes on the the next. This option is useful when some of * the pairwise tests are too time-consuming to be practical. The * default (if -t is specified without a number) is 60 sec. * * The -u option tells eutest not to calculate probabilities, but * instead to write, for each pair of datasets, numbers n1, n2, and U * that can be looked up in a U statistic table. */ #include #include #include #include "common.h" #include "readline.h" #include "logs.h" #ifdef TOPT /* if we want -t option */ #include #include #endif #define DEFNP (10000L) /* default npicks */ #define DEFTIM (60) /* default nsecs */ typedef struct ds { /* dataset structure */ struct ds *d_next; /* next dataset in list */ char *d_name; /* name of this dataset */ int d_n; /* number of data */ double *d_data; /* data points */ } DS; typedef struct point { /* one data point */ int id; /* which dataset belongs to */ double x; /* value */ int r; /* rank */ } POINT; void *ealloc(); char *realloc(); double *points(); DS *readds(); char *wdfield(); int strlen(); char *estrdup(); int picks(); int sr_picks(); void tsrandom(); void rpick(); long atol(); int atoi(); int readline(); long lpicks(); long gpicks(); long epicks(); double llpicks(); double lgpicks(); double lepicks(); double logpick(); #ifdef TOPT unsigned alarm(); void sigabort(); #endif char *PROGNAME; int mflag = FALSE; /* Monte Carlo pick generation */ long npicks = DEFNP; /* # picks to generate */ #ifdef TOPT int tflag = FALSE; /* limit time for tests */ int nsecs = DEFTIM; /* time to allow for test */ int aborted = FALSE; /* whether test was aborted */ /* * This must be local (i.e., static) on some systems, or it becomes * inaccessible to longjmp. */ static jmp_buf abort_buf; /* environment save for abort */ #endif int uflag = FALSE; /* just calculate U stats */ int cflag = FALSE; /* check picks */ int dflag = FALSE; /* print debug info */ int indent = 0; /* indent for debugging output */ FNL *inf = NULL; /* input file list */ FILE *cif = NULL; /* current input file */ DS *list = NULL; /* root of list of datasets */ int sr; /* sum of ranks in 1st dataset */ long lsr; /* # picks < sr */ long esr; /* # picks = sr */ long gsr; /* # picks > sr */ double llsr; /* logs of lsr, esr, gsr */ double lesr; double lgsr; POINT *p; /* points being compared */ main(argc, argv) int argc; char *argv[]; { DS *dp; STARTUP; options(argc, argv); files(argc, argv); /* * read the data */ for( list = NULL; NULL != (dp = readds()); ) { dp->d_next = list; list = dp; } if (list == NULL) EXITOK; /* * check every pairwise combination */ tsrandom(); while(list->d_next != NULL) { for(dp = list->d_next; NULL != dp; dp = dp->d_next) { euwrite(list, dp); } dp = list; list = list->d_next; dsfree(dp); } dsfree(list); EXITOK; } /* options -- process options for eutest */ options(argc, argv) int argc; char *argv[]; { register char *s, *n; register int c; while(argc-- > 0) { if ( (NULL == (s = *argv++)) || ('-' != *s++) ) continue; argv[-1] = NULL; while('\0' != (c = *s++)) { switch(c) { /* * -d: print debugging info */ case 'd': if (dflag) usage(); dflag = TRUE; break; /* * -c check picks */ case 'c': if (cflag) usage(); cflag = TRUE; break; /* * -u calculate U stats */ case 'u': if (uflag) usage(); uflag = TRUE; break; #ifdef TOPT /* * -t: limit time for each test */ case 't': if (tflag) usage(); tflag = TRUE; if (isdigit(*s)) { n = s; while(isdigit(*s)) s++; } else if ( (argc > 0) && (NULL != (n = *argv)) && isdigit(*n) ) { argc--; *argv++ = NULL; } else break; nsecs = atoi(n); while(isdigit(c = *n++)); if ('\0' != c) usage(); break; #endif /* * -m: do Monte Carlo pick generation */ case 'm': if (mflag) usage(); mflag = TRUE; if (isdigit(*s)) { n = s; while(isdigit(*s)) s++; } else if ( (argc > 0) && (NULL != (n = *argv)) && isdigit(*n) ) { argc--; *argv++ = NULL; } else break; npicks = atol(n); while(isdigit(c = *n++)); if ('\0' != c) usage(); break; /* * something else: error */ default: usage(); } } } } /* files -- get list of input files */ files(argc, argv) int argc; char *argv[]; { register FNL **fp = &inf; for(; argc-- > 0; argv++) { if (NULL == *argv) continue; *fp = (FNL *) ealloc(sizeof(FNL)); (*fp)->f_n = estrdup(*argv); (*fp)->f_nxt = NULL; fp = &(*fp)->f_nxt; } if (NULL == inf) { /* if no files, use stdin */ cif = stdin; inf = (FNL *) ealloc(sizeof(FNL)); inf->f_nxt = NULL; inf->f_n = estrdup("Standard Input"); } } /* dsfree -- free a dataset */ dsfree(dp) register DS *dp; { free(dp->d_name); free(dp->d_data); free(dp); } /* readds -- read a dataset */ DS * readds() { char *lbuf; register DS *dp; register char *lp, *fp; /* read a line */ if (EOF == readline(&lbuf, &cif, &inf)) return(NULL); dp = (DS *) ealloc(sizeof(DS)); /* allocate a dataset for it */ dp->d_next = NULL; for(lp=lbuf; isspace(*lp); lp++); /* get name */ fp = lp; lp = wdfield(lp); dp->d_name = estrdup(fp); points(lp, dp); /* get data */ free(lbuf); return(dp); } #define CHUNK 10 /* points in a chunk of data */ /* points -- turn an ASCII list into an array of doubles */ double * points(lbuf, dp) char *lbuf; DS *dp; { char *lp; int space; int a, c; double dd; dp->d_n = 0; dp->d_data = (double *) ealloc(CHUNK * sizeof(double)); space = CHUNK; while(*lbuf != '\0') { /* pull a field */ lp = lbuf; lbuf = wdfield(lbuf); c = '1'; /* decode it */ a = sscanf(lp, "%lf%c", &dd, &c); if ((c == '\0') ? (a != 2) : (a != 1)) error1("%s is not a number", lp); if (space <= dp->d_n) { /* if no room for it... */ space += CHUNK; /* ...make some */ dp->d_data = (double *) realloc(dp->d_data, space * sizeof(double)); if (NULL == dp->d_data) error("no memory"); } dp->d_data[dp->d_n++] = dd; /* shove it in */ } /* discard excess space */ dp->d_data = (double *) realloc(dp->d_data, dp->d_n * sizeof(double)); } /* wdfield -- pull out a whitespace-delimited field * * char *np, *fp; * fp = np; np = wdfield(fp); * * On call np points to a character string beginning with * a non-white char. wdfield scans forward for the next * whitespace and places a null there. A pointer to the * next non-whitespace or the end of the string is returned * in np. */ char * wdfield(fp) register char *fp; { register int c; while(!isspace(c = *fp++) && (c != '\0')); *--fp = '\0'; if (c == '\0') return(fp); for(fp++; isspace(*fp); fp++); return(fp); } /* euwrite -- do an exact U test on two datasets and write results */ euwrite(dp1, dp2) DS *dp1, *dp2; { register int i, j, k; int tn, rr; int *pick = NULL; int *rank = NULL; double tp; register long ltp; int sw; DS *dp = NULL; int cmp(); double llesr, lgesr; /* * compute the sum of ranks sr for the smaller dataset */ if (sw = (dp1->d_n > dp2->d_n)) { dp = dp1; dp1 = dp2; dp2 = dp; } tn = dp1->d_n + dp2->d_n; p = (POINT *) ealloc(tn * sizeof(POINT)); rank = (int *) ealloc(tn * sizeof(int)); for(i=0; id_n; i++) { p[i].x = dp1->d_data[i]; p[i].id = 0; } j = i; for(i=0; id_n; i++, j++) { p[j].x = dp2->d_data[i]; p[j].id = 1; } qsort(p, tn, sizeof(POINT), cmp); sr = 0; for(i=0; id_n; long sn1 = dp2->d_n; double U1 = sr/2.0 - sn2*(sn2-1)/2.0; double U2 = sn1*sn2 + sn2*(sn2-1)/2.0 - sr/2.0; double U = (U1 > U2) ? U1 : U2; printf("%s\t%s\tn1=%ld n2=%ld U=%.1f\n", dp1->d_name, dp2->d_name, sn1, sn2, U ); goto quit; } pick = (int *) ealloc(dp1->d_n * sizeof(int)); lsr = esr = gsr = 0; #ifdef TOPT /* * set up alarm if time limit in effect */ if (tflag) { /* there is a time limit */ if (aborted = setjmp(abort_buf)) { if (sw) { dp = dp1; dp1 = dp2; dp2 = dp; } printf("%s\t%s\ttime limit expired\n", dp1->d_name, dp2->d_name ); goto quit; } if ( (EOF == (int) signal(SIGALRM, sigabort)) || (EOF == alarm(nsecs)) ) error("unable to set alarm"); } #endif /* * compare sums of ranks for random picks: Monte Carlo test */ if (mflag) { for(ltp=0; ltpd_n, tn, pick); sr_picks(dp1->d_n, tn, pick); } if (sw) { ltp = lsr; lsr = gsr; gsr = ltp; dp = dp1; dp1 = dp2; dp2 = dp; } tp = lsr + esr + gsr; printf("%s\t%s\t%g%% (%ld/%ld)\t%g%% (%ld/%ld)\n", dp1->d_name, dp2->d_name, 100.0 * (lsr + esr) / tp, lsr + esr, npicks, 100.0 * (gsr + esr) / tp, gsr + esr, npicks ); } /* * compare sums of ranks for all picks: exact test */ else if (LOGMAXL > tn * LOG2) { /* if (i = picks(dp1->d_n, tn, pick, sr_picks)) error1("can't happen %d", i); */ lsr = lpicks(dp1->d_n, tn, sr, rank); esr = epicks(dp1->d_n, tn, sr, rank); gsr = gpicks(dp1->d_n, tn, sr, rank); if (sw) { ltp = lsr; lsr = gsr; gsr = ltp; dp = dp1; dp1 = dp2; dp2 = dp; } tp = lsr + esr + gsr; printf("%s\t%s\t%g%%\t%g%%\n", dp1->d_name, dp2->d_name, 100.0 * (lsr + esr) / tp, 100.0 * (gsr + esr) / tp ); } /* * exact test, but need to use logs */ else { llsr = llpicks(dp1->d_n, tn, sr, rank); lesr = lepicks(dp1->d_n, tn, sr, rank); lgsr = lgpicks(dp1->d_n, tn, sr, rank); if (sw) { tp = llsr; llsr = lgsr; lgsr = tp; dp = dp1; dp1 = dp2; dp2 = dp; } llesr = logsum(llsr, lesr); lgesr = logsum(lgsr, lesr); tp = logsum(llesr, lgsr); printf("%s\t%s\t%g%%\t%g%%\n", dp1->d_name, dp2->d_name, (llesr < LOG1) ? 0 : 100.0 * exp(llesr - tp), (lgesr < LOG1) ? 0 : 100.0 * exp(lgesr - tp) ); } quit: #ifdef TOPT alarm(0); /* turn off alarm */ signal(SIGALRM, SIG_IGN); /* ignore any alarms */ #endif free(pick); free(p); free(rank); } int cmp(p1, p2) POINT *p1, *p2; { return( (p1->x < p2->x) ? -1 : ( (p1->x > p2->x) ? 1 : 0 ) ); } #ifdef TOPT /* sigabort -- respond to SIGALRM by aborting test */ void sigabort(sig, code, scp, addr) int sig, code; struct sigcontext *scp; char *addr; { alarm(0); /* make sure alarm is off */ signal(SIGALRM, SIG_IGN); /* ignore further alarms */ longjmp(abort_buf, TRUE); /* return true to abort test */ } #endif /* sr_picks -- compare sums of ranks for picks to sr */ int sr_picks(n, N, pick) register int n; int N; register int pick[]; { register int csr = 0; while(n-- > 0) { csr += p[*pick++].r; } if (csr < sr) { lsr++; } else if (csr == sr) { esr++; } else { gsr++; } return(0); } /* usage -- complain about usage errors */ usage() { #ifdef TOPT error("usage: eutest [-m [n]] [-t [n]] [files]"); #else error("usage: eutest [-m [n]] [files]"); #endif }