/* utest -- Mann-Whitney U test * * utest [-t time] [-m picks] n1 n2 U * * utest has two modes: exact and Monte Carlo. Exact mode is the * default. In this mode all possible reassortments of the data are * examined to determine exact significance levels. The number of such * assortments is N!/(n1! n2!), so exact mode becomes impractically slow * as the datasets become large. Monte Carlo mode, specified by the -m * option, can be used for large datasets. In this mode reassortments * are generated at random. If a number is given following the -m * option, it is the number of random reassortments to check. If no * number is given it defaults to 10000, which is usually sufficient for * distinguishing significant, highly significant, and non-significant. * * The -t option is used to set a time limit. If * more than the specified time elapses, the test is aborted. The * default (if -t is specified without a number) is 60 sec. */ #include #include #include #include #include "common.h" #include "readline.h" #include "logs.h" #ifdef TOPT /* if we want -t option */ #include #include #endif #define DEFNP (10000L) /* default npicks */ #define DEFTIM (60) /* default nsecs */ typedef struct point { /* one data point */ int id; /* which dataset belongs to */ double x; /* value */ int r; /* rank */ } POINT; void *ealloc(); char *realloc(); double *points(); char *wdfield(); int strlen(); char *estrdup(); int picks(); int sr_picks(); void tsrandom(); void rpick(); /* long atol(); */ /* int atoi(); */ int readline(); long lpicks(int, int, int, int*); long gpicks(int, int, int, int*); long epicks(int, int, int, int*); double llpicks(int, int, int, int*); double lgpicks(int, int, int, int*); double lepicks(int, int, int, int*); double logpick(int, int); #ifdef TOPT unsigned alarm(); void sigabort(); #endif #ifdef NOPT double ndtr(double x); #endif char *PROGNAME; int mflag = FALSE; /* Monte Carlo pick generation */ long npicks = DEFNP; /* # picks to generate */ #ifdef TOPT int tflag = FALSE; /* limit time for tests */ int nsecs = DEFTIM; /* time to allow for test */ int aborted = FALSE; /* whether test was aborted */ /* * This must be local (i.e., static) on some systems, or it becomes * inaccessible to longjmp. */ static jmp_buf abort_buf; /* environment save for abort */ #endif #ifdef NOPT int nflag = FALSE; /* normal approximation */ #endif int dflag = FALSE; /* print debug info */ int cflag = FALSE; int indent = 0; /* indent for debugging output */ double U = 0.0; /* U value */ int n1 = 0; int n2 = 0; double sr; /* sum of ranks in 1st dataset */ long lsr; /* # picks < sr */ long esr; /* # picks = sr */ long gsr; /* # picks > sr */ double llsr; /* logs of lsr, esr, gsr */ double lesr; double lgsr; POINT *p; /* points being compared */ main(argc, argv) int argc; char *argv[]; { STARTUP; options(argc, argv); vals(argc, argv); tsrandom(); uwrite(n1, n2, U); EXITOK; } /* options -- process options for utest */ options(argc, argv) int argc; char *argv[]; { register char *s, *n; register int c; while(argc-- > 0) { if ( (NULL == (s = *argv++)) || ('-' != *s++) ) continue; argv[-1] = NULL; while('\0' != (c = *s++)) { switch(c) { /* * -d: print debugging info */ case 'd': if (dflag) usage(); dflag = TRUE; break; /* * -c: check picks */ case 'c': if (cflag) usage(); cflag = TRUE; break; #ifdef NOPT /* * -n: Normal approximation */ case 'n': if (nflag) usage(); nflag = TRUE; break; #endif #ifdef TOPT /* * -t: limit time for each test */ case 't': if (tflag) usage(); tflag = TRUE; if (isdigit(*s)) { n = s; while(isdigit(*s)) s++; } else if ( (argc > 0) && (NULL != (n = *argv)) && isdigit(*n) ) { argc--; *argv++ = NULL; } else break; nsecs = atoi(n); while(isdigit(c = *n++)); if ('\0' != c) usage(); break; #endif /* * -m: do Monte Carlo pick generation */ case 'm': if (mflag) usage(); mflag = TRUE; if (isdigit(*s)) { n = s; while(isdigit(*s)) s++; } else if ( (argc > 0) && (NULL != (n = *argv)) && isdigit(*n) ) { argc--; *argv++ = NULL; } else break; npicks = atol(n); while(isdigit(c = *n++)); if ('\0' != c) usage(); break; /* * something else: error */ default: usage(); } } } } /* vals -- get input values */ vals(argc, argv) int argc; char *argv[]; { char *s; for(; argc-- > 0; argv++) { if (NULL == *argv) continue; if ('\0' == **argv) usage(); n1 = strtol(*argv, &s, 0); if ('\0' != *s) usage(); *argv = NULL; break; } for(argv++; argc-- > 0; argv++) { if (NULL == *argv) continue; if ('\0' == **argv) usage(); n2 = strtol(*argv, &s, 0); if ('\0' != *s) usage(); *argv = NULL; break; } for(argv++; argc-- > 0; argv++) { if (NULL == *argv) continue; if ('\0' == **argv) usage(); U = strtod(*argv, &s); if ('\0' != *s) usage(); *argv = NULL; break; } } /* uwrite -- calculate a U probability and write results */ uwrite(n1, n2, U) int n1, n2; double U; { register int i, j, k; int sn1, sn2; int tn, rr; int *pick = NULL; int *rank = NULL; double tp; register long ltp; int sw; int cmp(); double llesr, lgesr; int fsr; /* floor(sr) */ /* * compute the sum of ranks sr for the smaller dataset */ if (sw = (n1 < n2)) { sn1 = n2; sn2 = n1; } else { sn1 = n1; sn2 = n2; } tn = sn1 + sn2; p = (POINT *) ealloc(tn * sizeof(POINT)); rank = (int *) ealloc(tn * sizeof(int)); for(i=0; i tn * LOG2) { /* if (i = picks(n2, tn, pick, sr_picks)) error1("can't happen %d", i); */ fsr = floor(sr); lsr = lpicks(sn2, tn, fsr, rank); esr = epicks(sn2, tn, fsr, rank); gsr = gpicks(sn2, tn, fsr, rank); if (fsr < sr) { lsr += esr; esr = 0; } if (sw) { ltp = lsr; lsr = gsr; gsr = ltp; } tp = lsr + esr + gsr; printf("%g%%\t%g%%\n", 100.0 * (lsr + esr) / tp, 100.0 * (gsr + esr) / tp ); } /* * exact test, but need to use logs */ else { fsr = floor(sr); llsr = llpicks(sn2, tn, fsr, rank); lesr = lepicks(sn2, tn, fsr, rank); lgsr = lgpicks(sn2, tn, fsr, rank); if (fsr < sr) { llsr = logsum(llsr, lesr); lesr = LOG0; } if (sw) { tp = llsr; llsr = lgsr; lgsr = tp; } llesr = logsum(llsr, lesr); lgesr = logsum(lgsr, lesr); tp = logsum(llesr, lgsr); printf("%g%%\t%g%%\n", (llesr < LOG1) ? 0 : 100.0 * exp(llesr - tp), (lgesr < LOG1) ? 0 : 100.0 * exp(lgesr - tp) ); } quit: #ifdef TOPT alarm(0); /* turn off alarm */ signal(SIGALRM, SIG_IGN); /* ignore any alarms */ #endif #ifdef NOPT if (nflag) { double p1, p2, z, Pu, Pl; p1 = n1; p1 *= n2; p1 /= 2.0; p2 = n1; p2 *= n2; p2 *= (n1 + n2 + 1); p2 /= 12.0; z = (U - p1) / sqrt(p2); Pl = ndtr(z); Pu = ndtr(-z); printf("%g%%\t%g%%\t(z = %g)\n", 100.0 * Pu, 100.0 * Pl, z); } #endif free(pick); free(p); free(rank); } #ifdef TOPT /* sigabort -- respond to SIGALRM by aborting test */ void sigabort(sig, code, scp, addr) int sig, code; struct sigcontext *scp; char *addr; { alarm(0); /* make sure alarm is off */ signal(SIGALRM, SIG_IGN); /* ignore further alarms */ longjmp(abort_buf, TRUE); /* return true to abort test */ } #endif /* sr_picks -- compare sums of ranks for picks to sr */ int sr_picks(n, N, pick) register int n; int N; register int pick[]; { register int csr = 0; while(n-- > 0) { csr += p[*pick++].r; } if (csr < sr) { lsr++; } else if (csr == sr) { esr++; } else { gsr++; } return(0); } /* usage -- complain about usage errors */ usage() { #ifdef TOPT error("usage: eutest [-m [n]] [-t [n]] n1 n2 U"); #else error("usage: eutest [-m [n]] n1 n2 U"); #endif }