diff --git a/README.md b/README.md index 88080a8..8fd20a9 100644 --- a/README.md +++ b/README.md @@ -384,13 +384,22 @@ wiggletools max test/fixedStep.bw test/variableStep.bw * Welch's t-test -Computes the two-tailed p-value of Welch's t-test comparing to sets of numbers, each assumed to have a normal distribution: +Computes the two-tailed p-value of Welch's t-test comparing two sets of numbers, each assumed to have a normal distribution: ``` wiggletools ttest test/fixedStep.bw test/variableStep.bw test/fixedStep.wig \ : test/fixedStep.wig test/variableStep.bw test/fixedStep.wig ``` +* F-test + +Computes the p-value of the F-test comparing sets of numbers, each assumed to have a normal distribution: + +``` +wiggletools ftest test/fixedStep.bw test/variableStep.bw test/fixedStep.wig \ + : test/fixedStep.wig test/variableStep.bw test/fixedStep.wig +``` + * Wilcoxon's sum rank test Non-parametric equivalent of the above: diff --git a/src/commandParser.c b/src/commandParser.c index 76d06bd..88b18ac 100644 --- a/src/commandParser.c +++ b/src/commandParser.c @@ -42,16 +42,17 @@ puts("\twiggletools program"); puts(""); puts("Program grammar:"); puts("\tprogram = (iterator) | do (iterator) | (extraction)"); -puts("\titerator = (filename) | (unary_operator) (iterator) | (binary_operator) (iterator) (iterator) | (reducer) (multiplex) | (setComparison) (multiplex) (multiplex) | print (statistic)"); +puts("\titerator = (in_filename) | (unary_operator) (iterator) | (binary_operator) (iterator) (iterator) | (reducer) (multiplex) | (setComparison) (multiplex_list) | print (statistic)"); puts("\tunary_operator = unit | write (output) | write_bg (ouput) | smooth (int) | exp | ln | log (float) | pow (float) | offset (float) | scale (float) | gt (float) | default (float) | isZero | (statistic)"); -puts("\toutput = (filename) | -"); -puts("\tfilename = *.wig | *.bw | *.bed | *.bb | *.bg | *.bam | *.vcf | *.bcf"); +puts("\toutput = (out_filename) | -"); +puts("\tin_filename = *.wig | *.bw | *.bed | *.bb | *.bg | *.bam | *.vcf | *.bcf"); puts("\tstatistic = AUC | meanI | varI | minI | maxI | stddevI | CVI | pearson (iterator)"); puts("\tbinary_operator = diff | ratio | overlaps | apply (statistic) | fillIn"); puts("\treducer = cat | sum | product | mean | var | stddev | entropy | CV | median | min | max"); +puts("\tsetComparison = ttest | ftest | wilcoxon"); +puts("\tmultiplex_list = (multiplex) | (multiplex) : (multiplex_list)"); puts("\tmultiplex = (iterator_list) | map (unary_operator) (multiplex) | strict (multiplex)"); -puts("\titerator_list = (iterator) : | (iterator) (iterator_list)"); -puts("\tsetComparison = ttest | wilcoxon"); +puts("\titerator_list = (iterator) | (iterator) : (iterator_list)"); puts("\textraction = profile (output) (int) (iterator) (iterator) | profiles (output) (int) (iterator) (iterator) | histogram (output) (width) (iterator_list) | mwrite (output) (multiplex) | mwrite_bg (output) (multiplex)"); puts("\t\t| apply_paste (out_filename) (statistic) (bed_file) (iterator)"); @@ -302,6 +303,28 @@ static Multiplexer * readMultiplexer() { return readMultiplexerToken(token); } +static Multiplexer ** readMultiplexerList(int * count) { + size_t buffer_size = 8; + char * token; + Multiplexer ** multis = (Multiplexer **) calloc(buffer_size, sizeof(Multiplexer*)); + *count = 0; + + for (token = nextToken(0,0); token != NULL && strcmp(token, ":"); token = nextToken(0,0)) { + if (*count == buffer_size) { + buffer_size *= 2; + multis = (Multiplexer **) realloc(multis, buffer_size * sizeof(Multiplexer*)); + } + multis[(*count)++] = readMultiplexerToken(token); + } + return multis; +} + +static Multiset * readMultiset() { + int count = 0; + Multiplexer ** multis = readMultiplexerList(&count); + return newMultiset(multis, count); +} + static WiggleIterator * readTee() { FILE * file = readOutputFilename(); return TeeWiggleIterator(readIterator(), file, false, holdFire); @@ -496,6 +519,10 @@ static WiggleIterator * readTTest() { return TTestReduction(newMultiset(multis, 2)); } +static WiggleIterator * readFTest() { + return FTestReduction(readMultiset()); +} + static WiggleIterator * readMWUTest() { Multiplexer ** multis = calloc(2, sizeof(Multiplexer *)); multis[0] = readMultiplexer(); @@ -600,6 +627,8 @@ static WiggleIterator * readIteratorToken(char * token) { return readOverlap(); if (strcmp(token, "ttest") == 0) return readTTest(); + if (strcmp(token, "ftest") == 0) + return readFTest(); if (strcmp(token, "wilcoxon") == 0) return readMWUTest(); if (strcmp(token, "AUC") == 0) @@ -623,7 +652,7 @@ static WiggleIterator * readIteratorToken(char * token) { if (strcmp(token, "apply") == 0) return SelectReduction(readApply(), 0); - return SmartReader(token); + return SmartReader(token, holdFire); } @@ -701,7 +730,7 @@ static Multiplexer * readApplyPaste() { exit(1); } - return PasteMultiplexer(ApplyMultiplexer(SmartReader(infilename), statistics, count, readLastIterator(), strict), infile, outfile, false); + return PasteMultiplexer(ApplyMultiplexer(SmartReader(infilename, holdFire), statistics, count, readLastIterator(), strict), infile, outfile, false); } void readPrint() { diff --git a/src/setComparisons.c b/src/setComparisons.c index 78d40a3..fb5ea77 100644 --- a/src/setComparisons.c +++ b/src/setComparisons.c @@ -130,6 +130,105 @@ WiggleIterator * TTestReduction(Multiset * multi) { return newWiggleIterator(data, &TTestReductionPop, &SetComparisonSeek, NAN); } + +//////////////////////////////////////////////////////// +// F-test +//////////////////////////////////////////////////////// + +typedef struct ftestData_st { + Multiset * multi; + int * counts; + double * means; + int total_count; +} FTestData; + +void FTestSeek(WiggleIterator * iter, const char * chrom, int start, int finish) { + FTestData * data = (FTestData* ) iter->data; + seekMultiset(data->multi, chrom, start, finish); + pop(iter); +} + + +void FTestReductionPop(WiggleIterator * wi) { + if (wi->done) + return; + + FTestData * data = (FTestData *) wi->data; + Multiset * multi = data->multi; + + if (multi->done) { + wi->done = true; + return; + } + + // Go to first position where both of the sets have at least one value + while (!multi->inplay[0] || !multi->inplay[1]) { + popMultiset(multi); + if (multi->done) { + wi->done = true; + return; + } + } + wi->chrom = multi->chrom; + wi->start = multi->start; + wi->finish = multi->finish; + + // Compute means + double mean = 0; + int groups = multi->count; + int index, index2; + for (index = 0; index < groups; index++) { + Multiplexer * mplx = multi->multis[index]; + data->means[index] = 0; + for (index2 = 0; index < mplx->count; index++) { + if (mplx->inplay[index2]) + data->means[index] += mplx->values[index2]; + else + data->means[index] += mplx->iters[index2]->default_value; + } + mean += data->means[index]; + data->means[index] /= data->counts[index]; + } + mean /= data->total_count; + + double inter = 0; + double intra = 0; + for (index = 0; index < groups; index++) { + Multiplexer * mplx = multi->multis[index]; + inter += mplx->count * (data->means[index] - mean) * (data->means[index] - mean); + for (index2 = 0; index < mplx->count; index++) { + if (mplx->inplay[index2]) + intra += (mplx->values[index2] - data->means[index]) * (mplx->values[index2] - data->means[index]); + else + intra += (mplx->iters[index2]->default_value - data->means[index]) * (mplx->iters[index2]->default_value - data->means[index]); + } + } + + // F-statistic + inter /= groups - 1; + intra /= data->total_count - groups; + double f = inter / intra; + + // P-value + wi->value = 2 * gsl_cdf_fdist_Q(f, multi->count - 1, data->total_count - multi->count); + + // Update inputs + popMultiset(multi); +} + +WiggleIterator * FTestReduction(Multiset * multi) { + FTestData * data = (FTestData *) calloc(1, sizeof(FTestData)); + data->multi = multi; + data->means = calloc(multi->count, sizeof(double)); + data->counts = calloc(multi->count, sizeof(int)); + int index; + for (index = 0; index < multi->count; index++) { + data->counts[index] = multi->multis[index]->count; + data->total_count += data->counts[index]; + } + return newWiggleIterator(data, &FTestReductionPop, &FTestSeek, NAN); +} + //////////////////////////////////////////////////////// // Mann-Whitney U (Wilcoxon rank-sum test) //////////////////////////////////////////////////////// diff --git a/src/wiggletools.h b/src/wiggletools.h index dc82ee3..027d37d 100644 --- a/src/wiggletools.h +++ b/src/wiggletools.h @@ -89,6 +89,7 @@ Multiset * newMultiset(Multiplexer **, int); // Reduction operators on sets of sets: WiggleIterator * TTestReduction(Multiset *); +WiggleIterator * FTestReduction(Multiset *); WiggleIterator * MWUReduction(Multiset *); // Output