-
Notifications
You must be signed in to change notification settings - Fork 1
/
sanxot.py
executable file
·1507 lines (1237 loc) · 53.7 KB
/
sanxot.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
import pdb
import sys
import getopt
import stats
import operator
import os
import random
from scipy.stats import norm
from numpy import *
from scipy.optimize import leastsq
from time import strftime
from datetime import datetime
import pprint
pp = pprint.PrettyPrinter(indent=4)
#######################################################
class higherResult:
def __init__(self, id2 = None, Xj = None, Vj = None):
self.id2 = id2
self.Xj = Xj
self.Vj = Vj
#------------------------------------------------------
class lowerResult:
def __init__(self, id1 = None, XiXj = None, Wij = None, Vi = None):
self.id1 = id1
self.XiXj = XiXj
self.Wij = Wij
self.Vi = Vi
#------------------------------------------------------
class statsResult:
# statsResult.id2 --> higher level identifier
# statsResult.Xj --> higher level X
# statsResult.Vj --> higher level V
# statsResult.id1 --> lower level identifier
# statsResult.Xi --> lower level X (log2Ratio)
# statsResult.Vi --> lower level weight without adding variance
# statsResult.Wij --> lower level weight including variance
# statsResult.nij --> lower level number of elements within the higher level element
# statsResult.Zij --> distance in sigmas
# statsResult.FDRij --> false discovery rate
def __init__(self, id2 = None, Xj = None, Vj = None, id1 = None, Xi = None, Vi = None, Wij = None, nij = None, Zij = None, FDRij = None, tags = ""):
self.id2 = id2
self.Xj = Xj
self.Vj = Vj
self.id1 = id1
self.Xi = Xi
self.Vi = Vi
self.Wij = Wij
self.nij = nij
self.Zij = Zij
self.absZij = None # used only for sorting
self.FDRij = FDRij
self.tags = tags
#------------------------------------------------------
def varDiff(params,
inputRawData,
inputRelations,
verbose = False,
forHtml = False,
logResults = None,
acceptedError = 0.001,
removeDuplicateUpper = False):
MADconstant = 1.48260221850560 # *** 1 / DISTR.NORM.ESTAND.INV(3 / 4) get exact number
varianceSeed = params[0]
inputRawData.sort()
inputRelations.sort()
nextIdXData = getNextIdX_sanxot(inputRawData, inputRelations, variance = varianceSeed, giveMergedData = True, removeDuplicateUpper = removeDuplicateUpper)
nextIdX = nextIdXData[0]
mergedData = nextIdXData[1]
newlist = []
for orow in nextIdX:
sequence = orow[0]
# if sequence == "R.VNQAIALLTIGAR.E":
# it is important to avoid sorting to keep it fast
# so in next line do not foget sort = False
# this should be sorted in the beginnig
scanListWithSequence = stats.filterByElement(mergedData, sequence, sort = False)
if len(scanListWithSequence) > 1: # otherwise Xi = Xj --> Xi - Xj = 0 --> does not work
for scanRow in scanListWithSequence:
newrow = []
weight = scanRow[4] # the W (weight taking into account the variance)
degreesOfFreedom = len(scanListWithSequence)
gs = float(degreesOfFreedom) / (float(degreesOfFreedom) - 1.0)
XiXj = scanRow[2] - orow[1]
Fi = gs * weight * (XiXj ** 2)
newrow.append(sequence) # sequence --> 0
newrow.append(scanRow[1]) # scan number --> 1
newrow.append(XiXj) # Xi - Xj --> 2
newrow.append(weight) # weight --> 3
newrow.append(len(scanListWithSequence)) # degrees of freedom --> 4
newrow.append(Fi) # Fi distribution --> 5
newlist.append(newrow)
variance = (MADconstant ** 2) * stats.medianByIndex(newlist, index = 5)
printLine = "seed = " + str(varianceSeed) + ", sigma2 = " + str(variance)
logResults.append([printLine])
if verbose:
print printLine
if forHtml: print "</br>"
return asarray([fabs(variance - 1.0)])
#------------------------------------------------------
def integrate(data = None,
relations = None,
dataFile = "",
relationsFile = "",
varianceSeed = 0.001,
maxIterations = 0,
verbose = False,
showGraph = False,
forceParameters = False,
forceEvenNegativeVariance = False,
giveAllResults = True,
forHtml = False,
randomiseRelfile = False,
confluenceRelfile = False,
randomisedFileName = "",
confluencedFileName = "",
acceptedError = 0.001,
minVarianceSeed = 1e-3,
removeDuplicateUpper = False,
forceEmergencyVariance = False,
emergencySweep = False,
sweepDecimals = 3,
includeOrphans = False,
tags = "",
logicOperatorsAsWords = False):
logResults = []
outStats = []
outStatsExcluded = []
lowerNormExcluded = []
higherOrphanList = []
filteredRelations = None
relationsExcluded = None
varianceSeedOriginal = varianceSeed
if data == None:
if len(dataFile) == 0:
print "Error: no input data"
if forHtml: print "</br>"
sys.exit()
else:
data = stats.loadInputDataFile(dataFile)
if relations == None:
if len(relationsFile) == 0:
if not confluenceRelfile:
print "Error: no relations file"
if forHtml: print "</br>"
sys.exit()
else: # then means a confluence is made from the data file, getting ids from column 0 (first)
idsForConfluence = stats.extractColumns(data, 0)
relations = stats.getConfluenceList(idsForConfluence, deleteDuplicates = True)
if len(confluencedFileName) > 0:
stats.saveFile(confluencedFileName, relations, "idsup\tidinf")
else:
relations = stats.loadRelationsFile(relationsFile)
if confluenceRelfile: # this means a confluence is made from the relations file, getting ids from column 1 (second)
idsForConfluence = stats.extractColumns(relations, 1)
relations = stats.getConfluenceList(idsForConfluence, deleteDuplicates = True)
if len(confluencedFileName) > 0:
stats.saveFile(confluencedFileName, relations, "idsup\tidinf")
if len(tags) > 0:
filteredRelations, relationsExcluded = stats.filterRelations(relations, tags, logicOperatorsAsWords)
else:
filteredRelations = relations
relationsExcluded = []
if randomiseRelfile:
filteredRelations = stats.getRandomList(filteredRelations)
if len(randomisedFileName) > 0:
stats.saveFile(randomisedFileName, filteredRelations, "idsup\tidinf")
if filteredRelations == None and len(filteredRelations) == 0:
print "Error: no relations data"
if forHtml: print "</br>"
sys.exit()
success = True
if forceParameters:
if varianceSeed < 0 and not forceEvenNegativeVariance:
# informing the used about forced negative variance being corrected to zero
message = "Since variance is negative, it will be reset to zero.\nOriginal forced variance was %s." % str(varianceSeed)
logResults.extend([[], [message], []])
print
if forHtml: print "</br>"
print message
if forHtml: print "</br>"
print
if forHtml: print "</br>"
varianceSeed = 0
variance = varianceSeed
else:
success = False
stopAnyway = False
iterationCounter = 0
tries = 0
while not (success or stopAnyway):
tries += 1
bestSol, cov, infodict, mesg, ier = leastsq(varDiff, varianceSeed, args = (data, filteredRelations, verbose, forHtml, logResults, acceptedError, removeDuplicateUpper), maxfev = maxIterations, full_output = True)
variance = bestSol[0]
LMError = fabs(infodict["fvec"][0])
numberOfCalls = infodict["nfev"]
iterationCounter += numberOfCalls
if LMError < acceptedError:
success = True
else:
if maxIterations > 0 and iterationCounter >= maxIterations and not success:
stopAnyway = True
message = "After %i iterations, the Levenberg-Marquardt algorithm was unable to find a solution" % iterationCounter
logResults.extend([[], [message], []])
print
if forHtml: print "</br>"
print message
if forHtml: print "</br>"
print
if forHtml: print "</br>"
if not stopAnyway:
firstCalc = True
while firstCalc or varianceSeed < -0.03:
# -0.03 is a value that has proven not to give problems when seed is above it
# seed below (about) -0.03 have shown to make the LM algorithm collapse,
# giving values incrementally big in each iteration, instead of approaching the solution
# it has to be tested further, anyway
firstCalc = False
if abs(varianceSeed) <= minVarianceSeed or str(varianceSeed) == "nan":
varianceSeed = random.random() - 0.5
else:
varianceSeed = varianceSeed * 4 * (random.random() - 0.5)
message = "Solution did not converge, retrying with variance seed = %f" % varianceSeed
logResults.extend([[], [message], []])
print
if forHtml: print "</br>"
print message
if forHtml: print "</br>"
print
if forHtml: print "</br>"
if success:
message = "Solution found with variance seed = %f, after %i iterations and %i tries" % (varianceSeed, iterationCounter, tries)
logResults.extend([[], [message], []])
print
if forHtml: print "</br>"
print message
if forHtml: print "</br>"
print
if forHtml: print "</br>"
else:
message = "SOLUTION *NOT* FOUND, after %i iterations and %i tries" % (iterationCounter, tries)
logResults.extend([[], [message], []])
print
if forHtml: print "</br>"
print message
if forHtml: print "</br>"
print
if forHtml: print "</br>"
if emergencySweep:
stepsTakenEachSide = 10
bestVariance = 0.0
leastDif = sys.float_info.max
message = "Performing sweep to get best variance upto %i decimals" % sweepDecimals
logResults.extend([[], [message], []])
print
print message
print
for decimal in range(sweepDecimals + 1):
startVariance = bestVariance
for sweepTry in range(stepsTakenEachSide * 2):
varianceTry = startVariance + (sweepTry - stepsTakenEachSide) * 10.0**(-decimal)
params = [varianceTry]
varianceDifference = varDiff(params, data, filteredRelations, verbose = False, logResults = logResults)
message = "Variance difference when sweeping with variance %f -> %f." % (varianceTry, varianceDifference)
logResults.extend([[message]])
if verbose:
print message
if varianceDifference < leastDif:
leastDif = varianceDifference
bestVariance = varianceTry
variance = bestVariance
message = "After sweeping, the best variance found was: %f.\nForcing this variance, as requested by user." % variance
logResults.extend([[], [message], []])
print
print message
print
success = True
if forceEmergencyVariance and not emergencySweep:
message = "Forcing seed variance %f as emergency variance, as requested by user" % varianceSeed
logResults.extend([[], [message], []])
print
print message
print
variance = varianceSeedOriginal
success = True
if success and not forceEvenNegativeVariance and variance < 0:
message = "Calculated variance was %s.\nSince variance is negative, it will be reset to zero." % str(variance)
logResults.extend([[], [message], []])
print message
print
variance = 0
if giveAllResults:
mergedData = stats.getIdXW(data, filteredRelations, variance, giveMergedData = True, removeDuplicateUpper = removeDuplicateUpper)[1]
higherLevel, outStats, lowerNorm = makeStats(variance = variance, input = mergedData)
if len(relationsExcluded) > 0:
outStatsExcluded, lowerNormExcluded, higherOrphanList = calculateExcluded(relationsExcluded, outStats, data, variance, includeOrphans)
message = "Total elements excluded using tags: %i (of %i)" % (len(relationsExcluded), len(relationsExcluded) + len(outStats))
logResults.extend([[message], []])
outStats += outStatsExcluded
lowerNorm += lowerNormExcluded
higherLevel += higherOrphanList
return variance, higherLevel, outStats, lowerNorm, logResults, success
else:
return variance, logResults, success
#------------------------------------------------------
# def sortByZij(list):
# newList = []
# for element in list:
# if str(element.Zij).lower() == "nan":
# # trick used, because the sort method has a bug with nans
# element.FDRij = sys.float_info.max
# else:
# element.FDRij = fabs(element.Zij)
# outputStats = stats.sortByInstance(outputStats, "FDRij")
# for element in list:
# if
# return newList
#------------------------------------------------------
def calculateExcluded(relsExcludedOriginal,
statsOriginal,
dataOriginal,
variance,
includeOrphans = False,
orphanTag = "orphan"):
outStatsExcluded = []
lowerNormExcluded = []
statsIncluded = statsOriginal[:]
relsExcluded = relsExcludedOriginal[:]
txtExcluded = "excluded"
txtPending = "pending"
statsIncluded = stats.sortByInstance(statsIncluded, "id2")
dataOriginal = stats.sortByIndex(dataOriginal, 0)[:]
relsExcluded = stats.sortByIndex(relsExcluded, 0, 1)[:]
relIdSup = ""
selectionStats = []
higherOrphanList = []
higherOrphanAdded = False
for relExcluded in relsExcluded:
selectionData = stats.filterByElement(dataOriginal, relExcluded[1], sort = False)
if len(selectionData) > 0:
if relExcluded[0] != relIdSup:
relIdSup = relExcluded[0]
selectionStats = [x for x in statsIncluded if x.id2 == relIdSup]
higherOrphan = []
higherOrphanAdded = False
if len(selectionStats) == 0:
# whole inferior level of a superior element excluded using tags
if not includeOrphans:
relInOutStats = statsResult(
id2 = relExcluded[0], # idsup
Xj = txtExcluded, # Xsup
Vj = txtExcluded, # Vsup
id1 = relExcluded[1], # idinf
Xi = selectionData[0][1], # Xinf
Vi = selectionData[0][2], # Vinf
nij = 0, # n
Zij = txtExcluded, # Z calculation few lines below
FDRij = txtExcluded) # FDR not calculable
relInLowerNorm = lowerResult(
id1 = relExcluded[1], # idinf
XiXj = txtExcluded, # X'inf = Xinf - Xsup
Vi = relInOutStats.Vi, # Vinf
Wij = txtExcluded) # Winf
else:
# include orphans
# then, the stats are not taken from included data (statsIncluded),
# but from anything pointing to the higher level element (relsExcluded)
relExcluded = stats.addTagToRelations([relExcluded], [relExcluded], orphanTag)[0]
relsOrphan = [x for x in relsExcluded if x[0] == relIdSup]
# integrate dataOriginal using relsOrphan
if not higherOrphan and not higherOrphanAdded:
mergedOrphans = stats.getIdXW(dataOriginal, relsOrphan, variance, giveMergedData = True, removeDuplicateUpper = True)[1]
higherOrphan, outStatsOrphan, lowerNormOrphan = makeStats(variance = variance, input = mergedOrphans)
relInOutStats = statsResult(
id2 = relExcluded[0], # idsup
Xj = higherOrphan[0].Xj, # Xsup
Vj = higherOrphan[0].Vj, # Vsup
id1 = relExcluded[1], # idinf
Xi = selectionData[0][1], # Xinf
Vi = selectionData[0][2], # Vinf
nij = 0, # n
Zij = txtExcluded, # Z calculation few lines below
FDRij = txtExcluded) # FDR not calculable
relInLowerNorm = lowerResult(
id1 = relExcluded[1], # idinf
XiXj = relInOutStats.Xi - relInOutStats.Xj, # X'inf = Xinf - Xsup
Vi = relInOutStats.Vi, # Vinf
Wij = getW_sanxot(relInOutStats.Vi, variance)) # Winf
higherOrphanForList = higherResult(
id2 = higherOrphan[0].id2, # idsup
Xj = higherOrphan[0].Xj, # Xsup
Vj = higherOrphan[0].Vj) #Vsup
else:
relInOutStats = statsResult(
id2 = relExcluded[0], # idsup
Xj = selectionStats[0].Xj, # Xsup
Vj = selectionStats[0].Vj, # Vsup
id1 = relExcluded[1], # idinf
Xi = selectionData[0][1], # Xinf
Vi = selectionData[0][2], # Vinf
nij = selectionStats[0].nij, # n
Zij = txtPending, # Z calculation few lines below
FDRij = txtExcluded) # FDR not calculable
relInLowerNorm = lowerResult(
id1 = relExcluded[1], # idinf
XiXj = relInOutStats.Xi - relInOutStats.Xj, # X'inf = Xinf - Xsup
Vi = relInOutStats.Vi, # Vinf
Wij = getW_sanxot(relInOutStats.Vi, variance)) # Winf
relInOutStats.Zij = getZ(relInLowerNorm.XiXj, relInLowerNorm.Wij, relInOutStats.nij)
if len(relExcluded) > 2:
relInOutStats.tags = relExcluded[2] # tags, otherwise, tags = ""
outStatsExcluded.append(relInOutStats)
lowerNormExcluded.append(relInLowerNorm)
if higherOrphan and not higherOrphanAdded:
higherOrphanList.append(higherOrphanForList)
higherOrphanAdded = True
return outStatsExcluded, lowerNormExcluded, higherOrphanList
#------------------------------------------------------
def printErrorFileMissing(file = "file", argument = None):
print
print "Error: file name for " + file + " is missing."
if argument != None:
print "Use " + argument + " for this parameter (using the short path)."
print "Use -h for help."
sys.exit()
#------------------------------------------------------
def getNextIdX_sanxot(idXVall, relations, variance = 0.0, giveMergedData = False, removeDuplicateUpper = False):
idX = []
# no need to sort: mergeInput will do it anyway
# idXVall.sort()
# relations.sort()
idXVWall = addW_sanxot(idXVall, variance)
mergedData = stats.mergeInput(idXVWall, relations, removeDuplicateUpper = removeDuplicateUpper)
position = 0
id2old = ""
XWlist = []
if len(mergedData) == 0:
print "Error, merged data list is empty. Please check the provided files do exist and are not corrupt."
sys.exit()
while position < len(mergedData):
id2 = mergedData[position][0]
if id2 != id2old and len(XWlist) > 0:
x2 = stats.getNextX(XWlist)
idX.append([id2old, x2])
XWlist = []
else:
XWspecific = [mergedData[position][2], mergedData[position][4]]
XWlist.append(XWspecific)
position += 1
id2old = id2
# the last one has not been added, so...
x2 = stats.getNextX(XWlist)
idX.append([id2old, x2])
if giveMergedData:
return [idX, mergedData]
else:
return idX
#------------------------------------------------------
def makeStats(variance = 0.0, inputFile = "", input = None):
# input here is in the format id2-id1-XV
outputHigher = []
outputLower = []
outputStats = []
element = []
# pdb.set_trace()
# ***
if not input:
if len(inputFile) == 0:
print 'Error: no input file'
sys.exit()
input = loadFile(inputFile)
for row in input:
id2 = row[0]
if len(element) == 0 or id2 == list(element[0])[0]:
# while the id2 is the same, the id1's keep adding up to element
element.append(row)
else:
# when the list of id1's within an id2 is complete, then...
statsResults, higherResult, lowerResults = getAverage(element, variance)
for i in xrange(len(statsResults)):
statsResults[i].nij = len(element) # degrees of freedom
statsResults[i].Zij = getZ(lowerResults[i].XiXj, lowerResults[i].Wij, statsResults[i].nij)
outputHigher.append(higherResult)
outputLower.extend(lowerResults)
outputStats.extend(statsResults)
element = []
element.append(row)
####################
# calculate the last row after exiting the for
statsResults, higherResult, lowerResults = getAverage(element, variance)
for i in xrange(len(statsResults)):
statsResults[i].nij = len(element) # degrees of freedom
statsResults[i].Zij = getZ(lowerResults[i].XiXj, lowerResults[i].Wij, statsResults[i].nij)
####################
outputHigher.append(higherResult)
outputLower.extend(lowerResults)
outputStats.extend(statsResults)
outputStats = addFDR(outputStats)
# outputHigher.sort()
sorted(outputHigher, key=lambda higherResult: higherResult.id2)
# outputHigher.sort(key=operator.attrgetter('id2'))
return outputHigher, outputStats, outputLower
#------------------------------------------------------
def addFDR(statsList):
# calculate FDRij
# order by abs(Zij)
for row in statsList:
if str(row.Zij).lower() == "nan":
# trick used, because the sort method has a bug with nans
row.absZij = None
else:
row.absZij = fabs(row.Zij)
statsList = stats.sortByInstance(statsList, "absZij", isDescendent = True)
totN = 0
for statsResult in statsList:
if statsResult.absZij != None: totN += 1
rank = 1
for i in [x for x in xrange(len(statsList))]:
if statsList[i].absZij != None:
statsList[i].FDRij = 2 * (1 - norm.cdf(statsList[i].absZij)) / (float(rank) / float(totN))
rank += 1
else:
statsList[i].FDRij = float("nan")
# if statsList[i].id2 == "C@ASQVGMTAPGTR": pdb.set_trace()
return statsList
#------------------------------------------------------
def getAverage(chunk, variance1 = 0.0):
# chunk should be in the form id2-id1-XV
statsResults = []
lowerResults = []
id2 = chunk[0][0]
higherElement = higherResult(id2 = id2)
sumXW = 0
sumW = 0
# higherElement.append(chunk[0][0]) # 0
# this copies the unificating field,
# for example, the sequences, if we are integrating
# spectra into peptides
for bit in chunk:
statsElement = statsResult(id2 = id2, id1 = bit[1])
lowerElement = lowerResult(id1 = bit[1])
x1 = bit[2]
v1 = bit[3]
w1 = getW_sanxot(v1, variance1)
sumXW = sumXW + x1 * w1
sumW = sumW + w1
statsElement.Xi = x1
statsElement.Vi = v1
if len(bit) > 4: statsElement.tags = bit[4]
lowerElement.Wij = w1
lowerElement.XiXj = x1 # fixed a few lines ahead
lowerElement.Vi = v1
statsResults.append(statsElement)
lowerResults.append(lowerElement)
x2 = sumXW / sumW
totW = sumW
v2 = totW
higherElement.Xj = x2 # 1
higherElement.Vj = v2 # 2
for statsRes in statsResults:
statsRes.Xj = higherElement.Xj
statsRes.Vj = higherElement.Vj
# begin: jmrc
# for element in lowerResults:
# element.XiXj -= higherElement.Xj # I told you it was going to be fixed
if len(lowerResults) == 1: # disable cases when n=1 with X'=nan
lowerResults[0].XiXj = "nan"
else:
for element in lowerResults:
element.XiXj -= higherElement.Xj # I told you it was going to be fixed
# end: jmrc
return statsResults, higherElement, lowerResults
#------------------------------------------------------
def addW_sanxot(idXVlist, variance = 0.0):
idXVWlist = []
for row in idXVlist:
id = row[0]
x = row[1]
v = row[2]
w = getW_sanxot(v, variance)
idXVWlist.append([id, x, v, w])
return idXVWlist
#------------------------------------------------------
def getW_sanxot(v, variance):
w = 1 / ((1 / v) + variance)
return w
#------------------------------------------------------
def getZ(x, w, n):
if n > 1:
gs = float(n) / (float(n - 1.0))
z = x * sqrt(w * gs)
else:
z = float("nan")
return z
#------------------------------------------------------
def randomData(data = None, variance = 0):
newData = []
for row in data:
id = row[0]
v = row[2]
w = 1 / (variance + 1 / v)
localVariance = 1 / math.sqrt(w)
newX = localVariance * norm.ppf(random.random())
newData.append([id, newX, v])
return newData
#------------------------------------------------------
def getVarConf(variance,
dataFile,
relationsFile,
totSimulations,
varConfLimit = 0.05,
verbose = False,
minVarianceSeed = 1e-3,
maxIterations = 0,
removeDuplicateUpper = False,
acceptedError = 0.001,
tags = "",
logicOperatorsAsWords = False):
# fix this, so it is loaded once only
data = stats.loadInputDataFile(dataFile)
relations = stats.loadRelationsFile(relationsFile)
indexUpper = int(min(math.ceil(totSimulations * (1 - varConfLimit)) - 1, totSimulations - 1))
indexLower = int(max(math.floor(totSimulations * varConfLimit) - 1, 0))
# use norm.ppf or similar
# fix next
varianceArray = []
for simulation in xrange(totSimulations):
print "Generating simulation #%i..." % simulation
dataRnd = randomData(data, variance)
print "Calculating variance for simulation #%i..." % simulation
newVariance, higherLevel, outStats, lowerNorm, logResults, success = \
integrate(data = dataRnd,
relations = relations,
varianceSeed = variance,
maxIterations = maxIterations,
verbose = verbose,
forceParameters = False,
acceptedError = acceptedError,
minVarianceSeed = minVarianceSeed,
removeDuplicateUpper = removeDuplicateUpper,
tags = tags,
logicOperatorsAsWords = logicOperatorsAsWords)
print "New variance: %f" % newVariance
print
varianceArray.append(newVariance)
varianceArray.sort()
varMedian = stats.median(varianceArray)
varUpper = varianceArray[indexUpper]
varLower = varianceArray[indexLower]
return varMedian, varLower, varUpper
#------------------------------------------------------
def printHelp(version = None, advanced = False):
print """
SanXoT %s is a program made in the Jesus Vazquez Cardiovascular Proteomics
Lab at Centro Nacional de Investigaciones Cardiovasculares, used to perform
integration of experimental data to a higher level (such as integration from
peptide data to protein data), while determining the variance between them.
SanXoT needs two input files:
* the lower level input data file, a tab separated text file containing
three columns: the first one with the unique identifiers of each lower
level element (such as "RawFile05.raw-scan19289-charge2" for a scan, or
"CGLAGCGLLK" for a peptide sequence, or "P01308" for the Uniprot accession
number of a protein), the Xi which corresponds to the log2(A/B), and the
Vi which corresponds to the weight of the measure). This data have to be
pre-calibrated with a certain weight (see help of the Klibrate program).
* the relations file, a tab separated text file containing a first column
with the higher level identifiers (such as the peptide sequence, a Uniprot
accession number, or a Gene Ontology category) and the lower level
identifiers within the abovementioned input data file.
NOTE: you must include a first line header in all your files.
And delivers six output file:
* the output data file for the higher level, which has the same format as
the lower level data file, but containing the ids of the higher level in
the first column, the ratio Xj in the second column, and the weight Vj in
the third column. By default, this file is suffixed as "_higherLevel".
* two lower level output files, containing three columns each: in both,
the first column contains with the identifiers of the lower level, the
second column contains the Xinf - Xsup (i.e. the ratios of the lower
level, but centered for each element they belong to), and the third column
is either the new weight Winf (contanining the variance of the
integration) or the former untouched Vinf weight. For example, integrating
from scan to peptide, these files would contain firstly the scan
identifiers, secondly the Xscan - Xpep (the ratios of each scan compared
to the peptide they are identifying) and either Wscan (the weight of the
scan, taking into account the variance of the scan distribution) or Vscan.
By default, these files are suffixed "_lowerNormW" and "_lowerNormV".
* a file useful for statistics, containing all the relations of the higher
and lower level element present in the data file, with a copy of their
ratios X and weights V, followed by the number of lower elements contained
in the upper element (for example, the number of scans that identify the
same peptide), the Z (which is the distance in sigmas of the lower level
ratio X to the higher level weighted average), and the FDR (the false
discovery rate, important to keep track of changes or outliers). By
default, this file is suffixed "_outStats".
* an info file, containing a log of the performed integrations. Its last
line is always in the form of "Variance = [double]". This file can be used
as input in place of the variance (see -v and -V arguments). By default,
this file is suffixed "_infoFile".
* a graph file, depicting the sigmoid of the Z column which appears in the
stats file, compared to the theoretical normal distribution. By default,
this file is suffixed "_outGraph".
Usage: sanxot.py -d[data file] -r[relations file] [OPTIONS]""" % version
if advanced:
print """
-h, --help Display basic help and exit.
-H, --advanced-help Display this help and exit.
-A, --outrandom=filename
To use a non-default name for the randomised relations
file (only applicable when -R is in use).
-a, --analysis=string
Use a prefix for the output files. If this is not
provided, then the prefix will be garnered from the data
file.
-b, --no-verbose Do not print result summary after executing.
-C, --confluence A modified version of the relations file is used, where
all the destination higher level elements are "1". If no
relations file is provided, the program gets the lower
level elements from the first column of the data file.
-d, --datafile=filename
Data file with identificators of the lowel level in the
first column, measured values (x) in the second column,
and weights (v) in the third column.
-D, --removeduplicateupper
When merging data with relations table, remove duplicate
higher level elements (not removed by default).
-f, --forceparameters
Use the parameters as provided, without using the
Levenberg-Marquardt algorithm. Negative variances will
be reset to zero (see -F if you do not wish this).
-F, --forcenegativevariance
Though the indirect calculation of variance may lead to
a negative value, this has no mathematical meaning and
may cause a number of artefacts; hence, by default,
negative variances are automatically reset to zero.
However, for some analyses, it might be important seeing
the effect of original variance; for these cases, use
this option to override resetting negative variances to
zero.
-g, --no-graph Do not show the Zij vs rank / N graph.
-G, --outgraph=filename
To use a non-default name for the graph file.
-J, --includeorphans
In the case all the lower elements pointing to a higher
level element are excluded, the default behaviour is
removing the higher level element altogether. Adding
this option, the lower level elements will be integrated
in any case.
-l, --graphlimits=integer
To set the +- limits of the Zij graph (default is 6). If
you want the limits to be between the minimum and
maximum values, you can use -l.
-L, --infofile=filename
To use a non-default name for the info file.
-m, --maxiterations=integer
Maximum number of iterations performed by the Levenberg-
Marquardt algorithm to calculate the variance. If
unused, then the default value of the algorithm is
taken.
-M, --minseed=float To use a non-default minimum seed. Default is 1e-3.
-o, --higherlevel=filename
To use a non-default higher level output file name.
-p, --place, --folder=foldername
To use a different common folder for the output files.
If this is not provided, the the folder used will be the
same as the input folder.
-r, --relfile, --relationsfile=filename
Relations file, with identificators of the higher level
in the first column, and identificators of the lower
level in the second column.
-R, --randomise, --randomize
A modified version of the relations file is used, where
the higher level elements (first column) are replaced by
numbers and randomly written in the first column. The
numbers range from 1 to the total number of elements.
The second column (containing the lowel level elements)
remains unchanged.
-s, --no-steps Do not print result summary and the steps of every
Levenberg-Marquardt iteration.
-t, --graphtitle=string
The graph title (default is
"Zij graph for sigma^2 = [variance]").
-T, --minimalgraphticks
It will only show the x secondary line for x = 0, and
none for the Y axis (useful for publishing).
-u, --lowernormw=filename
To use a non-default lower level output file name,
setting W as weight (default suffix is _lowerNormW).
-U, --lowernormv=filename
To use a non-default lower level output file name,
setting V as weight (default suffix is _lowerNormV).
-v, --var, --varianceseed=double
Seed used to start calculating the variance.
Default is 0.001.
-V, --varfile=filename
Get the variance value from a text file. It must contain
a line (not more than once) with the text
"Variance = [double]". This suits the info file from
another integration (see -L).
-W, --graphlinewidth=float
Use a non-default value for the sigmoid line width.
Default is 1.0.
-w, --varconf=integer
Get the confidence limits of the variance using n
by performimg n simultaions.
-y, --varconfpercent=float
Get the higher and lower limits to calculate the limits
of the variance (see -w). Default is 0.05.
-z, --outstats=filename
To use a non-default stats file name.
--emergencysweep Use a sweep method instead of the Levenberg-Marquardt
algorithm if the number of tries (see -m) is reached.