Skip to content

Commit e652e38

Browse files
author
Stephan Schiffels
committed
added outgroup option and outgroup F3 statistics
1 parent 1f54d08 commit e652e38

File tree

2 files changed

+33
-26
lines changed

2 files changed

+33
-26
lines changed

.gitignore

+1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
__pycache__/

RASCalculator.py

+32-26
Original file line numberDiff line numberDiff line change
@@ -7,10 +7,10 @@
77

88
########## MAIN ############
99

10-
parser = argparse.ArgumentParser(description="Extract the frequency of shared rare variants between each left population and all right populations from a freqsum file. Also preforms error estimation using jackknifing, using the number of observed sites for normalisation.")
10+
parser = argparse.ArgumentParser(description="Compute rare allele sharing statistics between two populations with respect to an outgroup, as well as outgroup F3 statistics. Also preforms error estimation using jackknifing, using the number of observed sites for normalisation.")
1111
parser.add_argument("-I", "--Input", metavar="<INPUT FILE>", type=argparse.FileType('r'), help="The input freqsum file. Omit to read from stdin.", required=False)
1212
parser.add_argument("-O", "--Output", metavar="<OUTPUT FILE>", type=argparse.FileType('w'), help="The output file. Omit to print in stdout.")
13-
# parser.add_argument("-F", "--FocalPop", metavar="POP", type=str, help="The population to polarise all alleles with. Only consider non-variable positions in this population that are variable in other populations. By default the human reference is used.", required=False)
13+
parser.add_argument("-o", "--outgroup", metavar="POP", type=str, help="The outgroup population to polarise all alleles with. By default the human reference is used (not recommended, only for backwards compatibility). Note that this normally should be a population within the Right Populations", required=False)
1414
parser.add_argument("-M", "--maxAF", metavar="<MAX ALLELE COUNT>", type=int, default=10, help="The maximum number of alleles (total) in the reference populations. The default maximum allele value is 10.", required=False)
1515
parser.add_argument("-m", "--minAF", metavar="<MIN ALLELE COUNT>", type=int, default=2, help="The minimum number of alleles (total) in the reference populations. The default minimum allele count is 2.", required=False)
1616
parser.add_argument("-L", "--LeftPops", type=str, metavar="POP1,POP2,...", required=True, help="Set the Test populations/individuals. RAS will be calculated between the Test and all Right populations.")
@@ -53,10 +53,6 @@
5353
for x in RightPops:
5454
assert (x in freqSumParser.popNames), "Population '{}' not found in FreqSum".format(x)
5555

56-
RAS = [[[[0 for i in range(NumBins)] for j in range(maxAF+1)] for k in range(len(RightPops))] for x in range(len(LeftPops))]
57-
# The normalization only records a total for all allele frequencies.
58-
mj = [[ [0 for i in range(NumBins)] for k in range(len(RightPops))] for x in range(len(LeftPops))]
59-
6056
def getMissingness(afDict):
6157
missing=0
6258
for x in RightPops:
@@ -68,13 +64,24 @@ def isTransition(ref, alt):
6864
Transitions = {"A":"G", "G":"A","C":"T","T":"C"}
6965
return (ref in Transitions and alt == Transitions[ref])
7066

71-
def getTotalAF(afDict):
67+
def getTotalMinorAF(afDict):
7268
#Calculate AfSum for each position
73-
AfSum = 0
69+
NonRefAfSum = 0
70+
TotalCount = 0
7471
for pop in RightPops:
7572
if afDict[pop] > 0:
76-
AfSum+=afDict[pop]
77-
return AfSum
73+
NonRefAfSum += afDict[pop]
74+
TotalCount += freqSumParser.sizes[pop]
75+
outgroupFreq = 0.0 if args.outgroup == None else afDict[args.outgroup] / freqSumParser.sizes[args.outgroup]
76+
minorAfSum = NonRefAfSum if outgroupFreq < 0.5 else TotalCount - NonRefAfSum
77+
return minorAfSum
78+
79+
# Bin minAf - 1: Total rare allele sharing
80+
# Bins minAf -> maxAf: Rare Allele sharing per allele count
81+
# Bin maxAf + 1: Outgroup F3 stats
82+
RAS = [[[[0 for i in range(NumBins)] for j in range(maxAF+2)] for k in range(len(RightPops))] for x in range(len(LeftPops))]
83+
# The normalization only records a total for all allele frequencies.
84+
mj = [[ [0 for i in range(NumBins)] for k in range(len(RightPops))] for x in range(len(LeftPops))]
7885

7986
for (Chrom, Pos, Ref, Alt, afDict) in freqSumParser:
8087

@@ -86,7 +93,7 @@ def getTotalAF(afDict):
8693
if args.NoTransitions and isTransition(Ref, Alt):
8794
continue
8895

89-
AfSum = getTotalAF(afDict)
96+
AfSum = getTotalMinorAF(afDict)
9097

9198
for Lftidx, leftPop in enumerate(LeftPops):
9299
for Rgtidx, rightPop in enumerate(RightPops):
@@ -98,26 +105,23 @@ def getTotalAF(afDict):
98105

99106
if afDict[leftPop] >= 0 and afDict[rightPop] >= 0:
100107
mj[Lftidx][Rgtidx][Chrom] += 1
108+
xLeft = afDict[leftPop] / leftSize
109+
xRight = afDict[rightPop] / rightSize
110+
xOutgroup = 0.0 if args.outgroup == None else afDict[args.outgroup] / freqSumParser.sizes[args.outgroup]
111+
add = (xLeft - xOutgroup) * (xRight - xOutgroup)
112+
RAS[Lftidx][Rgtidx][maxAf+1][Chrom] += add # For Outgroup F3 Stats
101113
if AfSum >= minAF and AfSum <= maxAF and (not args.Private or isPrivate):
102-
#Only consider sites with ascertained AF between the provided ranges.
103-
if leftPop != rightPop:
104-
#Case where right and left pops are different
105-
add = (afDict[leftPop] * afDict[rightPop]) / (leftSize * rightSize)
106-
RAS[Lftidx][Rgtidx][AfSum][Chrom] += add
107-
#within "minAF-1" we store total RAS and observed sites, for Jackknife estimations on the totals.
108-
RAS[Lftidx][Rgtidx][minAF-1][Chrom] += add
109-
else:
110-
#Case where left pop is also right pop (within population)
111-
add = (afDict[leftPop] * (afDict[leftPop]-1)) / (leftSize * (leftSize-1))
112-
RAS[Lftidx][Rgtidx][AfSum][Chrom] += add
113-
RAS[Lftidx][Rgtidx][minAF-1][Chrom] += add
114+
#Only consider sites with ascertained minor AF between the provided ranges.
115+
RAS[Lftidx][Rgtidx][AfSum][Chrom] += add
116+
#within "minAF-1" we store total Rare allele sharing.
117+
RAS[Lftidx][Rgtidx][minAF-1][Chrom] += add
114118

115119
#Jackknifing
116-
ThetaJ=[[[0 for j in range(maxAF+1)] for k in range(len(RightPops))] for x in range(len(LeftPops))]
117-
Sigma2=[[[0 for j in range(maxAF+1)] for k in range(len(RightPops))] for x in range(len(LeftPops))]
120+
ThetaJ=[[[0 for j in range(maxAF+2)] for k in range(len(RightPops))] for x in range(len(LeftPops))]
121+
Sigma2=[[[0 for j in range(maxAF+2)] for k in range(len(RightPops))] for x in range(len(LeftPops))]
118122
for x in range(len(LeftPops)):
119123
for j in range(len(RightPops)):
120-
for i in range(minAF-1,maxAF+1):
124+
for i in range(minAF-1,maxAF+2):
121125
thetaJ,sigma2=ras.getJackknife(RAS[x][j][i],mj[x][j])
122126
ThetaJ[x][j][i]=thetaJ
123127
Sigma2[x][j][i]=sigma2
@@ -134,6 +138,8 @@ def getTotalAF(afDict):
134138
print (rightPop, leftPop, "{:.5}".format(float(sum(RAS[leftidx][rightidx][m]))), "{:.15e}".format(sum(mj[leftidx][rightidx])), "{:.15e}".format(ThetaJ[leftidx][rightidx][m]), "{:.15e}".format(sqrt(Sigma2[leftidx][rightidx][m])),m, sep="\t", file=args.Output)
135139
m=minAF-1
136140
print (rightPop, leftPop, "{:.5}".format(float(sum(RAS[leftidx][rightidx][m]))), "{:.15e}".format(sum(mj[leftidx][rightidx])), "{:.15e}".format(ThetaJ[leftidx][rightidx][m]), "{:.15e}".format(sqrt(Sigma2[leftidx][rightidx][m])),"Total [{},{}]".format(minAF,maxAF), sep="\t", file=args.Output)
141+
m=maxAF+1
142+
print (rightPop, leftPop, "{:.5}".format(float(sum(RAS[leftidx][rightidx][m]))), "{:.15e}".format(sum(mj[leftidx][rightidx])), "{:.15e}".format(ThetaJ[leftidx][rightidx][m]), "{:.15e}".format(sqrt(Sigma2[leftidx][rightidx][m])),"Outgroup F3", sep="\t", file=args.Output)
137143
#print ("", file=args.Output)
138144

139145
print ("Program finished running at:", strftime("%D %H:%M:%S"), file=sys.stderr)

0 commit comments

Comments
 (0)