-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathContaminateGenotypes.py
executable file
·85 lines (72 loc) · 4.75 KB
/
ContaminateGenotypes.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
#!/projects1/tools/anaconda3/4.0.0/bin/python3
import sys, argparse, sh, random
import Utils as util
def Contaminate(Genos, SampleList, Contaminant, rates, nrReps, Index, overlapOnly):
Contaminated=""
for Sample in SampleList:
if Genos[Index[Sample]]!="9":
if overlapOnly and Genos[Index[Contaminant]] == "9":
Contaminated+="9"*nrReps*len(rates)
else:
for rate in rates:
for Rep in range (nrReps):
if random.uniform(0,1) <= rate:
if Genos[Index[Contaminant]]=="1":
if random.uniform(0,1) <= 0.5:
Contaminated+="2"
else:
Contaminated+="0"
else:
Contaminated+=Genos[Index[Contaminant]]
else:
Contaminated+=Genos[Index[Sample]]
else:
Contaminated+="9"*nrReps*len(rates)
return (Genos+Contaminated);
##MAIN##
parser = argparse.ArgumentParser(usage="%(prog)s (-i <INPUT FILE PREFIX>) (-o <OUTPUT FILE PREFIX>) (-s <SAMPLE>) (-c <CONTAMINANT>) (-r <RATE1,RATE2,RATE3,...>) (-n <nrReps>)" , description="A tool artificially contaminate the genotypes of multiple sample individuals with genotypes from a contaminant individual, at different rates of contamination.")
parser._optionals.title = "Available options"
parser.add_argument("-i", "--Input", type=str, metavar="<INPUT FILES PREFIX>", required=True, help="The desired input file prefix. Input files are assumed to be '<INPUT PREFIX>.geno', '<INPUT PREFIX>.snp' and '<INPUT PREFIX>.ind'.")
parser.add_argument("-o", "--Output", type=str, metavar="<OUTPUT FILES PREFIX>", required=False, help="The desired output file prefix. Three output files are created, '<OUTPUT FILES PREFIX>.geno', '<OUTPUT FILES PREFIX>.snp' and '<OUTPUT FILES PREFIX>.ind'.")
parser.add_argument("-s", "--Samples", type=str, metavar="<SAMPLE1,SAMPLE2,SAMPLE3,...>", required=True, help="The sample individual(s), whose genotypes will be contaminated.")
parser.add_argument("-c", "--Contaminant", type=str, metavar="<CONTAMINANT>", required=True, help="The contaminant individual, which will be used to contaminate the genotypes of each <SAMPLE> at the specified rate(s).")
parser.add_argument("-r", "--rates", type=str, metavar="<RATE1,RATE2,RATE3,...>", required=True, help="A comma separated list of contamination rates.")
parser.add_argument("-n", "--nrReps", type=int, metavar="<nrReps>", required=False, default=1, help="An integer value specifying the number of replicate contaminated genotypes to be created per contamination rate [1].")
parser.add_argument("-v", "--overlapOnly", action="store_true", required=False, help="When provided, only SNPs that are present in both Contaminant and Sample will be contaminated, while all other SNPs will be set to missing. This ensures conaminant genotypes will not be diluted due to lack of coverage overlap in the Sample and Contaminant.")
args = parser.parse_args()
GenoFile = open(args.Input+".geno", "r")
SnpFile = open(args.Input+".snp", "r")
IndFile = open(args.Input+".ind", "r")
OutGenoFile = open(args.Output+".geno", "w")
OutSnpFile = args.Output+".snp"
OutIndFile = args.Output+".ind"
SampleList=[x for x in args.Samples.split(',')]
rates=[float(r) for r in args.rates.split(',')]
##Check for errors in input files
util.CheckInputFiles(args.Input)
## Index Individual in database.
(Index,Sex,Pop)=util.Indexing(args.Input, SampleList, args.Contaminant)
## Print Logfile.
with sys.stderr as o:
print('#Input files:', args.Input+".geno", args.Input+".snp", args.Input+".ind", sep="\n\t", file=o)
print('#Output files:', args.Output+".geno", OutSnpFile, OutIndFile, sep="\n\t", file=o)
print('#Samples:', end="\t", file=o)
print(*SampleList, sep=",", file=o)
print('#Contamination Rates:', end="\t", file=o)
print(*rates, sep=",", file=o)
print('#NrReps:',args.nrReps, sep="\t",file=o)
if args.overlapOnly:
print('#Overlap Only option detected. All other sites will be set to missing.', file=o)
##Contaminate Genotypes
for line in GenoFile:
Genos=line.strip()
print (Contaminate(Genos, SampleList, args.Contaminant, rates, args.nrReps, Index, args.overlapOnly), file=OutGenoFile)
##Copy original .snp and .ind files
sh.cp(args.Input+".snp", OutSnpFile)
sh.cp(args.Input+".ind", OutIndFile)
##Append conaminated replicates of Samples at the end of the .ind file.
with open(OutIndFile, "a") as f:
for Sample in SampleList:
for rate in rates:
for rep in range(args.nrReps):
print (Sample+"_{}_{}".format(rate,rep+1), Sex[Sample], Pop[Sample]+"_{}".format(rate), file=f)