-
Notifications
You must be signed in to change notification settings - Fork 3
Expand file tree
/
Copy pathsimulate_SNVs.py
More file actions
145 lines (116 loc) · 5.38 KB
/
simulate_SNVs.py
File metadata and controls
145 lines (116 loc) · 5.38 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
import copy
import random
import os
import pysam
import itertools
import numpy as np
from numpy.random import choice
VALID_CHROMS = set(['{}'.format(c) for c in range(1,23)]+['X']+['contig1','contig2','contig3'])
# estimate prior probability of genotypes using strategy described here:
# http://www.ncbi.nlm.nih.gov/pmc/articles/PMC2694485/
# "prior probability of each genotype"
#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
# CAUTION!
# Selects a variant genotype, assuming the site already is a SNV.
#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
def create_phased_genotype_selector():
alleles = ['A','C','G','T']
genotypes = list(itertools.combinations_with_replacement(alleles,2))
hom_snp_rate = 0.0005
het_snp_rate = 0.001
diploid_genotype_priors = dict()
haploid_genotype_priors = dict()
transition = {'A':'G','G':'A','T':'C','C':'T'}
for allele in alleles:
# priors on haploid alleles
haploid_genotype_priors[allele] = dict()
haploid_genotype_priors[allele][allele] = 1 - het_snp_rate
haploid_genotype_priors[allele][transition[allele]] = het_snp_rate / 6 * 4
for transversion in alleles:
if transversion in haploid_genotype_priors[allele]:
continue
haploid_genotype_priors[allele][transversion] = het_snp_rate / 6
diploid_genotype_priors[allele] = []
for G in genotypes:
g1,g2 = G
# probability of homozygous reference is the probability of neither het or hom SNP
if g1 == g2 and g1 == allele:
diploid_genotype_priors[allele].append(0)
elif g1 == g2 and g1 != allele:
# transitions are 4 times as likely as transversions
if g1 == transition[allele]:
diploid_genotype_priors[allele].append(hom_snp_rate / 6 * 4)
else:
diploid_genotype_priors[allele].append(hom_snp_rate / 6)
else: # else it's the product of the haploid priors
diploid_genotype_priors[allele].append(haploid_genotype_priors[allele][g1] * haploid_genotype_priors[allele][g2])
# remove the option of selecting homozygous reference
total = sum(diploid_genotype_priors[allele])
for i in range(len(genotypes)):
diploid_genotype_priors[allele][i] /= total
# make sure everything sums to 1
diploid_genotype_priors[allele][-1] = 1.0 - sum(diploid_genotype_priors[allele][:-1])
g_ixs = list(range(len(genotypes)))
def phased_genotype_selector(ref_allele):
g_ix = choice(g_ixs, 1, p=diploid_genotype_priors[ref_allele])[0]
g = genotypes[g_ix]
if random.random() > 0.5:
return g
else:
return (g[1],g[0])
return phased_genotype_selector
def simulate_SNV_VCF(hg19_fasta, output_vcf, min_pos=None, max_pos=None):
phased_genotype_selector = create_phased_genotype_selector()
with pysam.FastaFile(hg19_fasta) as fasta, open(output_vcf,'w') as outv:
header = '''##fileformat=VCFv4.2
##source=simulate_SNVs.py
#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tSAMPLE'''
print(header ,file=outv)
for chrom in fasta.references:
size = fasta.get_reference_length(chrom)
pos = 0
while pos < size:
pos += np.random.geometric(0.0015, size=None)
ref_allele = fasta.fetch(chrom,pos,pos+1).upper()
if ref_allele in ['A','C','G','T']:
genotype = phased_genotype_selector(ref_allele)
else:
continue
#hap1_seq.append(genotype[0])
#hap2_seq.append(genotype[1])
if genotype[0] == genotype[1] and genotype[0] != ref_allele:
var_str = genotype[0]
genotype_str = '1|1'
elif genotype[1] == ref_allele and genotype[0] != ref_allele:
var_str = genotype[0]
genotype_str = '1|0'
elif genotype[0] == ref_allele and genotype[1] != ref_allele:
var_str = genotype[1]
genotype_str = '0|1'
elif genotype[0] != ref_allele and genotype[1] != ref_allele and genotype[0] != genotype[1]:
# triallelic
var_str = genotype[0] + ',' + genotype[1]
genotype_str = '1|2'
else:
print("INVALID GENOTYPE ENCOUNTERED")
exit(1)
if(chrom not in VALID_CHROMS):
continue
if (min_pos != None and pos < min_pos) or (max_pos != None and pos > max_pos):
continue
if genotype != (ref_allele,ref_allele) and 'N' not in genotype:
el = [None]*10
el[0] = chrom
el[1] = pos + 1
el[2] = '.'
el[3] = ref_allele
el[4] = var_str
el[5] = 100
el[6] = 'PASS'
el[7] = '.'
el[8] = 'GT'
el[9] = genotype_str
line = '{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}'.format(*el)
print(line,file=outv)
if __name__ == '__main__':
generate_fasta()