-
Notifications
You must be signed in to change notification settings - Fork 2
/
add_EGIgeno_from_vcf.py
44 lines (35 loc) · 1.34 KB
/
add_EGIgeno_from_vcf.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
# -*- coding: utf-8 -*-
"""
Created on Wed Jan 8 22:03:42 2020
@author: YudongCai
@Email: [email protected]
"""
import click
from cyvcf2 import VCF
@click.command()
@click.option('--vcffile')
@click.option('--genofile')
@click.option('--snpfile')
@click.option('--samples', help='query sample list file, 和vcf中个体顺序一致')
@click.option('--outgeno')
def main(vcffile, genofile, snpfile, samples, outgeno):
querysamples = [x.strip() for x in open(samples)]
nsamples = len(querysamples)
vcf_query = VCF(vcffile, gts012=True, samples=querysamples)
with open(genofile) as f_g, open(snpfile) as f_s, open(outgeno, 'w') as f_o:
for lgeno, lsnp in zip(f_g, f_s):
loc, chrom, gdist, pdist = lsnp.split()[:4]
loc = f'{chrom}:{pdist}-{pdist}'
try:
record = next(vcf_query(loc))
gts = record.gt_types # 0=HOM_REF, 1=HET, 2=HOM_ALT, 3=UNKNOWN
gts[gts==3] = 9
gts[gts==0] = -1
gts[gts==2] = 0
gts[gts==-1] = 2
outgeno = lgeno.strip() + ''.join([str(x) for x in gts]) + '\n'
except StopIteration:
outgeno = lgeno.strip() + '9'*nsamples + '\n'
f_o.write(outgeno)
if __name__ == '__main__':
main()