-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathwrite_vcf.py
106 lines (104 loc) · 6.4 KB
/
write_vcf.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
import pysam
from itertools import count
from classes.Structural_variant import StructuralVariant
def write_to_vcf(sv_list, in_vcf, outfile):
"""
write vcf of simplified SVs
:param sv_list: if a list of sv objects one for each sv
:param in_vcf: the original vcf file
:param outfile: text name of output file
uses pysam
"""
# make new header from original vcf file
samp_vcf = pysam.VariantFile(in_vcf)
new_header = pysam.VariantHeader()
# write contigs / chromosomes
for x in samp_vcf.header.records:
if (x.type == "CONTIG"):
new_header.add_record(x)
# get sample name
sample_name = list(samp_vcf.header.samples)[0]
split_string = sample_name.split("_", 1)
newname = split_string[0]
new_header.add_sample(newname)
# add new meta data to header: filter, genotype, info
new_header.add_meta('FILTER', items=[('ID', 'PASS'),
('Description', 'All filters passed')])
new_header.add_meta('FORMAT', items=[('ID', "GT"),
('Number', 1), ('Type', 'String'),
('Description', 'Genotype')])
new_header.add_meta('VAF', items=[('ID', "VAF"),
('Number', 1), ('Type', 'Integer'),
('Description', 'variant allele fraction')])
new_header.add_meta('INFO', items=[('ID', 'SVTYPE'),
('Number', 1), ('Type', 'String'),
('Description', "Type of structural variant")])
new_header.add_meta('INFO', items=[('ID', 'SVLEN'),
('Number', '.'), ('Type', 'Integer'),
('Description', "difference in length between REF and ALT alleles")])
new_header.add_meta('INFO', items=[('ID', 'END'),
('Number', 1), ('Type', 'Integer'),
('Description', 'End position of the variant described in this record')])
new_header.add_meta('INFO', items=[('ID', 'IMPRECISE'),
('Number', '0'), ('Type', 'Flag'),
('Description', 'Imprecise structural variation')])
new_header.add_meta('INFO', items=[('ID', 'STRANDS'),
('Number', '.'), ('Type', 'String'),
('Description', 'Strand orientation of the adjacency in BEDPE format (DEL:+-, DUP:-+, INV:++/--)')])
new_header.add_meta('INFO', items=[('ID', 'CIEND'),
('Number', 2), ('Type', 'Integer'), ('Description', 'Confidence interval around END for imprecise variants')])
new_header.add_meta('INFO', items=[('ID', 'CIPOS'),
('Number', 2), ('Type', 'Integer'), ('Description', 'Confidence interval around remote breakend POS for imprecise variants')])
new_header.add_meta('INFO', items=[('ID', 'GRIDSSEVENT'),
('Number', "."), ('Type', 'String'), ('Description', 'GRIDSS id for event associated to breakend')])
new_header.add_meta('INFO', items=[('ID', 'INSCHROM'),
('Number', '.'), ('Type', 'String'), ('Description', 'Breakpoint pair chromosome of interchromosome breakpoint')])
new_header.add_meta('INFO', items=[('ID', 'INSPOS'),
('Number', '.'), ('Type', 'Integer'), ('Description', 'insertion coordinate on interchromosome breakpoint')])
new_header.add_meta('INFO', items=[('ID', 'CNVRCTX'),
('Number', '0'), ('Type', 'Flag'),
('Description', 'interchromosome breakpoint in CNVR')])
new_header.add_meta('INFO', items=[('ID', 'CNVRINV'),
('Number', '0'), ('Type', 'Flag'),
('Description', 'inversion breakpoint in CNVR')])
new_header.add_meta("INFO", items=[('ID', 'CNVRTYPE'),
('Number', '.'), ('Type', 'String'),
('Description', 'structural variant types associated with CNVR')])
new_header.add_meta("INFO", items=[('ID', 'VAF'),
('Number', "."), ('Type', 'String'),
('Description', 'variant allele fraction of gridss breakends supporting this SV')])
new_header.add_meta("INFO", items=[('ID', 'GTBNDS'),
('Number', "."), ('Type', 'String'),
('Description', 'genotype from all gridss breakend supporting this SV ')])
sites_identifiers = count(1)
# open file for writing
newfile = pysam.VariantFile(outfile, 'w', header=new_header)
for sv in sv_list:
# fix formatting of the VAF and GT list
pair_gt = ",".join([str(gt) for gt in sv.pair_gt])
pair_vaf = ",".join([str(vaf) for vaf in sv.pair_vaf])
rec = newfile.new_record(contig=sv.chromosome,
start=(sv.start - 1),
stop=sv.stop,
alleles=sv.alleles,
qual=sv.qual,
id=str(next(sites_identifiers)),
filter=sv.filter,
info={'SVTYPE': sv.SVTYPE,
'SVLEN': sv.SVLEN,
'END': sv.stop,
'IMPRECISE': sv.imprecise,
'STRANDS': sv.strand,
'CIEND': sv.ciend,
'CIPOS': sv.cipos,
'GRIDSSEVENT': sv.gridss_id,
'INSCHROM': sv.inschrom,
'INSPOS': sv.inspos,
'CNVRINV': sv.cnvr_inv,
'CNVRCTX': sv.cnvr_ctx,
'CNVRTYPE': sv.cnvr_types,
'GTBNDS': pair_gt,
'VAF': pair_vaf})
rec.samples[newname]['GT'] = sv.genotype
newfile.write(rec)
newfile.close()