Skip to content

Commit 69b6d10

Browse files
committed
Initial commit
0 parents  commit 69b6d10

File tree

5 files changed

+546
-0
lines changed

5 files changed

+546
-0
lines changed

.gitignore

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
/__pycache__/
2+
*.pyc
3+
*.pyo
4+
*.vcf
5+
*.vcf.gz
6+
*.vcf.gz.tbi
7+
/*.rp
8+
/*.rpdb
9+
/.*/
10+
/*.sh

README.md

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
## svimmer - SV merging tool
2+
3+
Merges similar SVs from multiple single sample VCF files. The tool was written for merging SVs discovered using Manta calls, but should support (almost) any SV VCFs. The output is a VCF file containing all merged SV sites (with no calls). The output can be given as input into [GraphTyper](https://github.com/DecodeGenetics/graphtyper) to genotype the sites.
4+
5+
### Requirements
6+
7+
* Python 3.4+
8+
* pysam
9+
10+
### Usage
11+
12+
```sh
13+
python3 svimmer input_vcfs chrA chrB chrC ...
14+
```
15+
16+
where input is a list of tabix indexed+bgzipped VCF files and chromosomes are the chromosomes to merge. For further details see the help page:
17+
18+
```
19+
python3 svimmer -h
20+
```
21+
22+
## License
23+
GNU GPLv3

sv.py

Lines changed: 209 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,209 @@
1+
#!/usr/bin/env python
2+
3+
import sys
4+
from utilities import calculate_overlap, calculate_stddev, get_most_common_item
5+
6+
SV_TYPES = ['UNK', 'BND', 'DEL', 'INS', 'INV']
7+
8+
def make_info_dictionary(info):
9+
spl_info = info.split(";")
10+
info_dict = {}
11+
12+
for key_val in spl_info:
13+
if "=" in key_val:
14+
key, val = key_val.split("=")
15+
info_dict[key] = val
16+
else:
17+
info_dict[key_val] = ""
18+
19+
return info_dict
20+
21+
22+
class SV(object):
23+
"""
24+
Constructs a structural variant (SV) candidate
25+
"""
26+
def __init__(self, vcf_record, check_type = True, join_mode = False, output_ids = False):
27+
self.is_outputting_ids = output_ids
28+
self.join_mode = join_mode
29+
30+
spl_line = vcf_record.rstrip().split("\t")[:8]
31+
self.check_type = check_type
32+
self.chromosome = spl_line[0]
33+
self.begin = int(spl_line[1])
34+
self.end = self.begin
35+
36+
if self.is_outputting_ids:
37+
self.ids = [spl_line[2]]
38+
39+
info_dict = make_info_dictionary(spl_line[7])
40+
41+
# Join related SV types
42+
if "SVTYPE" in info_dict:
43+
if info_dict["SVTYPE"] == "DEL_ALU" or info_dict["SVTYPE"] == "DEL_LINE1":
44+
info_dict["SVTYPE"] = "DEL"
45+
elif info_dict["SVTYPE"] == "ALU" or info_dict["SVTYPE"] == "LINE1" or info_dict["SVTYPE"] == "SVA" or \
46+
info_dict["SVTYPE"] == "DUP" or info_dict["SVTYPE"] == "CNV" or info_dict["SVTYPE"] == "INVDUP" or \
47+
info_dict["SVTYPE"] == "INV":
48+
info_dict["SVTYPE"] = "INS"
49+
elif info_dict["SVTYPE"] == "TRA":
50+
info_dict["SVTYPE"] = "BND"
51+
52+
# Remove old values
53+
if not join_mode and "NUM_MERGED_SVS" in info_dict:
54+
spl_line[7] = spl_line[7].replace("NUM_MERGED_SVS=%s;" % info_dict["NUM_MERGED_SVS"], "")
55+
56+
if "STDDEV_POS" in info_dict:
57+
spl_line[7] = spl_line[7].replace("STDDEV_POS=%s;" % info_dict["STDDEV_POS"], "")
58+
59+
#self.strand = 0 # -1 == minus strand, 0 == both strands, 1 plus strand
60+
61+
if "SVTYPE" in info_dict and (info_dict["SVTYPE"] == "DEL" ):
62+
if "END" in info_dict:
63+
self.end = int(info_dict["END"])
64+
else:
65+
if "SVSIZE" in info_dict:
66+
self.end = self.begin + abs(int(info_dict["SVSIZE"]))
67+
elif "SVLEN" in info_dict:
68+
self.end = self.begin + abs(int(info_dict["SVLEN"]))
69+
70+
if check_type:
71+
if "SVTYPE" in info_dict and info_dict["SVTYPE"] in SV_TYPES:
72+
self.type = SV_TYPES.index(info_dict["SVTYPE"])
73+
else:
74+
self.type = 0 # Unknown type
75+
assert False
76+
77+
#if info_dict["SVTYPE"] == "INV":
78+
# if "INV3" in info_dict:
79+
# self.strand = 1
80+
# elif "INV5" in info_dict:
81+
# self.strand = -1
82+
#elif info_dict["SVTYPE"] == "BND":
83+
# if "[" in spl_line[4]:
84+
# self.strand = 1
85+
# elif "]" in spl_line[4]:
86+
# self.strand = -1
87+
else:
88+
self.type = 0
89+
90+
# Get all begins and ends
91+
self.min_begin = self.begin
92+
self.max_begin = self.begin
93+
self.begins = [self.begin]
94+
self.ends = [self.end]
95+
self.infos = [spl_line[7]]
96+
self.unique_begins_and_ends = set()
97+
self.unique_begins_and_ends.add((int(self.begin), int(self.end)))
98+
self.refs = [spl_line[3]]
99+
self.alts = [spl_line[4]]
100+
101+
102+
"""
103+
Converts the SV to a string
104+
"""
105+
def __str__(self):
106+
info = self.infos[0]
107+
if len(info) > 0:
108+
info += ";"
109+
110+
num_text = "JOINED" if self.join_mode else "MERGED"
111+
112+
if self.is_outputting_ids:
113+
info += "MERGED_IDS=%s;NUM_%s_SVS=%d;STDDEV_POS=%.2f,%.2f" % (",".join(self.ids),
114+
num_text,
115+
len(self.begins),
116+
calculate_stddev(self.begins),
117+
calculate_stddev(self.ends)
118+
)
119+
else:
120+
info += "NUM_%s_SVS=%d;STDDEV_POS=%.2f,%.2f" % (num_text,
121+
len(self.begins),
122+
calculate_stddev(self.begins),
123+
calculate_stddev(self.ends)
124+
)
125+
126+
return "%s\t%d\t%s\t%s\t%s\t%d\t%s\t%s\n" % (
127+
self.chromosome,
128+
self.begin,
129+
".", # variant ID
130+
self.refs[0],
131+
self.alts[0],
132+
0, # quality
133+
".", # filter
134+
info
135+
)
136+
137+
138+
"""
139+
Finalize SV before printing
140+
"""
141+
def finalize(self):
142+
# Get the most common begin and end combo
143+
begins_and_ends = list(zip(self.begins, self.ends))
144+
self.begin, self.end = get_most_common_item(begins_and_ends)
145+
146+
for i, (begin, end) in enumerate(begins_and_ends):
147+
if begin == self.begin and end == self.end:
148+
if i > 0:
149+
self.infos[0] = self.infos[i]
150+
self.refs[0] = self.refs[i]
151+
self.alts[0] = self.alts[i]
152+
return
153+
154+
"""
155+
Defines how to represent the SV (in printing)
156+
"""
157+
def __repr__(self):
158+
return self.__str__()
159+
160+
"""
161+
Check if some SV should merge with this one. Merging should happen when both SVs are the same type with and overlap
162+
highly with each other.
163+
164+
:parameter otherSV The other SV to check.
165+
:returns True if the two SVs should merge.
166+
"""
167+
def should_merge(self, other_sv, max_sv_distance, max_size_difference):
168+
assert isinstance(other_sv, SV)
169+
assert len(self.begins) == len(self.ends)
170+
171+
if other_sv.type != self.type:
172+
return False # SVs of different types or chromosome should not merge
173+
174+
# For sanity
175+
if abs(other_sv.max_begin - self.min_begin) > 10000 or\
176+
abs(other_sv.min_begin - self.max_begin) > 10000:
177+
return False
178+
179+
for begin, end in self.unique_begins_and_ends:
180+
# For other SVs, calculate their overlap
181+
if abs(begin - other_sv.begin) <= max_sv_distance and\
182+
abs(end - other_sv.end) <= max_sv_distance and\
183+
abs((other_sv.end - other_sv.begin) - (end - begin)) <= max_size_difference:
184+
return True # Overlap is enough to merge
185+
186+
# We could not find any interval in this SV that did not enough, we should therefore merge these two SVs
187+
return False
188+
189+
"""
190+
Merges two SVs. Assumes that they should be merged, which can be checked with the 'should merge' function
191+
192+
:parameter otherSV The other SV to merge with.
193+
:returns Nothing
194+
"""
195+
def merge(self, other_sv):
196+
assert isinstance(other_sv, SV)
197+
assert self.type == other_sv.type
198+
199+
self.min_begin = min(self.min_begin, other_sv.min_begin)
200+
self.max_begin = max(self.max_begin, other_sv.max_begin)
201+
self.begins += other_sv.begins
202+
self.ends += other_sv.ends
203+
self.infos += other_sv.infos
204+
self.refs += other_sv.refs
205+
self.alts += other_sv.alts
206+
self.unique_begins_and_ends = self.unique_begins_and_ends.union(other_sv.unique_begins_and_ends)
207+
208+
if self.is_outputting_ids:
209+
self.ids += other_sv.ids

0 commit comments

Comments
 (0)