|
| 1 | +#!/usr/bin/env python |
| 2 | + |
| 3 | +import sys |
| 4 | +from utilities import calculate_overlap, calculate_stddev, get_most_common_item |
| 5 | + |
| 6 | +SV_TYPES = ['UNK', 'BND', 'DEL', 'INS', 'INV'] |
| 7 | + |
| 8 | +def make_info_dictionary(info): |
| 9 | + spl_info = info.split(";") |
| 10 | + info_dict = {} |
| 11 | + |
| 12 | + for key_val in spl_info: |
| 13 | + if "=" in key_val: |
| 14 | + key, val = key_val.split("=") |
| 15 | + info_dict[key] = val |
| 16 | + else: |
| 17 | + info_dict[key_val] = "" |
| 18 | + |
| 19 | + return info_dict |
| 20 | + |
| 21 | + |
| 22 | +class SV(object): |
| 23 | + """ |
| 24 | + Constructs a structural variant (SV) candidate |
| 25 | + """ |
| 26 | + def __init__(self, vcf_record, check_type = True, join_mode = False, output_ids = False): |
| 27 | + self.is_outputting_ids = output_ids |
| 28 | + self.join_mode = join_mode |
| 29 | + |
| 30 | + spl_line = vcf_record.rstrip().split("\t")[:8] |
| 31 | + self.check_type = check_type |
| 32 | + self.chromosome = spl_line[0] |
| 33 | + self.begin = int(spl_line[1]) |
| 34 | + self.end = self.begin |
| 35 | + |
| 36 | + if self.is_outputting_ids: |
| 37 | + self.ids = [spl_line[2]] |
| 38 | + |
| 39 | + info_dict = make_info_dictionary(spl_line[7]) |
| 40 | + |
| 41 | + # Join related SV types |
| 42 | + if "SVTYPE" in info_dict: |
| 43 | + if info_dict["SVTYPE"] == "DEL_ALU" or info_dict["SVTYPE"] == "DEL_LINE1": |
| 44 | + info_dict["SVTYPE"] = "DEL" |
| 45 | + elif info_dict["SVTYPE"] == "ALU" or info_dict["SVTYPE"] == "LINE1" or info_dict["SVTYPE"] == "SVA" or \ |
| 46 | + info_dict["SVTYPE"] == "DUP" or info_dict["SVTYPE"] == "CNV" or info_dict["SVTYPE"] == "INVDUP" or \ |
| 47 | + info_dict["SVTYPE"] == "INV": |
| 48 | + info_dict["SVTYPE"] = "INS" |
| 49 | + elif info_dict["SVTYPE"] == "TRA": |
| 50 | + info_dict["SVTYPE"] = "BND" |
| 51 | + |
| 52 | + # Remove old values |
| 53 | + if not join_mode and "NUM_MERGED_SVS" in info_dict: |
| 54 | + spl_line[7] = spl_line[7].replace("NUM_MERGED_SVS=%s;" % info_dict["NUM_MERGED_SVS"], "") |
| 55 | + |
| 56 | + if "STDDEV_POS" in info_dict: |
| 57 | + spl_line[7] = spl_line[7].replace("STDDEV_POS=%s;" % info_dict["STDDEV_POS"], "") |
| 58 | + |
| 59 | + #self.strand = 0 # -1 == minus strand, 0 == both strands, 1 plus strand |
| 60 | + |
| 61 | + if "SVTYPE" in info_dict and (info_dict["SVTYPE"] == "DEL" ): |
| 62 | + if "END" in info_dict: |
| 63 | + self.end = int(info_dict["END"]) |
| 64 | + else: |
| 65 | + if "SVSIZE" in info_dict: |
| 66 | + self.end = self.begin + abs(int(info_dict["SVSIZE"])) |
| 67 | + elif "SVLEN" in info_dict: |
| 68 | + self.end = self.begin + abs(int(info_dict["SVLEN"])) |
| 69 | + |
| 70 | + if check_type: |
| 71 | + if "SVTYPE" in info_dict and info_dict["SVTYPE"] in SV_TYPES: |
| 72 | + self.type = SV_TYPES.index(info_dict["SVTYPE"]) |
| 73 | + else: |
| 74 | + self.type = 0 # Unknown type |
| 75 | + assert False |
| 76 | + |
| 77 | + #if info_dict["SVTYPE"] == "INV": |
| 78 | + # if "INV3" in info_dict: |
| 79 | + # self.strand = 1 |
| 80 | + # elif "INV5" in info_dict: |
| 81 | + # self.strand = -1 |
| 82 | + #elif info_dict["SVTYPE"] == "BND": |
| 83 | + # if "[" in spl_line[4]: |
| 84 | + # self.strand = 1 |
| 85 | + # elif "]" in spl_line[4]: |
| 86 | + # self.strand = -1 |
| 87 | + else: |
| 88 | + self.type = 0 |
| 89 | + |
| 90 | + # Get all begins and ends |
| 91 | + self.min_begin = self.begin |
| 92 | + self.max_begin = self.begin |
| 93 | + self.begins = [self.begin] |
| 94 | + self.ends = [self.end] |
| 95 | + self.infos = [spl_line[7]] |
| 96 | + self.unique_begins_and_ends = set() |
| 97 | + self.unique_begins_and_ends.add((int(self.begin), int(self.end))) |
| 98 | + self.refs = [spl_line[3]] |
| 99 | + self.alts = [spl_line[4]] |
| 100 | + |
| 101 | + |
| 102 | + """ |
| 103 | + Converts the SV to a string |
| 104 | + """ |
| 105 | + def __str__(self): |
| 106 | + info = self.infos[0] |
| 107 | + if len(info) > 0: |
| 108 | + info += ";" |
| 109 | + |
| 110 | + num_text = "JOINED" if self.join_mode else "MERGED" |
| 111 | + |
| 112 | + if self.is_outputting_ids: |
| 113 | + info += "MERGED_IDS=%s;NUM_%s_SVS=%d;STDDEV_POS=%.2f,%.2f" % (",".join(self.ids), |
| 114 | + num_text, |
| 115 | + len(self.begins), |
| 116 | + calculate_stddev(self.begins), |
| 117 | + calculate_stddev(self.ends) |
| 118 | + ) |
| 119 | + else: |
| 120 | + info += "NUM_%s_SVS=%d;STDDEV_POS=%.2f,%.2f" % (num_text, |
| 121 | + len(self.begins), |
| 122 | + calculate_stddev(self.begins), |
| 123 | + calculate_stddev(self.ends) |
| 124 | + ) |
| 125 | + |
| 126 | + return "%s\t%d\t%s\t%s\t%s\t%d\t%s\t%s\n" % ( |
| 127 | + self.chromosome, |
| 128 | + self.begin, |
| 129 | + ".", # variant ID |
| 130 | + self.refs[0], |
| 131 | + self.alts[0], |
| 132 | + 0, # quality |
| 133 | + ".", # filter |
| 134 | + info |
| 135 | + ) |
| 136 | + |
| 137 | + |
| 138 | + """ |
| 139 | + Finalize SV before printing |
| 140 | + """ |
| 141 | + def finalize(self): |
| 142 | + # Get the most common begin and end combo |
| 143 | + begins_and_ends = list(zip(self.begins, self.ends)) |
| 144 | + self.begin, self.end = get_most_common_item(begins_and_ends) |
| 145 | + |
| 146 | + for i, (begin, end) in enumerate(begins_and_ends): |
| 147 | + if begin == self.begin and end == self.end: |
| 148 | + if i > 0: |
| 149 | + self.infos[0] = self.infos[i] |
| 150 | + self.refs[0] = self.refs[i] |
| 151 | + self.alts[0] = self.alts[i] |
| 152 | + return |
| 153 | + |
| 154 | + """ |
| 155 | + Defines how to represent the SV (in printing) |
| 156 | + """ |
| 157 | + def __repr__(self): |
| 158 | + return self.__str__() |
| 159 | + |
| 160 | + """ |
| 161 | + Check if some SV should merge with this one. Merging should happen when both SVs are the same type with and overlap |
| 162 | + highly with each other. |
| 163 | +
|
| 164 | + :parameter otherSV The other SV to check. |
| 165 | + :returns True if the two SVs should merge. |
| 166 | + """ |
| 167 | + def should_merge(self, other_sv, max_sv_distance, max_size_difference): |
| 168 | + assert isinstance(other_sv, SV) |
| 169 | + assert len(self.begins) == len(self.ends) |
| 170 | + |
| 171 | + if other_sv.type != self.type: |
| 172 | + return False # SVs of different types or chromosome should not merge |
| 173 | + |
| 174 | + # For sanity |
| 175 | + if abs(other_sv.max_begin - self.min_begin) > 10000 or\ |
| 176 | + abs(other_sv.min_begin - self.max_begin) > 10000: |
| 177 | + return False |
| 178 | + |
| 179 | + for begin, end in self.unique_begins_and_ends: |
| 180 | + # For other SVs, calculate their overlap |
| 181 | + if abs(begin - other_sv.begin) <= max_sv_distance and\ |
| 182 | + abs(end - other_sv.end) <= max_sv_distance and\ |
| 183 | + abs((other_sv.end - other_sv.begin) - (end - begin)) <= max_size_difference: |
| 184 | + return True # Overlap is enough to merge |
| 185 | + |
| 186 | + # We could not find any interval in this SV that did not enough, we should therefore merge these two SVs |
| 187 | + return False |
| 188 | + |
| 189 | + """ |
| 190 | + Merges two SVs. Assumes that they should be merged, which can be checked with the 'should merge' function |
| 191 | +
|
| 192 | + :parameter otherSV The other SV to merge with. |
| 193 | + :returns Nothing |
| 194 | + """ |
| 195 | + def merge(self, other_sv): |
| 196 | + assert isinstance(other_sv, SV) |
| 197 | + assert self.type == other_sv.type |
| 198 | + |
| 199 | + self.min_begin = min(self.min_begin, other_sv.min_begin) |
| 200 | + self.max_begin = max(self.max_begin, other_sv.max_begin) |
| 201 | + self.begins += other_sv.begins |
| 202 | + self.ends += other_sv.ends |
| 203 | + self.infos += other_sv.infos |
| 204 | + self.refs += other_sv.refs |
| 205 | + self.alts += other_sv.alts |
| 206 | + self.unique_begins_and_ends = self.unique_begins_and_ends.union(other_sv.unique_begins_and_ends) |
| 207 | + |
| 208 | + if self.is_outputting_ids: |
| 209 | + self.ids += other_sv.ids |
0 commit comments