-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathconvert_coordinate_ref2donor.py
107 lines (91 loc) · 2.77 KB
/
convert_coordinate_ref2donor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
"""
input: vcf file, 1 based
bed file, 0 based
output: recover file, 0 based
Given snp information that stored in vcf file, the snp is donor compared to reference, and coordinate is in reference
bed file containing coordinate info in reference
Convert coordinate to corresponding one on donor genome
Pay attention that the coordinate may not exist on donor genome because of deletion happened on reference,
here we did not deal with it.
"""
from sys import argv
if len(argv) < 3:
print "Error: parameter required!\n"
print "usage: " + argv[0] +" vcfFile bedFile > new_bedFile"
exit()
vcfFilename = argv[1]
bedFilename = argv[2]
vcfFile = open(vcfFilename)
pos_refLen = {}
pos_altLen = {}
# readfile
for line in vcfFile.readlines():
if(line.startswith("#")):
continue
line = line.strip()
columns = line.split("\t")
pos = int(columns[1]) - 1 #vcf file is 1 based
ref = columns[3]
alt = columns[4]
refLen = len(ref)
altLen = len(alt)
if (refLen == altLen):
continue
pos_refLen[pos] = refLen
pos_altLen[pos] = altLen
vcfFile.close()
"""
Traverse all snp position, start from largest position
For one position, change the coordinate after it.
"""
posList = pos_refLen.keys()
posList.sort(reverse=True)
"""
Read bed file position.
"""
bedFile = open(bedFilename)
bedId = 0
bedId_bedStart = {}
bedId_bedEnd = {}
for line in bedFile.readlines():
bedId += 1
if line.startswith("#"):
continue
line = line.strip()
columns = line.split("\t")
startPosition = int(columns[1])
endPosition = int(columns[2])
bedId_bedStart[bedId] = startPosition
bedId_bedEnd[bedId] = endPosition
bedFile.close()
for bedId in bedId_bedStart:
startPosition = bedId_bedStart[bedId]
for pos in posList:
if pos > startPosition:
continue
refLen = pos_refLen[pos]
altLen = pos_altLen[pos]
if (refLen > altLen):
#del happen
eventLen = refLen-altLen
bedId_bedStart[bedId] -= eventLen
elif (refLen < altLen):
#ins happen
eventLen = altLen - refLen
bedId_bedStart[bedId] += eventLen
endPosition = bedId_bedEnd[bedId]
for pos in posList:
if pos > endPosition:
continue
refLen = pos_refLen[pos]
altLen = pos_altLen[pos]
if (refLen > altLen):
#del happen
eventLen = refLen-altLen
bedId_bedEnd[bedId] -= eventLen
elif (refLen < altLen):
#ins happen
eventLen = altLen - refLen
bedId_bedEnd[bedId] += eventLen
for bedId in bedId_bedStart:
print "chr17\t" + str(bedId_bedStart[bedId]) + "\t" + str(bedId_bedEnd[bedId])