-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathAutomateMutationDAta.py
245 lines (222 loc) · 10.7 KB
/
AutomateMutationDAta.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
# Script that will take in a mutation csv from AleDb and convert it to the Camel Mutation format
# Starting by just doing one file
# Then move to go through all files in a folder
import csv
import re
from openpyxl import load_workbook
import pandas as pd
import shutil
import glob
path = "C:/Users/samue/Desktop/Thesis/ALEDB_conversion/Experiment_Files_to_convert"
def mutTranslate(csvfile):
# Open each csvfile
f = open(csvfile, 'r', encoding='UTF-8')
csv_f = csv.reader(f)
# Remove header row from csv file and put it in a list
Isolates = []
header = next(csv_f)
index = 6
while index < len(header):
Isolates.append(header[index])
index += 1
# Use regular expressions to get the actual headers from the header file
Groups = []
for i in range(len(Isolates)):
Groups.append(re.findall("[A][0-9]+[ ][F][0-9]+[ ][I][0-9]+[ ][R][0-9]+", Isolates[i]))
# Add each mutation from csv file into a list
data = []
for row in csv_f:
data.append(row)
stors = []
# # Loop to do Type of Mutation
j = 0
while j < len(data):
# We are adding two extra columns to our csv file to match our template
data[j].insert(4, "")
data[j].insert(2, "")
# If the mutation is type SNP we want to move the data so that the end position is equal to the starting
# position + 1 as well as moving the data from looking like A->T to A in column 4 and T in column 5
if data[j][3] == "SNP":
temp = data[j][4]
splitter = re.split("→", temp)
data[j][4] = splitter[0]
data[j][5] = splitter[1]
data[j][2] = int(data[j][1].replace(',', "")) + 1
# If the mutation is type Del we first check if it starts with a Δ, this represents how many bases are deleted
# we remove the information from columns 4 and 5 and adjust the end position to be the starting position + the
# number of bases deleted
if data[j][3] == "DEL":
if data[j][4][0] == "Δ":
temp = re.split('Δ', data[j][4])
newtemp = re.split(" ", temp[1])
data[j][2] = int(data[j][1].replace(",", "")) + int(newtemp[0].replace(",", ""))
data[j][4] = ""
data[j][5] = ""
# If the mutation information doesn't start with a Δ, then we collect the number of bases deleted and
# add that value to our stat position to see what our end position is
else:
valuesplit = re.split("→", re.split("[(]|[)]", data[j][4])[2])
vali = int(valuesplit[0]) - int(valuesplit[1])
data[j][5] = ""
data[j][4] = ""
data[j][2] = int(data[j][1].replace(',', "")) + vali
# If the mutation is type INS then if this starts with a + we find out how many bases are inserted and put them in
# ALT and adjust the end position accordingly by adding one to the start position
if data[j][3] == "INS":
if data[j][4][0] == "+":
data[j][5] = re.split("[+]", data[j][4])[1]
data[j][4] = ""
data[j][2] = int(data[j][1].replace(',', "")) + 1
# If another format is used for INS then we do the same thing to see how many bases were inserted and add them
# or the Insertion sequence identifier to ALT and adjust the end by adding one to the start position
else:
valuesplit = re.split("→", re.split("[(]|[)]", data[j][4])[2])
vali = int(valuesplit[1]) - int(valuesplit[0])
data[j][5] = re.split("[(]|[)]", data[j][4])[1] * vali
data[j][4] = ""
data[j][2] = int(data[j][1].replace(',', "")) + 1
# If our mutation is type MOB then we parse the data to see how many bp were added and show what Insertion
# sequence was added and how many bp it was, this is for the normal case of mobility
if data[j][3] == "MOB":
if data[j][4][0] == "I":
data[j][3] = "INS"
temp = re.split(" [+]", data[j][4])
data[j][4] = ""
data[j][5] = temp[0]
data[j][2] = int(data[j][1].replace(",", "")) + 1
else:
# For special case of mobility such as Δ2 bp :: IS186 (+) +9 bp we are treating as insertion of IS186
data[j][3] = "INS"
data[j][5] = re.split(" ", data[j][4])[3]
data[j][4] = ""
data[j][2] = int(data[j][1].replace(',', "")) + 1
# If our mutation is type AMP than we update the name of Amplification to the number of times it is amplified
# we also put in alt the length of the amplification
if data[j][3] == "AMP":
ampsplit = re.split(" ", data[j][4])
data[j][3] = "AMP" + str(ampsplit[3])
data[j][4] = ""
data[j][5] = ampsplit[0]
data[j][2] = int(data[j][1].replace(",", "")) + 1
data[j][7] = ""
# handling multiple substitutions by the number of bases changed
if data[j][3] == "SUB":
data[j][3] = "MNV"
subsplit = re.split("→", data[j][4])
data[j][2] = int(data[j][1].replace(',', "")) + int(re.split(" ", subsplit[0])[0])
data[j][4] = ""
data[j][5] = subsplit[1]
# inversion mutations, inversion length is added to position start to signify length of the inversion
if data[j][3] == "INV":
invsplit = re.split(" ", data[j][4])
data[j][2] = int(data[j][1].replace(',', "")) + int(re.split(" ", subsplit[0])[0])
data[j][4] = ""
# gene conversion mutations update based on this (REPL (a change of 2 or more bases with a change in length)
# as it involves the simultaneous deletion of a segment and the insertion of a new segment
# (copied from somewhere in the genome). Start and End would be the region in the reference that was
# deleted with ALT showing the new sequence (which can be long or undefined).
if data[j][3] == "CON":
data[j][3] = "REPL"
j += 1
# Loop to do multiple mutations per line
j = 0
# This loop checks each row of the Ale Db CSV data and sees how many experiment replicates this mutation occurred in
# creating a new mutation for each replicate it occurred in
while j < len(data):
o = 8
k = len(header) + 1
while o < k:
if data[j][o] != "":
stors.append(
[data[j][0], data[j][1].replace(",", ""), data[j][2], data[j][3], data[j][4], data[j][5], data[j][6],
data[j][7].split(' ')[0], str(Groups[o - 8]).split(" ")[0][3:], str(Groups[o - 8]).split(" ")[2][1:],
str(Groups[o - 7]).split(" ")[1][1:], data[j][o], data[j][7]])
o += 1
else:
o += 1
j += 1
# Final loop to clean up last columns
# This is to clean up the template used in the Ale DB CSV to the template information we want in our data
j = 0
while j < len(stors):
if stors[j][7] == "coding":
stors[j][7] = ""
elif stors[j][7] == "intergenic":
stors[j][6] = "NA"
stors[j][7] = ""
elif (stors[j][3] == "DEL") and (stors[j][7] != ""):
stors[j][7] = ""
j += 1
# This creates a data frame with the given columns and the fullly converted mutation data
df = pd.DataFrame(stors,
columns=["CHROM", "START POS", "END POS", 'TYPE', 'REF', 'ALT', 'GEN', '∆AA', 'POP', 'CLON', 'TIME',
'FREQ', 'COM'])
shutil.copy2("C:/Users/samue/Desktop/Thesis/Mutationlisttemplate_new.xlsx",
csvfile + '.xlsx')
append_df_to_excel(csvfile + '.xlsx', df, index=False, header=False, startrow=5)
wb = load_workbook(filename=csvfile + '.xlsx')
ws = wb.worksheets[0]
ws["K4"] = "Flask"
wb.save(csvfile + '.xlsx')
f.close()
def append_df_to_excel(filename, df, sheet_name='Sheet1', startrow=None,
truncate_sheet=False,
**to_excel_kwargs):
"""
Append a DataFrame [df] to existing Excel file [filename]
into [sheet_name] Sheet.
If [filename] doesn't exist, then this function will create it.
Parameters:
filename : File path or existing ExcelWriter
(Example: '/path/to/file.xlsx')
df : dataframe to save to workbook
sheet_name : Name of sheet which will contain DataFrame.
(default: 'Sheet1')
startrow : upper left cell row to dump data frame.
Per default (startrow=None) calculate the last row
in the existing DF and write to the next row...
truncate_sheet : truncate (remove and recreate) [sheet_name]
before writing DataFrame to Excel file
to_excel_kwargs : arguments which will be passed to `DataFrame.to_excel()`
[can be dictionary]
Returns: None
"""
# ignore [engine] parameter if it was passed
if 'engine' in to_excel_kwargs:
to_excel_kwargs.pop('engine')
writer = pd.ExcelWriter(filename, engine='openpyxl')
# Python 2.x: define [FileNotFoundError] exception if it doesn't exist
try:
FileNotFoundError
except NameError:
FileNotFoundError = IOError
try:
# try to open an existing workbook
writer.book = load_workbook(filename)
# get the last row in the existing Excel sheet
# if it was not specified explicitly
if startrow is None and sheet_name in writer.book.sheetnames:
startrow = writer.book[sheet_name].max_row
# truncate sheet
if truncate_sheet and sheet_name in writer.book.sheetnames:
# index of [sheet_name] sheet
idx = writer.book.sheetnames.index(sheet_name)
# remove [sheet_name]
writer.book.remove(writer.book.worksheets[idx])
# create an empty sheet [sheet_name] using old index
writer.book.create_sheet(sheet_name, idx)
# copy existing sheets
writer.sheets = {ws.title:ws for ws in writer.book.worksheets}
except FileNotFoundError:
# file does not exist yet, we will create it
pass
if startrow is None:
startrow = 0
# write out the new sheet
df.to_excel(writer, sheet_name, startrow=startrow, **to_excel_kwargs)
# save the workbook
writer.save()
# mutTranslate('C:/Users/samue/Desktop/Thesis/ALEDB_conversion/Experiment_Files_to_convert/OxidizeMe.csv')
# Goes through every file in the folder and runs our function to convert the CSV file into our template
# for file in glob.glob(path + '\\*'):
# mutTranslate(file)