|
51 | 51 | barcode_assignment = match.group(1)
|
52 | 52 | print(f"IFC barcode: {barcode_assignment}")
|
53 | 53 |
|
| 54 | +##### |
| 55 | +# instantiate Data Reader from reader.py |
| 56 | +reader = DataReader() # make am empty class object that corresponds to the DataReader() object from reader.py |
54 | 57 |
|
55 | 58 | # then load in the data file
|
56 | 59 | data_files = sorted([fname for fname in all_files if fname.suffix == ".csv"])
|
|
64 | 67 | file_like_object = BytesIO(data_file.read_bytes())
|
65 | 68 | df_data = pd.read_csv(file_like_object, on_bad_lines="skip")
|
66 | 69 | print(f"This is the Data File that was loaded: {data_file.name}")
|
67 |
| - |
68 |
| - reader = DataReader() # make am empty class object that corresponds to the DataReader() object from reader.py |
69 | 70 |
|
70 | 71 | phrases_to_find = [
|
71 | 72 | "Raw Data for Passive Reference ROX",
|
|
80 | 81 |
|
81 | 82 | # at this point, we have loaded the assignment sheet and have sorted through the loaded data file to create a dict of dataframes
|
82 | 83 |
|
| 84 | +##### |
83 | 85 | # instantiate DataProcessor from norm.py
|
84 | 86 | processor = DataProcessor()
|
85 | 87 | # normalize the signal
|
86 | 88 | normalized_dataframes = processor.background_processing(dataframes)
|
87 | 89 |
|
| 90 | +# The premise of the code is that different viral panels require different thresholding |
| 91 | +# So the user will specifiy command line arguments as per the ReadMe instructions |
| 92 | +# and this portion of the code is meant to access the CI arguments and modify the threshold specified in the code |
| 93 | + |
| 94 | +CLI_arg = sys.argv |
| 95 | + |
88 | 96 | # make an output folder in your path's wd if it hasn't been made already
|
89 |
| -output_folder = f'output_{barcode_assignment}' |
| 97 | +output_folder = f'output_{barcode_assignment}_[{CLI_arg[1]}]' |
90 | 98 | if not os.path.exists(output_folder):
|
91 | 99 | os.makedirs(output_folder)
|
92 | 100 |
|
|
95 | 103 | normalized_dataframes['signal_norm'].to_csv(os.path.join(output_folder, 'signal_norm.csv'), index=True)
|
96 | 104 | normalized_dataframes['ref_norm'].to_csv(os.path.join(output_folder, 'ref_norm.csv'), index=True)
|
97 | 105 |
|
| 106 | +##### |
98 | 107 | # instantiate DataMatcher from matcher.py
|
99 | 108 | matcher = DataMatcher()
|
100 | 109 | assigned_norms, assigned_lists = matcher.assign_assays(assignment_files[0], normalized_dataframes['ref_norm'], normalized_dataframes['signal_norm'])
|
|
104 | 113 | assigned_norms['signal_norm_raw'].to_csv(os.path.join(output_folder, 'assigned_signal_norm.csv'), index=True)
|
105 | 114 | assigned_norms['ref_norm_raw'].to_csv(os.path.join(output_folder, 'assigned_ref_norm.csv'), index=True)
|
106 | 115 |
|
107 |
| -samples_list = assigned_lists['samples_list'] |
| 116 | +# collect the assays/samples from the layout assays/samples in the assignment sheet (this extraction is done in matcher.py) |
108 | 117 | crRNA_assays = assigned_lists['assay_list']
|
| 118 | +samples_list = assigned_lists['samples_list'] |
109 | 119 |
|
| 120 | +##### |
110 | 121 | # instantiate ntcContaminationChecker from ntc_con_check.py
|
111 | 122 | ntcCheck = ntcContaminationChecker()
|
112 | 123 |
|
|
115 | 126 | # create df of filtered assigned_signal_norm by applying the NTC check to remove any NTCs whose raw signal suggests contamination
|
116 | 127 | assigned_signal_norm_with_NTC_check = ntcCheck.ntc_cont(assigned_signal_norm) # feed this into MedianSort
|
117 | 128 |
|
| 129 | +# temporarily save assigned_signal_norm_with_NTC_check |
| 130 | +assigned_signal_norm_with_NTC_check.to_csv(os.path.join(output_folder, 'assigned_signal_norm_with_NTC_check.csv'), index=True) |
| 131 | + |
| 132 | + |
| 133 | +##### |
118 | 134 | # instantiate MedianSort from median_frame.py
|
119 | 135 | median = MedianSort(crRNA_assays)
|
120 | 136 | final_med_frames = median.create_median(assigned_signal_norm_with_NTC_check)
|
121 | 137 |
|
| 138 | +# temporarily print final_med_frames |
| 139 | +#print(final_med_frames) |
| 140 | + |
122 | 141 | # Output needs to be rounded to 4 digits
|
123 | 142 | rounded_final_med_frames = {}
|
124 | 143 | # Define the number of decimals for rounding
|
125 | 144 | decimals = 5
|
126 | 145 |
|
127 |
| -# Iterate through each row and column |
| 146 | +# Iterate through each row and column, round each value |
128 | 147 | for key, df in final_med_frames.items():
|
129 | 148 | rounded_df = pd.DataFrame(index=df.index, columns=df.columns)
|
130 | 149 | for i in range(len(df.index)):
|
|
133 | 152 | rounded_df.iloc[i, j] = round(df.iloc[i, j], decimals)
|
134 | 153 | rounded_final_med_frames[key] = rounded_df
|
135 | 154 |
|
| 155 | +# Make subfolder in the output folder in your path's wd if it hasn't been made already |
| 156 | +timepoints_subfolder = os.path.join(output_folder, f'timepoints_quantData_{barcode_assignment}') |
| 157 | +if not os.path.exists(timepoints_subfolder): |
| 158 | + os.makedirs(timepoints_subfolder) |
| 159 | + |
| 160 | +# Save the dataframes per timepoint in subfolder timp |
136 | 161 | timepoints = list(rounded_final_med_frames.keys())
|
137 | 162 | for i, t in enumerate(timepoints, start=1):
|
138 |
| - filename = os.path.join(output_folder, f't{i}_{barcode_assignment}.csv') |
| 163 | + filename = os.path.join(timepoints_subfolder, f't{i}_{barcode_assignment}.csv') |
139 | 164 | csv = rounded_final_med_frames[t].to_csv(filename, index=True)
|
140 | 165 |
|
| 166 | + |
141 | 167 | # since we want to explicitly manipulate t13_csv, it is helpful to have the t13 df referenced outside of the for loop
|
142 | 168 | last_key = list(rounded_final_med_frames.keys())[-1]
|
143 | 169 | t13_dataframe_orig = rounded_final_med_frames[last_key]
|
|
147 | 173 | # at this point, we have created a t1 thru t13 dataframe and exported all these dataframes as csv files in our output folder
|
148 | 174 | # now we need to threshold the t13 csv and mark signals >= threshold as positive and < threshold as negative
|
149 | 175 |
|
150 |
| -# The premise of the code is that different viral panels require different thresholding |
151 |
| -# So the user will specifiy command line arguments as per the ReadMe instructions |
152 |
| -# and this portion of the code is meant to access the CI arguments and modify the threshold specified in the code |
153 |
| - |
154 |
| -CLI_arg = sys.argv |
155 |
| - |
| 176 | +##### |
156 | 177 | # instantiate Thresholder from threshold.py
|
157 | 178 | thresholdr = Thresholder()
|
158 | 179 | unique_crRNA_assays = list(set(crRNA_assays))
|
|
161 | 182 | # and save the file to your working directory
|
162 | 183 | ntc_PerAssay, ntc_thresholds_output, t13_hit_output = thresholdr.raw_thresholder(unique_crRNA_assays, assigned_norms['signal_norm_raw'], t13_dataframe_copy1, CLI_arg[1])
|
163 | 184 |
|
164 |
| - |
165 | 185 | # make copies of t13_hit_output csv for downstream summaries and quality control checks
|
166 | 186 | t13_hit_output_copy1 = pd.DataFrame(t13_hit_output).copy() # make a copy of t13_hit_output # used in ndc qual check
|
167 | 187 | t13_hit_output_copy2 = pd.DataFrame(t13_hit_output).copy() # make a copy of t13_hit_output # used in cpc qual check
|
|
176 | 196 | ntc_thresholds_output.to_csv(ntc_thresholds_output_file_path, index=True)
|
177 | 197 | t13_hit_output.to_csv(hit_output_file_path, index=True)
|
178 | 198 |
|
| 199 | +##### |
179 | 200 | # instantiate NTC_Normalized from ntcnorm.py
|
180 | 201 | ntcNorm = Normalized()
|
181 | 202 | # apply ntc_normalizr to the t13_dataframe to produce a new dataframe with all values divided by the mean NTC for that assay
|
182 |
| -t13_quant_hit_norm = ntcNorm.normalizr(t13_dataframe_copy2) |
183 |
| -quant_hit_output_ntcNorm_file_path = os.path.join(output_folder, f't13_{barcode_assignment}_quant_ntcNorm.csv') |
184 |
| -t13_quant_hit_norm.to_csv(quant_hit_output_ntcNorm_file_path, index=True) |
| 203 | +t13_quant_norm = ntcNorm.normalizr(t13_dataframe_copy2) |
| 204 | +quant_output_ntcNorm_file_path = os.path.join(output_folder, f't13_{barcode_assignment}_normalized.csv') |
| 205 | +t13_quant_norm.to_csv(quant_output_ntcNorm_file_path, index=True) |
185 | 206 |
|
| 207 | +##### |
186 | 208 | # instantiate Binary_Converter from binary_results.py
|
187 | 209 | binary_num_converter = Binary_Converter()
|
188 | 210 | # apply hit_numeric_conv to the the t13_hit_output to produce a new dataframe with all pos/neg converted to binary 1/0 output
|
|
194 | 216 | t13_hit_binary_output_copy1 = pd.DataFrame(t13_hit_binary_output).copy() # used in coninf check
|
195 | 217 | t13_hit_binary_output_copy2 = pd.DataFrame(t13_hit_binary_output).copy()
|
196 | 218 |
|
| 219 | +##### |
197 | 220 | # instantiate Summarized from summary.py
|
198 | 221 | summary = Summarized()
|
199 | 222 | # apply summarizer to the t13_dataframe to produce a new dataframe tabulating all of the positive samples
|
200 | 223 | summary_samples_df = summary.summarizer(t13_hit_output)
|
201 | 224 | summary_pos_samples_file_path = os.path.join(output_folder, f'Positives_Summary_{barcode_assignment}.csv')
|
202 | 225 | summary_samples_df.to_csv(summary_pos_samples_file_path, index=True)
|
203 | 226 |
|
204 |
| - |
| 227 | +##### |
205 | 228 | # instantiate Plotter from plotting.py
|
206 | 229 | heatmap_generator = Plotter()
|
207 | 230 |
|
|
211 | 234 | unique_crRNA_assays = list(OrderedDict.fromkeys(crRNA_assays))
|
212 | 235 | heatmap = heatmap_generator.plt_heatmap(tgap, barcode_assignment,final_med_frames, samples_list, unique_crRNA_assays, timepoints)
|
213 | 236 |
|
| 237 | +# Make subfolder in the output folder in your path's wd if it hasn't been made already |
| 238 | +heatmaps_subfolder = os.path.join(output_folder, f'heatmaps_{barcode_assignment}') |
| 239 | +if not os.path.exists(heatmaps_subfolder): |
| 240 | + os.makedirs(heatmaps_subfolder) |
| 241 | + |
214 | 242 | # save heatmap per timepoint
|
215 | 243 | for i, t in enumerate(timepoints, start=1):
|
216 | 244 | #csv = convert_df(final_med_frames[t])
|
217 |
| - heatmap_filename = os.path.join(output_folder, f'Heatmap_t{i}_{barcode_assignment}.png') |
| 245 | + heatmap_filename = os.path.join(heatmaps_subfolder, f'Heatmap_t{i}_{barcode_assignment}.png') |
218 | 246 | fig = heatmap[t].savefig(heatmap_filename, bbox_inches = 'tight', dpi=80)
|
219 | 247 | plt.close(fig)
|
220 | 248 |
|
221 |
| -print(f"The heatmap plots saved to the folder, {output_folder}") |
| 249 | +print(f"The heatmap plots saved to the folder, {heatmaps_subfolder} in {output_folder}") |
222 | 250 |
|
| 251 | +heatmap_t13_quant_norm = heatmap_generator.t13_plt_heatmap(tgap, barcode_assignment,t13_quant_norm, samples_list, unique_crRNA_assays, timepoints) |
| 252 | +heatmap_t13_quant_norm_filename = os.path.join(output_folder, f'Heatmap_t13_{barcode_assignment}_normalized.png') |
| 253 | +fig = heatmap_t13_quant_norm.savefig(heatmap_t13_quant_norm_filename, bbox_inches = 'tight', dpi=80) |
| 254 | +plt.close(fig) |
| 255 | + |
| 256 | + |
| 257 | +##### |
223 | 258 | # instantiate Assay_QC_Score
|
224 | 259 | assayScorer = Assay_QC_Score()
|
225 | 260 | # take in t13_hit_binary_output as the df to build off of
|
226 | 261 | QC_score_per_assay_df = assayScorer.assay_level_score(t13_hit_binary_output)
|
227 |
| -assay_lvl_QC_score_file_path = os.path.join(output_folder, f'Assay_Level_QC_Metrics_{barcode_assignment}.csv') |
| 262 | + |
| 263 | +# Make subfolder in the output folder in your path's wd if it hasn't been made already |
| 264 | +assayQC_subfolder = os.path.join(output_folder, f'assay_performance_evaluation_{barcode_assignment}') |
| 265 | +if not os.path.exists(assayQC_subfolder): |
| 266 | + os.makedirs(assayQC_subfolder) |
| 267 | + |
| 268 | +assay_lvl_QC_score_file_path = os.path.join(assayQC_subfolder, f'Assay_Performance_QC_Test_Results_{barcode_assignment}.csv') |
228 | 269 | QC_score_per_assay_df.to_csv(assay_lvl_QC_score_file_path, index=True)
|
229 | 270 |
|
230 | 271 | # write text file explaining the QC score
|
|
264 | 305 | assayScores_Explanation.append("The final score per assay is included for easy comparison of assay performance.\n")
|
265 | 306 |
|
266 | 307 | # create and save an output text file containing the quality control checks
|
267 |
| -assayScores_file_path = os.path.join(output_folder, f'Assay_Performance_QC_Tests_{barcode_assignment}.txt') |
| 308 | +assayScores_file_path = os.path.join(assayQC_subfolder, f'Assay_Performance_QC_Tests_{barcode_assignment}.txt') |
268 | 309 | with open(assayScores_file_path, 'w') as f:
|
269 | 310 | for line in assayScores_Explanation:
|
270 | 311 | f.write(line + '\n')
|
271 | 312 |
|
272 |
| -print(f"The four quality control tests to evaluate assay performance are complete. Their results have been saved to the folder, {output_folder}") |
| 313 | +print(f"The four quality control tests to evaluate assay performance are complete. Their results have been saved to the folder, {assayQC_subfolder}") |
273 | 314 |
|
| 315 | +##### |
274 | 316 | # instantiate Qual_Ctrl_Checks from qual-checks.py
|
275 | 317 | qual_checks = Qual_Ctrl_Checks()
|
276 | 318 |
|
| 319 | +# Make subfolder in the output folder in your path's wd if it hasn't been made already |
| 320 | +qc_subfolder = os.path.join(output_folder, 'quality_control_flags') |
| 321 | +if not os.path.exists(qc_subfolder): |
| 322 | + os.makedirs(qc_subfolder) |
| 323 | + |
277 | 324 | # initialize a list to collect all quality control checks
|
278 | 325 | QC_lines = []
|
279 | 326 |
|
|
332 | 379 | # apply ntc_check to the t13_hit_output df to generate a list of all ntc positive assays
|
333 | 380 | assigned_signal_norm_2 = pd.DataFrame(assigned_norms['signal_norm_raw']).copy() # make a copy of assigned_signal_norm dataframe
|
334 | 381 |
|
335 |
| - |
336 | 382 | high_raw_ntc_signal = qual_checks.ntc_check(assigned_signal_norm_2)
|
337 | 383 | QC_lines.append("4. Evaluation of No Target Control (NTC) Contamination \n")
|
338 | 384 | if high_raw_ntc_signal:
|
|
348 | 394 | coinfection_df = qual_checks.coinf_check(t13_hit_binary_output_copy1)
|
349 | 395 | QC_lines.append("5. Evaluation of Potential Co-Infected Samples\n")
|
350 | 396 |
|
351 |
| -coinfection_df_file_path = os.path.join(output_folder, f'Coinfection_Check_{barcode_assignment}.csv') |
| 397 | +coinfection_df_file_path = os.path.join(qc_subfolder, f'Coinfection_Check_{barcode_assignment}.csv') |
352 | 398 | coinfection_df.to_csv(coinfection_df_file_path, index=True) # output needs to be csv of coninfection check
|
353 | 399 |
|
354 | 400 | # in Qual_Check text file, add message saying "see coinf check csv" and "if any samples excpet CPC are flagged as being coinfected, there is risk of these samples being coinfected"
|
|
359 | 405 | QC_lines.append(" - All other flagged samples should be further evaluated for potential co-infection.\n")
|
360 | 406 | QC_lines.append("Please be advised to check the output files as well.")
|
361 | 407 |
|
362 |
| - |
363 | 408 | # create and save an output text file containing the quality control checks
|
364 |
| -QCs_file_path = os.path.join(output_folder, f'Quality_Control_Flags_{barcode_assignment}.txt') |
| 409 | +QCs_file_path = os.path.join(qc_subfolder, f'Quality_Control_Flags_{barcode_assignment}.txt') |
365 | 410 | with open(QCs_file_path, 'w') as f:
|
366 | 411 | for line in QC_lines:
|
367 | 412 | f.write(line + '\n')
|
368 | 413 |
|
369 |
| -print(f"The quality control checks are complete and saved to the folder, {output_folder}") |
| 414 | +print(f"The quality control checks are complete and saved to the folder, {qc_subfolder}") |
370 | 415 |
|
371 | 416 |
|
0 commit comments