From 470b3d31da1ab20a8b756358f0500e008e1ee0d0 Mon Sep 17 00:00:00 2001 From: albeez Date: Mon, 26 Aug 2024 17:52:19 -0400 Subject: [PATCH] Software Update 3.2.3 Improvements to user interface and plotting normalized t13 data --- analyze_run.py | 99 +++++++++++++++++++++++++++++++++++------------- ntc_con_check.py | 16 +++++--- ntcnorm.py | 2 + plotting.py | 75 +++++++++++++++++++++++++++++++++++- 4 files changed, 158 insertions(+), 34 deletions(-) diff --git a/analyze_run.py b/analyze_run.py index 275da1c..8c07b67 100644 --- a/analyze_run.py +++ b/analyze_run.py @@ -51,6 +51,9 @@ barcode_assignment = match.group(1) print(f"IFC barcode: {barcode_assignment}") +##### +# instantiate Data Reader from reader.py +reader = DataReader() # make am empty class object that corresponds to the DataReader() object from reader.py # then load in the data file data_files = sorted([fname for fname in all_files if fname.suffix == ".csv"]) @@ -64,8 +67,6 @@ file_like_object = BytesIO(data_file.read_bytes()) df_data = pd.read_csv(file_like_object, on_bad_lines="skip") print(f"This is the Data File that was loaded: {data_file.name}") - - reader = DataReader() # make am empty class object that corresponds to the DataReader() object from reader.py phrases_to_find = [ "Raw Data for Passive Reference ROX", @@ -80,13 +81,20 @@ # at this point, we have loaded the assignment sheet and have sorted through the loaded data file to create a dict of dataframes +##### # instantiate DataProcessor from norm.py processor = DataProcessor() # normalize the signal normalized_dataframes = processor.background_processing(dataframes) +# The premise of the code is that different viral panels require different thresholding +# So the user will specifiy command line arguments as per the ReadMe instructions +# and this portion of the code is meant to access the CI arguments and modify the threshold specified in the code + +CLI_arg = sys.argv + # make an output folder in your path's wd if it hasn't been made already -output_folder = f'output_{barcode_assignment}' +output_folder = f'output_{barcode_assignment}_[{CLI_arg[1]}]' if not os.path.exists(output_folder): os.makedirs(output_folder) @@ -95,6 +103,7 @@ normalized_dataframes['signal_norm'].to_csv(os.path.join(output_folder, 'signal_norm.csv'), index=True) normalized_dataframes['ref_norm'].to_csv(os.path.join(output_folder, 'ref_norm.csv'), index=True) +##### # instantiate DataMatcher from matcher.py matcher = DataMatcher() assigned_norms, assigned_lists = matcher.assign_assays(assignment_files[0], normalized_dataframes['ref_norm'], normalized_dataframes['signal_norm']) @@ -104,9 +113,11 @@ assigned_norms['signal_norm_raw'].to_csv(os.path.join(output_folder, 'assigned_signal_norm.csv'), index=True) assigned_norms['ref_norm_raw'].to_csv(os.path.join(output_folder, 'assigned_ref_norm.csv'), index=True) -samples_list = assigned_lists['samples_list'] +# collect the assays/samples from the layout assays/samples in the assignment sheet (this extraction is done in matcher.py) crRNA_assays = assigned_lists['assay_list'] +samples_list = assigned_lists['samples_list'] +##### # instantiate ntcContaminationChecker from ntc_con_check.py ntcCheck = ntcContaminationChecker() @@ -115,16 +126,24 @@ # create df of filtered assigned_signal_norm by applying the NTC check to remove any NTCs whose raw signal suggests contamination assigned_signal_norm_with_NTC_check = ntcCheck.ntc_cont(assigned_signal_norm) # feed this into MedianSort +# temporarily save assigned_signal_norm_with_NTC_check +assigned_signal_norm_with_NTC_check.to_csv(os.path.join(output_folder, 'assigned_signal_norm_with_NTC_check.csv'), index=True) + + +##### # instantiate MedianSort from median_frame.py median = MedianSort(crRNA_assays) final_med_frames = median.create_median(assigned_signal_norm_with_NTC_check) +# temporarily print final_med_frames +#print(final_med_frames) + # Output needs to be rounded to 4 digits rounded_final_med_frames = {} # Define the number of decimals for rounding decimals = 5 -# Iterate through each row and column +# Iterate through each row and column, round each value for key, df in final_med_frames.items(): rounded_df = pd.DataFrame(index=df.index, columns=df.columns) for i in range(len(df.index)): @@ -133,11 +152,18 @@ rounded_df.iloc[i, j] = round(df.iloc[i, j], decimals) rounded_final_med_frames[key] = rounded_df +# Make subfolder in the output folder in your path's wd if it hasn't been made already +timepoints_subfolder = os.path.join(output_folder, f'timepoints_quantData_{barcode_assignment}') +if not os.path.exists(timepoints_subfolder): + os.makedirs(timepoints_subfolder) + +# Save the dataframes per timepoint in subfolder timp timepoints = list(rounded_final_med_frames.keys()) for i, t in enumerate(timepoints, start=1): - filename = os.path.join(output_folder, f't{i}_{barcode_assignment}.csv') + filename = os.path.join(timepoints_subfolder, f't{i}_{barcode_assignment}.csv') csv = rounded_final_med_frames[t].to_csv(filename, index=True) + # since we want to explicitly manipulate t13_csv, it is helpful to have the t13 df referenced outside of the for loop last_key = list(rounded_final_med_frames.keys())[-1] t13_dataframe_orig = rounded_final_med_frames[last_key] @@ -147,12 +173,7 @@ # at this point, we have created a t1 thru t13 dataframe and exported all these dataframes as csv files in our output folder # now we need to threshold the t13 csv and mark signals >= threshold as positive and < threshold as negative -# The premise of the code is that different viral panels require different thresholding -# So the user will specifiy command line arguments as per the ReadMe instructions -# and this portion of the code is meant to access the CI arguments and modify the threshold specified in the code - -CLI_arg = sys.argv - +##### # instantiate Thresholder from threshold.py thresholdr = Thresholder() unique_crRNA_assays = list(set(crRNA_assays)) @@ -161,7 +182,6 @@ # and save the file to your working directory ntc_PerAssay, ntc_thresholds_output, t13_hit_output = thresholdr.raw_thresholder(unique_crRNA_assays, assigned_norms['signal_norm_raw'], t13_dataframe_copy1, CLI_arg[1]) - # make copies of t13_hit_output csv for downstream summaries and quality control checks t13_hit_output_copy1 = pd.DataFrame(t13_hit_output).copy() # make a copy of t13_hit_output # used in ndc qual check t13_hit_output_copy2 = pd.DataFrame(t13_hit_output).copy() # make a copy of t13_hit_output # used in cpc qual check @@ -176,13 +196,15 @@ ntc_thresholds_output.to_csv(ntc_thresholds_output_file_path, index=True) t13_hit_output.to_csv(hit_output_file_path, index=True) +##### # instantiate NTC_Normalized from ntcnorm.py ntcNorm = Normalized() # apply ntc_normalizr to the t13_dataframe to produce a new dataframe with all values divided by the mean NTC for that assay -t13_quant_hit_norm = ntcNorm.normalizr(t13_dataframe_copy2) -quant_hit_output_ntcNorm_file_path = os.path.join(output_folder, f't13_{barcode_assignment}_quant_ntcNorm.csv') -t13_quant_hit_norm.to_csv(quant_hit_output_ntcNorm_file_path, index=True) +t13_quant_norm = ntcNorm.normalizr(t13_dataframe_copy2) +quant_output_ntcNorm_file_path = os.path.join(output_folder, f't13_{barcode_assignment}_normalized.csv') +t13_quant_norm.to_csv(quant_output_ntcNorm_file_path, index=True) +##### # instantiate Binary_Converter from binary_results.py binary_num_converter = Binary_Converter() # apply hit_numeric_conv to the the t13_hit_output to produce a new dataframe with all pos/neg converted to binary 1/0 output @@ -194,6 +216,7 @@ t13_hit_binary_output_copy1 = pd.DataFrame(t13_hit_binary_output).copy() # used in coninf check t13_hit_binary_output_copy2 = pd.DataFrame(t13_hit_binary_output).copy() +##### # instantiate Summarized from summary.py summary = Summarized() # apply summarizer to the t13_dataframe to produce a new dataframe tabulating all of the positive samples @@ -201,7 +224,7 @@ summary_pos_samples_file_path = os.path.join(output_folder, f'Positives_Summary_{barcode_assignment}.csv') summary_samples_df.to_csv(summary_pos_samples_file_path, index=True) - +##### # instantiate Plotter from plotting.py heatmap_generator = Plotter() @@ -211,20 +234,38 @@ unique_crRNA_assays = list(OrderedDict.fromkeys(crRNA_assays)) heatmap = heatmap_generator.plt_heatmap(tgap, barcode_assignment,final_med_frames, samples_list, unique_crRNA_assays, timepoints) +# Make subfolder in the output folder in your path's wd if it hasn't been made already +heatmaps_subfolder = os.path.join(output_folder, f'heatmaps_{barcode_assignment}') +if not os.path.exists(heatmaps_subfolder): + os.makedirs(heatmaps_subfolder) + # save heatmap per timepoint for i, t in enumerate(timepoints, start=1): #csv = convert_df(final_med_frames[t]) - heatmap_filename = os.path.join(output_folder, f'Heatmap_t{i}_{barcode_assignment}.png') + heatmap_filename = os.path.join(heatmaps_subfolder, f'Heatmap_t{i}_{barcode_assignment}.png') fig = heatmap[t].savefig(heatmap_filename, bbox_inches = 'tight', dpi=80) plt.close(fig) -print(f"The heatmap plots saved to the folder, {output_folder}") +print(f"The heatmap plots saved to the folder, {heatmaps_subfolder} in {output_folder}") +heatmap_t13_quant_norm = heatmap_generator.t13_plt_heatmap(tgap, barcode_assignment,t13_quant_norm, samples_list, unique_crRNA_assays, timepoints) +heatmap_t13_quant_norm_filename = os.path.join(output_folder, f'Heatmap_t13_{barcode_assignment}_normalized.png') +fig = heatmap_t13_quant_norm.savefig(heatmap_t13_quant_norm_filename, bbox_inches = 'tight', dpi=80) +plt.close(fig) + + +##### # instantiate Assay_QC_Score assayScorer = Assay_QC_Score() # take in t13_hit_binary_output as the df to build off of QC_score_per_assay_df = assayScorer.assay_level_score(t13_hit_binary_output) -assay_lvl_QC_score_file_path = os.path.join(output_folder, f'Assay_Level_QC_Metrics_{barcode_assignment}.csv') + +# Make subfolder in the output folder in your path's wd if it hasn't been made already +assayQC_subfolder = os.path.join(output_folder, f'assay_performance_evaluation_{barcode_assignment}') +if not os.path.exists(assayQC_subfolder): + os.makedirs(assayQC_subfolder) + +assay_lvl_QC_score_file_path = os.path.join(assayQC_subfolder, f'Assay_Performance_QC_Test_Results_{barcode_assignment}.csv') QC_score_per_assay_df.to_csv(assay_lvl_QC_score_file_path, index=True) # write text file explaining the QC score @@ -264,16 +305,22 @@ assayScores_Explanation.append("The final score per assay is included for easy comparison of assay performance.\n") # create and save an output text file containing the quality control checks -assayScores_file_path = os.path.join(output_folder, f'Assay_Performance_QC_Tests_{barcode_assignment}.txt') +assayScores_file_path = os.path.join(assayQC_subfolder, f'Assay_Performance_QC_Tests_{barcode_assignment}.txt') with open(assayScores_file_path, 'w') as f: for line in assayScores_Explanation: f.write(line + '\n') -print(f"The four quality control tests to evaluate assay performance are complete. Their results have been saved to the folder, {output_folder}") +print(f"The four quality control tests to evaluate assay performance are complete. Their results have been saved to the folder, {assayQC_subfolder}") +##### # instantiate Qual_Ctrl_Checks from qual-checks.py qual_checks = Qual_Ctrl_Checks() +# Make subfolder in the output folder in your path's wd if it hasn't been made already +qc_subfolder = os.path.join(output_folder, 'quality_control_flags') +if not os.path.exists(qc_subfolder): + os.makedirs(qc_subfolder) + # initialize a list to collect all quality control checks QC_lines = [] @@ -332,7 +379,6 @@ # apply ntc_check to the t13_hit_output df to generate a list of all ntc positive assays assigned_signal_norm_2 = pd.DataFrame(assigned_norms['signal_norm_raw']).copy() # make a copy of assigned_signal_norm dataframe - high_raw_ntc_signal = qual_checks.ntc_check(assigned_signal_norm_2) QC_lines.append("4. Evaluation of No Target Control (NTC) Contamination \n") if high_raw_ntc_signal: @@ -348,7 +394,7 @@ coinfection_df = qual_checks.coinf_check(t13_hit_binary_output_copy1) QC_lines.append("5. Evaluation of Potential Co-Infected Samples\n") -coinfection_df_file_path = os.path.join(output_folder, f'Coinfection_Check_{barcode_assignment}.csv') +coinfection_df_file_path = os.path.join(qc_subfolder, f'Coinfection_Check_{barcode_assignment}.csv') coinfection_df.to_csv(coinfection_df_file_path, index=True) # output needs to be csv of coninfection check # in Qual_Check text file, add message saying "see coinf check csv" and "if any samples excpet CPC are flagged as being coinfected, there is risk of these samples being coinfected" @@ -359,13 +405,12 @@ QC_lines.append(" - All other flagged samples should be further evaluated for potential co-infection.\n") QC_lines.append("Please be advised to check the output files as well.") - # create and save an output text file containing the quality control checks -QCs_file_path = os.path.join(output_folder, f'Quality_Control_Flags_{barcode_assignment}.txt') +QCs_file_path = os.path.join(qc_subfolder, f'Quality_Control_Flags_{barcode_assignment}.txt') with open(QCs_file_path, 'w') as f: for line in QC_lines: f.write(line + '\n') -print(f"The quality control checks are complete and saved to the folder, {output_folder}") +print(f"The quality control checks are complete and saved to the folder, {qc_subfolder}") diff --git a/ntc_con_check.py b/ntc_con_check.py index f6e3d18..1d1b54e 100644 --- a/ntc_con_check.py +++ b/ntc_con_check.py @@ -12,29 +12,33 @@ def ntc_cont(self, assigned_sig_norm): # filter sample for anything that contains NTC (case-insensitive) ntc_filtered_df = assigned_sig_norm[assigned_sig_norm['sample'].str.contains('NTC', case=False, na=False)] + # filter all other samples and save it in a new df all_else_filtered_df = assigned_sig_norm[~assigned_sig_norm['sample'].str.contains('NTC', case=False, na=False)] - - ntc_assay_dfs = [] + # initiate list of ntc_assay_dfs + ntc_assay_dfs = [] # check per assay_df for assay in ntc_filtered_df['assay'].unique(): # filter the df for the current assay assay_df = ntc_filtered_df[ntc_filtered_df['assay'] == assay] # Check if t13 value is greater than 0.5 for any sample - for _, row in assay_df.iterrows(): + for index, row in assay_df.iterrows(): if row['t13'] > 0.5: # Check if there is still more than one row left in assay_df if len(assay_df) > 1: # Remove the row where t13 is greater than 0.5 - assay_df = assay_df[assay_df['t13'] != row['t13']] + assay_df = assay_df.drop(index) else: # If there is only one row left, set row['t13'] to 0.5 - assay_df.at[row.name, 't13'] = 0.5 - + assay_df.at[index, 't13'] = 0.5 + # add assay_df back to list of ntc_assay_dfs ntc_assay_dfs.append(assay_df) + # convert ntc_assay_dfs list into a df combined_ntc_assay_dfs = pd.concat(ntc_assay_dfs, ignore_index=True) assigned_sig_norm = pd.concat([combined_ntc_assay_dfs, all_else_filtered_df], ignore_index=True) + + # pull the samples list return assigned_sig_norm \ No newline at end of file diff --git a/ntcnorm.py b/ntcnorm.py index 72f3e64..1e356cb 100644 --- a/ntcnorm.py +++ b/ntcnorm.py @@ -28,6 +28,8 @@ def normalizr(self, t13_df): ntc_mean = ntc_mean_df.loc['NTC Mean', col_name] # Divide the value by the NTC mean per assay t13_df.at[index, col_name] = value/ntc_mean + + t13_df = t13_df.apply(pd.to_numeric, errors = 'coerce') return t13_df diff --git a/plotting.py b/plotting.py index bc6d81c..75a2ad3 100644 --- a/plotting.py +++ b/plotting.py @@ -92,4 +92,77 @@ def plt_heatmap(self, tgap, barcode_number, df_dict, sample_list, assay_list, tp axes.vlines(v_lines, colors = 'silver',alpha=0.9,linewidths = 0.35,*axes.get_ylim()) fig_timepoints[i] = fig - return fig_timepoints \ No newline at end of file + return fig_timepoints + + def t13_plt_heatmap(self, tgap, barcode_number, df, sample_list, assay_list, tp): + # Create a dictonary for timepoints + time_assign = {} + + for cycle in range(1,len(tp)+1): + tpoint = "t" + str(cycle) + time_assign[tpoint] = tgap + 3 + (cycle-1) * 5 + last_key = list(time_assign.keys())[-1] + + half_samples = int(len(sample_list)/2) + + if len(sample_list) == 192: + # Split the sample list into two halves + first_half_samples = sample_list[:half_samples] + second_half_samples = sample_list[half_samples:] + + fig, axes = plt.subplots(2, 1, figsize=(len(first_half_samples) * 0.5, len(assay_list) * 0.5 * 2)) + + df = df.transpose() + + # First heatmap (first 96 samples) + frame1 = df[first_half_samples].reindex(assay_list) + ax1 = sns.heatmap(frame1, cmap='Reds', square=True, cbar_kws={'pad': 0.002}, annot_kws={"size": 20}, ax=axes[0]) + ax1.set_title(f'Heatmap for data normalized against assay-specific NTC mean, for {barcode_number} at {time_assign[last_key]} minutes (First {half_samples} Samples)', size=28) + ax1.set_xlabel('Samples', size=14) + ax1.set_ylabel('Assays', size=14) + bottom, top = ax1.get_ylim() + ax1.set_ylim(bottom + 0.5, top - 0.5) + ax1.tick_params(axis="y", labelsize=16) + ax1.tick_params(axis="x", labelsize=16) + plt.yticks(rotation=0) + + # Second heatmap (next 96 samples) + frame2 = df[second_half_samples].reindex(assay_list) + ax2 = sns.heatmap(frame2, cmap='Reds', square=True, cbar_kws={'pad': 0.002}, annot_kws={"size": 20}, ax=axes[1]) + ax2.set_title(f'Heatmap for data normalized against assay-specific NTC mean, for {barcode_number} at {time_assign[last_key]} minutes (Next {half_samples} Samples)', size=28) + ax2.set_xlabel('Samples', size=14) + ax2.set_ylabel('Assays', size=14) + bottom, top = ax2.get_ylim() + ax2.set_ylim(bottom + 0.5, top - 0.5) + ax2.tick_params(axis="y", labelsize=16) + ax2.tick_params(axis="x", labelsize=16) + plt.yticks(rotation=0) + + # Adjust layout + plt.tight_layout() + + else: + df = df.transpose() + frame = df[sample_list].reindex(assay_list) + fig, axes = plt.subplots(1,1,figsize=(len(frame.columns.values)*0.5,len(frame.index.values)*0.5)) + ax = sns.heatmap(frame,cmap='Reds',square = True,cbar_kws={'pad':0.002}, annot_kws={"size": 20}) + + plt.title(f'Heatmap for data normalized against assay-specific NTC mean, for {barcode_number} at {time_assign[-1]} minutes', size = 28) + plt.xlabel('Samples', size = 14) + plt.ylabel('Assays', size = 14) + bottom, top = ax.get_ylim() + ax.set_ylim(bottom + 0.5, top - 0.5) + ax.tick_params(axis="y", labelsize=16) + ax.tick_params(axis="x", labelsize=16) + plt.yticks(rotation=0) + + tgt_num = len(sample_list) + gd_num = len(assay_list) + bottom, top = ax.get_ylim() + ax.set_ylim(bottom + 0.5, top - 0.5) + h_lines = np.arange(3,gd_num,3) + v_lines = np.arange(3,tgt_num,3) + axes.hlines(h_lines, colors = 'silver',alpha=0.9,linewidths = 0.35,*axes.get_xlim()) + axes.vlines(v_lines, colors = 'silver',alpha=0.9,linewidths = 0.35,*axes.get_ylim()) + + return fig \ No newline at end of file