Software Update 3.2.3

Improvements to user interface and plotting normalized t13 data
broadinstitute · Aug 26, 2024 · 470b3d3 · 470b3d3
1 parent 6e34b82
commit 470b3d3
Show file tree

Hide file tree

Showing 4 changed files with 158 additions and 34 deletions.
diff --git a/analyze_run.py b/analyze_run.py
@@ -51,6 +51,9 @@
         barcode_assignment = match.group(1)
     print(f"IFC barcode: {barcode_assignment}")
 
+#####
+# instantiate Data Reader from reader.py
+reader = DataReader() # make am empty class object that corresponds to the DataReader() object from reader.py
 
 # then load in the data file 
 data_files = sorted([fname for fname in all_files if fname.suffix == ".csv"])
@@ -64,8 +67,6 @@
     file_like_object = BytesIO(data_file.read_bytes())
     df_data = pd.read_csv(file_like_object, on_bad_lines="skip")
     print(f"This is the Data File that was loaded: {data_file.name}")
-
-    reader = DataReader() # make am empty class object that corresponds to the DataReader() object from reader.py
 
     phrases_to_find = [
         "Raw Data for Passive Reference ROX",
@@ -80,13 +81,20 @@
 
 # at this point, we have loaded the assignment sheet and have sorted through the loaded data file to create a dict of dataframes 
 
+#####
 # instantiate DataProcessor from norm.py
 processor = DataProcessor()
 # normalize the signal 
 normalized_dataframes = processor.background_processing(dataframes)
 
+# The premise of the code is that different viral panels require different thresholding
+# So the user will specifiy command line arguments as per the ReadMe instructions
+# and this portion of the code is meant to access the CI arguments and modify the threshold specified in the code
+
+CLI_arg = sys.argv
+
 # make an output folder in your path's wd if it hasn't been made already
-output_folder = f'output_{barcode_assignment}'
+output_folder = f'output_{barcode_assignment}_[{CLI_arg[1]}]'
 if not os.path.exists(output_folder):
     os.makedirs(output_folder)
 
@@ -95,6 +103,7 @@
 normalized_dataframes['signal_norm'].to_csv(os.path.join(output_folder, 'signal_norm.csv'), index=True)
 normalized_dataframes['ref_norm'].to_csv(os.path.join(output_folder, 'ref_norm.csv'), index=True)
 
+#####
 # instantiate DataMatcher from matcher.py
 matcher = DataMatcher() 
 assigned_norms, assigned_lists = matcher.assign_assays(assignment_files[0], normalized_dataframes['ref_norm'], normalized_dataframes['signal_norm'])
@@ -104,9 +113,11 @@
 assigned_norms['signal_norm_raw'].to_csv(os.path.join(output_folder, 'assigned_signal_norm.csv'), index=True)
 assigned_norms['ref_norm_raw'].to_csv(os.path.join(output_folder, 'assigned_ref_norm.csv'), index=True)
 
-samples_list = assigned_lists['samples_list']
+# collect the assays/samples from the layout assays/samples in the assignment sheet (this extraction is done in matcher.py)
 crRNA_assays = assigned_lists['assay_list']
+samples_list = assigned_lists['samples_list']
 
+#####
 # instantiate ntcContaminationChecker from ntc_con_check.py
 ntcCheck = ntcContaminationChecker()
 
@@ -115,16 +126,24 @@
 # create df of filtered assigned_signal_norm by applying the NTC check to remove any NTCs whose raw signal suggests contamination
 assigned_signal_norm_with_NTC_check = ntcCheck.ntc_cont(assigned_signal_norm) # feed this into MedianSort
 
+# temporarily save assigned_signal_norm_with_NTC_check
+assigned_signal_norm_with_NTC_check.to_csv(os.path.join(output_folder, 'assigned_signal_norm_with_NTC_check.csv'), index=True)
+
+
+#####
 # instantiate MedianSort from median_frame.py
 median = MedianSort(crRNA_assays)
 final_med_frames = median.create_median(assigned_signal_norm_with_NTC_check)
 
+# temporarily print final_med_frames
+#print(final_med_frames)
+
 # Output needs to be rounded to 4 digits
 rounded_final_med_frames = {}
 # Define the number of decimals for rounding
 decimals = 5
 
-# Iterate through each row and column
+# Iterate through each row and column, round each value
 for key, df in final_med_frames.items():
     rounded_df = pd.DataFrame(index=df.index, columns=df.columns)
     for i in range(len(df.index)):
@@ -133,11 +152,18 @@
             rounded_df.iloc[i, j] = round(df.iloc[i, j], decimals)
     rounded_final_med_frames[key] = rounded_df
 
+# Make subfolder in the output folder in your path's wd if it hasn't been made already
+timepoints_subfolder = os.path.join(output_folder, f'timepoints_quantData_{barcode_assignment}')
+if not os.path.exists(timepoints_subfolder):
+    os.makedirs(timepoints_subfolder)
+
+# Save the dataframes per timepoint in subfolder timp
 timepoints = list(rounded_final_med_frames.keys())
 for i, t in enumerate(timepoints, start=1):
-    filename = os.path.join(output_folder, f't{i}_{barcode_assignment}.csv')
+    filename = os.path.join(timepoints_subfolder, f't{i}_{barcode_assignment}.csv')
     csv = rounded_final_med_frames[t].to_csv(filename, index=True)
 
+
 # since we want to explicitly manipulate t13_csv, it is helpful to have the t13 df referenced outside of the for loop
 last_key = list(rounded_final_med_frames.keys())[-1]
 t13_dataframe_orig = rounded_final_med_frames[last_key]
@@ -147,12 +173,7 @@
 # at this point, we have created a t1 thru t13 dataframe and exported all these dataframes as csv files in our output folder
 # now we need to threshold the t13 csv and mark signals >= threshold as positive and < threshold as negative
 
-# The premise of the code is that different viral panels require different thresholding
-# So the user will specifiy command line arguments as per the ReadMe instructions
-# and this portion of the code is meant to access the CI arguments and modify the threshold specified in the code
-
-CLI_arg = sys.argv
-
+#####
 # instantiate Thresholder from threshold.py
 thresholdr = Thresholder()
 unique_crRNA_assays = list(set(crRNA_assays))
@@ -161,7 +182,6 @@
 # and save the file to your working directory
 ntc_PerAssay, ntc_thresholds_output, t13_hit_output = thresholdr.raw_thresholder(unique_crRNA_assays, assigned_norms['signal_norm_raw'], t13_dataframe_copy1, CLI_arg[1])
 
-
 # make copies of t13_hit_output csv for downstream summaries and quality control checks
 t13_hit_output_copy1 = pd.DataFrame(t13_hit_output).copy() # make a copy of t13_hit_output # used in ndc qual check
 t13_hit_output_copy2 = pd.DataFrame(t13_hit_output).copy() # make a copy of t13_hit_output # used in cpc qual check
@@ -176,13 +196,15 @@
 ntc_thresholds_output.to_csv(ntc_thresholds_output_file_path, index=True)
 t13_hit_output.to_csv(hit_output_file_path, index=True)
 
+#####
  # instantiate NTC_Normalized from ntcnorm.py
 ntcNorm = Normalized()
  # apply ntc_normalizr to the t13_dataframe to produce a new dataframe with all values divided by the mean NTC for that assay
-t13_quant_hit_norm = ntcNorm.normalizr(t13_dataframe_copy2)
-quant_hit_output_ntcNorm_file_path = os.path.join(output_folder, f't13_{barcode_assignment}_quant_ntcNorm.csv')
-t13_quant_hit_norm.to_csv(quant_hit_output_ntcNorm_file_path, index=True)
+t13_quant_norm = ntcNorm.normalizr(t13_dataframe_copy2)
+quant_output_ntcNorm_file_path = os.path.join(output_folder, f't13_{barcode_assignment}_normalized.csv')
+t13_quant_norm.to_csv(quant_output_ntcNorm_file_path, index=True)
 
+#####
 # instantiate Binary_Converter from binary_results.py
 binary_num_converter = Binary_Converter()
 # apply hit_numeric_conv to the the t13_hit_output to produce a new dataframe with all pos/neg converted to binary 1/0 output
@@ -194,14 +216,15 @@
 t13_hit_binary_output_copy1 = pd.DataFrame(t13_hit_binary_output).copy() # used in coninf check
 t13_hit_binary_output_copy2 = pd.DataFrame(t13_hit_binary_output).copy() 
 
+#####
 # instantiate Summarized from summary.py
 summary = Summarized()
 # apply summarizer to the t13_dataframe to produce a new dataframe tabulating all of the positive samples
 summary_samples_df = summary.summarizer(t13_hit_output)
 summary_pos_samples_file_path = os.path.join(output_folder, f'Positives_Summary_{barcode_assignment}.csv')
 summary_samples_df.to_csv(summary_pos_samples_file_path, index=True)
 
-
+#####  
 # instantiate Plotter from plotting.py
 heatmap_generator = Plotter()
 
@@ -211,20 +234,38 @@
 unique_crRNA_assays = list(OrderedDict.fromkeys(crRNA_assays))
 heatmap = heatmap_generator.plt_heatmap(tgap, barcode_assignment,final_med_frames, samples_list, unique_crRNA_assays, timepoints)
 
+# Make subfolder in the output folder in your path's wd if it hasn't been made already
+heatmaps_subfolder = os.path.join(output_folder, f'heatmaps_{barcode_assignment}')
+if not os.path.exists(heatmaps_subfolder):
+    os.makedirs(heatmaps_subfolder)
+
 # save heatmap per timepoint
 for i, t in enumerate(timepoints, start=1):
     #csv = convert_df(final_med_frames[t])
-    heatmap_filename = os.path.join(output_folder, f'Heatmap_t{i}_{barcode_assignment}.png')
+    heatmap_filename = os.path.join(heatmaps_subfolder, f'Heatmap_t{i}_{barcode_assignment}.png')
     fig = heatmap[t].savefig(heatmap_filename, bbox_inches = 'tight', dpi=80)
     plt.close(fig)
 
-print(f"The heatmap plots saved to the folder, {output_folder}")
+print(f"The heatmap plots saved to the folder, {heatmaps_subfolder} in {output_folder}")
 
+heatmap_t13_quant_norm = heatmap_generator.t13_plt_heatmap(tgap, barcode_assignment,t13_quant_norm, samples_list, unique_crRNA_assays, timepoints)
+heatmap_t13_quant_norm_filename = os.path.join(output_folder, f'Heatmap_t13_{barcode_assignment}_normalized.png')
+fig = heatmap_t13_quant_norm.savefig(heatmap_t13_quant_norm_filename, bbox_inches = 'tight', dpi=80)
+plt.close(fig)
+
+
+#####
 # instantiate Assay_QC_Score
 assayScorer = Assay_QC_Score()
 # take in t13_hit_binary_output as the df to build off of 
 QC_score_per_assay_df = assayScorer.assay_level_score(t13_hit_binary_output)
-assay_lvl_QC_score_file_path = os.path.join(output_folder, f'Assay_Level_QC_Metrics_{barcode_assignment}.csv')
+
+# Make subfolder in the output folder in your path's wd if it hasn't been made already
+assayQC_subfolder = os.path.join(output_folder, f'assay_performance_evaluation_{barcode_assignment}')
+if not os.path.exists(assayQC_subfolder):
+    os.makedirs(assayQC_subfolder)
+
+assay_lvl_QC_score_file_path = os.path.join(assayQC_subfolder, f'Assay_Performance_QC_Test_Results_{barcode_assignment}.csv')
 QC_score_per_assay_df.to_csv(assay_lvl_QC_score_file_path, index=True)
 
 # write text file explaining the QC score
@@ -264,16 +305,22 @@
 assayScores_Explanation.append("The final score per assay is included for easy comparison of assay performance.\n")
 
 # create and save an output text file containing the quality control checks
-assayScores_file_path = os.path.join(output_folder, f'Assay_Performance_QC_Tests_{barcode_assignment}.txt')
+assayScores_file_path = os.path.join(assayQC_subfolder, f'Assay_Performance_QC_Tests_{barcode_assignment}.txt')
 with open(assayScores_file_path, 'w') as f:
     for line in assayScores_Explanation:
         f.write(line + '\n')
 
-print(f"The four quality control tests to evaluate assay performance are complete. Their results have been saved to the folder, {output_folder}")
+print(f"The four quality control tests to evaluate assay performance are complete. Their results have been saved to the folder, {assayQC_subfolder}")
 
+#####
 # instantiate Qual_Ctrl_Checks from qual-checks.py
 qual_checks = Qual_Ctrl_Checks()
 
+# Make subfolder in the output folder in your path's wd if it hasn't been made already
+qc_subfolder = os.path.join(output_folder, 'quality_control_flags')
+if not os.path.exists(qc_subfolder):
+    os.makedirs(qc_subfolder)
+
 # initialize a list to collect all quality control checks
 QC_lines = []
 
@@ -332,7 +379,6 @@
 # apply ntc_check to the t13_hit_output df to generate a list of all ntc positive assays
 assigned_signal_norm_2 = pd.DataFrame(assigned_norms['signal_norm_raw']).copy() # make a copy of assigned_signal_norm dataframe
 
-
 high_raw_ntc_signal = qual_checks.ntc_check(assigned_signal_norm_2)
 QC_lines.append("4. Evaluation of No Target Control (NTC) Contamination \n")
 if high_raw_ntc_signal:
@@ -348,7 +394,7 @@
 coinfection_df = qual_checks.coinf_check(t13_hit_binary_output_copy1)
 QC_lines.append("5. Evaluation of Potential Co-Infected Samples\n")
 
-coinfection_df_file_path = os.path.join(output_folder, f'Coinfection_Check_{barcode_assignment}.csv')
+coinfection_df_file_path = os.path.join(qc_subfolder, f'Coinfection_Check_{barcode_assignment}.csv')
 coinfection_df.to_csv(coinfection_df_file_path, index=True) # output needs to be csv of coninfection check
 
 # in Qual_Check text file, add message saying "see coinf check csv" and "if any samples excpet CPC are flagged as being coinfected, there is risk of these samples being coinfected"
@@ -359,13 +405,12 @@
 QC_lines.append("   - All other flagged samples should be further evaluated for potential co-infection.\n")
 QC_lines.append("Please be advised to check the output files as well.")
 
-
 # create and save an output text file containing the quality control checks
-QCs_file_path = os.path.join(output_folder, f'Quality_Control_Flags_{barcode_assignment}.txt')
+QCs_file_path = os.path.join(qc_subfolder, f'Quality_Control_Flags_{barcode_assignment}.txt')
 with open(QCs_file_path, 'w') as f:
     for line in QC_lines:
         f.write(line + '\n')
 
-print(f"The quality control checks are complete and saved to the folder, {output_folder}")
+print(f"The quality control checks are complete and saved to the folder, {qc_subfolder}")
 
 
diff --git a/ntc_con_check.py b/ntc_con_check.py
@@ -12,29 +12,33 @@ def ntc_cont(self, assigned_sig_norm):
 
         # filter sample for anything that contains NTC (case-insensitive)
         ntc_filtered_df = assigned_sig_norm[assigned_sig_norm['sample'].str.contains('NTC', case=False, na=False)]
+        # filter all other samples and save it in a new df 
         all_else_filtered_df = assigned_sig_norm[~assigned_sig_norm['sample'].str.contains('NTC', case=False, na=False)]
-
-        ntc_assay_dfs = []
+        # initiate list of ntc_assay_dfs
+        ntc_assay_dfs = [] 
         # check per assay_df 
         for assay in ntc_filtered_df['assay'].unique():
             # filter the df for the current assay
             assay_df = ntc_filtered_df[ntc_filtered_df['assay'] == assay]
 
             # Check if t13 value is greater than 0.5 for any sample
-            for _, row in assay_df.iterrows():
+            for index, row in assay_df.iterrows():
                 if row['t13'] > 0.5:
                     # Check if there is still more than one row left in assay_df
                     if len(assay_df) > 1:
                         # Remove the row where t13 is greater than 0.5
-                        assay_df = assay_df[assay_df['t13'] != row['t13']]
+                        assay_df = assay_df.drop(index)
                     else:
                         # If there is only one row left, set row['t13'] to 0.5
-                        assay_df.at[row.name, 't13'] = 0.5
-                
+                        assay_df.at[index, 't13'] = 0.5
+            # add assay_df back to list of ntc_assay_dfs
             ntc_assay_dfs.append(assay_df)
 
+        # convert ntc_assay_dfs list into a df 
         combined_ntc_assay_dfs = pd.concat(ntc_assay_dfs, ignore_index=True)
 
         assigned_sig_norm = pd.concat([combined_ntc_assay_dfs, all_else_filtered_df], ignore_index=True)
+
+        # pull the samples list 
 
         return assigned_sig_norm
diff --git a/ntcnorm.py b/ntcnorm.py
@@ -28,6 +28,8 @@ def normalizr(self, t13_df):
                 ntc_mean = ntc_mean_df.loc['NTC Mean', col_name]
                 # Divide the value by the NTC mean per assay
                 t13_df.at[index, col_name] = value/ntc_mean
+
+        t13_df = t13_df.apply(pd.to_numeric, errors = 'coerce')  
 
         return t13_df
 

diff --git a/plotting.py b/plotting.py
@@ -92,4 +92,77 @@ def plt_heatmap(self, tgap, barcode_number, df_dict, sample_list, assay_list, tp
                 axes.vlines(v_lines, colors = 'silver',alpha=0.9,linewidths = 0.35,*axes.get_ylim())
                 fig_timepoints[i] = fig
 
-        return fig_timepoints
+        return fig_timepoints
+
+    def t13_plt_heatmap(self, tgap, barcode_number, df, sample_list, assay_list, tp):
+        # Create a dictonary for timepoints
+        time_assign = {}
+
+        for cycle in range(1,len(tp)+1):
+            tpoint = "t" + str(cycle)
+            time_assign[tpoint] = tgap + 3 + (cycle-1) * 5
+        last_key = list(time_assign.keys())[-1] 
+
+        half_samples = int(len(sample_list)/2)
+
+        if len(sample_list) == 192: 
+            # Split the sample list into two halves
+            first_half_samples = sample_list[:half_samples]
+            second_half_samples = sample_list[half_samples:]
+
+            fig, axes = plt.subplots(2, 1, figsize=(len(first_half_samples) * 0.5, len(assay_list) * 0.5 * 2))
+
+            df = df.transpose()
+
+            # First heatmap (first 96 samples)
+            frame1 = df[first_half_samples].reindex(assay_list)
+            ax1 = sns.heatmap(frame1, cmap='Reds', square=True, cbar_kws={'pad': 0.002}, annot_kws={"size": 20}, ax=axes[0])
+            ax1.set_title(f'Heatmap for data normalized against assay-specific NTC mean, for {barcode_number} at {time_assign[last_key]} minutes (First {half_samples} Samples)', size=28)
+            ax1.set_xlabel('Samples', size=14)
+            ax1.set_ylabel('Assays', size=14)
+            bottom, top = ax1.get_ylim()
+            ax1.set_ylim(bottom + 0.5, top - 0.5)
+            ax1.tick_params(axis="y", labelsize=16)
+            ax1.tick_params(axis="x", labelsize=16)
+            plt.yticks(rotation=0)
+
+            # Second heatmap (next 96 samples)
+            frame2 = df[second_half_samples].reindex(assay_list)
+            ax2 = sns.heatmap(frame2, cmap='Reds', square=True, cbar_kws={'pad': 0.002}, annot_kws={"size": 20}, ax=axes[1])
+            ax2.set_title(f'Heatmap for data normalized against assay-specific NTC mean, for {barcode_number} at {time_assign[last_key]} minutes (Next {half_samples} Samples)', size=28)
+            ax2.set_xlabel('Samples', size=14)
+            ax2.set_ylabel('Assays', size=14)
+            bottom, top = ax2.get_ylim()
+            ax2.set_ylim(bottom + 0.5, top - 0.5)
+            ax2.tick_params(axis="y", labelsize=16)
+            ax2.tick_params(axis="x", labelsize=16)
+            plt.yticks(rotation=0)
+
+            # Adjust layout
+            plt.tight_layout()
+
+        else: 
+            df = df.transpose()
+            frame = df[sample_list].reindex(assay_list)
+            fig, axes = plt.subplots(1,1,figsize=(len(frame.columns.values)*0.5,len(frame.index.values)*0.5))
+            ax = sns.heatmap(frame,cmap='Reds',square = True,cbar_kws={'pad':0.002}, annot_kws={"size": 20})
+
+            plt.title(f'Heatmap for data normalized against assay-specific NTC mean, for {barcode_number} at {time_assign[-1]} minutes', size = 28)
+            plt.xlabel('Samples', size = 14)
+            plt.ylabel('Assays', size = 14)
+            bottom, top = ax.get_ylim()
+            ax.set_ylim(bottom + 0.5, top - 0.5)
+            ax.tick_params(axis="y", labelsize=16)
+            ax.tick_params(axis="x", labelsize=16)
+            plt.yticks(rotation=0)
+
+            tgt_num = len(sample_list)
+            gd_num = len(assay_list)
+            bottom, top = ax.get_ylim()
+            ax.set_ylim(bottom + 0.5, top - 0.5)
+            h_lines = np.arange(3,gd_num,3)
+            v_lines = np.arange(3,tgt_num,3)
+            axes.hlines(h_lines, colors = 'silver',alpha=0.9,linewidths = 0.35,*axes.get_xlim())
+            axes.vlines(v_lines, colors = 'silver',alpha=0.9,linewidths = 0.35,*axes.get_ylim())
+
+        return fig