v5.1.0

fsalbeez · fsalbeez · commit 8df49fc34d09 · 2025-02-05T15:42:48.000-05:00
diff --git a/README.md b/README.md
@@ -2,7 +2,7 @@
 CARMEN is a diagnostic tool designed for surveillance purposes. Below are the instructions to complete your CARMEN analysis. 
 
 ## Software Version
-When cloning this repository, you will be using software version 5.0.0.
+When cloning this repository, you will be using software version 5.1.0.
 
 ## Overview
 At this point, you have ran the $Standard\ BioTools\ Dynamic\ Array^{TM}$ IFC (integrated fluidic circuit) on the $Standard\ BioTools\ Biomark^{TM}$ instrument and have completed the experimental portion of CARMEN. In running this code, you will be able to complete the data analysis portion of CARMEN and generate both binary positive/negative and quantitative signal output of your diagnostic assay. 
diff --git a/analyze_run.py b/analyze_run.py
@@ -9,6 +9,7 @@
 import seaborn as sns 
 #file imports
 from io import BytesIO
+from datetime import datetime
 import base64
 from pathlib import Path
 import os 
@@ -51,7 +52,7 @@
 
 ######################################################################################################################################################
 # assign software version
-software_version = '5.0.0'
+software_version = '5.1.0'
 
 ######################################################################################################################################################
 # data loading
@@ -102,7 +103,7 @@
     file_like_object.seek(0)
      
     # Extract dataframes from each CSV file
-    read_dataframes = reader.extract_dataframes_from_csv(file_like_object, phrases_to_find)
+    read_dataframes, date = reader.extract_dataframes_from_csv(file_like_object, phrases_to_find)
 
 # at this point, we have loaded the assignment sheet and have sorted through the loaded data file to create a dict of dataframes 
 
@@ -831,13 +832,38 @@
         fail_nocrRNA_check_df.to_csv(fail_nocrRNA_check_df_file_path, index=True)
         print(f"CSV created with data at {fail_nocrRNA_check_df_file_path}")
 
+""" 
+flagged_file = t13_hit_output.copy()
+processed_samples = set()
+for _, row in high_raw_ntc_signal_df.iterrows():
+    for col in high_raw_ntc_signal_df.columns: # cols are Sample, Assay, t13 
+        cont_ntc_sample = row['Sample'] # NEG NTC sample
+        cont_ntc_assay = row['Assay'] # NTC assay
+        # now iterate over the flagged file
+        for idx, sample_row in flagged_file.iterrows(): 
+            if cont_ntc_sample == idx:
+                # add † to each cell value
+                for assay_col in flagged_file.columns:  
+                    if assay_col.upper() == cont_ntc_assay.upper():
+                        # check that the sample-assay pair has alr been processed
+                        if (cont_ntc_sample, cont_ntc_assay) not in processed_samples:
+                            processed_samples.add((cont_ntc_sample, cont_ntc_assay))
+                            # check if the value is NA (NaN)
+                            if pd.isna(sample_row[assay_col]) or sample_row[assay_col] == '':
+                                flagged_file.loc[idx, assay_col] = '†'  # only dagger if value is NA
+                            else:
+                                flagged_file[assay_col] = flagged_file[assay_col].astype(str)
+                                #flagged_file.at[idx, assay_col] = str(flagged_file.at[idx, assay_col])
+                                flagged_file.at[idx, assay_col] = f"{sample_row[assay_col]}†"  # add dagger to the value
+            
 
 
+"""  
 ###################################################################################################################################################### 
 # instantiate Flagger from flags.py
 flagger = Flagger()
 
-invalid_assays, invalid_samples, flagged_files = flagger.assign_flags(fail_nocrRNA_check_df, high_raw_ntc_signal_df, rnasep_df_heatmap, QC_score_per_assay_df, t13_hit_output, rounded_t13_quant_norm, summary_samples_df, rounded_ntc_thresholds_output, t13_hit_binary_output)
+invalid_assays, invalid_samples, flagged_files, processed_samples, cont_ntc_sample, cont_ntc_assay = flagger.assign_flags(fail_nocrRNA_check_df, high_raw_ntc_signal_df, rnasep_df_heatmap, QC_score_per_assay_df, t13_hit_output, rounded_t13_quant_norm, summary_samples_df, rounded_ntc_thresholds_output, t13_hit_binary_output)
 
 fl_t13_hit_output = flagged_files[0] # Results_Summary
 fl_rounded_t13_quant_norm = flagged_files[1] # NTC_Normalized_Quantitative_Results_Summary
@@ -1000,25 +1026,27 @@
 fig = heatmap_t13_quant_norm.savefig(heatmap_t13_quant_norm_filename, bbox_inches = 'tight', dpi=80)
 plt.close(fig)
 
-print("Operation complete.")
-
 
 ######################################################################################################################################################   
 # RedCap Integration
-# set it as you have to enter a CLI for Redcap to run this code
+# set it as you have to enter a CLI arg for Redcap to run this code
 
-# if CLI[1]
+if len(CLI_arg) > 2 and CLI_arg[2] == 'REDCAP':
+    # instantiate RedCapper from flags.py
+    redcapper = RedCapper()
 
+    # make copy of binary output file from RESULTS Excel sheet
+    fl_t13_hit_binary_output_2 = fl_t13_hit_binary_output.copy()
 
-# instantiate RedCapper from flags.py
-redcapper = RedCapper()
+    redcap_t13_hit_binary_output = redcapper.build_redcap(fl_t13_hit_binary_output_2, date, barcode_assignment)
 
-# make copy of binary output file from RESULTS Excel sheet
-fl_t13_hit_binary_output_2 = fl_t13_hit_binary_output.copy()
+    redcap_t13_hit_binary_output_file_path = os.path.join(res_subfolder, f'REDCAP_{barcode_assignment}.csv')
+    redcap_t13_hit_binary_output.to_csv(redcap_t13_hit_binary_output_file_path, index=True)
+    print("REDCAP file generated.")
+    print("Operation complete.")
 
-redcap_t13_hit_binary_output = redcapper.build_redcap(fl_t13_hit_binary_output_2)
-
-redcap_t13_hit_binary_output_file_path = os.path.join(res_subfolder, f'REDCAP_{barcode_assignment}.csv')
-redcap_t13_hit_binary_output.to_csv(redcap_t13_hit_binary_output_file_path, index=True)
+else:
+    print("User did not specify REDCAP as a command line argument, and it follows that the REDCAP file was not generated.")
+    print("Operation complete.")
 
 
diff --git a/flags.py b/flags.py
@@ -89,12 +89,12 @@ def assign_flags(self, fail_nocrRNA_check_df, high_raw_ntc_signal_df, rnasep_df,
                                         if (cont_ntc_sample, cont_ntc_assay) not in processed_samples:
                                             processed_samples.add((cont_ntc_sample, cont_ntc_assay))
                                             # check if the value is NA (NaN)
-                                            if pd.isna(sample_row[assay_col]):
+                                            if pd.isna(sample_row[assay_col]) or sample_row[assay_col] == '':
                                                 flagged_file.loc[idx, assay_col] = '†'  # only dagger if value is NA
                                             else:
                                                 flagged_file[assay_col] = flagged_file[assay_col].astype(str)
                                                 #flagged_file.at[idx, assay_col] = str(flagged_file.at[idx, assay_col])
-                                                flagged_file.loc[idx, assay_col] = f"{sample_row[assay_col]}†"  # add dagger to the value
+                                                flagged_file.at[idx, assay_col] = f"{sample_row[assay_col]}†"  # add dagger to the value
                          
                 for _, row in high_raw_ntc_signal_df.iterrows(): 
                     for col in high_raw_ntc_signal_df.columns: 
@@ -141,7 +141,7 @@ def assign_flags(self, fail_nocrRNA_check_df, high_raw_ntc_signal_df, rnasep_df,
                         flagged_file = pd.concat([flagged_file, cont_NTC_thresh_legend_label_filled], ignore_index=False) # concatenate
                         legend_added = True
                         break                    
-
+                        
             ### no-crRNA flags
             ## need to be added to t13_hit, t13_quant_norm, pos_samples_df, t13_hit_binary
             if i in {0,1,4}: # t13_hit, t13_quant_norm,t13_hit_binary
@@ -280,4 +280,4 @@ def assign_flags(self, fail_nocrRNA_check_df, high_raw_ntc_signal_df, rnasep_df,
 
             flagged_files.append(flagged_file) # add flagged file to the list          
 
-        return invalid_assays, invalid_samples, flagged_files
+        return invalid_assays, invalid_samples, flagged_files, processed_samples, cont_ntc_sample, cont_ntc_assay
diff --git a/reader.py b/reader.py
@@ -1,5 +1,6 @@
 import pandas as pd
 import io
+from datetime import datetime
 
 class DataReader:
     def __init__(self):
@@ -27,7 +28,7 @@ def extract_dataframes_from_csv(self, file_like_object, phrases_to_find):
 
         # Read the content of the file-like object (only once)
         file_like_object.seek(0)
-        content = file_like_object.read().decode('utf-8')
+        content = file_like_object.read().decode('utf-8') # bytesIO obj
 
         new_phrase = [
             "ref_raw", 
@@ -62,6 +63,13 @@ def extract_dataframes_from_csv(self, file_like_object, phrases_to_find):
             df = pd.read_csv(content_io, nrows=section_rows[phrase], skiprows=start_index)
             dataframes[phrase] = self.clean_dataframe(df)  # Clean and store the dataframe
             #print(phrase, len(df))
-        # Return a dictionary of the dataframes
 
-        return dataframes
+        # Collect the date
+        df = pd.read_csv(io.StringIO(content), nrows=0)  
+        date_header = df.columns[7]
+        date_str = date_header.split(' ')[0] # strip removes the character to concat rest and split breaks string at character
+        date_obj = datetime.strptime(date_str, "%d/%m/%Y")
+        date = date_obj.strftime("%m/%d/%Y")
+        
+        # Return a dictionary of the dataframes
+        return dataframes, date
diff --git a/redcap_builder.py b/redcap_builder.py
@@ -10,8 +10,7 @@ def __init__(self):
         pass
     
     # method
-    def build_redcap(self, fl_t13_hit_binary_output_2):
-
+    def build_redcap(self, fl_t13_hit_binary_output_2, date, barcode_assignment):
        
         ### convert 0 to 2 (negative)
         redcap_t13_hit_binary_output = fl_t13_hit_binary_output_2.replace(0, 2)
@@ -62,20 +61,90 @@ def build_redcap(self, fl_t13_hit_binary_output_2):
             if col not in redcap_t13_hit_binary_output.columns:
                 redcap_t13_hit_binary_output[col] = 4
         
-        # reorder cols
+        ### reorder cols
         redcap_t13_hit_binary_output = redcap_t13_hit_binary_output[column_order]
-        
-
- 
-
-
-
 
+        ### add in the metadata columns
+        sampleid = []
+        for idx in redcap_t13_hit_binary_output.index:
+            cleaned_idx = re.sub(r'[\*\|†\s]', '', idx)
+            sampleid.append(cleaned_idx)
         
+        redcap_t13_hit_binary_output.insert(0, "date", date)
+        redcap_t13_hit_binary_output.insert(1, "ifc", barcode_assignment)
+        redcap_t13_hit_binary_output.insert(2, "sampleid", sampleid)
 
+        record_id = []
+        for row in redcap_t13_hit_binary_output.itertuples():
+            samp_id = row.sampleid 
+            record_id_val = barcode_assignment + '_' + samp_id 
+            record_id.append(record_id_val)
 
+        redcap_t13_hit_binary_output.insert(0, "record_id", record_id)
 
-
+        ### merge same samples ran on different panels 
+        # extract sampleid before panel 
+        redcap_t13_hit_binary_output['sampleid_prefix'] = redcap_t13_hit_binary_output['sampleid'].str.replace(r'(_P1|_P2|_RVP)$', '', regex=True)
+       
+        # merge rows with the same sampleid_prefix - keep assay results unique, combine record_ids, update sampleid
+        def merge_group(group):
+            # select first row in subset of redcap_t13_hit_binary_output grouped by sampleid_prefix
+            merged_row = pd.DataFrame(columns=group.columns)
+            merged_row.loc[0] = group.iloc[0]
+
+            # the group is the unique sampleid_prefix - each group should have max 2 rows
+            for col in group.columns:
+                if col not in ["record_id", "date", "ifc", "sampleid", "sampleid_prefix"]:
+                    # if merged_row['cchfv'] = [5,4], then lambda fn will produce [5,None]
+                    # dropna will make it merged_row['cchfv'] = [5]
+                    # .unique ensures that only unique vals are retained
+                   
+                    if all(group[col] == 4):
+                        merged_row[col] = 4
+                    else: # if group[col] = [5,4] or [3, 4] - there's no world where it would be [5,3]
+                        filtered_values = group.loc[group[col] != 4, col].dropna().unique()
+                        merged_row[col] = filtered_values[0] 
+                     
+            # each record_id is split and the two panel suffixes are added to the record_id - the .unique ensures that that all distinct splits are added tg
+            merged_row['record_id'] = '_'.join(group['record_id'].apply(lambda x: x.split('_')[-1]).unique())
+
+            # assign a sampleid to the merged_row (doesn't matter as sampleid col will be dropped later)
+            merged_row['sampleid'] = group['sampleid'].iloc[0]
+
+            return merged_row
+        
+        # apply the merge_group function to each group in the groupby obj (which is a df)
+        redcap_t13_hit_binary_output = redcap_t13_hit_binary_output.groupby('sampleid_prefix').apply(merge_group).reset_index(drop=True)
+
+        # make record_id col be ifc + sample_id_prefix + record_id (which is just _P1_P2)
+        record_id_fix = []
+        for row in redcap_t13_hit_binary_output.itertuples():
+            record_id = row.record_id 
+            sampleid_prefix = row.sampleid_prefix
+            sampleid = row.sampleid
+            if not any(control in sampleid_prefix for control in ['NTC', 'CPC', 'NDC']):
+                record_id_val = barcode_assignment + '_' + sampleid_prefix + '_' + record_id
+                record_id_fix.append(record_id_val)
+            else:
+                record_id_val = barcode_assignment + '_' + sampleid + '_' + record_id
+
+        redcap_t13_hit_binary_output['record_id'] = record_id_fix
+
+        ### drop sampleid
+        redcap_t13_hit_binary_output = redcap_t13_hit_binary_output.drop(columns=['sampleid'])
+
+        ### rename sampleid_prefix as sampleid and insert it as the 4th col
+        redcap_t13_hit_binary_output = redcap_t13_hit_binary_output.rename(columns={'sampleid_prefix': 'sampleid'})
+        cols = list(redcap_t13_hit_binary_output.columns)
+        cols.remove('sampleid')
+        cols.insert(3, 'sampleid')
+        redcap_t13_hit_binary_output = redcap_t13_hit_binary_output[cols]
+
+        ### lowercase all columns in redcap_t13_hit_binary_output for REDCAP data entry
+        redcap_t13_hit_binary_output.columns = redcap_t13_hit_binary_output.columns.str.lower()
+
+        ### reset index
+        redcap_t13_hit_binary_output = redcap_t13_hit_binary_output.reset_index(drop=True)
 
         return redcap_t13_hit_binary_output