v5.2.0

fsalbeez · fsalbeez · commit aae6589c03bb · 2025-02-10T18:56:29.000-05:00
diff --git a/README.md b/README.md
@@ -2,7 +2,7 @@
 CARMEN is a diagnostic tool designed for surveillance purposes. Below are the instructions to complete your CARMEN analysis. 
 
 ## Software Version
-When cloning this repository, you will be using software version 5.1.0.
+When cloning this repository, you will be using software version 5.2.0.
 
 ## Overview
 At this point, you have ran the $Standard\ BioTools\ Dynamic\ Array^{TM}$ IFC (integrated fluidic circuit) on the $Standard\ BioTools\ Biomark^{TM}$ instrument and have completed the experimental portion of CARMEN. In running this code, you will be able to complete the data analysis portion of CARMEN and generate both binary positive/negative and quantitative signal output of your diagnostic assay. 
diff --git a/analyze_run.py b/analyze_run.py
@@ -52,7 +52,7 @@
 
 ######################################################################################################################################################
 # assign software version
-software_version = '5.1.0'
+software_version = '5.2.0'
 
 ######################################################################################################################################################
 # data loading
@@ -116,7 +116,8 @@
     print("Please include the command line arguments when running analyze_run.py")
 else:
     # Proceed with your script logic
-    print("Threshold provided:", sys.argv[1:])
+    print("Threshold provided:", CLI_arg[1])
+    #print("Threshold provided:", sys.argv[1:])
 
 
 ## Set up structure of the output folder - simplify into RESUTLS, QUALITY CONTROL, R&D
@@ -1012,10 +1013,12 @@
     # make copy of binary output file from RESULTS Excel sheet
     fl_t13_hit_binary_output_2 = fl_t13_hit_binary_output.copy()
 
-    concat_redcap_t13_hit_binary_output = redcapper.build_redcap(fl_t13_hit_binary_output_2, date, barcode_assignment)
-    # concat_redcap_t13_hit_binary_output, samplesDF, controlsDF
-    concat_redcap_t13_hit_binary_output_file_path = os.path.join(res_subfolder, f'REDCAP_{barcode_assignment}.csv')
-    concat_redcap_t13_hit_binary_output.to_csv(concat_redcap_t13_hit_binary_output_file_path, index=True)
+    # apply redcapper to fl_t13_hit_binary_output_2 df
+    redcap_t13_hit_binary_output = redcapper.build_redcap(fl_t13_hit_binary_output_2, date, barcode_assignment)
+    
+    # save REDCAP file
+    redcap_t13_hit_binary_output_file_path = os.path.join(res_subfolder, f'REDCAP_{barcode_assignment}.csv')
+    redcap_t13_hit_binary_output.to_csv(redcap_t13_hit_binary_output_file_path, index=False)
     print("REDCAP file generated.")
     print("Operation complete.")
 
diff --git a/flags.py b/flags.py
@@ -86,11 +86,12 @@ def assign_flags(self, fail_nocrRNA_check_df, high_raw_ntc_signal_df, rnasep_df,
                             if cont_ntc_sample == idx:
                                 # add † to each cell value
                                 for assay_col in flagged_file.columns:  
-                                    if assay_col.strip().upper() == cont_ntc_assay:
+                                    stripped_assay_col = re.sub(r'[\*\|†\s]', '', assay_col) # strip the * from assay_col
+                                    if stripped_assay_col.upper() == cont_ntc_assay: # shld now be met
                                         # check that the sample-assay pair has alr been processed
                                         if (cont_ntc_sample, cont_ntc_assay) not in processed_samples:
                                             processed_samples.add((cont_ntc_sample, cont_ntc_assay))
-                                            # check if the value is NA (NaN)
+                                            # check if the value is NA (NaN) re.search(r'rnasep|no_crrna', col, re.IGNORECASE)
                                             if pd.isna(sample_row[assay_col]) or sample_row[assay_col] == '':
                                                 flagged_file.loc[idx, assay_col] = '†'  # only dagger if value is NA
                                             else:
diff --git a/reader.py b/reader.py
@@ -68,8 +68,8 @@ def extract_dataframes_from_csv(self, file_like_object, phrases_to_find):
         df = pd.read_csv(io.StringIO(content), nrows=0)  
         date_header = df.columns[7]
         date_str = date_header.split(' ')[0] # strip removes the character to concat rest and split breaks string at character
-        date_obj = datetime.strptime(date_str, "%d/%m/%Y")
-        date = date_obj.strftime("%m/%d/%Y")
+        #date_obj = datetime.strptime(date_str, "%d/%m/%Y")
+        #date = date_obj.strftime("%m/%d/%Y")
         
         # Return a dictionary of the dataframes
-        return dataframes, date
+        return dataframes, date_str
diff --git a/redcap_builder.py b/redcap_builder.py
@@ -12,8 +12,11 @@ def __init__(self):
     # method
     def build_redcap(self, fl_t13_hit_binary_output_2, date, barcode_assignment):
 
+        # legend
+        # 1 = pos, 2 = neg, 3 = pending, 4 = not run, 5 = invalid, 6 = NTC contaminated
+
         # merge rows with the same sampleid_prefix - keep assay results unique, combine record_ids, update sampleid
-        def merge_group(group):
+        def merge_group(group, bbp_P1_assays, bbp_P2_assays, rvp_assays):
             # select first row in subset of redcap_t13_hit_binary_output grouped by sampleid_prefix
             merged_row = pd.DataFrame(columns=group.columns)
             merged_row.loc[0] = group.iloc[0]
@@ -24,106 +27,114 @@ def merge_group(group):
                     # if merged_row['cchfv'] = [5,4], then lambda fn will produce [5,None]
                     # dropna will make it merged_row['cchfv'] = [5]
                     # .unique ensures that only unique vals are retained
-                   
-                    if all(group[col] == 4):
+                    if all(group[col] == 4): # not run
                         merged_row[col] = 4
-                    else: # if group[col] = [5,4] or [3, 4] - there's no world where it would be [5,3]
-                        filtered_values = group.loc[group[col] != 4, col].dropna().unique()
-                        merged_row[col] = filtered_values[0] if len(filtered_values) ==1 else filtered_values[1]
+                    elif all(group[col] ==5): # both assays are invalid
+                        merged_row[col] = 5
+                    elif all(group[col] ==2): # both assays are negative
+                        merged_row[col] = 2
+                    else: 
+                        p1_value = group.loc[group["sampleid"].str.endswith("_P1"), col].dropna().unique()
+                        p2_value = group.loc[group["sampleid"].str.endswith("_P2"), col].dropna().unique()
+                        rvp_value = group.loc[group["sampleid"].str.endswith("_RVP"), col].dropna().unique()
+                        
+                        if col in bbp_P1_assays and len(p1_value) > 0:
+                            merged_row[col] = p1_value #p1_value[0]
+                        elif col in bbp_P2_assays and len(p2_value) > 0:
+                            merged_row[col] = p2_value #p2_value[0]
+                        elif col in rvp_assays and len(p2_value) > 0:
+                            merged_row[col] = rvp_value #rvp_value[0]
+                        """  
+                        else:
+                            # if group[col] = [5,4] or [3, 4] - there's no world where it would be [5,3]
+                            filtered_values = group.loc[group[col] != 4, col].dropna().unique() 
+                            # ^ group.loc[group[col] != 4, col] filters the rows in group where the column col is NOT equal to 4
+                            merged_row[col] = filtered_values[0] #if len(filtered_values) ==1 else filtered_values[1]
+                        """
                      
             # each record_id is split and the unique panel suffixes are added to suffix_record_id 
             merged_row['suffix_record_id'] = '_'.join(group['record_id'].apply(lambda x: x.split('_')[-1]).unique())
 
-            # assign a sampleid to the merged_row (doesn't matter as sampleid col will be dropped later)
-            #merged_row['sampleid'] = group['sampleid'].iloc[0]
-
             return merged_row
        
         ### format input flagged t13 binary hit file 
         redcap_t13_hit_binary_output = fl_t13_hit_binary_output_2.copy()
         redcap_t13_hit_binary_output = redcap_t13_hit_binary_output.astype(str)
 
+        ### convert any cell val with a dagger † to 6 (NTC contaminated)
+        redcap_t13_hit_binary_output = redcap_t13_hit_binary_output.map(lambda x: '6' if '0.0†' in x else x) 
+        
         ### convert 0 to 2 (negative)
         redcap_t13_hit_binary_output = redcap_t13_hit_binary_output.replace(to_replace=r'^0.*', value=2, regex=True)
 
+        ### convert 1.0 to 1 (positive)
+        redcap_t13_hit_binary_output = redcap_t13_hit_binary_output.replace(to_replace=r'^1.0', value=1, regex=True)
+        
         ### drop any rows incl and below 'Summary' row
         if 'Summary' in redcap_t13_hit_binary_output.index:
             idx = redcap_t13_hit_binary_output.index.get_loc('Summary')
             redcap_t13_hit_binary_output = redcap_t13_hit_binary_output.iloc[:idx]
-
-        ### convert any cell val with a dagger † to 6 (NTC contaminated)
-        """ 
-        test_redcap_t13_hit_binary_output = redcap_t13_hit_binary_output.copy() 
-        test_redcap_t13_hit_binary_output = test_redcap_t13_hit_binary_output.astype(str)
-        test_redcap_t13_hit_binary_output = test_redcap_t13_hit_binary_output.map(lambda x: str(x) if not isinstance(x, str) else x)
-         
-        for _, row in high_raw_ntc_signal_df.iterrows():
-            cont_ntc_sample = row['Sample']
-            cont_ntc_assay = row['Assay'].upper()
-            
-            # Check if the sample exists in the row index and the assay exists in the column header
-            if cont_ntc_sample in test_redcap_t13_hit_binary_output.index and cont_ntc_assay in test_redcap_t13_hit_binary_output.columns:
-                current_value = test_redcap_t13_hit_binary_output.loc[cont_ntc_sample, cont_ntc_assay]
-                if '†' in current_value:
-                    test_redcap_t13_hit_binary_output.loc[cont_ntc_sample, cont_ntc_assay] = '6'
-        """
-        redcap_t13_hit_binary_output = redcap_t13_hit_binary_output.astype(str)
-        redcap_t13_hit_binary_output = redcap_t13_hit_binary_output.map(lambda x: '6' if '†' in x else x)
-        
-
         
-        ### convert col vals for invalid assays to 5 (invalid)
+        ### convert col vals for invalid samples to 5 (invalid)
         # for all invalid samples
         redcap_t13_hit_binary_output.loc[redcap_t13_hit_binary_output['SAMPLE VALID? Y/N'] == 'N***', :] = 5
-
+        
+        ### convert col vals for invalid assays to 5 (invalid)
         # for all invalid assays
         assay_valid_cols = redcap_t13_hit_binary_output.columns[redcap_t13_hit_binary_output.loc['Assay Valid?'] == 'INVALID ASSAY']
         for col in assay_valid_cols:
             redcap_t13_hit_binary_output[col] = 5
-
+        
         ### drop the 'SAMPLE VALID? Y/N' col
         redcap_t13_hit_binary_output = redcap_t13_hit_binary_output.drop('SAMPLE VALID? Y/N', axis=1)
-
+        
         ### drop the 'Assay Valid?' row
         redcap_t13_hit_binary_output = redcap_t13_hit_binary_output.drop('Assay Valid?', axis=0)
-
+        
         ### drop any columns containing no_crRNA
         redcap_t13_hit_binary_output = redcap_t13_hit_binary_output.loc[:, ~redcap_t13_hit_binary_output.columns.str.lower().str.contains('no_crrna')]
-
+        
         ### strip all _ and asterisks from the column names
         for i, col in enumerate(redcap_t13_hit_binary_output.columns):
-            if not re.search(r'rnasep|no_crrna', col, re.IGNORECASE):
-                new_col = re.split(r'[_*]', col)[0]
+            if not re.search(r'rnasep', col, re.IGNORECASE):
+                new_col = re.split(r'[_*]', col)[0] # remove _ and * from all col names for assays
                 redcap_t13_hit_binary_output.columns.values[i] = new_col
-            if  re.search(r'rnasep|no_crrna', col, re.IGNORECASE):
-                new_col = re.split(r'[*]', col)[0]
+            if  re.search(r'rnasep', col, re.IGNORECASE):
+                new_col = re.split(r'[*]', col)[0] # we don't want to remove the _P1 or _RVP or _P2 part from RNASEP column header
                 redcap_t13_hit_binary_output.columns.values[i] = new_col
         
         ### add columns for the assay that wasn't run with since REDCAP format needs all assays (RVP and BBP) headers in 
+        # define assays
+        bbp_P1_assays = ['CCHFV','EBOV','HIV_1','HIV_2','LASV', 'MBV','MPOX_DNA','PF_3_DNA','WNV','YFV']
+        bbp_P2_assays = ['CHI', 'DENV','HBV_DNA','HCV', 'HTV', 'MMV', 'ONN','RBV','RVFV','SYPH_DNA','ZIKV']
+        rvp_assays = ['SARS_COV-2', 'HCOV_HKU1', 'HCOV_NL63', 'HCOV_OC43', 'FLUAV', 'FLUBV', 'HMPV', 'HRSV', 'HPIV_3']
         bbp_assays = ['CCHFV', 'CHI', 'DENV', 'EBOV', 'HBV_DNA', 'HCV', 'HIV_1', 'HIV_2', 'HTV', 'LASV', 'MBV', 'MMV', 
                     'MPOX_DNA', 'ONN', 'PF_3_DNA', 'RBV', 'RVFV', 'SYPH_DNA', 'WNV', 'YFV', 'ZIKV']
-        rvp_assays = ['SARS_COV-2', 'HCOV_HKU1', 'HCOV_NL63', 'HCOV_OC43', 'FLUAV', 'FLUBV', 'HMPV', 'HRSV', 'HPIV_3']
+    
         # set column order
-        column_order = bbp_assays + rvp_assays + ['RNASEP_P1','RNASEP_P2']
+        column_order = bbp_assays + rvp_assays + ['RNASEP_P1','RNASEP_P2', 'RNASEP_RVP']
         # when adding the new columns, enter the value as 4 (not run)
         for col in column_order:
             if col not in redcap_t13_hit_binary_output.columns:
                 redcap_t13_hit_binary_output[col] = 4
-       
+        
         ### reorder cols
         redcap_t13_hit_binary_output = redcap_t13_hit_binary_output[column_order]
-
+        
         ### add in the metadata columns
         # date
         redcap_t13_hit_binary_output.insert(0, "date", date)
+        
         # barcode assignment
         redcap_t13_hit_binary_output.insert(1, "ifc", barcode_assignment)
+        
         # sampleid
         sampleid = []
         for idx in redcap_t13_hit_binary_output.index: # strip all _ and asterisks from the sample names
             cleaned_idx = re.sub(r'[\*\|†\s]', '', idx)
             sampleid.append(cleaned_idx)
         redcap_t13_hit_binary_output.insert(2, "sampleid", sampleid)
+        
         # recordid
         record_id = []
         for row in redcap_t13_hit_binary_output.itertuples():
@@ -132,18 +143,18 @@ def merge_group(group):
             record_id.append(record_id_val)
 
         redcap_t13_hit_binary_output.insert(0, "record_id", record_id)
-
+        
         ### merge same samples ran on different panels 
         # extract sampleid before panel _P1 or _P2 or _RVP
         redcap_t13_hit_binary_output['sampleid_prefix'] = redcap_t13_hit_binary_output['sampleid'].str.replace(r'(_P1|_P2|_RVP)$', '', regex=True)
-       
+        
         # subset redcap into two dfs 
         controlsDF = redcap_t13_hit_binary_output[redcap_t13_hit_binary_output['sampleid'].str.contains('NDC|CPC|NTC', regex=True, na=False)]
         samplesDF = redcap_t13_hit_binary_output[~redcap_t13_hit_binary_output['sampleid'].str.contains('NDC|CPC|NTC', regex=True, na=False)]
-
+        
         # apply the merge_group function to each group in the groupby obj (which is a df)
-        samplesDF = samplesDF.groupby('sampleid_prefix').apply(merge_group).reset_index(drop=True)
-
+        samplesDF = samplesDF.groupby('sampleid_prefix').apply(lambda group: merge_group(group, bbp_P1_assays, bbp_P2_assays, rvp_assays)).reset_index(drop=True)
+        
         # fix the suffix in record_id
         record_id_fix = []
         for row in samplesDF.itertuples():
@@ -154,26 +165,25 @@ def merge_group(group):
             new_record_id = record_id + "_" + suffix_record_id
             record_id_fix.append(new_record_id)
         samplesDF['record_id'] = record_id_fix  
-
+        
         # drop suffix_record_id
         samplesDF = samplesDF.drop(columns=['suffix_record_id'])
-
+        
         # concatenate back to redcap
-        concat_redcap_t13_hit_binary_output = pd.concat((samplesDF, controlsDF), axis=0, ignore_index=True)
-
+        redcap_t13_hit_binary_output = pd.concat((samplesDF, controlsDF), axis=0, ignore_index=True)
+        
         ### write sampleid as the sample_prefix for all samples but those containing CPC, NTC, and NDC
-        mask = ~concat_redcap_t13_hit_binary_output['sampleid'].str.contains('NTC|CPC|NDC', regex=True, na=False)
-        concat_redcap_t13_hit_binary_output.loc[mask, 'sampleid'] = concat_redcap_t13_hit_binary_output['sampleid_prefix']
-
+        mask = ~redcap_t13_hit_binary_output['sampleid'].str.contains('NTC|CPC|NDC', regex=True, na=False)
+        redcap_t13_hit_binary_output.loc[mask, 'sampleid'] = redcap_t13_hit_binary_output['sampleid_prefix']
+        
         # drop sample_prefix_id
-        concat_redcap_t13_hit_binary_output = concat_redcap_t13_hit_binary_output.drop(columns=['sampleid_prefix'])
-
+        redcap_t13_hit_binary_output = redcap_t13_hit_binary_output.drop(columns=['sampleid_prefix'])
+        
         ### lowercase all columns in redcap_t13_hit_binary_output for REDCAP data entry
-        concat_redcap_t13_hit_binary_output.columns = concat_redcap_t13_hit_binary_output.columns.str.lower()
+        redcap_t13_hit_binary_output.columns = redcap_t13_hit_binary_output.columns.str.lower()
 
         ### reset index
-        concat_redcap_t13_hit_binary_output = concat_redcap_t13_hit_binary_output.reset_index(drop=True)
-
+        redcap_t13_hit_binary_output = redcap_t13_hit_binary_output.reset_index(drop=True)
 
-        return concat_redcap_t13_hit_binary_output # redcap_t13_hit_binary_output, samplesDF, controlsDF
+        return redcap_t13_hit_binary_output