v5.1.0

fsalbeez · fsalbeez · commit 0b4ee5aee6ce · 2025-02-07T11:40:22.000-05:00
diff --git a/analyze_run.py b/analyze_run.py
@@ -260,9 +260,9 @@
 # save t13_hit_output as Results_Summary after flags are added below in Flagger()
 
 ######################################################################################################################################################
- # instantiate NTC_Normalized from ntcnorm.py
+# instantiate NTC_Normalized from ntcnorm.py
 ntcNorm = Normalized()
- # apply ntc_normalizr to the t13_dataframe to produce a new dataframe with all values divided by the mean NTC for that assay
+# apply ntc_normalizr to the t13_dataframe to produce a new dataframe with all values divided by the mean NTC for that assay
 t13_quant_norm = ntcNorm.normalizr(t13_dataframe_copy2)
 # round the NTC normalized quantitative results file
 decimals = 5 # Define the number of decimals for rounding
@@ -832,33 +832,7 @@
         fail_nocrRNA_check_df.to_csv(fail_nocrRNA_check_df_file_path, index=True)
         print(f"CSV created with data at {fail_nocrRNA_check_df_file_path}")
 
-""" 
-flagged_file = t13_hit_output.copy()
-processed_samples = set()
-for _, row in high_raw_ntc_signal_df.iterrows():
-    for col in high_raw_ntc_signal_df.columns: # cols are Sample, Assay, t13 
-        cont_ntc_sample = row['Sample'] # NEG NTC sample
-        cont_ntc_assay = row['Assay'] # NTC assay
-        # now iterate over the flagged file
-        for idx, sample_row in flagged_file.iterrows(): 
-            if cont_ntc_sample == idx:
-                # add † to each cell value
-                for assay_col in flagged_file.columns:  
-                    if assay_col.upper() == cont_ntc_assay.upper():
-                        # check that the sample-assay pair has alr been processed
-                        if (cont_ntc_sample, cont_ntc_assay) not in processed_samples:
-                            processed_samples.add((cont_ntc_sample, cont_ntc_assay))
-                            # check if the value is NA (NaN)
-                            if pd.isna(sample_row[assay_col]) or sample_row[assay_col] == '':
-                                flagged_file.loc[idx, assay_col] = '†'  # only dagger if value is NA
-                            else:
-                                flagged_file[assay_col] = flagged_file[assay_col].astype(str)
-                                #flagged_file.at[idx, assay_col] = str(flagged_file.at[idx, assay_col])
-                                flagged_file.at[idx, assay_col] = f"{sample_row[assay_col]}†"  # add dagger to the value
-            
-
-
-"""  
+ 
 ###################################################################################################################################################### 
 # instantiate Flagger from flags.py
 flagger = Flagger()
@@ -1038,10 +1012,10 @@
     # make copy of binary output file from RESULTS Excel sheet
     fl_t13_hit_binary_output_2 = fl_t13_hit_binary_output.copy()
 
-    redcap_t13_hit_binary_output = redcapper.build_redcap(fl_t13_hit_binary_output_2, date, barcode_assignment)
-
-    redcap_t13_hit_binary_output_file_path = os.path.join(res_subfolder, f'REDCAP_{barcode_assignment}.csv')
-    redcap_t13_hit_binary_output.to_csv(redcap_t13_hit_binary_output_file_path, index=True)
+    concat_redcap_t13_hit_binary_output = redcapper.build_redcap(fl_t13_hit_binary_output_2, date, barcode_assignment)
+    # concat_redcap_t13_hit_binary_output, samplesDF, controlsDF
+    concat_redcap_t13_hit_binary_output_file_path = os.path.join(res_subfolder, f'REDCAP_{barcode_assignment}.csv')
+    concat_redcap_t13_hit_binary_output.to_csv(concat_redcap_t13_hit_binary_output_file_path, index=True)
     print("REDCAP file generated.")
     print("Operation complete.")
 
diff --git a/flags.py b/flags.py
@@ -75,13 +75,37 @@ def assign_flags(self, fail_nocrRNA_check_df, high_raw_ntc_signal_df, rnasep_df,
             ## dagger flag needs to be added to t13_hit_output, rounded_t13_quant_norm, summary_samples_df, t13_hit_binary_output
             if i in {0,1,2,4}:
                 processed_samples = set()
+
+                for _, row in high_raw_ntc_signal_df.iterrows(): # cols are Sample, Assay, t13 
+                        cont_ntc_sample = row['Sample'].upper() # NEG NTC sample
+                        cont_ntc_sample = cont_ntc_sample.strip()
+                        cont_ntc_assay = row['Assay'].upper() # NTC assay
+                        # now iterate over the flagged file
+                        for idx, sample_row in flagged_file.iterrows(): 
+                            idx = str(idx).strip().upper()
+                            if cont_ntc_sample == idx:
+                                # add † to each cell value
+                                for assay_col in flagged_file.columns:  
+                                    if assay_col.strip().upper() == cont_ntc_assay:
+                                        # check that the sample-assay pair has alr been processed
+                                        if (cont_ntc_sample, cont_ntc_assay) not in processed_samples:
+                                            processed_samples.add((cont_ntc_sample, cont_ntc_assay))
+                                            # check if the value is NA (NaN)
+                                            if pd.isna(sample_row[assay_col]) or sample_row[assay_col] == '':
+                                                flagged_file.loc[idx, assay_col] = '†'  # only dagger if value is NA
+                                            else:
+                                                flagged_file[assay_col] = flagged_file[assay_col].astype(str)
+                                                #flagged_file.at[idx, assay_col] = str(flagged_file.at[idx, assay_col])
+                                                flagged_file.at[idx, assay_col] = f"{sample_row[assay_col]}†"  # add dagger to the value
+                """  
                 for _, row in high_raw_ntc_signal_df.iterrows():
                     for col in high_raw_ntc_signal_df.columns: # cols are Sample, Assay, t13 
-                        cont_ntc_sample = row['Sample'] # NEG NTC sample
-                        cont_ntc_assay = row['Assay'] # NTC assay
+                        cont_ntc_sample = row['Sample'].upper() # NEG NTC sample
+                        cont_ntc_sample = cont_ntc_sample.strip()
+                        cont_ntc_assay = row['Assay'].upper() # NTC assay
                         # now iterate over the flagged file
                         for idx, sample_row in flagged_file.iterrows(): 
-                            if cont_ntc_sample == idx:
+                            if cont_ntc_sample == str(idx).upper():
                                 # add † to each cell value
                                 for assay_col in flagged_file.columns:  
                                     if assay_col.upper() == cont_ntc_assay.upper():
@@ -95,7 +119,7 @@ def assign_flags(self, fail_nocrRNA_check_df, high_raw_ntc_signal_df, rnasep_df,
                                                 flagged_file[assay_col] = flagged_file[assay_col].astype(str)
                                                 #flagged_file.at[idx, assay_col] = str(flagged_file.at[idx, assay_col])
                                                 flagged_file.at[idx, assay_col] = f"{sample_row[assay_col]}†"  # add dagger to the value
-                         
+                """    
                 for _, row in high_raw_ntc_signal_df.iterrows(): 
                     for col in high_raw_ntc_signal_df.columns: 
                         cont_ntc_sample = row['Sample'] # NEG NTC sample
diff --git a/redcap_builder.py b/redcap_builder.py
@@ -11,18 +11,63 @@ def __init__(self):
     
     # method
     def build_redcap(self, fl_t13_hit_binary_output_2, date, barcode_assignment):
+
+        # merge rows with the same sampleid_prefix - keep assay results unique, combine record_ids, update sampleid
+        def merge_group(group):
+            # select first row in subset of redcap_t13_hit_binary_output grouped by sampleid_prefix
+            merged_row = pd.DataFrame(columns=group.columns)
+            merged_row.loc[0] = group.iloc[0]
+
+            # the group is the unique sampleid_prefix - each group should have max 2 rows
+            for col in group.columns:
+                if col not in ["record_id", "date", "ifc", "sampleid", "sampleid_prefix"]:
+                    # if merged_row['cchfv'] = [5,4], then lambda fn will produce [5,None]
+                    # dropna will make it merged_row['cchfv'] = [5]
+                    # .unique ensures that only unique vals are retained
+                   
+                    if all(group[col] == 4):
+                        merged_row[col] = 4
+                    else: # if group[col] = [5,4] or [3, 4] - there's no world where it would be [5,3]
+                        filtered_values = group.loc[group[col] != 4, col].dropna().unique()
+                        merged_row[col] = filtered_values[0] if len(filtered_values) ==1 else filtered_values[1]
+                     
+            # each record_id is split and the unique panel suffixes are added to suffix_record_id 
+            merged_row['suffix_record_id'] = '_'.join(group['record_id'].apply(lambda x: x.split('_')[-1]).unique())
+
+            # assign a sampleid to the merged_row (doesn't matter as sampleid col will be dropped later)
+            #merged_row['sampleid'] = group['sampleid'].iloc[0]
+
+            return merged_row
        
         ### convert 0 to 2 (negative)
         redcap_t13_hit_binary_output = fl_t13_hit_binary_output_2.replace(0, 2)
 
-         ### drop any rows incl and below 'Summary' row
+        ### drop any rows incl and below 'Summary' row
         if 'Summary' in redcap_t13_hit_binary_output.index:
             idx = redcap_t13_hit_binary_output.index.get_loc('Summary')
             redcap_t13_hit_binary_output = redcap_t13_hit_binary_output.iloc[:idx]
 
         ### convert any cell val with a dagger † to 6 (NTC contaminated)
-        redcap_t13_hit_binary_output = redcap_t13_hit_binary_output.replace(r'.*†.*', 6, regex=True)
+        """ 
+        test_redcap_t13_hit_binary_output = redcap_t13_hit_binary_output.copy() 
+        test_redcap_t13_hit_binary_output = test_redcap_t13_hit_binary_output.astype(str)
+        test_redcap_t13_hit_binary_output = test_redcap_t13_hit_binary_output.map(lambda x: str(x) if not isinstance(x, str) else x)
+         
+        for _, row in high_raw_ntc_signal_df.iterrows():
+            cont_ntc_sample = row['Sample']
+            cont_ntc_assay = row['Assay'].upper()
+            
+            # Check if the sample exists in the row index and the assay exists in the column header
+            if cont_ntc_sample in test_redcap_t13_hit_binary_output.index and cont_ntc_assay in test_redcap_t13_hit_binary_output.columns:
+                current_value = test_redcap_t13_hit_binary_output.loc[cont_ntc_sample, cont_ntc_assay]
+                if '†' in current_value:
+                    test_redcap_t13_hit_binary_output.loc[cont_ntc_sample, cont_ntc_assay] = '6'
+        """
+        redcap_t13_hit_binary_output = redcap_t13_hit_binary_output.astype(str)
+        redcap_t13_hit_binary_output = redcap_t13_hit_binary_output.applymap(lambda x: '6' if '†' in x else x)
+        
 
+        
         ### convert col vals for invalid assays to 5 (invalid)
         # for all invalid samples
         redcap_t13_hit_binary_output.loc[redcap_t13_hit_binary_output['SAMPLE VALID? Y/N'] == 'N***', :] = 5
@@ -49,7 +94,7 @@ def build_redcap(self, fl_t13_hit_binary_output_2, date, barcode_assignment):
             if  re.search(r'rnasep|no_crrna', col, re.IGNORECASE):
                 new_col = re.split(r'[*]', col)[0]
                 redcap_t13_hit_binary_output.columns.values[i] = new_col
-
+        
         ### add columns for the assay that wasn't run with since REDCAP format needs all assays (RVP and BBP) headers in 
         bbp_assays = ['CCHFV', 'CHI', 'DENV', 'EBOV', 'HBV_DNA', 'HCV', 'HIV_1', 'HIV_2', 'HTV', 'LASV', 'MBV', 'MMV', 
                     'MPOX_DNA', 'ONN', 'PF_3_DNA', 'RBV', 'RVFV', 'SYPH_DNA', 'WNV', 'YFV', 'ZIKV']
@@ -60,20 +105,22 @@ def build_redcap(self, fl_t13_hit_binary_output_2, date, barcode_assignment):
         for col in column_order:
             if col not in redcap_t13_hit_binary_output.columns:
                 redcap_t13_hit_binary_output[col] = 4
-        
+       
         ### reorder cols
         redcap_t13_hit_binary_output = redcap_t13_hit_binary_output[column_order]
 
         ### add in the metadata columns
+        # date
+        redcap_t13_hit_binary_output.insert(0, "date", date)
+        # barcode assignment
+        redcap_t13_hit_binary_output.insert(1, "ifc", barcode_assignment)
+        # sampleid
         sampleid = []
-        for idx in redcap_t13_hit_binary_output.index:
+        for idx in redcap_t13_hit_binary_output.index: # strip all _ and asterisks from the sample names
             cleaned_idx = re.sub(r'[\*\|†\s]', '', idx)
             sampleid.append(cleaned_idx)
-        
-        redcap_t13_hit_binary_output.insert(0, "date", date)
-        redcap_t13_hit_binary_output.insert(1, "ifc", barcode_assignment)
         redcap_t13_hit_binary_output.insert(2, "sampleid", sampleid)
-
+        # recordid
         record_id = []
         for row in redcap_t13_hit_binary_output.itertuples():
             samp_id = row.sampleid 
@@ -83,68 +130,46 @@ def build_redcap(self, fl_t13_hit_binary_output_2, date, barcode_assignment):
         redcap_t13_hit_binary_output.insert(0, "record_id", record_id)
 
         ### merge same samples ran on different panels 
-        # extract sampleid before panel 
+        # extract sampleid before panel _P1 or _P2 or _RVP
         redcap_t13_hit_binary_output['sampleid_prefix'] = redcap_t13_hit_binary_output['sampleid'].str.replace(r'(_P1|_P2|_RVP)$', '', regex=True)
        
-        # merge rows with the same sampleid_prefix - keep assay results unique, combine record_ids, update sampleid
-        def merge_group(group):
-            # select first row in subset of redcap_t13_hit_binary_output grouped by sampleid_prefix
-            merged_row = pd.DataFrame(columns=group.columns)
-            merged_row.loc[0] = group.iloc[0]
+        # subset redcap into two dfs 
+        controlsDF = redcap_t13_hit_binary_output[redcap_t13_hit_binary_output['sampleid'].str.contains('NDC|CPC|NTC', regex=True, na=False)]
+        samplesDF = redcap_t13_hit_binary_output[~redcap_t13_hit_binary_output['sampleid'].str.contains('NDC|CPC|NTC', regex=True, na=False)]
 
-            # the group is the unique sampleid_prefix - each group should have max 2 rows
-            for col in group.columns:
-                if col not in ["record_id", "date", "ifc", "sampleid", "sampleid_prefix"]:
-                    # if merged_row['cchfv'] = [5,4], then lambda fn will produce [5,None]
-                    # dropna will make it merged_row['cchfv'] = [5]
-                    # .unique ensures that only unique vals are retained
-                   
-                    if all(group[col] == 4):
-                        merged_row[col] = 4
-                    else: # if group[col] = [5,4] or [3, 4] - there's no world where it would be [5,3]
-                        filtered_values = group.loc[group[col] != 4, col].dropna().unique()
-                        merged_row[col] = filtered_values[0] 
-                     
-            # each record_id is split and the two panel suffixes are added to the record_id - the .unique ensures that that all distinct splits are added tg
-            merged_row['record_id'] = '_'.join(group['record_id'].apply(lambda x: x.split('_')[-1]).unique())
-
-            # assign a sampleid to the merged_row (doesn't matter as sampleid col will be dropped later)
-            merged_row['sampleid'] = group['sampleid'].iloc[0]
-
-            return merged_row
-        
         # apply the merge_group function to each group in the groupby obj (which is a df)
-        redcap_t13_hit_binary_output = redcap_t13_hit_binary_output.groupby('sampleid_prefix').apply(merge_group).reset_index(drop=True)
+        samplesDF = samplesDF.groupby('sampleid_prefix').apply(merge_group).reset_index(drop=True)
 
-        # make record_id col be ifc + sample_id_prefix + record_id (which is just _P1_P2)
+        # fix the suffix in record_id
         record_id_fix = []
-        for row in redcap_t13_hit_binary_output.itertuples():
+        for row in samplesDF.itertuples():
             record_id = row.record_id 
-            sampleid_prefix = row.sampleid_prefix
-            sampleid = row.sampleid
-            if not any(control in sampleid_prefix for control in ['NTC', 'CPC', 'NDC']):
-                record_id_val = barcode_assignment + '_' + sampleid_prefix + '_' + record_id
-                record_id_fix.append(record_id_val)
-            else:
-                record_id_val = barcode_assignment + '_' + sampleid + '_' + record_id
-
-        redcap_t13_hit_binary_output['record_id'] = record_id_fix
-
-        ### drop sampleid
-        redcap_t13_hit_binary_output = redcap_t13_hit_binary_output.drop(columns=['sampleid'])
-
-        ### rename sampleid_prefix as sampleid and insert it as the 4th col
-        redcap_t13_hit_binary_output = redcap_t13_hit_binary_output.rename(columns={'sampleid_prefix': 'sampleid'})
-        cols = list(redcap_t13_hit_binary_output.columns)
-        cols.remove('sampleid')
-        cols.insert(3, 'sampleid')
-        redcap_t13_hit_binary_output = redcap_t13_hit_binary_output[cols]
+            suffix_record_id = row.suffix_record_id
+            record_id = record_id.split("_")[:-1] 
+            record_id = "_".join(record_id) 
+            new_record_id = record_id + "_" + suffix_record_id
+            record_id_fix.append(new_record_id)
+        samplesDF['record_id'] = record_id_fix  
+
+        # drop suffix_record_id
+        samplesDF = samplesDF.drop(columns=['suffix_record_id'])
+
+        # concatenate back to redcap
+        concat_redcap_t13_hit_binary_output = pd.concat((samplesDF, controlsDF), axis=0, ignore_index=True)
+
+        ### write sampleid as the sample_prefix for all samples but those containing CPC, NTC, and NDC
+        mask = ~concat_redcap_t13_hit_binary_output['sampleid'].str.contains('NTC|CPC|NDC', regex=True, na=False)
+        concat_redcap_t13_hit_binary_output.loc[mask, 'sampleid'] = concat_redcap_t13_hit_binary_output['sampleid_prefix']
+
+        # drop sample_prefix_id
+        concat_redcap_t13_hit_binary_output = concat_redcap_t13_hit_binary_output.drop(columns=['sampleid_prefix'])
 
         ### lowercase all columns in redcap_t13_hit_binary_output for REDCAP data entry
-        redcap_t13_hit_binary_output.columns = redcap_t13_hit_binary_output.columns.str.lower()
+        concat_redcap_t13_hit_binary_output.columns = concat_redcap_t13_hit_binary_output.columns.str.lower()
 
         ### reset index
-        redcap_t13_hit_binary_output = redcap_t13_hit_binary_output.reset_index(drop=True)
+        concat_redcap_t13_hit_binary_output = concat_redcap_t13_hit_binary_output.reset_index(drop=True)
+
 
-        return redcap_t13_hit_binary_output
+        return concat_redcap_t13_hit_binary_output # redcap_t13_hit_binary_output, samplesDF, controlsDF