Skip to content

Commit 0b4ee5a

Browse files
committed
v5.1.0
1 parent 8df49fc commit 0b4ee5a

File tree

3 files changed

+121
-98
lines changed

3 files changed

+121
-98
lines changed

analyze_run.py

Lines changed: 7 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -260,9 +260,9 @@
260260
# save t13_hit_output as Results_Summary after flags are added below in Flagger()
261261

262262
######################################################################################################################################################
263-
# instantiate NTC_Normalized from ntcnorm.py
263+
# instantiate NTC_Normalized from ntcnorm.py
264264
ntcNorm = Normalized()
265-
# apply ntc_normalizr to the t13_dataframe to produce a new dataframe with all values divided by the mean NTC for that assay
265+
# apply ntc_normalizr to the t13_dataframe to produce a new dataframe with all values divided by the mean NTC for that assay
266266
t13_quant_norm = ntcNorm.normalizr(t13_dataframe_copy2)
267267
# round the NTC normalized quantitative results file
268268
decimals = 5 # Define the number of decimals for rounding
@@ -832,33 +832,7 @@
832832
fail_nocrRNA_check_df.to_csv(fail_nocrRNA_check_df_file_path, index=True)
833833
print(f"CSV created with data at {fail_nocrRNA_check_df_file_path}")
834834

835-
"""
836-
flagged_file = t13_hit_output.copy()
837-
processed_samples = set()
838-
for _, row in high_raw_ntc_signal_df.iterrows():
839-
for col in high_raw_ntc_signal_df.columns: # cols are Sample, Assay, t13
840-
cont_ntc_sample = row['Sample'] # NEG NTC sample
841-
cont_ntc_assay = row['Assay'] # NTC assay
842-
# now iterate over the flagged file
843-
for idx, sample_row in flagged_file.iterrows():
844-
if cont_ntc_sample == idx:
845-
# add † to each cell value
846-
for assay_col in flagged_file.columns:
847-
if assay_col.upper() == cont_ntc_assay.upper():
848-
# check that the sample-assay pair has alr been processed
849-
if (cont_ntc_sample, cont_ntc_assay) not in processed_samples:
850-
processed_samples.add((cont_ntc_sample, cont_ntc_assay))
851-
# check if the value is NA (NaN)
852-
if pd.isna(sample_row[assay_col]) or sample_row[assay_col] == '':
853-
flagged_file.loc[idx, assay_col] = '†' # only dagger if value is NA
854-
else:
855-
flagged_file[assay_col] = flagged_file[assay_col].astype(str)
856-
#flagged_file.at[idx, assay_col] = str(flagged_file.at[idx, assay_col])
857-
flagged_file.at[idx, assay_col] = f"{sample_row[assay_col]}†" # add dagger to the value
858-
859-
860-
861-
"""
835+
862836
######################################################################################################################################################
863837
# instantiate Flagger from flags.py
864838
flagger = Flagger()
@@ -1038,10 +1012,10 @@
10381012
# make copy of binary output file from RESULTS Excel sheet
10391013
fl_t13_hit_binary_output_2 = fl_t13_hit_binary_output.copy()
10401014

1041-
redcap_t13_hit_binary_output = redcapper.build_redcap(fl_t13_hit_binary_output_2, date, barcode_assignment)
1042-
1043-
redcap_t13_hit_binary_output_file_path = os.path.join(res_subfolder, f'REDCAP_{barcode_assignment}.csv')
1044-
redcap_t13_hit_binary_output.to_csv(redcap_t13_hit_binary_output_file_path, index=True)
1015+
concat_redcap_t13_hit_binary_output = redcapper.build_redcap(fl_t13_hit_binary_output_2, date, barcode_assignment)
1016+
# concat_redcap_t13_hit_binary_output, samplesDF, controlsDF
1017+
concat_redcap_t13_hit_binary_output_file_path = os.path.join(res_subfolder, f'REDCAP_{barcode_assignment}.csv')
1018+
concat_redcap_t13_hit_binary_output.to_csv(concat_redcap_t13_hit_binary_output_file_path, index=True)
10451019
print("REDCAP file generated.")
10461020
print("Operation complete.")
10471021

flags.py

Lines changed: 28 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -75,13 +75,37 @@ def assign_flags(self, fail_nocrRNA_check_df, high_raw_ntc_signal_df, rnasep_df,
7575
## dagger flag needs to be added to t13_hit_output, rounded_t13_quant_norm, summary_samples_df, t13_hit_binary_output
7676
if i in {0,1,2,4}:
7777
processed_samples = set()
78+
79+
for _, row in high_raw_ntc_signal_df.iterrows(): # cols are Sample, Assay, t13
80+
cont_ntc_sample = row['Sample'].upper() # NEG NTC sample
81+
cont_ntc_sample = cont_ntc_sample.strip()
82+
cont_ntc_assay = row['Assay'].upper() # NTC assay
83+
# now iterate over the flagged file
84+
for idx, sample_row in flagged_file.iterrows():
85+
idx = str(idx).strip().upper()
86+
if cont_ntc_sample == idx:
87+
# add † to each cell value
88+
for assay_col in flagged_file.columns:
89+
if assay_col.strip().upper() == cont_ntc_assay:
90+
# check that the sample-assay pair has alr been processed
91+
if (cont_ntc_sample, cont_ntc_assay) not in processed_samples:
92+
processed_samples.add((cont_ntc_sample, cont_ntc_assay))
93+
# check if the value is NA (NaN)
94+
if pd.isna(sample_row[assay_col]) or sample_row[assay_col] == '':
95+
flagged_file.loc[idx, assay_col] = '†' # only dagger if value is NA
96+
else:
97+
flagged_file[assay_col] = flagged_file[assay_col].astype(str)
98+
#flagged_file.at[idx, assay_col] = str(flagged_file.at[idx, assay_col])
99+
flagged_file.at[idx, assay_col] = f"{sample_row[assay_col]}†" # add dagger to the value
100+
"""
78101
for _, row in high_raw_ntc_signal_df.iterrows():
79102
for col in high_raw_ntc_signal_df.columns: # cols are Sample, Assay, t13
80-
cont_ntc_sample = row['Sample'] # NEG NTC sample
81-
cont_ntc_assay = row['Assay'] # NTC assay
103+
cont_ntc_sample = row['Sample'].upper() # NEG NTC sample
104+
cont_ntc_sample = cont_ntc_sample.strip()
105+
cont_ntc_assay = row['Assay'].upper() # NTC assay
82106
# now iterate over the flagged file
83107
for idx, sample_row in flagged_file.iterrows():
84-
if cont_ntc_sample == idx:
108+
if cont_ntc_sample == str(idx).upper():
85109
# add † to each cell value
86110
for assay_col in flagged_file.columns:
87111
if assay_col.upper() == cont_ntc_assay.upper():
@@ -95,7 +119,7 @@ def assign_flags(self, fail_nocrRNA_check_df, high_raw_ntc_signal_df, rnasep_df,
95119
flagged_file[assay_col] = flagged_file[assay_col].astype(str)
96120
#flagged_file.at[idx, assay_col] = str(flagged_file.at[idx, assay_col])
97121
flagged_file.at[idx, assay_col] = f"{sample_row[assay_col]}†" # add dagger to the value
98-
122+
"""
99123
for _, row in high_raw_ntc_signal_df.iterrows():
100124
for col in high_raw_ntc_signal_df.columns:
101125
cont_ntc_sample = row['Sample'] # NEG NTC sample

redcap_builder.py

Lines changed: 86 additions & 61 deletions
Original file line numberDiff line numberDiff line change
@@ -11,18 +11,63 @@ def __init__(self):
1111

1212
# method
1313
def build_redcap(self, fl_t13_hit_binary_output_2, date, barcode_assignment):
14+
15+
# merge rows with the same sampleid_prefix - keep assay results unique, combine record_ids, update sampleid
16+
def merge_group(group):
17+
# select first row in subset of redcap_t13_hit_binary_output grouped by sampleid_prefix
18+
merged_row = pd.DataFrame(columns=group.columns)
19+
merged_row.loc[0] = group.iloc[0]
20+
21+
# the group is the unique sampleid_prefix - each group should have max 2 rows
22+
for col in group.columns:
23+
if col not in ["record_id", "date", "ifc", "sampleid", "sampleid_prefix"]:
24+
# if merged_row['cchfv'] = [5,4], then lambda fn will produce [5,None]
25+
# dropna will make it merged_row['cchfv'] = [5]
26+
# .unique ensures that only unique vals are retained
27+
28+
if all(group[col] == 4):
29+
merged_row[col] = 4
30+
else: # if group[col] = [5,4] or [3, 4] - there's no world where it would be [5,3]
31+
filtered_values = group.loc[group[col] != 4, col].dropna().unique()
32+
merged_row[col] = filtered_values[0] if len(filtered_values) ==1 else filtered_values[1]
33+
34+
# each record_id is split and the unique panel suffixes are added to suffix_record_id
35+
merged_row['suffix_record_id'] = '_'.join(group['record_id'].apply(lambda x: x.split('_')[-1]).unique())
36+
37+
# assign a sampleid to the merged_row (doesn't matter as sampleid col will be dropped later)
38+
#merged_row['sampleid'] = group['sampleid'].iloc[0]
39+
40+
return merged_row
1441

1542
### convert 0 to 2 (negative)
1643
redcap_t13_hit_binary_output = fl_t13_hit_binary_output_2.replace(0, 2)
1744

18-
### drop any rows incl and below 'Summary' row
45+
### drop any rows incl and below 'Summary' row
1946
if 'Summary' in redcap_t13_hit_binary_output.index:
2047
idx = redcap_t13_hit_binary_output.index.get_loc('Summary')
2148
redcap_t13_hit_binary_output = redcap_t13_hit_binary_output.iloc[:idx]
2249

2350
### convert any cell val with a dagger † to 6 (NTC contaminated)
24-
redcap_t13_hit_binary_output = redcap_t13_hit_binary_output.replace(r'.*†.*', 6, regex=True)
51+
"""
52+
test_redcap_t13_hit_binary_output = redcap_t13_hit_binary_output.copy()
53+
test_redcap_t13_hit_binary_output = test_redcap_t13_hit_binary_output.astype(str)
54+
test_redcap_t13_hit_binary_output = test_redcap_t13_hit_binary_output.map(lambda x: str(x) if not isinstance(x, str) else x)
55+
56+
for _, row in high_raw_ntc_signal_df.iterrows():
57+
cont_ntc_sample = row['Sample']
58+
cont_ntc_assay = row['Assay'].upper()
59+
60+
# Check if the sample exists in the row index and the assay exists in the column header
61+
if cont_ntc_sample in test_redcap_t13_hit_binary_output.index and cont_ntc_assay in test_redcap_t13_hit_binary_output.columns:
62+
current_value = test_redcap_t13_hit_binary_output.loc[cont_ntc_sample, cont_ntc_assay]
63+
if '†' in current_value:
64+
test_redcap_t13_hit_binary_output.loc[cont_ntc_sample, cont_ntc_assay] = '6'
65+
"""
66+
redcap_t13_hit_binary_output = redcap_t13_hit_binary_output.astype(str)
67+
redcap_t13_hit_binary_output = redcap_t13_hit_binary_output.applymap(lambda x: '6' if '†' in x else x)
68+
2569

70+
2671
### convert col vals for invalid assays to 5 (invalid)
2772
# for all invalid samples
2873
redcap_t13_hit_binary_output.loc[redcap_t13_hit_binary_output['SAMPLE VALID? Y/N'] == 'N***', :] = 5
@@ -49,7 +94,7 @@ def build_redcap(self, fl_t13_hit_binary_output_2, date, barcode_assignment):
4994
if re.search(r'rnasep|no_crrna', col, re.IGNORECASE):
5095
new_col = re.split(r'[*]', col)[0]
5196
redcap_t13_hit_binary_output.columns.values[i] = new_col
52-
97+
5398
### add columns for the assay that wasn't run with since REDCAP format needs all assays (RVP and BBP) headers in
5499
bbp_assays = ['CCHFV', 'CHI', 'DENV', 'EBOV', 'HBV_DNA', 'HCV', 'HIV_1', 'HIV_2', 'HTV', 'LASV', 'MBV', 'MMV',
55100
'MPOX_DNA', 'ONN', 'PF_3_DNA', 'RBV', 'RVFV', 'SYPH_DNA', 'WNV', 'YFV', 'ZIKV']
@@ -60,20 +105,22 @@ def build_redcap(self, fl_t13_hit_binary_output_2, date, barcode_assignment):
60105
for col in column_order:
61106
if col not in redcap_t13_hit_binary_output.columns:
62107
redcap_t13_hit_binary_output[col] = 4
63-
108+
64109
### reorder cols
65110
redcap_t13_hit_binary_output = redcap_t13_hit_binary_output[column_order]
66111

67112
### add in the metadata columns
113+
# date
114+
redcap_t13_hit_binary_output.insert(0, "date", date)
115+
# barcode assignment
116+
redcap_t13_hit_binary_output.insert(1, "ifc", barcode_assignment)
117+
# sampleid
68118
sampleid = []
69-
for idx in redcap_t13_hit_binary_output.index:
119+
for idx in redcap_t13_hit_binary_output.index: # strip all _ and asterisks from the sample names
70120
cleaned_idx = re.sub(r'[\*\|†\s]', '', idx)
71121
sampleid.append(cleaned_idx)
72-
73-
redcap_t13_hit_binary_output.insert(0, "date", date)
74-
redcap_t13_hit_binary_output.insert(1, "ifc", barcode_assignment)
75122
redcap_t13_hit_binary_output.insert(2, "sampleid", sampleid)
76-
123+
# recordid
77124
record_id = []
78125
for row in redcap_t13_hit_binary_output.itertuples():
79126
samp_id = row.sampleid
@@ -83,68 +130,46 @@ def build_redcap(self, fl_t13_hit_binary_output_2, date, barcode_assignment):
83130
redcap_t13_hit_binary_output.insert(0, "record_id", record_id)
84131

85132
### merge same samples ran on different panels
86-
# extract sampleid before panel
133+
# extract sampleid before panel _P1 or _P2 or _RVP
87134
redcap_t13_hit_binary_output['sampleid_prefix'] = redcap_t13_hit_binary_output['sampleid'].str.replace(r'(_P1|_P2|_RVP)$', '', regex=True)
88135

89-
# merge rows with the same sampleid_prefix - keep assay results unique, combine record_ids, update sampleid
90-
def merge_group(group):
91-
# select first row in subset of redcap_t13_hit_binary_output grouped by sampleid_prefix
92-
merged_row = pd.DataFrame(columns=group.columns)
93-
merged_row.loc[0] = group.iloc[0]
136+
# subset redcap into two dfs
137+
controlsDF = redcap_t13_hit_binary_output[redcap_t13_hit_binary_output['sampleid'].str.contains('NDC|CPC|NTC', regex=True, na=False)]
138+
samplesDF = redcap_t13_hit_binary_output[~redcap_t13_hit_binary_output['sampleid'].str.contains('NDC|CPC|NTC', regex=True, na=False)]
94139

95-
# the group is the unique sampleid_prefix - each group should have max 2 rows
96-
for col in group.columns:
97-
if col not in ["record_id", "date", "ifc", "sampleid", "sampleid_prefix"]:
98-
# if merged_row['cchfv'] = [5,4], then lambda fn will produce [5,None]
99-
# dropna will make it merged_row['cchfv'] = [5]
100-
# .unique ensures that only unique vals are retained
101-
102-
if all(group[col] == 4):
103-
merged_row[col] = 4
104-
else: # if group[col] = [5,4] or [3, 4] - there's no world where it would be [5,3]
105-
filtered_values = group.loc[group[col] != 4, col].dropna().unique()
106-
merged_row[col] = filtered_values[0]
107-
108-
# each record_id is split and the two panel suffixes are added to the record_id - the .unique ensures that that all distinct splits are added tg
109-
merged_row['record_id'] = '_'.join(group['record_id'].apply(lambda x: x.split('_')[-1]).unique())
110-
111-
# assign a sampleid to the merged_row (doesn't matter as sampleid col will be dropped later)
112-
merged_row['sampleid'] = group['sampleid'].iloc[0]
113-
114-
return merged_row
115-
116140
# apply the merge_group function to each group in the groupby obj (which is a df)
117-
redcap_t13_hit_binary_output = redcap_t13_hit_binary_output.groupby('sampleid_prefix').apply(merge_group).reset_index(drop=True)
141+
samplesDF = samplesDF.groupby('sampleid_prefix').apply(merge_group).reset_index(drop=True)
118142

119-
# make record_id col be ifc + sample_id_prefix + record_id (which is just _P1_P2)
143+
# fix the suffix in record_id
120144
record_id_fix = []
121-
for row in redcap_t13_hit_binary_output.itertuples():
145+
for row in samplesDF.itertuples():
122146
record_id = row.record_id
123-
sampleid_prefix = row.sampleid_prefix
124-
sampleid = row.sampleid
125-
if not any(control in sampleid_prefix for control in ['NTC', 'CPC', 'NDC']):
126-
record_id_val = barcode_assignment + '_' + sampleid_prefix + '_' + record_id
127-
record_id_fix.append(record_id_val)
128-
else:
129-
record_id_val = barcode_assignment + '_' + sampleid + '_' + record_id
130-
131-
redcap_t13_hit_binary_output['record_id'] = record_id_fix
132-
133-
### drop sampleid
134-
redcap_t13_hit_binary_output = redcap_t13_hit_binary_output.drop(columns=['sampleid'])
135-
136-
### rename sampleid_prefix as sampleid and insert it as the 4th col
137-
redcap_t13_hit_binary_output = redcap_t13_hit_binary_output.rename(columns={'sampleid_prefix': 'sampleid'})
138-
cols = list(redcap_t13_hit_binary_output.columns)
139-
cols.remove('sampleid')
140-
cols.insert(3, 'sampleid')
141-
redcap_t13_hit_binary_output = redcap_t13_hit_binary_output[cols]
147+
suffix_record_id = row.suffix_record_id
148+
record_id = record_id.split("_")[:-1]
149+
record_id = "_".join(record_id)
150+
new_record_id = record_id + "_" + suffix_record_id
151+
record_id_fix.append(new_record_id)
152+
samplesDF['record_id'] = record_id_fix
153+
154+
# drop suffix_record_id
155+
samplesDF = samplesDF.drop(columns=['suffix_record_id'])
156+
157+
# concatenate back to redcap
158+
concat_redcap_t13_hit_binary_output = pd.concat((samplesDF, controlsDF), axis=0, ignore_index=True)
159+
160+
### write sampleid as the sample_prefix for all samples but those containing CPC, NTC, and NDC
161+
mask = ~concat_redcap_t13_hit_binary_output['sampleid'].str.contains('NTC|CPC|NDC', regex=True, na=False)
162+
concat_redcap_t13_hit_binary_output.loc[mask, 'sampleid'] = concat_redcap_t13_hit_binary_output['sampleid_prefix']
163+
164+
# drop sample_prefix_id
165+
concat_redcap_t13_hit_binary_output = concat_redcap_t13_hit_binary_output.drop(columns=['sampleid_prefix'])
142166

143167
### lowercase all columns in redcap_t13_hit_binary_output for REDCAP data entry
144-
redcap_t13_hit_binary_output.columns = redcap_t13_hit_binary_output.columns.str.lower()
168+
concat_redcap_t13_hit_binary_output.columns = concat_redcap_t13_hit_binary_output.columns.str.lower()
145169

146170
### reset index
147-
redcap_t13_hit_binary_output = redcap_t13_hit_binary_output.reset_index(drop=True)
171+
concat_redcap_t13_hit_binary_output = concat_redcap_t13_hit_binary_output.reset_index(drop=True)
172+
148173

149-
return redcap_t13_hit_binary_output
174+
return concat_redcap_t13_hit_binary_output # redcap_t13_hit_binary_output, samplesDF, controlsDF
150175

0 commit comments

Comments
 (0)