Skip to content

Commit aae6589

Browse files
committed
v5.2.0
1 parent e0a6cc7 commit aae6589

File tree

5 files changed

+88
-74
lines changed

5 files changed

+88
-74
lines changed

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
CARMEN is a diagnostic tool designed for surveillance purposes. Below are the instructions to complete your CARMEN analysis.
33

44
## Software Version
5-
When cloning this repository, you will be using software version 5.1.0.
5+
When cloning this repository, you will be using software version 5.2.0.
66

77
## Overview
88
At this point, you have ran the $Standard\ BioTools\ Dynamic\ Array^{TM}$ IFC (integrated fluidic circuit) on the $Standard\ BioTools\ Biomark^{TM}$ instrument and have completed the experimental portion of CARMEN. In running this code, you will be able to complete the data analysis portion of CARMEN and generate both binary positive/negative and quantitative signal output of your diagnostic assay.

analyze_run.py

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,7 @@
5252

5353
######################################################################################################################################################
5454
# assign software version
55-
software_version = '5.1.0'
55+
software_version = '5.2.0'
5656

5757
######################################################################################################################################################
5858
# data loading
@@ -116,7 +116,8 @@
116116
print("Please include the command line arguments when running analyze_run.py")
117117
else:
118118
# Proceed with your script logic
119-
print("Threshold provided:", sys.argv[1:])
119+
print("Threshold provided:", CLI_arg[1])
120+
#print("Threshold provided:", sys.argv[1:])
120121

121122

122123
## Set up structure of the output folder - simplify into RESUTLS, QUALITY CONTROL, R&D
@@ -1012,10 +1013,12 @@
10121013
# make copy of binary output file from RESULTS Excel sheet
10131014
fl_t13_hit_binary_output_2 = fl_t13_hit_binary_output.copy()
10141015

1015-
concat_redcap_t13_hit_binary_output = redcapper.build_redcap(fl_t13_hit_binary_output_2, date, barcode_assignment)
1016-
# concat_redcap_t13_hit_binary_output, samplesDF, controlsDF
1017-
concat_redcap_t13_hit_binary_output_file_path = os.path.join(res_subfolder, f'REDCAP_{barcode_assignment}.csv')
1018-
concat_redcap_t13_hit_binary_output.to_csv(concat_redcap_t13_hit_binary_output_file_path, index=True)
1016+
# apply redcapper to fl_t13_hit_binary_output_2 df
1017+
redcap_t13_hit_binary_output = redcapper.build_redcap(fl_t13_hit_binary_output_2, date, barcode_assignment)
1018+
1019+
# save REDCAP file
1020+
redcap_t13_hit_binary_output_file_path = os.path.join(res_subfolder, f'REDCAP_{barcode_assignment}.csv')
1021+
redcap_t13_hit_binary_output.to_csv(redcap_t13_hit_binary_output_file_path, index=False)
10191022
print("REDCAP file generated.")
10201023
print("Operation complete.")
10211024

flags.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -86,11 +86,12 @@ def assign_flags(self, fail_nocrRNA_check_df, high_raw_ntc_signal_df, rnasep_df,
8686
if cont_ntc_sample == idx:
8787
# add † to each cell value
8888
for assay_col in flagged_file.columns:
89-
if assay_col.strip().upper() == cont_ntc_assay:
89+
stripped_assay_col = re.sub(r'[\*\|†\s]', '', assay_col) # strip the * from assay_col
90+
if stripped_assay_col.upper() == cont_ntc_assay: # shld now be met
9091
# check that the sample-assay pair has alr been processed
9192
if (cont_ntc_sample, cont_ntc_assay) not in processed_samples:
9293
processed_samples.add((cont_ntc_sample, cont_ntc_assay))
93-
# check if the value is NA (NaN)
94+
# check if the value is NA (NaN) re.search(r'rnasep|no_crrna', col, re.IGNORECASE)
9495
if pd.isna(sample_row[assay_col]) or sample_row[assay_col] == '':
9596
flagged_file.loc[idx, assay_col] = '†' # only dagger if value is NA
9697
else:

reader.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -68,8 +68,8 @@ def extract_dataframes_from_csv(self, file_like_object, phrases_to_find):
6868
df = pd.read_csv(io.StringIO(content), nrows=0)
6969
date_header = df.columns[7]
7070
date_str = date_header.split(' ')[0] # strip removes the character to concat rest and split breaks string at character
71-
date_obj = datetime.strptime(date_str, "%d/%m/%Y")
72-
date = date_obj.strftime("%m/%d/%Y")
71+
#date_obj = datetime.strptime(date_str, "%d/%m/%Y")
72+
#date = date_obj.strftime("%m/%d/%Y")
7373

7474
# Return a dictionary of the dataframes
75-
return dataframes, date
75+
return dataframes, date_str

redcap_builder.py

Lines changed: 72 additions & 62 deletions
Original file line numberDiff line numberDiff line change
@@ -12,8 +12,11 @@ def __init__(self):
1212
# method
1313
def build_redcap(self, fl_t13_hit_binary_output_2, date, barcode_assignment):
1414

15+
# legend
16+
# 1 = pos, 2 = neg, 3 = pending, 4 = not run, 5 = invalid, 6 = NTC contaminated
17+
1518
# merge rows with the same sampleid_prefix - keep assay results unique, combine record_ids, update sampleid
16-
def merge_group(group):
19+
def merge_group(group, bbp_P1_assays, bbp_P2_assays, rvp_assays):
1720
# select first row in subset of redcap_t13_hit_binary_output grouped by sampleid_prefix
1821
merged_row = pd.DataFrame(columns=group.columns)
1922
merged_row.loc[0] = group.iloc[0]
@@ -24,106 +27,114 @@ def merge_group(group):
2427
# if merged_row['cchfv'] = [5,4], then lambda fn will produce [5,None]
2528
# dropna will make it merged_row['cchfv'] = [5]
2629
# .unique ensures that only unique vals are retained
27-
28-
if all(group[col] == 4):
30+
if all(group[col] == 4): # not run
2931
merged_row[col] = 4
30-
else: # if group[col] = [5,4] or [3, 4] - there's no world where it would be [5,3]
31-
filtered_values = group.loc[group[col] != 4, col].dropna().unique()
32-
merged_row[col] = filtered_values[0] if len(filtered_values) ==1 else filtered_values[1]
32+
elif all(group[col] ==5): # both assays are invalid
33+
merged_row[col] = 5
34+
elif all(group[col] ==2): # both assays are negative
35+
merged_row[col] = 2
36+
else:
37+
p1_value = group.loc[group["sampleid"].str.endswith("_P1"), col].dropna().unique()
38+
p2_value = group.loc[group["sampleid"].str.endswith("_P2"), col].dropna().unique()
39+
rvp_value = group.loc[group["sampleid"].str.endswith("_RVP"), col].dropna().unique()
40+
41+
if col in bbp_P1_assays and len(p1_value) > 0:
42+
merged_row[col] = p1_value #p1_value[0]
43+
elif col in bbp_P2_assays and len(p2_value) > 0:
44+
merged_row[col] = p2_value #p2_value[0]
45+
elif col in rvp_assays and len(p2_value) > 0:
46+
merged_row[col] = rvp_value #rvp_value[0]
47+
"""
48+
else:
49+
# if group[col] = [5,4] or [3, 4] - there's no world where it would be [5,3]
50+
filtered_values = group.loc[group[col] != 4, col].dropna().unique()
51+
# ^ group.loc[group[col] != 4, col] filters the rows in group where the column col is NOT equal to 4
52+
merged_row[col] = filtered_values[0] #if len(filtered_values) ==1 else filtered_values[1]
53+
"""
3354

3455
# each record_id is split and the unique panel suffixes are added to suffix_record_id
3556
merged_row['suffix_record_id'] = '_'.join(group['record_id'].apply(lambda x: x.split('_')[-1]).unique())
3657

37-
# assign a sampleid to the merged_row (doesn't matter as sampleid col will be dropped later)
38-
#merged_row['sampleid'] = group['sampleid'].iloc[0]
39-
4058
return merged_row
4159

4260
### format input flagged t13 binary hit file
4361
redcap_t13_hit_binary_output = fl_t13_hit_binary_output_2.copy()
4462
redcap_t13_hit_binary_output = redcap_t13_hit_binary_output.astype(str)
4563

64+
### convert any cell val with a dagger † to 6 (NTC contaminated)
65+
redcap_t13_hit_binary_output = redcap_t13_hit_binary_output.map(lambda x: '6' if '0.0†' in x else x)
66+
4667
### convert 0 to 2 (negative)
4768
redcap_t13_hit_binary_output = redcap_t13_hit_binary_output.replace(to_replace=r'^0.*', value=2, regex=True)
4869

70+
### convert 1.0 to 1 (positive)
71+
redcap_t13_hit_binary_output = redcap_t13_hit_binary_output.replace(to_replace=r'^1.0', value=1, regex=True)
72+
4973
### drop any rows incl and below 'Summary' row
5074
if 'Summary' in redcap_t13_hit_binary_output.index:
5175
idx = redcap_t13_hit_binary_output.index.get_loc('Summary')
5276
redcap_t13_hit_binary_output = redcap_t13_hit_binary_output.iloc[:idx]
53-
54-
### convert any cell val with a dagger † to 6 (NTC contaminated)
55-
"""
56-
test_redcap_t13_hit_binary_output = redcap_t13_hit_binary_output.copy()
57-
test_redcap_t13_hit_binary_output = test_redcap_t13_hit_binary_output.astype(str)
58-
test_redcap_t13_hit_binary_output = test_redcap_t13_hit_binary_output.map(lambda x: str(x) if not isinstance(x, str) else x)
59-
60-
for _, row in high_raw_ntc_signal_df.iterrows():
61-
cont_ntc_sample = row['Sample']
62-
cont_ntc_assay = row['Assay'].upper()
63-
64-
# Check if the sample exists in the row index and the assay exists in the column header
65-
if cont_ntc_sample in test_redcap_t13_hit_binary_output.index and cont_ntc_assay in test_redcap_t13_hit_binary_output.columns:
66-
current_value = test_redcap_t13_hit_binary_output.loc[cont_ntc_sample, cont_ntc_assay]
67-
if '†' in current_value:
68-
test_redcap_t13_hit_binary_output.loc[cont_ntc_sample, cont_ntc_assay] = '6'
69-
"""
70-
redcap_t13_hit_binary_output = redcap_t13_hit_binary_output.astype(str)
71-
redcap_t13_hit_binary_output = redcap_t13_hit_binary_output.map(lambda x: '6' if '†' in x else x)
72-
73-
7477

75-
### convert col vals for invalid assays to 5 (invalid)
78+
### convert col vals for invalid samples to 5 (invalid)
7679
# for all invalid samples
7780
redcap_t13_hit_binary_output.loc[redcap_t13_hit_binary_output['SAMPLE VALID? Y/N'] == 'N***', :] = 5
78-
81+
82+
### convert col vals for invalid assays to 5 (invalid)
7983
# for all invalid assays
8084
assay_valid_cols = redcap_t13_hit_binary_output.columns[redcap_t13_hit_binary_output.loc['Assay Valid?'] == 'INVALID ASSAY']
8185
for col in assay_valid_cols:
8286
redcap_t13_hit_binary_output[col] = 5
83-
87+
8488
### drop the 'SAMPLE VALID? Y/N' col
8589
redcap_t13_hit_binary_output = redcap_t13_hit_binary_output.drop('SAMPLE VALID? Y/N', axis=1)
86-
90+
8791
### drop the 'Assay Valid?' row
8892
redcap_t13_hit_binary_output = redcap_t13_hit_binary_output.drop('Assay Valid?', axis=0)
89-
93+
9094
### drop any columns containing no_crRNA
9195
redcap_t13_hit_binary_output = redcap_t13_hit_binary_output.loc[:, ~redcap_t13_hit_binary_output.columns.str.lower().str.contains('no_crrna')]
92-
96+
9397
### strip all _ and asterisks from the column names
9498
for i, col in enumerate(redcap_t13_hit_binary_output.columns):
95-
if not re.search(r'rnasep|no_crrna', col, re.IGNORECASE):
96-
new_col = re.split(r'[_*]', col)[0]
99+
if not re.search(r'rnasep', col, re.IGNORECASE):
100+
new_col = re.split(r'[_*]', col)[0] # remove _ and * from all col names for assays
97101
redcap_t13_hit_binary_output.columns.values[i] = new_col
98-
if re.search(r'rnasep|no_crrna', col, re.IGNORECASE):
99-
new_col = re.split(r'[*]', col)[0]
102+
if re.search(r'rnasep', col, re.IGNORECASE):
103+
new_col = re.split(r'[*]', col)[0] # we don't want to remove the _P1 or _RVP or _P2 part from RNASEP column header
100104
redcap_t13_hit_binary_output.columns.values[i] = new_col
101105

102106
### add columns for the assay that wasn't run with since REDCAP format needs all assays (RVP and BBP) headers in
107+
# define assays
108+
bbp_P1_assays = ['CCHFV','EBOV','HIV_1','HIV_2','LASV', 'MBV','MPOX_DNA','PF_3_DNA','WNV','YFV']
109+
bbp_P2_assays = ['CHI', 'DENV','HBV_DNA','HCV', 'HTV', 'MMV', 'ONN','RBV','RVFV','SYPH_DNA','ZIKV']
110+
rvp_assays = ['SARS_COV-2', 'HCOV_HKU1', 'HCOV_NL63', 'HCOV_OC43', 'FLUAV', 'FLUBV', 'HMPV', 'HRSV', 'HPIV_3']
103111
bbp_assays = ['CCHFV', 'CHI', 'DENV', 'EBOV', 'HBV_DNA', 'HCV', 'HIV_1', 'HIV_2', 'HTV', 'LASV', 'MBV', 'MMV',
104112
'MPOX_DNA', 'ONN', 'PF_3_DNA', 'RBV', 'RVFV', 'SYPH_DNA', 'WNV', 'YFV', 'ZIKV']
105-
rvp_assays = ['SARS_COV-2', 'HCOV_HKU1', 'HCOV_NL63', 'HCOV_OC43', 'FLUAV', 'FLUBV', 'HMPV', 'HRSV', 'HPIV_3']
113+
106114
# set column order
107-
column_order = bbp_assays + rvp_assays + ['RNASEP_P1','RNASEP_P2']
115+
column_order = bbp_assays + rvp_assays + ['RNASEP_P1','RNASEP_P2', 'RNASEP_RVP']
108116
# when adding the new columns, enter the value as 4 (not run)
109117
for col in column_order:
110118
if col not in redcap_t13_hit_binary_output.columns:
111119
redcap_t13_hit_binary_output[col] = 4
112-
120+
113121
### reorder cols
114122
redcap_t13_hit_binary_output = redcap_t13_hit_binary_output[column_order]
115-
123+
116124
### add in the metadata columns
117125
# date
118126
redcap_t13_hit_binary_output.insert(0, "date", date)
127+
119128
# barcode assignment
120129
redcap_t13_hit_binary_output.insert(1, "ifc", barcode_assignment)
130+
121131
# sampleid
122132
sampleid = []
123133
for idx in redcap_t13_hit_binary_output.index: # strip all _ and asterisks from the sample names
124134
cleaned_idx = re.sub(r'[\*\|†\s]', '', idx)
125135
sampleid.append(cleaned_idx)
126136
redcap_t13_hit_binary_output.insert(2, "sampleid", sampleid)
137+
127138
# recordid
128139
record_id = []
129140
for row in redcap_t13_hit_binary_output.itertuples():
@@ -132,18 +143,18 @@ def merge_group(group):
132143
record_id.append(record_id_val)
133144

134145
redcap_t13_hit_binary_output.insert(0, "record_id", record_id)
135-
146+
136147
### merge same samples ran on different panels
137148
# extract sampleid before panel _P1 or _P2 or _RVP
138149
redcap_t13_hit_binary_output['sampleid_prefix'] = redcap_t13_hit_binary_output['sampleid'].str.replace(r'(_P1|_P2|_RVP)$', '', regex=True)
139-
150+
140151
# subset redcap into two dfs
141152
controlsDF = redcap_t13_hit_binary_output[redcap_t13_hit_binary_output['sampleid'].str.contains('NDC|CPC|NTC', regex=True, na=False)]
142153
samplesDF = redcap_t13_hit_binary_output[~redcap_t13_hit_binary_output['sampleid'].str.contains('NDC|CPC|NTC', regex=True, na=False)]
143-
154+
144155
# apply the merge_group function to each group in the groupby obj (which is a df)
145-
samplesDF = samplesDF.groupby('sampleid_prefix').apply(merge_group).reset_index(drop=True)
146-
156+
samplesDF = samplesDF.groupby('sampleid_prefix').apply(lambda group: merge_group(group, bbp_P1_assays, bbp_P2_assays, rvp_assays)).reset_index(drop=True)
157+
147158
# fix the suffix in record_id
148159
record_id_fix = []
149160
for row in samplesDF.itertuples():
@@ -154,26 +165,25 @@ def merge_group(group):
154165
new_record_id = record_id + "_" + suffix_record_id
155166
record_id_fix.append(new_record_id)
156167
samplesDF['record_id'] = record_id_fix
157-
168+
158169
# drop suffix_record_id
159170
samplesDF = samplesDF.drop(columns=['suffix_record_id'])
160-
171+
161172
# concatenate back to redcap
162-
concat_redcap_t13_hit_binary_output = pd.concat((samplesDF, controlsDF), axis=0, ignore_index=True)
163-
173+
redcap_t13_hit_binary_output = pd.concat((samplesDF, controlsDF), axis=0, ignore_index=True)
174+
164175
### write sampleid as the sample_prefix for all samples but those containing CPC, NTC, and NDC
165-
mask = ~concat_redcap_t13_hit_binary_output['sampleid'].str.contains('NTC|CPC|NDC', regex=True, na=False)
166-
concat_redcap_t13_hit_binary_output.loc[mask, 'sampleid'] = concat_redcap_t13_hit_binary_output['sampleid_prefix']
167-
176+
mask = ~redcap_t13_hit_binary_output['sampleid'].str.contains('NTC|CPC|NDC', regex=True, na=False)
177+
redcap_t13_hit_binary_output.loc[mask, 'sampleid'] = redcap_t13_hit_binary_output['sampleid_prefix']
178+
168179
# drop sample_prefix_id
169-
concat_redcap_t13_hit_binary_output = concat_redcap_t13_hit_binary_output.drop(columns=['sampleid_prefix'])
170-
180+
redcap_t13_hit_binary_output = redcap_t13_hit_binary_output.drop(columns=['sampleid_prefix'])
181+
171182
### lowercase all columns in redcap_t13_hit_binary_output for REDCAP data entry
172-
concat_redcap_t13_hit_binary_output.columns = concat_redcap_t13_hit_binary_output.columns.str.lower()
183+
redcap_t13_hit_binary_output.columns = redcap_t13_hit_binary_output.columns.str.lower()
173184

174185
### reset index
175-
concat_redcap_t13_hit_binary_output = concat_redcap_t13_hit_binary_output.reset_index(drop=True)
176-
186+
redcap_t13_hit_binary_output = redcap_t13_hit_binary_output.reset_index(drop=True)
177187

178-
return concat_redcap_t13_hit_binary_output # redcap_t13_hit_binary_output, samplesDF, controlsDF
188+
return redcap_t13_hit_binary_output
179189

0 commit comments

Comments
 (0)