@@ -12,8 +12,11 @@ def __init__(self):
12
12
# method
13
13
def build_redcap (self , fl_t13_hit_binary_output_2 , date , barcode_assignment ):
14
14
15
+ # legend
16
+ # 1 = pos, 2 = neg, 3 = pending, 4 = not run, 5 = invalid, 6 = NTC contaminated
17
+
15
18
# merge rows with the same sampleid_prefix - keep assay results unique, combine record_ids, update sampleid
16
- def merge_group (group ):
19
+ def merge_group (group , bbp_P1_assays , bbp_P2_assays , rvp_assays ):
17
20
# select first row in subset of redcap_t13_hit_binary_output grouped by sampleid_prefix
18
21
merged_row = pd .DataFrame (columns = group .columns )
19
22
merged_row .loc [0 ] = group .iloc [0 ]
@@ -24,106 +27,114 @@ def merge_group(group):
24
27
# if merged_row['cchfv'] = [5,4], then lambda fn will produce [5,None]
25
28
# dropna will make it merged_row['cchfv'] = [5]
26
29
# .unique ensures that only unique vals are retained
27
-
28
- if all (group [col ] == 4 ):
30
+ if all (group [col ] == 4 ): # not run
29
31
merged_row [col ] = 4
30
- else : # if group[col] = [5,4] or [3, 4] - there's no world where it would be [5,3]
31
- filtered_values = group .loc [group [col ] != 4 , col ].dropna ().unique ()
32
- merged_row [col ] = filtered_values [0 ] if len (filtered_values ) == 1 else filtered_values [1 ]
32
+ elif all (group [col ] == 5 ): # both assays are invalid
33
+ merged_row [col ] = 5
34
+ elif all (group [col ] == 2 ): # both assays are negative
35
+ merged_row [col ] = 2
36
+ else :
37
+ p1_value = group .loc [group ["sampleid" ].str .endswith ("_P1" ), col ].dropna ().unique ()
38
+ p2_value = group .loc [group ["sampleid" ].str .endswith ("_P2" ), col ].dropna ().unique ()
39
+ rvp_value = group .loc [group ["sampleid" ].str .endswith ("_RVP" ), col ].dropna ().unique ()
40
+
41
+ if col in bbp_P1_assays and len (p1_value ) > 0 :
42
+ merged_row [col ] = p1_value #p1_value[0]
43
+ elif col in bbp_P2_assays and len (p2_value ) > 0 :
44
+ merged_row [col ] = p2_value #p2_value[0]
45
+ elif col in rvp_assays and len (p2_value ) > 0 :
46
+ merged_row [col ] = rvp_value #rvp_value[0]
47
+ """
48
+ else:
49
+ # if group[col] = [5,4] or [3, 4] - there's no world where it would be [5,3]
50
+ filtered_values = group.loc[group[col] != 4, col].dropna().unique()
51
+ # ^ group.loc[group[col] != 4, col] filters the rows in group where the column col is NOT equal to 4
52
+ merged_row[col] = filtered_values[0] #if len(filtered_values) ==1 else filtered_values[1]
53
+ """
33
54
34
55
# each record_id is split and the unique panel suffixes are added to suffix_record_id
35
56
merged_row ['suffix_record_id' ] = '_' .join (group ['record_id' ].apply (lambda x : x .split ('_' )[- 1 ]).unique ())
36
57
37
- # assign a sampleid to the merged_row (doesn't matter as sampleid col will be dropped later)
38
- #merged_row['sampleid'] = group['sampleid'].iloc[0]
39
-
40
58
return merged_row
41
59
42
60
### format input flagged t13 binary hit file
43
61
redcap_t13_hit_binary_output = fl_t13_hit_binary_output_2 .copy ()
44
62
redcap_t13_hit_binary_output = redcap_t13_hit_binary_output .astype (str )
45
63
64
+ ### convert any cell val with a dagger † to 6 (NTC contaminated)
65
+ redcap_t13_hit_binary_output = redcap_t13_hit_binary_output .map (lambda x : '6' if '0.0†' in x else x )
66
+
46
67
### convert 0 to 2 (negative)
47
68
redcap_t13_hit_binary_output = redcap_t13_hit_binary_output .replace (to_replace = r'^0.*' , value = 2 , regex = True )
48
69
70
+ ### convert 1.0 to 1 (positive)
71
+ redcap_t13_hit_binary_output = redcap_t13_hit_binary_output .replace (to_replace = r'^1.0' , value = 1 , regex = True )
72
+
49
73
### drop any rows incl and below 'Summary' row
50
74
if 'Summary' in redcap_t13_hit_binary_output .index :
51
75
idx = redcap_t13_hit_binary_output .index .get_loc ('Summary' )
52
76
redcap_t13_hit_binary_output = redcap_t13_hit_binary_output .iloc [:idx ]
53
-
54
- ### convert any cell val with a dagger † to 6 (NTC contaminated)
55
- """
56
- test_redcap_t13_hit_binary_output = redcap_t13_hit_binary_output.copy()
57
- test_redcap_t13_hit_binary_output = test_redcap_t13_hit_binary_output.astype(str)
58
- test_redcap_t13_hit_binary_output = test_redcap_t13_hit_binary_output.map(lambda x: str(x) if not isinstance(x, str) else x)
59
-
60
- for _, row in high_raw_ntc_signal_df.iterrows():
61
- cont_ntc_sample = row['Sample']
62
- cont_ntc_assay = row['Assay'].upper()
63
-
64
- # Check if the sample exists in the row index and the assay exists in the column header
65
- if cont_ntc_sample in test_redcap_t13_hit_binary_output.index and cont_ntc_assay in test_redcap_t13_hit_binary_output.columns:
66
- current_value = test_redcap_t13_hit_binary_output.loc[cont_ntc_sample, cont_ntc_assay]
67
- if '†' in current_value:
68
- test_redcap_t13_hit_binary_output.loc[cont_ntc_sample, cont_ntc_assay] = '6'
69
- """
70
- redcap_t13_hit_binary_output = redcap_t13_hit_binary_output .astype (str )
71
- redcap_t13_hit_binary_output = redcap_t13_hit_binary_output .map (lambda x : '6' if '†' in x else x )
72
-
73
-
74
77
75
- ### convert col vals for invalid assays to 5 (invalid)
78
+ ### convert col vals for invalid samples to 5 (invalid)
76
79
# for all invalid samples
77
80
redcap_t13_hit_binary_output .loc [redcap_t13_hit_binary_output ['SAMPLE VALID? Y/N' ] == 'N***' , :] = 5
78
-
81
+
82
+ ### convert col vals for invalid assays to 5 (invalid)
79
83
# for all invalid assays
80
84
assay_valid_cols = redcap_t13_hit_binary_output .columns [redcap_t13_hit_binary_output .loc ['Assay Valid?' ] == 'INVALID ASSAY' ]
81
85
for col in assay_valid_cols :
82
86
redcap_t13_hit_binary_output [col ] = 5
83
-
87
+
84
88
### drop the 'SAMPLE VALID? Y/N' col
85
89
redcap_t13_hit_binary_output = redcap_t13_hit_binary_output .drop ('SAMPLE VALID? Y/N' , axis = 1 )
86
-
90
+
87
91
### drop the 'Assay Valid?' row
88
92
redcap_t13_hit_binary_output = redcap_t13_hit_binary_output .drop ('Assay Valid?' , axis = 0 )
89
-
93
+
90
94
### drop any columns containing no_crRNA
91
95
redcap_t13_hit_binary_output = redcap_t13_hit_binary_output .loc [:, ~ redcap_t13_hit_binary_output .columns .str .lower ().str .contains ('no_crrna' )]
92
-
96
+
93
97
### strip all _ and asterisks from the column names
94
98
for i , col in enumerate (redcap_t13_hit_binary_output .columns ):
95
- if not re .search (r'rnasep|no_crrna ' , col , re .IGNORECASE ):
96
- new_col = re .split (r'[_*]' , col )[0 ]
99
+ if not re .search (r'rnasep' , col , re .IGNORECASE ):
100
+ new_col = re .split (r'[_*]' , col )[0 ] # remove _ and * from all col names for assays
97
101
redcap_t13_hit_binary_output .columns .values [i ] = new_col
98
- if re .search (r'rnasep|no_crrna ' , col , re .IGNORECASE ):
99
- new_col = re .split (r'[*]' , col )[0 ]
102
+ if re .search (r'rnasep' , col , re .IGNORECASE ):
103
+ new_col = re .split (r'[*]' , col )[0 ] # we don't want to remove the _P1 or _RVP or _P2 part from RNASEP column header
100
104
redcap_t13_hit_binary_output .columns .values [i ] = new_col
101
105
102
106
### add columns for the assay that wasn't run with since REDCAP format needs all assays (RVP and BBP) headers in
107
+ # define assays
108
+ bbp_P1_assays = ['CCHFV' ,'EBOV' ,'HIV_1' ,'HIV_2' ,'LASV' , 'MBV' ,'MPOX_DNA' ,'PF_3_DNA' ,'WNV' ,'YFV' ]
109
+ bbp_P2_assays = ['CHI' , 'DENV' ,'HBV_DNA' ,'HCV' , 'HTV' , 'MMV' , 'ONN' ,'RBV' ,'RVFV' ,'SYPH_DNA' ,'ZIKV' ]
110
+ rvp_assays = ['SARS_COV-2' , 'HCOV_HKU1' , 'HCOV_NL63' , 'HCOV_OC43' , 'FLUAV' , 'FLUBV' , 'HMPV' , 'HRSV' , 'HPIV_3' ]
103
111
bbp_assays = ['CCHFV' , 'CHI' , 'DENV' , 'EBOV' , 'HBV_DNA' , 'HCV' , 'HIV_1' , 'HIV_2' , 'HTV' , 'LASV' , 'MBV' , 'MMV' ,
104
112
'MPOX_DNA' , 'ONN' , 'PF_3_DNA' , 'RBV' , 'RVFV' , 'SYPH_DNA' , 'WNV' , 'YFV' , 'ZIKV' ]
105
- rvp_assays = [ 'SARS_COV-2' , 'HCOV_HKU1' , 'HCOV_NL63' , 'HCOV_OC43' , 'FLUAV' , 'FLUBV' , 'HMPV' , 'HRSV' , 'HPIV_3' ]
113
+
106
114
# set column order
107
- column_order = bbp_assays + rvp_assays + ['RNASEP_P1' ,'RNASEP_P2' ]
115
+ column_order = bbp_assays + rvp_assays + ['RNASEP_P1' ,'RNASEP_P2' , 'RNASEP_RVP' ]
108
116
# when adding the new columns, enter the value as 4 (not run)
109
117
for col in column_order :
110
118
if col not in redcap_t13_hit_binary_output .columns :
111
119
redcap_t13_hit_binary_output [col ] = 4
112
-
120
+
113
121
### reorder cols
114
122
redcap_t13_hit_binary_output = redcap_t13_hit_binary_output [column_order ]
115
-
123
+
116
124
### add in the metadata columns
117
125
# date
118
126
redcap_t13_hit_binary_output .insert (0 , "date" , date )
127
+
119
128
# barcode assignment
120
129
redcap_t13_hit_binary_output .insert (1 , "ifc" , barcode_assignment )
130
+
121
131
# sampleid
122
132
sampleid = []
123
133
for idx in redcap_t13_hit_binary_output .index : # strip all _ and asterisks from the sample names
124
134
cleaned_idx = re .sub (r'[\*\|†\s]' , '' , idx )
125
135
sampleid .append (cleaned_idx )
126
136
redcap_t13_hit_binary_output .insert (2 , "sampleid" , sampleid )
137
+
127
138
# recordid
128
139
record_id = []
129
140
for row in redcap_t13_hit_binary_output .itertuples ():
@@ -132,18 +143,18 @@ def merge_group(group):
132
143
record_id .append (record_id_val )
133
144
134
145
redcap_t13_hit_binary_output .insert (0 , "record_id" , record_id )
135
-
146
+
136
147
### merge same samples ran on different panels
137
148
# extract sampleid before panel _P1 or _P2 or _RVP
138
149
redcap_t13_hit_binary_output ['sampleid_prefix' ] = redcap_t13_hit_binary_output ['sampleid' ].str .replace (r'(_P1|_P2|_RVP)$' , '' , regex = True )
139
-
150
+
140
151
# subset redcap into two dfs
141
152
controlsDF = redcap_t13_hit_binary_output [redcap_t13_hit_binary_output ['sampleid' ].str .contains ('NDC|CPC|NTC' , regex = True , na = False )]
142
153
samplesDF = redcap_t13_hit_binary_output [~ redcap_t13_hit_binary_output ['sampleid' ].str .contains ('NDC|CPC|NTC' , regex = True , na = False )]
143
-
154
+
144
155
# apply the merge_group function to each group in the groupby obj (which is a df)
145
- samplesDF = samplesDF .groupby ('sampleid_prefix' ).apply (merge_group ).reset_index (drop = True )
146
-
156
+ samplesDF = samplesDF .groupby ('sampleid_prefix' ).apply (lambda group : merge_group ( group , bbp_P1_assays , bbp_P2_assays , rvp_assays ) ).reset_index (drop = True )
157
+
147
158
# fix the suffix in record_id
148
159
record_id_fix = []
149
160
for row in samplesDF .itertuples ():
@@ -154,26 +165,25 @@ def merge_group(group):
154
165
new_record_id = record_id + "_" + suffix_record_id
155
166
record_id_fix .append (new_record_id )
156
167
samplesDF ['record_id' ] = record_id_fix
157
-
168
+
158
169
# drop suffix_record_id
159
170
samplesDF = samplesDF .drop (columns = ['suffix_record_id' ])
160
-
171
+
161
172
# concatenate back to redcap
162
- concat_redcap_t13_hit_binary_output = pd .concat ((samplesDF , controlsDF ), axis = 0 , ignore_index = True )
163
-
173
+ redcap_t13_hit_binary_output = pd .concat ((samplesDF , controlsDF ), axis = 0 , ignore_index = True )
174
+
164
175
### write sampleid as the sample_prefix for all samples but those containing CPC, NTC, and NDC
165
- mask = ~ concat_redcap_t13_hit_binary_output ['sampleid' ].str .contains ('NTC|CPC|NDC' , regex = True , na = False )
166
- concat_redcap_t13_hit_binary_output .loc [mask , 'sampleid' ] = concat_redcap_t13_hit_binary_output ['sampleid_prefix' ]
167
-
176
+ mask = ~ redcap_t13_hit_binary_output ['sampleid' ].str .contains ('NTC|CPC|NDC' , regex = True , na = False )
177
+ redcap_t13_hit_binary_output .loc [mask , 'sampleid' ] = redcap_t13_hit_binary_output ['sampleid_prefix' ]
178
+
168
179
# drop sample_prefix_id
169
- concat_redcap_t13_hit_binary_output = concat_redcap_t13_hit_binary_output .drop (columns = ['sampleid_prefix' ])
170
-
180
+ redcap_t13_hit_binary_output = redcap_t13_hit_binary_output .drop (columns = ['sampleid_prefix' ])
181
+
171
182
### lowercase all columns in redcap_t13_hit_binary_output for REDCAP data entry
172
- concat_redcap_t13_hit_binary_output .columns = concat_redcap_t13_hit_binary_output .columns .str .lower ()
183
+ redcap_t13_hit_binary_output .columns = redcap_t13_hit_binary_output .columns .str .lower ()
173
184
174
185
### reset index
175
- concat_redcap_t13_hit_binary_output = concat_redcap_t13_hit_binary_output .reset_index (drop = True )
176
-
186
+ redcap_t13_hit_binary_output = redcap_t13_hit_binary_output .reset_index (drop = True )
177
187
178
- return concat_redcap_t13_hit_binary_output # redcap_t13_hit_binary_output, samplesDF, controlsDF
188
+ return redcap_t13_hit_binary_output
179
189
0 commit comments