@@ -11,18 +11,63 @@ def __init__(self):
11
11
12
12
# method
13
13
def build_redcap (self , fl_t13_hit_binary_output_2 , date , barcode_assignment ):
14
+
15
+ # merge rows with the same sampleid_prefix - keep assay results unique, combine record_ids, update sampleid
16
+ def merge_group (group ):
17
+ # select first row in subset of redcap_t13_hit_binary_output grouped by sampleid_prefix
18
+ merged_row = pd .DataFrame (columns = group .columns )
19
+ merged_row .loc [0 ] = group .iloc [0 ]
20
+
21
+ # the group is the unique sampleid_prefix - each group should have max 2 rows
22
+ for col in group .columns :
23
+ if col not in ["record_id" , "date" , "ifc" , "sampleid" , "sampleid_prefix" ]:
24
+ # if merged_row['cchfv'] = [5,4], then lambda fn will produce [5,None]
25
+ # dropna will make it merged_row['cchfv'] = [5]
26
+ # .unique ensures that only unique vals are retained
27
+
28
+ if all (group [col ] == 4 ):
29
+ merged_row [col ] = 4
30
+ else : # if group[col] = [5,4] or [3, 4] - there's no world where it would be [5,3]
31
+ filtered_values = group .loc [group [col ] != 4 , col ].dropna ().unique ()
32
+ merged_row [col ] = filtered_values [0 ] if len (filtered_values ) == 1 else filtered_values [1 ]
33
+
34
+ # each record_id is split and the unique panel suffixes are added to suffix_record_id
35
+ merged_row ['suffix_record_id' ] = '_' .join (group ['record_id' ].apply (lambda x : x .split ('_' )[- 1 ]).unique ())
36
+
37
+ # assign a sampleid to the merged_row (doesn't matter as sampleid col will be dropped later)
38
+ #merged_row['sampleid'] = group['sampleid'].iloc[0]
39
+
40
+ return merged_row
14
41
15
42
### convert 0 to 2 (negative)
16
43
redcap_t13_hit_binary_output = fl_t13_hit_binary_output_2 .replace (0 , 2 )
17
44
18
- ### drop any rows incl and below 'Summary' row
45
+ ### drop any rows incl and below 'Summary' row
19
46
if 'Summary' in redcap_t13_hit_binary_output .index :
20
47
idx = redcap_t13_hit_binary_output .index .get_loc ('Summary' )
21
48
redcap_t13_hit_binary_output = redcap_t13_hit_binary_output .iloc [:idx ]
22
49
23
50
### convert any cell val with a dagger † to 6 (NTC contaminated)
24
- redcap_t13_hit_binary_output = redcap_t13_hit_binary_output .replace (r'.*†.*' , 6 , regex = True )
51
+ """
52
+ test_redcap_t13_hit_binary_output = redcap_t13_hit_binary_output.copy()
53
+ test_redcap_t13_hit_binary_output = test_redcap_t13_hit_binary_output.astype(str)
54
+ test_redcap_t13_hit_binary_output = test_redcap_t13_hit_binary_output.map(lambda x: str(x) if not isinstance(x, str) else x)
55
+
56
+ for _, row in high_raw_ntc_signal_df.iterrows():
57
+ cont_ntc_sample = row['Sample']
58
+ cont_ntc_assay = row['Assay'].upper()
59
+
60
+ # Check if the sample exists in the row index and the assay exists in the column header
61
+ if cont_ntc_sample in test_redcap_t13_hit_binary_output.index and cont_ntc_assay in test_redcap_t13_hit_binary_output.columns:
62
+ current_value = test_redcap_t13_hit_binary_output.loc[cont_ntc_sample, cont_ntc_assay]
63
+ if '†' in current_value:
64
+ test_redcap_t13_hit_binary_output.loc[cont_ntc_sample, cont_ntc_assay] = '6'
65
+ """
66
+ redcap_t13_hit_binary_output = redcap_t13_hit_binary_output .astype (str )
67
+ redcap_t13_hit_binary_output = redcap_t13_hit_binary_output .applymap (lambda x : '6' if '†' in x else x )
68
+
25
69
70
+
26
71
### convert col vals for invalid assays to 5 (invalid)
27
72
# for all invalid samples
28
73
redcap_t13_hit_binary_output .loc [redcap_t13_hit_binary_output ['SAMPLE VALID? Y/N' ] == 'N***' , :] = 5
@@ -49,7 +94,7 @@ def build_redcap(self, fl_t13_hit_binary_output_2, date, barcode_assignment):
49
94
if re .search (r'rnasep|no_crrna' , col , re .IGNORECASE ):
50
95
new_col = re .split (r'[*]' , col )[0 ]
51
96
redcap_t13_hit_binary_output .columns .values [i ] = new_col
52
-
97
+
53
98
### add columns for the assay that wasn't run with since REDCAP format needs all assays (RVP and BBP) headers in
54
99
bbp_assays = ['CCHFV' , 'CHI' , 'DENV' , 'EBOV' , 'HBV_DNA' , 'HCV' , 'HIV_1' , 'HIV_2' , 'HTV' , 'LASV' , 'MBV' , 'MMV' ,
55
100
'MPOX_DNA' , 'ONN' , 'PF_3_DNA' , 'RBV' , 'RVFV' , 'SYPH_DNA' , 'WNV' , 'YFV' , 'ZIKV' ]
@@ -60,20 +105,22 @@ def build_redcap(self, fl_t13_hit_binary_output_2, date, barcode_assignment):
60
105
for col in column_order :
61
106
if col not in redcap_t13_hit_binary_output .columns :
62
107
redcap_t13_hit_binary_output [col ] = 4
63
-
108
+
64
109
### reorder cols
65
110
redcap_t13_hit_binary_output = redcap_t13_hit_binary_output [column_order ]
66
111
67
112
### add in the metadata columns
113
+ # date
114
+ redcap_t13_hit_binary_output .insert (0 , "date" , date )
115
+ # barcode assignment
116
+ redcap_t13_hit_binary_output .insert (1 , "ifc" , barcode_assignment )
117
+ # sampleid
68
118
sampleid = []
69
- for idx in redcap_t13_hit_binary_output .index :
119
+ for idx in redcap_t13_hit_binary_output .index : # strip all _ and asterisks from the sample names
70
120
cleaned_idx = re .sub (r'[\*\|†\s]' , '' , idx )
71
121
sampleid .append (cleaned_idx )
72
-
73
- redcap_t13_hit_binary_output .insert (0 , "date" , date )
74
- redcap_t13_hit_binary_output .insert (1 , "ifc" , barcode_assignment )
75
122
redcap_t13_hit_binary_output .insert (2 , "sampleid" , sampleid )
76
-
123
+ # recordid
77
124
record_id = []
78
125
for row in redcap_t13_hit_binary_output .itertuples ():
79
126
samp_id = row .sampleid
@@ -83,68 +130,46 @@ def build_redcap(self, fl_t13_hit_binary_output_2, date, barcode_assignment):
83
130
redcap_t13_hit_binary_output .insert (0 , "record_id" , record_id )
84
131
85
132
### merge same samples ran on different panels
86
- # extract sampleid before panel
133
+ # extract sampleid before panel _P1 or _P2 or _RVP
87
134
redcap_t13_hit_binary_output ['sampleid_prefix' ] = redcap_t13_hit_binary_output ['sampleid' ].str .replace (r'(_P1|_P2|_RVP)$' , '' , regex = True )
88
135
89
- # merge rows with the same sampleid_prefix - keep assay results unique, combine record_ids, update sampleid
90
- def merge_group (group ):
91
- # select first row in subset of redcap_t13_hit_binary_output grouped by sampleid_prefix
92
- merged_row = pd .DataFrame (columns = group .columns )
93
- merged_row .loc [0 ] = group .iloc [0 ]
136
+ # subset redcap into two dfs
137
+ controlsDF = redcap_t13_hit_binary_output [redcap_t13_hit_binary_output ['sampleid' ].str .contains ('NDC|CPC|NTC' , regex = True , na = False )]
138
+ samplesDF = redcap_t13_hit_binary_output [~ redcap_t13_hit_binary_output ['sampleid' ].str .contains ('NDC|CPC|NTC' , regex = True , na = False )]
94
139
95
- # the group is the unique sampleid_prefix - each group should have max 2 rows
96
- for col in group .columns :
97
- if col not in ["record_id" , "date" , "ifc" , "sampleid" , "sampleid_prefix" ]:
98
- # if merged_row['cchfv'] = [5,4], then lambda fn will produce [5,None]
99
- # dropna will make it merged_row['cchfv'] = [5]
100
- # .unique ensures that only unique vals are retained
101
-
102
- if all (group [col ] == 4 ):
103
- merged_row [col ] = 4
104
- else : # if group[col] = [5,4] or [3, 4] - there's no world where it would be [5,3]
105
- filtered_values = group .loc [group [col ] != 4 , col ].dropna ().unique ()
106
- merged_row [col ] = filtered_values [0 ]
107
-
108
- # each record_id is split and the two panel suffixes are added to the record_id - the .unique ensures that that all distinct splits are added tg
109
- merged_row ['record_id' ] = '_' .join (group ['record_id' ].apply (lambda x : x .split ('_' )[- 1 ]).unique ())
110
-
111
- # assign a sampleid to the merged_row (doesn't matter as sampleid col will be dropped later)
112
- merged_row ['sampleid' ] = group ['sampleid' ].iloc [0 ]
113
-
114
- return merged_row
115
-
116
140
# apply the merge_group function to each group in the groupby obj (which is a df)
117
- redcap_t13_hit_binary_output = redcap_t13_hit_binary_output .groupby ('sampleid_prefix' ).apply (merge_group ).reset_index (drop = True )
141
+ samplesDF = samplesDF .groupby ('sampleid_prefix' ).apply (merge_group ).reset_index (drop = True )
118
142
119
- # make record_id col be ifc + sample_id_prefix + record_id (which is just _P1_P2)
143
+ # fix the suffix in record_id
120
144
record_id_fix = []
121
- for row in redcap_t13_hit_binary_output .itertuples ():
145
+ for row in samplesDF .itertuples ():
122
146
record_id = row .record_id
123
- sampleid_prefix = row .sampleid_prefix
124
- sampleid = row . sampleid
125
- if not any ( control in sampleid_prefix for control in [ 'NTC' , 'CPC' , 'NDC' ]):
126
- record_id_val = barcode_assignment + '_' + sampleid_prefix + '_' + record_id
127
- record_id_fix .append (record_id_val )
128
- else :
129
- record_id_val = barcode_assignment + '_' + sampleid + '_' + record_id
130
-
131
- redcap_t13_hit_binary_output [ 'record_id' ] = record_id_fix
132
-
133
- ### drop sampleid
134
- redcap_t13_hit_binary_output = redcap_t13_hit_binary_output . drop ( columns = [ 'sampleid' ] )
135
-
136
- ### rename sampleid_prefix as sampleid and insert it as the 4th col
137
- redcap_t13_hit_binary_output = redcap_t13_hit_binary_output . rename ( columns = { 'sampleid_prefix' : 'sampleid' } )
138
- cols = list ( redcap_t13_hit_binary_output . columns )
139
- cols . remove ( 'sampleid' )
140
- cols . insert ( 3 , 'sampleid' )
141
- redcap_t13_hit_binary_output = redcap_t13_hit_binary_output [ cols ]
147
+ suffix_record_id = row .suffix_record_id
148
+ record_id = record_id . split ( "_" )[: - 1 ]
149
+ record_id = "_" . join ( record_id )
150
+ new_record_id = record_id + "_" + suffix_record_id
151
+ record_id_fix .append (new_record_id )
152
+ samplesDF [ 'record_id' ] = record_id_fix
153
+
154
+ # drop suffix_record_id
155
+ samplesDF = samplesDF . drop ( columns = [ 'suffix_record_id' ])
156
+
157
+ # concatenate back to redcap
158
+ concat_redcap_t13_hit_binary_output = pd . concat (( samplesDF , controlsDF ), axis = 0 , ignore_index = True )
159
+
160
+ ### write sampleid as the sample_prefix for all samples but those containing CPC, NTC, and NDC
161
+ mask = ~ concat_redcap_t13_hit_binary_output [ 'sampleid' ]. str . contains ( 'NTC|CPC|NDC' , regex = True , na = False )
162
+ concat_redcap_t13_hit_binary_output . loc [ mask , 'sampleid' ] = concat_redcap_t13_hit_binary_output [ 'sampleid_prefix' ]
163
+
164
+ # drop sample_prefix_id
165
+ concat_redcap_t13_hit_binary_output = concat_redcap_t13_hit_binary_output . drop ( columns = [ 'sampleid_prefix' ])
142
166
143
167
### lowercase all columns in redcap_t13_hit_binary_output for REDCAP data entry
144
- redcap_t13_hit_binary_output .columns = redcap_t13_hit_binary_output .columns .str .lower ()
168
+ concat_redcap_t13_hit_binary_output .columns = concat_redcap_t13_hit_binary_output .columns .str .lower ()
145
169
146
170
### reset index
147
- redcap_t13_hit_binary_output = redcap_t13_hit_binary_output .reset_index (drop = True )
171
+ concat_redcap_t13_hit_binary_output = concat_redcap_t13_hit_binary_output .reset_index (drop = True )
172
+
148
173
149
- return redcap_t13_hit_binary_output
174
+ return concat_redcap_t13_hit_binary_output # redcap_t13_hit_binary_output, samplesDF, controlsDF
150
175
0 commit comments