-
Notifications
You must be signed in to change notification settings - Fork 0
/
threshold.py
108 lines (77 loc) · 4.83 KB
/
threshold.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
import numpy as np
import pandas as pd
import sys
class Thresholder:
def __init__(self):
pass
def convert_df(self,df):
return df.to_csv(index=False).encode('utf-8')
def raw_thresholder(self, unique_crRNA_assays, assigned_only, t13_df, CLI_thresh_arg):
# Filter rows containing 'NTC' in any column, produces df
ntc_PerAssay = t13_df[t13_df.index.str.contains('NTC')]
# Collapse any columns that have the same name in the NTCs df
# there should not be duplicates for NTCs in t13_df - as this would have been taken care of by MedianSort
# but the .mean() below is to ensure this - but more so simplify the groupby and transpose functions
ntc_PerAssay = ntc_PerAssay.T.groupby(by=ntc_PerAssay.columns).mean()
# Calculate mean of each column
mean_ntc = ntc_PerAssay.mean(axis=1)
# Make a new df with these mean values
ntc_mean_df = pd.DataFrame({'NTC Mean': mean_ntc})
# Transpose the results to have assays as columns
ntc_mean_df = ntc_mean_df.transpose()
if CLI_thresh_arg == '1.8_Mean':
# Scale the NTC mean by 1.8 to generate thresholds
raw_thresholds_df = ntc_mean_df * 1.8
# Calculate the Normalized NTC threshold too
norm_thresholds_df = raw_thresholds_df / ntc_mean_df
# Append ntc_mean_df, raw_thresholds_df, and norm_thresholds_df
raw_thresholds_df = pd.concat([ntc_mean_df, raw_thresholds_df, norm_thresholds_df], ignore_index=True, axis=0)
raw_thresholds_df.index = ['NTC Mean', 'NTC Threshold', 'Normalized NTC Threshold']
elif CLI_thresh_arg == '3_SD':
# Calculate the standard deviation per assay of all NTC values (replicates included)
filtered_assigned_only = assigned_only.loc[:,["t13","assay","sample"]]
filtered_assigned_only = filtered_assigned_only[filtered_assigned_only['sample'].str.contains('NTC')]
ntc_sd_df = pd.DataFrame(index = ['SD'], columns = unique_crRNA_assays)
for i in unique_crRNA_assays:
filtered_assigned_only_i = filtered_assigned_only[filtered_assigned_only['assay'].isin([i])]
t13_data_i = filtered_assigned_only_i['t13']
for col_name in ntc_sd_df.columns:
if i == col_name:
ntc_sd_df.at['SD',col_name] = t13_data_i.std()
# raw_thresholds_df = ntc_sd_df
# Make a copy of ntc_mean_df
raw_thresholds_df = ntc_mean_df.copy(deep=True)
# Scale the SD by 3 to generate 3*SD per assay
ntc_3sd_df = ntc_sd_df * 3
ntc_3sd_df.index = ['NTC 3*SD']
# Sum the NTC mean and 3*SD per assay to generate thresholds
for col_name, col_data in raw_thresholds_df.items():
for index, value in col_data.items():
# Find the NTC 3*SD for the given assay
ntc_3sd = ntc_3sd_df.loc['NTC 3*SD', col_name]
# Sum the mean and the 3*SD value
raw_thresholds_df.at[index, col_name] = value + ntc_3sd
# Append the threshold to ntc_mean_df
raw_thresholds_df = pd.concat([ntc_mean_df, ntc_sd_df, ntc_3sd_df, raw_thresholds_df], ignore_index=True, axis=0)
raw_thresholds_df.index = ['NTC Mean', 'NTC Standard Deviation', 'NTC 3*SD', 'NTC Threshold']
# Calculate the Normalized NTC threshold too
norm_thresholds_df = raw_thresholds_df.loc['NTC Threshold'] / ntc_mean_df
raw_thresholds_df = pd.concat([raw_thresholds_df, norm_thresholds_df], ignore_index=True, axis=0)
raw_thresholds_df.index = ['NTC Mean', 'NTC Standard Deviation', 'NTC 3*SD', 'NTC Threshold', 'Normalized NTC Threshold']
else:
print("Consult ReadME and input appropriate command-line arguments to specify thresholding method.")
sys.exit()
# Produce the t13 output as Binary Pos/Neg Results
binary = ['POSITIVE', 'NEGATIVE']
for col_name, col_data in t13_df.items():
for index, value in col_data.items():
threshold = raw_thresholds_df.loc['NTC Threshold', col_name]
t13_df[col_name] = t13_df[col_name].map(str)
# Compare the value with the threshold and assign positive/negative accordingly
if value >= threshold:
t13_df.at[index, col_name] = binary[0]
else:
t13_df.at[index, col_name] = binary[1]
# Create a new row called 'Summary' at the bottom of the hit output sheet
t13_df.loc['Summary'] = t13_df.apply(lambda col: col.value_counts().get('POSITIVE', 0))
return raw_thresholds_df, t13_df