Skip to content

Commit

Permalink
Software Update 3.2.3
Browse files Browse the repository at this point in the history
Improvements to user interface and plotting normalized t13 data
  • Loading branch information
fsalbeez committed Aug 26, 2024
1 parent 6e34b82 commit 470b3d3
Show file tree
Hide file tree
Showing 4 changed files with 158 additions and 34 deletions.
99 changes: 72 additions & 27 deletions analyze_run.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,9 @@
barcode_assignment = match.group(1)
print(f"IFC barcode: {barcode_assignment}")

#####
# instantiate Data Reader from reader.py
reader = DataReader() # make am empty class object that corresponds to the DataReader() object from reader.py

# then load in the data file
data_files = sorted([fname for fname in all_files if fname.suffix == ".csv"])
Expand All @@ -64,8 +67,6 @@
file_like_object = BytesIO(data_file.read_bytes())
df_data = pd.read_csv(file_like_object, on_bad_lines="skip")
print(f"This is the Data File that was loaded: {data_file.name}")

reader = DataReader() # make am empty class object that corresponds to the DataReader() object from reader.py

phrases_to_find = [
"Raw Data for Passive Reference ROX",
Expand All @@ -80,13 +81,20 @@

# at this point, we have loaded the assignment sheet and have sorted through the loaded data file to create a dict of dataframes

#####
# instantiate DataProcessor from norm.py
processor = DataProcessor()
# normalize the signal
normalized_dataframes = processor.background_processing(dataframes)

# The premise of the code is that different viral panels require different thresholding
# So the user will specifiy command line arguments as per the ReadMe instructions
# and this portion of the code is meant to access the CI arguments and modify the threshold specified in the code

CLI_arg = sys.argv

# make an output folder in your path's wd if it hasn't been made already
output_folder = f'output_{barcode_assignment}'
output_folder = f'output_{barcode_assignment}_[{CLI_arg[1]}]'
if not os.path.exists(output_folder):
os.makedirs(output_folder)

Expand All @@ -95,6 +103,7 @@
normalized_dataframes['signal_norm'].to_csv(os.path.join(output_folder, 'signal_norm.csv'), index=True)
normalized_dataframes['ref_norm'].to_csv(os.path.join(output_folder, 'ref_norm.csv'), index=True)

#####
# instantiate DataMatcher from matcher.py
matcher = DataMatcher()
assigned_norms, assigned_lists = matcher.assign_assays(assignment_files[0], normalized_dataframes['ref_norm'], normalized_dataframes['signal_norm'])
Expand All @@ -104,9 +113,11 @@
assigned_norms['signal_norm_raw'].to_csv(os.path.join(output_folder, 'assigned_signal_norm.csv'), index=True)
assigned_norms['ref_norm_raw'].to_csv(os.path.join(output_folder, 'assigned_ref_norm.csv'), index=True)

samples_list = assigned_lists['samples_list']
# collect the assays/samples from the layout assays/samples in the assignment sheet (this extraction is done in matcher.py)
crRNA_assays = assigned_lists['assay_list']
samples_list = assigned_lists['samples_list']

#####
# instantiate ntcContaminationChecker from ntc_con_check.py
ntcCheck = ntcContaminationChecker()

Expand All @@ -115,16 +126,24 @@
# create df of filtered assigned_signal_norm by applying the NTC check to remove any NTCs whose raw signal suggests contamination
assigned_signal_norm_with_NTC_check = ntcCheck.ntc_cont(assigned_signal_norm) # feed this into MedianSort

# temporarily save assigned_signal_norm_with_NTC_check
assigned_signal_norm_with_NTC_check.to_csv(os.path.join(output_folder, 'assigned_signal_norm_with_NTC_check.csv'), index=True)


#####
# instantiate MedianSort from median_frame.py
median = MedianSort(crRNA_assays)
final_med_frames = median.create_median(assigned_signal_norm_with_NTC_check)

# temporarily print final_med_frames
#print(final_med_frames)

# Output needs to be rounded to 4 digits
rounded_final_med_frames = {}
# Define the number of decimals for rounding
decimals = 5

# Iterate through each row and column
# Iterate through each row and column, round each value
for key, df in final_med_frames.items():
rounded_df = pd.DataFrame(index=df.index, columns=df.columns)
for i in range(len(df.index)):
Expand All @@ -133,11 +152,18 @@
rounded_df.iloc[i, j] = round(df.iloc[i, j], decimals)
rounded_final_med_frames[key] = rounded_df

# Make subfolder in the output folder in your path's wd if it hasn't been made already
timepoints_subfolder = os.path.join(output_folder, f'timepoints_quantData_{barcode_assignment}')
if not os.path.exists(timepoints_subfolder):
os.makedirs(timepoints_subfolder)

# Save the dataframes per timepoint in subfolder timp
timepoints = list(rounded_final_med_frames.keys())
for i, t in enumerate(timepoints, start=1):
filename = os.path.join(output_folder, f't{i}_{barcode_assignment}.csv')
filename = os.path.join(timepoints_subfolder, f't{i}_{barcode_assignment}.csv')
csv = rounded_final_med_frames[t].to_csv(filename, index=True)


# since we want to explicitly manipulate t13_csv, it is helpful to have the t13 df referenced outside of the for loop
last_key = list(rounded_final_med_frames.keys())[-1]
t13_dataframe_orig = rounded_final_med_frames[last_key]
Expand All @@ -147,12 +173,7 @@
# at this point, we have created a t1 thru t13 dataframe and exported all these dataframes as csv files in our output folder
# now we need to threshold the t13 csv and mark signals >= threshold as positive and < threshold as negative

# The premise of the code is that different viral panels require different thresholding
# So the user will specifiy command line arguments as per the ReadMe instructions
# and this portion of the code is meant to access the CI arguments and modify the threshold specified in the code

CLI_arg = sys.argv

#####
# instantiate Thresholder from threshold.py
thresholdr = Thresholder()
unique_crRNA_assays = list(set(crRNA_assays))
Expand All @@ -161,7 +182,6 @@
# and save the file to your working directory
ntc_PerAssay, ntc_thresholds_output, t13_hit_output = thresholdr.raw_thresholder(unique_crRNA_assays, assigned_norms['signal_norm_raw'], t13_dataframe_copy1, CLI_arg[1])


# make copies of t13_hit_output csv for downstream summaries and quality control checks
t13_hit_output_copy1 = pd.DataFrame(t13_hit_output).copy() # make a copy of t13_hit_output # used in ndc qual check
t13_hit_output_copy2 = pd.DataFrame(t13_hit_output).copy() # make a copy of t13_hit_output # used in cpc qual check
Expand All @@ -176,13 +196,15 @@
ntc_thresholds_output.to_csv(ntc_thresholds_output_file_path, index=True)
t13_hit_output.to_csv(hit_output_file_path, index=True)

#####
# instantiate NTC_Normalized from ntcnorm.py
ntcNorm = Normalized()
# apply ntc_normalizr to the t13_dataframe to produce a new dataframe with all values divided by the mean NTC for that assay
t13_quant_hit_norm = ntcNorm.normalizr(t13_dataframe_copy2)
quant_hit_output_ntcNorm_file_path = os.path.join(output_folder, f't13_{barcode_assignment}_quant_ntcNorm.csv')
t13_quant_hit_norm.to_csv(quant_hit_output_ntcNorm_file_path, index=True)
t13_quant_norm = ntcNorm.normalizr(t13_dataframe_copy2)
quant_output_ntcNorm_file_path = os.path.join(output_folder, f't13_{barcode_assignment}_normalized.csv')
t13_quant_norm.to_csv(quant_output_ntcNorm_file_path, index=True)

#####
# instantiate Binary_Converter from binary_results.py
binary_num_converter = Binary_Converter()
# apply hit_numeric_conv to the the t13_hit_output to produce a new dataframe with all pos/neg converted to binary 1/0 output
Expand All @@ -194,14 +216,15 @@
t13_hit_binary_output_copy1 = pd.DataFrame(t13_hit_binary_output).copy() # used in coninf check
t13_hit_binary_output_copy2 = pd.DataFrame(t13_hit_binary_output).copy()

#####
# instantiate Summarized from summary.py
summary = Summarized()
# apply summarizer to the t13_dataframe to produce a new dataframe tabulating all of the positive samples
summary_samples_df = summary.summarizer(t13_hit_output)
summary_pos_samples_file_path = os.path.join(output_folder, f'Positives_Summary_{barcode_assignment}.csv')
summary_samples_df.to_csv(summary_pos_samples_file_path, index=True)


#####
# instantiate Plotter from plotting.py
heatmap_generator = Plotter()

Expand All @@ -211,20 +234,38 @@
unique_crRNA_assays = list(OrderedDict.fromkeys(crRNA_assays))
heatmap = heatmap_generator.plt_heatmap(tgap, barcode_assignment,final_med_frames, samples_list, unique_crRNA_assays, timepoints)

# Make subfolder in the output folder in your path's wd if it hasn't been made already
heatmaps_subfolder = os.path.join(output_folder, f'heatmaps_{barcode_assignment}')
if not os.path.exists(heatmaps_subfolder):
os.makedirs(heatmaps_subfolder)

# save heatmap per timepoint
for i, t in enumerate(timepoints, start=1):
#csv = convert_df(final_med_frames[t])
heatmap_filename = os.path.join(output_folder, f'Heatmap_t{i}_{barcode_assignment}.png')
heatmap_filename = os.path.join(heatmaps_subfolder, f'Heatmap_t{i}_{barcode_assignment}.png')
fig = heatmap[t].savefig(heatmap_filename, bbox_inches = 'tight', dpi=80)
plt.close(fig)

print(f"The heatmap plots saved to the folder, {output_folder}")
print(f"The heatmap plots saved to the folder, {heatmaps_subfolder} in {output_folder}")

heatmap_t13_quant_norm = heatmap_generator.t13_plt_heatmap(tgap, barcode_assignment,t13_quant_norm, samples_list, unique_crRNA_assays, timepoints)
heatmap_t13_quant_norm_filename = os.path.join(output_folder, f'Heatmap_t13_{barcode_assignment}_normalized.png')
fig = heatmap_t13_quant_norm.savefig(heatmap_t13_quant_norm_filename, bbox_inches = 'tight', dpi=80)
plt.close(fig)


#####
# instantiate Assay_QC_Score
assayScorer = Assay_QC_Score()
# take in t13_hit_binary_output as the df to build off of
QC_score_per_assay_df = assayScorer.assay_level_score(t13_hit_binary_output)
assay_lvl_QC_score_file_path = os.path.join(output_folder, f'Assay_Level_QC_Metrics_{barcode_assignment}.csv')

# Make subfolder in the output folder in your path's wd if it hasn't been made already
assayQC_subfolder = os.path.join(output_folder, f'assay_performance_evaluation_{barcode_assignment}')
if not os.path.exists(assayQC_subfolder):
os.makedirs(assayQC_subfolder)

assay_lvl_QC_score_file_path = os.path.join(assayQC_subfolder, f'Assay_Performance_QC_Test_Results_{barcode_assignment}.csv')
QC_score_per_assay_df.to_csv(assay_lvl_QC_score_file_path, index=True)

# write text file explaining the QC score
Expand Down Expand Up @@ -264,16 +305,22 @@
assayScores_Explanation.append("The final score per assay is included for easy comparison of assay performance.\n")

# create and save an output text file containing the quality control checks
assayScores_file_path = os.path.join(output_folder, f'Assay_Performance_QC_Tests_{barcode_assignment}.txt')
assayScores_file_path = os.path.join(assayQC_subfolder, f'Assay_Performance_QC_Tests_{barcode_assignment}.txt')
with open(assayScores_file_path, 'w') as f:
for line in assayScores_Explanation:
f.write(line + '\n')

print(f"The four quality control tests to evaluate assay performance are complete. Their results have been saved to the folder, {output_folder}")
print(f"The four quality control tests to evaluate assay performance are complete. Their results have been saved to the folder, {assayQC_subfolder}")

#####
# instantiate Qual_Ctrl_Checks from qual-checks.py
qual_checks = Qual_Ctrl_Checks()

# Make subfolder in the output folder in your path's wd if it hasn't been made already
qc_subfolder = os.path.join(output_folder, 'quality_control_flags')
if not os.path.exists(qc_subfolder):
os.makedirs(qc_subfolder)

# initialize a list to collect all quality control checks
QC_lines = []

Expand Down Expand Up @@ -332,7 +379,6 @@
# apply ntc_check to the t13_hit_output df to generate a list of all ntc positive assays
assigned_signal_norm_2 = pd.DataFrame(assigned_norms['signal_norm_raw']).copy() # make a copy of assigned_signal_norm dataframe


high_raw_ntc_signal = qual_checks.ntc_check(assigned_signal_norm_2)
QC_lines.append("4. Evaluation of No Target Control (NTC) Contamination \n")
if high_raw_ntc_signal:
Expand All @@ -348,7 +394,7 @@
coinfection_df = qual_checks.coinf_check(t13_hit_binary_output_copy1)
QC_lines.append("5. Evaluation of Potential Co-Infected Samples\n")

coinfection_df_file_path = os.path.join(output_folder, f'Coinfection_Check_{barcode_assignment}.csv')
coinfection_df_file_path = os.path.join(qc_subfolder, f'Coinfection_Check_{barcode_assignment}.csv')
coinfection_df.to_csv(coinfection_df_file_path, index=True) # output needs to be csv of coninfection check

# in Qual_Check text file, add message saying "see coinf check csv" and "if any samples excpet CPC are flagged as being coinfected, there is risk of these samples being coinfected"
Expand All @@ -359,13 +405,12 @@
QC_lines.append(" - All other flagged samples should be further evaluated for potential co-infection.\n")
QC_lines.append("Please be advised to check the output files as well.")


# create and save an output text file containing the quality control checks
QCs_file_path = os.path.join(output_folder, f'Quality_Control_Flags_{barcode_assignment}.txt')
QCs_file_path = os.path.join(qc_subfolder, f'Quality_Control_Flags_{barcode_assignment}.txt')
with open(QCs_file_path, 'w') as f:
for line in QC_lines:
f.write(line + '\n')

print(f"The quality control checks are complete and saved to the folder, {output_folder}")
print(f"The quality control checks are complete and saved to the folder, {qc_subfolder}")


16 changes: 10 additions & 6 deletions ntc_con_check.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,29 +12,33 @@ def ntc_cont(self, assigned_sig_norm):

# filter sample for anything that contains NTC (case-insensitive)
ntc_filtered_df = assigned_sig_norm[assigned_sig_norm['sample'].str.contains('NTC', case=False, na=False)]
# filter all other samples and save it in a new df
all_else_filtered_df = assigned_sig_norm[~assigned_sig_norm['sample'].str.contains('NTC', case=False, na=False)]

ntc_assay_dfs = []
# initiate list of ntc_assay_dfs
ntc_assay_dfs = []
# check per assay_df
for assay in ntc_filtered_df['assay'].unique():
# filter the df for the current assay
assay_df = ntc_filtered_df[ntc_filtered_df['assay'] == assay]

# Check if t13 value is greater than 0.5 for any sample
for _, row in assay_df.iterrows():
for index, row in assay_df.iterrows():
if row['t13'] > 0.5:
# Check if there is still more than one row left in assay_df
if len(assay_df) > 1:
# Remove the row where t13 is greater than 0.5
assay_df = assay_df[assay_df['t13'] != row['t13']]
assay_df = assay_df.drop(index)
else:
# If there is only one row left, set row['t13'] to 0.5
assay_df.at[row.name, 't13'] = 0.5
assay_df.at[index, 't13'] = 0.5
# add assay_df back to list of ntc_assay_dfs
ntc_assay_dfs.append(assay_df)

# convert ntc_assay_dfs list into a df
combined_ntc_assay_dfs = pd.concat(ntc_assay_dfs, ignore_index=True)

assigned_sig_norm = pd.concat([combined_ntc_assay_dfs, all_else_filtered_df], ignore_index=True)

# pull the samples list

return assigned_sig_norm
2 changes: 2 additions & 0 deletions ntcnorm.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,8 @@ def normalizr(self, t13_df):
ntc_mean = ntc_mean_df.loc['NTC Mean', col_name]
# Divide the value by the NTC mean per assay
t13_df.at[index, col_name] = value/ntc_mean

t13_df = t13_df.apply(pd.to_numeric, errors = 'coerce')

return t13_df

Expand Down
75 changes: 74 additions & 1 deletion plotting.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,4 +92,77 @@ def plt_heatmap(self, tgap, barcode_number, df_dict, sample_list, assay_list, tp
axes.vlines(v_lines, colors = 'silver',alpha=0.9,linewidths = 0.35,*axes.get_ylim())
fig_timepoints[i] = fig

return fig_timepoints
return fig_timepoints

def t13_plt_heatmap(self, tgap, barcode_number, df, sample_list, assay_list, tp):
# Create a dictonary for timepoints
time_assign = {}

for cycle in range(1,len(tp)+1):
tpoint = "t" + str(cycle)
time_assign[tpoint] = tgap + 3 + (cycle-1) * 5
last_key = list(time_assign.keys())[-1]

half_samples = int(len(sample_list)/2)

if len(sample_list) == 192:
# Split the sample list into two halves
first_half_samples = sample_list[:half_samples]
second_half_samples = sample_list[half_samples:]

fig, axes = plt.subplots(2, 1, figsize=(len(first_half_samples) * 0.5, len(assay_list) * 0.5 * 2))

df = df.transpose()

# First heatmap (first 96 samples)
frame1 = df[first_half_samples].reindex(assay_list)
ax1 = sns.heatmap(frame1, cmap='Reds', square=True, cbar_kws={'pad': 0.002}, annot_kws={"size": 20}, ax=axes[0])
ax1.set_title(f'Heatmap for data normalized against assay-specific NTC mean, for {barcode_number} at {time_assign[last_key]} minutes (First {half_samples} Samples)', size=28)
ax1.set_xlabel('Samples', size=14)
ax1.set_ylabel('Assays', size=14)
bottom, top = ax1.get_ylim()
ax1.set_ylim(bottom + 0.5, top - 0.5)
ax1.tick_params(axis="y", labelsize=16)
ax1.tick_params(axis="x", labelsize=16)
plt.yticks(rotation=0)

# Second heatmap (next 96 samples)
frame2 = df[second_half_samples].reindex(assay_list)
ax2 = sns.heatmap(frame2, cmap='Reds', square=True, cbar_kws={'pad': 0.002}, annot_kws={"size": 20}, ax=axes[1])
ax2.set_title(f'Heatmap for data normalized against assay-specific NTC mean, for {barcode_number} at {time_assign[last_key]} minutes (Next {half_samples} Samples)', size=28)
ax2.set_xlabel('Samples', size=14)
ax2.set_ylabel('Assays', size=14)
bottom, top = ax2.get_ylim()
ax2.set_ylim(bottom + 0.5, top - 0.5)
ax2.tick_params(axis="y", labelsize=16)
ax2.tick_params(axis="x", labelsize=16)
plt.yticks(rotation=0)

# Adjust layout
plt.tight_layout()

else:
df = df.transpose()
frame = df[sample_list].reindex(assay_list)
fig, axes = plt.subplots(1,1,figsize=(len(frame.columns.values)*0.5,len(frame.index.values)*0.5))
ax = sns.heatmap(frame,cmap='Reds',square = True,cbar_kws={'pad':0.002}, annot_kws={"size": 20})

plt.title(f'Heatmap for data normalized against assay-specific NTC mean, for {barcode_number} at {time_assign[-1]} minutes', size = 28)
plt.xlabel('Samples', size = 14)
plt.ylabel('Assays', size = 14)
bottom, top = ax.get_ylim()
ax.set_ylim(bottom + 0.5, top - 0.5)
ax.tick_params(axis="y", labelsize=16)
ax.tick_params(axis="x", labelsize=16)
plt.yticks(rotation=0)

tgt_num = len(sample_list)
gd_num = len(assay_list)
bottom, top = ax.get_ylim()
ax.set_ylim(bottom + 0.5, top - 0.5)
h_lines = np.arange(3,gd_num,3)
v_lines = np.arange(3,tgt_num,3)
axes.hlines(h_lines, colors = 'silver',alpha=0.9,linewidths = 0.35,*axes.get_xlim())
axes.vlines(v_lines, colors = 'silver',alpha=0.9,linewidths = 0.35,*axes.get_ylim())

return fig

0 comments on commit 470b3d3

Please sign in to comment.