From 5d60150d236abff5886c320b3170609de8cabfa9 Mon Sep 17 00:00:00 2001 From: Daniel Suveges Date: Mon, 20 Jan 2020 11:40:45 +0000 Subject: [PATCH 1/2] Now the windowing is fixed. --- peak_finder.py | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/peak_finder.py b/peak_finder.py index 85be79e..67bb93e 100755 --- a/peak_finder.py +++ b/peak_finder.py @@ -20,13 +20,15 @@ def get_log(pv): return(math.log10(float(pv))) # Function to find and annotate top association: -def find_top_association(chrom): +def find_top_association(chrom): global input_df global window global threshold test_df = input_df.loc[input_df.chromosome == chrom] - # test_df.loc[:,'pvalue'] = test_df['pvalue'].astype(float) # changing type of p-value column <- too dangerous. Underflow hazard. + print(chrom) + # print(test_df) + test_df['mLogPv'] = test_df.pvalue.apply(get_log) test_df.loc[:,'bp_location'] = test_df['bp_location'].astype(float) # changing type of position column @@ -43,17 +45,20 @@ def find_top_association(chrom): test_df.loc[index, 'isTopAssociation'] = 'false' continue - # We have found a top snp! + # We have found a top snp! else: pos = test_df.loc[index]['bp_location'] + pval = test_df.loc[index]['mLogPv'] + # Setting false flag for ALL variants within the window. test_df.loc[test_df[abs( test_df.bp_location - pos ) <= window].index,'isTopAssociation'] = 'false' - # If there are multiple variants with the same p-value, request review, othervise it's a true peak. - if len(test_df.loc[test_df['mLogPv'] == test_df.loc[index]['mLogPv']].chromosome) > 1: - test_df.loc[ test_df['mLogPv'] == test_df.loc[index]['mLogPv'], 'isTopAssociation'] = 'REQUIRES REVIEW' + # If there are multiple variants with the same p-value within the window they require review: + if len(test_df.loc[(test_df.mLogPv == pval) & (abs( test_df.bp_location - pos ) <= window)]) > 1: + test_df.loc[ (test_df.mLogPv == pval) & (abs( test_df.bp_location - pos ) <= window), 'isTopAssociation'] = 'REQUIRES REVIEW' + # Otherwise it's a true peak: else: - test_df.loc[ test_df['mLogPv'] == test_df.loc[index]['mLogPv'], 'isTopAssociation'] = 'true' + test_df.loc[ index, 'isTopAssociation'] = 'true' # Modifying the original dataframe: input_df.loc[ test_df.index, 'isTopAssociation'] = test_df.isTopAssociation From dc2973ccfd9b56f5289dad2fb99ba9ac2b7f832e Mon Sep 17 00:00:00 2001 From: Daniel Suveges Date: Mon, 20 Jan 2020 11:43:43 +0000 Subject: [PATCH 2/2] Cleaning code --- peak_finder.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/peak_finder.py b/peak_finder.py index 67bb93e..f853a75 100755 --- a/peak_finder.py +++ b/peak_finder.py @@ -25,12 +25,12 @@ def find_top_association(chrom): global window global threshold + # The dataframe is filtered by chromosome: test_df = input_df.loc[input_df.chromosome == chrom] - print(chrom) - # print(test_df) + # Calculate the -log(pv) and update column type of the position: test_df['mLogPv'] = test_df.pvalue.apply(get_log) - test_df.loc[:,'bp_location'] = test_df['bp_location'].astype(float) # changing type of position column + test_df.loc[:,'bp_location'] = test_df['bp_location'].astype(float) # sorting rows by p-value: for index in test_df.sort_values(['mLogPv']).index: