diff --git a/peak_finder.py b/peak_finder.py index 85be79e..f853a75 100755 --- a/peak_finder.py +++ b/peak_finder.py @@ -20,15 +20,17 @@ def get_log(pv): return(math.log10(float(pv))) # Function to find and annotate top association: -def find_top_association(chrom): +def find_top_association(chrom): global input_df global window global threshold + # The dataframe is filtered by chromosome: test_df = input_df.loc[input_df.chromosome == chrom] - # test_df.loc[:,'pvalue'] = test_df['pvalue'].astype(float) # changing type of p-value column <- too dangerous. Underflow hazard. + + # Calculate the -log(pv) and update column type of the position: test_df['mLogPv'] = test_df.pvalue.apply(get_log) - test_df.loc[:,'bp_location'] = test_df['bp_location'].astype(float) # changing type of position column + test_df.loc[:,'bp_location'] = test_df['bp_location'].astype(float) # sorting rows by p-value: for index in test_df.sort_values(['mLogPv']).index: @@ -43,17 +45,20 @@ def find_top_association(chrom): test_df.loc[index, 'isTopAssociation'] = 'false' continue - # We have found a top snp! + # We have found a top snp! else: pos = test_df.loc[index]['bp_location'] + pval = test_df.loc[index]['mLogPv'] + # Setting false flag for ALL variants within the window. test_df.loc[test_df[abs( test_df.bp_location - pos ) <= window].index,'isTopAssociation'] = 'false' - # If there are multiple variants with the same p-value, request review, othervise it's a true peak. - if len(test_df.loc[test_df['mLogPv'] == test_df.loc[index]['mLogPv']].chromosome) > 1: - test_df.loc[ test_df['mLogPv'] == test_df.loc[index]['mLogPv'], 'isTopAssociation'] = 'REQUIRES REVIEW' + # If there are multiple variants with the same p-value within the window they require review: + if len(test_df.loc[(test_df.mLogPv == pval) & (abs( test_df.bp_location - pos ) <= window)]) > 1: + test_df.loc[ (test_df.mLogPv == pval) & (abs( test_df.bp_location - pos ) <= window), 'isTopAssociation'] = 'REQUIRES REVIEW' + # Otherwise it's a true peak: else: - test_df.loc[ test_df['mLogPv'] == test_df.loc[index]['mLogPv'], 'isTopAssociation'] = 'true' + test_df.loc[ index, 'isTopAssociation'] = 'true' # Modifying the original dataframe: input_df.loc[ test_df.index, 'isTopAssociation'] = test_df.isTopAssociation