From 77ab9929bc11b54897e47dbe321b1ad248696b8d Mon Sep 17 00:00:00 2001 From: acferris <29984203+acferris@users.noreply.github.com> Date: Fri, 25 Oct 2024 12:46:05 -0700 Subject: [PATCH] reorganizing reference processing --- base.py | 1 + referenceProcessing.py | 23 +++++++++-------------- 2 files changed, 10 insertions(+), 14 deletions(-) diff --git a/base.py b/base.py index 972b033..8528933 100644 --- a/base.py +++ b/base.py @@ -13,6 +13,7 @@ import scipy.cluster.hierarchy as sch import graphs as plot import randMatrix as rand +import referenceProcessing def filterData(countsFile, metaFile, minloci, minSample, refFilter = None): ''' diff --git a/referenceProcessing.py b/referenceProcessing.py index 7cf5b53..b37a3db 100644 --- a/referenceProcessing.py +++ b/referenceProcessing.py @@ -35,7 +35,7 @@ def histogramTechnicalRep(snpProportion, sampleMeta): distance.append(correlation(subset[:,j[0]],subset[:,j[1]])) sample.append(i) - + #histogram of distance between technical replicates plt.figure() plt.hist(distance, bins = 20) @@ -44,7 +44,7 @@ def histogramTechnicalRep(snpProportion, sampleMeta): return np.asarray(distance), np.asarray(sample) -def heatmapTechnicalRep(snpProportion, sampleMeta, divergentDistance, divergentInventory): +def heatmapTechnicalRep(snpProportion, sampleMeta, percentile): """ Generate a heatmap and dendrogram for each cluster @@ -54,6 +54,12 @@ def heatmapTechnicalRep(snpProportion, sampleMeta, divergentDistance, divergentI divergentDistance: list of divergence values for samples divergentInventory: sample names in the same order as divergentDistance """ + distance, sample = histogramTechnicalRep(snpProportion, sampleMeta) + + cutoff = distance[np.argsort(distance)[int(np.floor(percentile*len(distance)))]] + divergentDistance = distance[distance > cutoff] + divergentInventory = sample[distance > cutoff] + refSubset = sampleMeta[pd.notna(sampleMeta['reference_original'])] w = refSubset[refSubset['short_name'].isin(snpProportion.columns.astype('int'))] @@ -119,15 +125,4 @@ def referenceDistance(snpProportion, sampleMeta): plt.ylabel('Max distance') plt.xlabel('Proportion of references') plt.xticks(len(maxDistance)*np.arange(0,1.1,0.1), np.around(np.arange(0,1.1,0.1),1)) - plt.tight_layout() - -#Flag references where the technical replicates don't match and generate a heatmap -percentile = 0.95 #percentile cutoff -distance, sample = histogramTechnicalRep(snpProportion, sampleMeta) -cutoff = distance[np.argsort(distance)[int(np.floor(percentile*len(distance)))]] -heatmapTechnicalRep(snpProportion, sampleMeta, distance[distance > cutoff], sample[distance > cutoff]) -print('Inventory numbers to check: ', sample[distance > cutoff]) - -#Flag references with the same original label that turn up in different DBSCAN clusters and manually check -#It may be helpful to use plot.heatmapManyClusters() and plot.heatmapSingleVariety() to understand how to best relabel references -splitReferences(snpProportion, sampleMeta, db_communities) + plt.tight_layout() \ No newline at end of file