Skip to content

Commit

Permalink
added fn to run entire pipeline
Browse files Browse the repository at this point in the history
also cleaned up some descriptions and stopped adding alt_name to output
  • Loading branch information
acferris committed Oct 12, 2024
1 parent eb9463c commit 70c8ec6
Show file tree
Hide file tree
Showing 2 changed files with 18 additions and 18 deletions.
22 changes: 13 additions & 9 deletions base.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@ def filterData(countsFile, metaFile, minloci, minSample, refFilter = None):
Args:
countsFile: path to reformatted counts file
metaFile: path to the metadata file paired with the countsFile
minSample: samples must be missing counts data from less than X proportion of markers
minloci: markers must be have counts data from more than than Y proportion of samples
refFilter: (optional) remove references with a divergence score above this value
'''
#import counts data
Expand All @@ -29,7 +31,7 @@ def filterData(countsFile, metaFile, minloci, minSample, refFilter = None):

#check that all marker names are unique
marker, markerCount = np.unique(counts.index, return_counts=True)
if len(np.where(markerCount > 2)[0]) > 0: print('Multiple markers in same gene, differentiate marker names in the count file')
if len(np.where(markerCount > 2)[0]) > 0: print('More than two rows with the same marker name, please differentiate marker names in the count file')

#import sample metadata
sampleMeta = pd.read_csv(metaFile)
Expand Down Expand Up @@ -151,14 +153,7 @@ def labelSamples(snpProportion,sampleMeta,db_communities,embedding, cutHeight, a
output['short_name'] = snpProportion.columns
if admixedCutoff:
output['divergence'] = plot.homozygousDivergence(snpProportion)
output['variety'] = pd.NA

#for each short_name, find the index in sampleMeta and grab the alt_name
countryID = []
for short_name in snpProportion.columns:
countryID.append(sampleMeta[sampleMeta['short_name'] == int(short_name)]['alt_name'].values[0])
output['countryID'] = countryID

output['variety'] = pd.NA

for cluster in np.unique(db_communities):
if cluster == -1: #-1 indicates samples that are disconnected from the rest of the clusters
Expand Down Expand Up @@ -220,6 +215,15 @@ def loadParameters(parameterFile):

return minSample, minloci, umapSeed, epsilon, cutHeight, admixedCutoff, filePrefix, inputCountsFile, inputMetaFile

def runPipeline(parameterFile):
minSample, minloci, umapSeed, epsilon, cutHeight, admixedCutoff, filePrefix, inputCountsFile, inputMetaFile = loadParameters(parameterFile)
snpProportion, snpProportionNoInterpolation, sampleMeta = filterData(inputCountsFile, inputMetaFile, minloci, minSample)
embedding = embedData(snpProportion, umapSeed)
db_communities = clusteringDBSCAN(snpProportion, sampleMeta, embedding, epsilon, filePrefix, admixedCutoff)
output = labelSamples(snpProportion, sampleMeta, db_communities, embedding, cutHeight, admixedCutoff, filePrefix)

return snpProportion, snpProportionNoInterpolation, sampleMeta, embedding, db_communities, output

if __name__ == '__main__':
minSample, minloci, umapSeed, epsilon, cutHeight, admixedCutoff, filePrefix, inputCountsFile, inputMetaFile = loadParameters(parameterFile)
snpProportion, snpProportionNoInterpolation, sampleMeta = filterData(inputCountsFile, inputMetaFile, minloci, minSample)
Expand Down
14 changes: 5 additions & 9 deletions graphs.py
Original file line number Diff line number Diff line change
Expand Up @@ -690,22 +690,17 @@ def heatmapDendrogramAll(snpProportion, sampleMeta, communities, filePrefix, cut
plt.savefig(filePrefix+' dendrogram cluster '+str(clusterNum)+' (cut height'+str(cutHeight)+').png', dpi = 300)

def barchartLandrace(snpProportion, output, sampleMeta, filePrefix = None, cutoff = 2):

"""
Barchart with the prevalence of each observed reference variety
Barchart with the prevalence of each observed genetic entity
Args:
snpProportion: processed SNP proportion data
output: output dataframe frome base.py
sampleMeta: metadata paired with genotyping data
filePrefix: optional prefix for output filenames to save
"""
w = sampleMeta[sampleMeta['short_name'].isin(snpProportion.columns.astype('int'))]
var, counts = np.unique(output['variety'], return_counts=True)

refshort = w['short_name'][(sampleMeta['reference'].notna())].values.astype('str') #references
admixedshort = output['short_name'][output['variety'] == 'Admixed'].values.astype('str') #admixed


landraceName = var[np.flatnonzero(np.core.defchararray.find(var.astype('str'),'Genetic entity')!=-1)]
landraceShort = output['short_name'][np.isin(output['variety'],landraceName)].values.astype('str')
landraceVarieties, landraceVarietiesCount = np.unique(output[output['short_name'].isin(landraceShort)]['variety'], return_counts=True)
Expand All @@ -719,14 +714,15 @@ def barchartLandrace(snpProportion, output, sampleMeta, filePrefix = None, cutof
ax.set_yticks(np.arange(len(landraceVarietiesFilter)),landraceVarietiesFilter[np.argsort(landraceVarietiesCountFilter)])
plt.tight_layout()
if filePrefix:
plt.savefig(filePrefix + ' barchart landrace horizontal.png', dpi = 300)
plt.savefig(filePrefix + ' genetic entity barchart.png', dpi = 300)

#histogram of occurences of genetic entities
plt.figure()
plt.hist(landraceVarietiesCount, bins = np.arange(max(landraceVarietiesCount)+2))
plt.xticks(np.arange(1,max(landraceVarietiesCount)+2))
plt.tight_layout()
if filePrefix:
plt.savefig(filePrefix + ' barchart landrace vertical.png', dpi = 300)
plt.savefig(filePrefix + ' genetic entity histogram.png', dpi = 300)

def heatmapDendrogram(snpProportion, sampleMeta, communities, COI, cutHeight):
"""
Expand Down

0 comments on commit 70c8ec6

Please sign in to comment.