From 15408538b0bdc3c297ff89ec2a4638f7d7d0f9e1 Mon Sep 17 00:00:00 2001 From: Jakob Nybo Nissen Date: Mon, 22 Jul 2024 13:00:42 +0200 Subject: [PATCH] Remove abundance.py script I didn't use this script, do not want to maintain it, and generally don't trust that it provides reliable answers. --- src/abundance.py | 61 ------------------------------------------------ 1 file changed, 61 deletions(-) delete mode 100644 src/abundance.py diff --git a/src/abundance.py b/src/abundance.py deleted file mode 100644 index b5cad2f3..00000000 --- a/src/abundance.py +++ /dev/null @@ -1,61 +0,0 @@ -import sys -import os -import argparse -import numpy as np -import vamb - -parser = argparse.ArgumentParser( - description="""Command-line bin abundance estimator. -Print the median RPKM abundance for each bin in each sample to STDOUT. -Will read the RPKM file into memory - beware.""", - formatter_class=argparse.RawDescriptionHelpFormatter, - add_help=False, -) - -parser.add_argument("rpkmpath", help="Path to RPKM file") -parser.add_argument("clusterspath", help="Path to clusters.tsv") -parser.add_argument("headerpath", help="Path to list of headers") - -if len(sys.argv) == 1: - parser.print_help() - sys.exit() - -args = parser.parse_args() - -# Check files -for infile in (args.rpkmpath, args.clusterspath, args.headerpath): - if not os.path.isfile(infile): - raise FileNotFoundError(infile) - -# Load in files -with open(args.headerpath) as file: - indexof = {line.strip(): i for i, line in enumerate(file)} - -with open(args.clusterspath) as file: - clusters = vamb.vambtools.read_clusters(file) - -# Check that all clusters names are in headers: -for cluster in clusters.values(): - for header in cluster: - if header not in indexof: - raise KeyError("Header not found in headerlist: {}".format(header)) - -# Load RPKM and check it -rpkm = vamb.vambtools.read_npz(args.rpkmpath) -nsamples = rpkm.shape[1] - -if len(indexof) != len(rpkm): - raise ValueError("Not the same number of headers as rows in RPKM file") - -# Now estimate abundances -for clustername, cluster in clusters.items(): - depths = np.empty((len(cluster), nsamples), dtype=np.float32) - - for row, header in enumerate(cluster): - index = indexof[header] - depths[row] = rpkm[index] - - median_depths = np.median(depths, axis=0) - - print(clustername, end="\t") - print("\t".join([str(i) for i in median_depths]))