From d95a83325bd1c9e726254c17560df0c3a1f91e00 Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Mon, 26 Aug 2024 12:27:34 -0700 Subject: [PATCH 1/9] clearer summary for lingroup results --- src/sourmash/tax/__main__.py | 19 +++++++++++-------- src/sourmash/tax/tax_utils.py | 28 ++++++++++++++++++++++++---- 2 files changed, 35 insertions(+), 12 deletions(-) diff --git a/src/sourmash/tax/__main__.py b/src/sourmash/tax/__main__.py index f0c8b4f2d..b5dd36538 100644 --- a/src/sourmash/tax/__main__.py +++ b/src/sourmash/tax/__main__.py @@ -154,6 +154,15 @@ def metagenome(args): error(f"ERROR: {str(exc)}") sys.exit(-1) + # if lingroup file is passed in, read it + lingroups=None + if args.lingroup is not None: + try: + lingroups = tax_utils.read_lingroups(args.lingroup) + except ValueError as exc: + error(f"ERROR: {str(exc)}") + sys.exit(-1) + # write summarized output in human-readable format if "lineage_summary" in args.output_format: lineage_outfile, limit_float = make_outfile( @@ -202,7 +211,7 @@ def metagenome(args): ) with FileOutputCSV(summary_outfile) as out_fp: tax_utils.write_summary( - query_gather_results, out_fp, limit_float_decimals=limit_float + query_gather_results, out_fp, limit_float_decimals=limit_float, lingroups=lingroups, ) # write summarized --> kreport output tsv @@ -218,13 +227,7 @@ def metagenome(args): ) # write summarized --> LINgroup output tsv - if "lingroup" in args.output_format: - try: - lingroups = tax_utils.read_lingroups(args.lingroup) - except ValueError as exc: - error(f"ERROR: {str(exc)}") - sys.exit(-1) - + if "lingroup" in args.output_format and lingroups is not None: lingroupfile, limit_float = make_outfile( args.output_base, "lingroup", output_dir=args.output_dir ) diff --git a/src/sourmash/tax/tax_utils.py b/src/sourmash/tax/tax_utils.py index 5af08f91f..857687554 100644 --- a/src/sourmash/tax/tax_utils.py +++ b/src/sourmash/tax/tax_utils.py @@ -1116,6 +1116,7 @@ def write_summary( sep=",", limit_float_decimals=False, classification=False, + lingroups=None, ): """ Write taxonomy-summarized gather results for each rank. @@ -1123,7 +1124,7 @@ def write_summary( w = None for q_res in query_gather_results: header, summary = q_res.make_full_summary( - limit_float=limit_float_decimals, classification=classification + limit_float=limit_float_decimals, classification=classification, lingroups=lingroups ) if w is None: w = csv.DictWriter(csv_fp, header, delimiter=sep) @@ -2073,9 +2074,15 @@ def as_lineage_dict(self, query_info, ranks): lD[rank] = lin_name return lD - def as_summary_dict(self, query_info, limit_float=False): + def as_summary_dict(self, query_info, limit_float=False, lingroups=None): sD = asdict(self) sD["lineage"] = self.lineage.display_lineage(null_as_unclassified=True) + # if lingroups, convert lingroup number to lingroup name + if lingroups is not None and sD["lineage"] in lingroups.keys(): + sD["lineage"] = lingroups[sD["lineage"]] + elif lingroups and sD["lineage"] != "unclassified" and sD["lineage"] not in lingroups.keys(): + return None + # import pdb;pdb.set_trace() sD["query_name"] = query_info.query_name sD["query_md5"] = query_info.query_md5 sD["query_filename"] = query_info.query_filename @@ -2553,7 +2560,7 @@ def make_human_summary(self, display_rank, classification=False): results.append(res.as_human_friendly_dict(query_info=self.query_info)) return results - def make_full_summary(self, classification=False, limit_float=False): + def make_full_summary(self, classification=False, limit_float=False, lingroups=None): results = [] rD = {} if classification: @@ -2590,7 +2597,18 @@ def make_full_summary(self, classification=False, limit_float=False): "total_weighted_hashes", ] + lingroup_ranks = set() + if lingroups is not None: + for lin in lingroups.keys(): + # e.g. "14;1;0;0;0;0;0;0;0;0" => 9 + lin_rank = len(lin.split(';')) - 1 + lingroup_ranks.add(lin_rank) + for rank in self.summarized_ranks[::-1]: # descending + # if lingroups are provided, only report summary for specified lingroups + if lingroup_ranks: + if int(rank) not in lingroup_ranks: + continue unclassified = [] rank_results = self.summarized_lineage_results[rank] rank_results.sort( @@ -2598,8 +2616,10 @@ def make_full_summary(self, classification=False, limit_float=False): ) # v5?: f_weighted_at_rank) for res in rank_results: rD = res.as_summary_dict( - query_info=self.query_info, limit_float=limit_float + query_info=self.query_info, limit_float=limit_float, lingroups=lingroups, ) + if rD is None: + continue # save unclassified for the end if rD["lineage"] == "unclassified": unclassified.append(rD) From 63b0d7561c97e809c9a82e5aad1700f5bb27c34a Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Mon, 26 Aug 2024 13:29:05 -0700 Subject: [PATCH 2/9] add test for lingroup csv_summary file --- tests/test_tax.py | 85 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 85 insertions(+) diff --git a/tests/test_tax.py b/tests/test_tax.py index 389a53661..0190a5227 100644 --- a/tests/test_tax.py +++ b/tests/test_tax.py @@ -6061,6 +6061,91 @@ def test_metagenome_LIN_lingroups(runtmp): ) +def test_metagenome_LIN_lingroups_summary(runtmp): + # test lingroups summary file. Can no longer output stdout, b/c will produce 2 files + c = runtmp + csv_base = "out" + sum_csv = csv_base + ".summarized.csv" + csvout = runtmp.output(sum_csv) + outdir = os.path.dirname(csvout) + + g_csv = utils.get_test_data("tax/test1.gather.v450.csv") + tax = utils.get_test_data("tax/test.LIN-taxonomy.csv") + + lg_file = runtmp.output("test.lg.csv") + with open(lg_file, "w") as out: + out.write("lin,name\n") + out.write("0;0;0,lg1\n") + out.write("1;0;0,lg2\n") + out.write("2;0;0,lg3\n") + out.write("1;0;1,lg3\n") + # write a 19 so we can check the end + out.write("1;0;1;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0,lg4\n") + + c.run_sourmash( + "tax", + "metagenome", + "-g", + g_csv, + "--taxonomy-csv", + tax, + "--lins", + "--lingroup", + lg_file, + "-o", + csv_base, + '--output-dir', + outdir, + "-F", "csv_summary" + ) + + print(c.last_result.status) + print(c.last_result.out) + print(c.last_result.err) + + assert c.last_result.status == 0 + assert ( + "Read 5 lingroup rows and found 5 distinct lingroup prefixes." + in c.last_result.err + ) + assert os.path.exists(csvout) + sum_gather_results = [x.rstrip() for x in Path(csvout).read_text().splitlines()] + print(sum_gather_results) + assert f"saving 'csv_summary' output to '{csvout}'" in runtmp.last_result.err + assert ( + "query_name,rank,fraction,lineage,query_md5,query_filename,f_weighted_at_rank,bp_match_at_rank" + in sum_gather_results[0] + ) + assert ( + "test1,2,0.08815317112086159,lg1,9687eeed,outputs/abundtrim/HSMA33MX.abundtrim.fq.gz,0.05815279361459521,442000,0.9246458342627294,6139" + in sum_gather_results[1] + ) + assert ( + "test1,2,0.07778220981252493,lg2,9687eeed,outputs/abundtrim/HSMA33MX.abundtrim.fq.gz,0.050496823586903404,390000,0.920920083987624,6139" + in sum_gather_results[2] + ) + assert ( + "test1,2,0.027522935779816515,lg3,9687eeed,outputs/abundtrim/HSMA33MX.abundtrim.fq.gz,0.015637726014008795,138000,0.8905689983332759,6139" + in sum_gather_results[3] + ) + assert ( + "test1,2,0.010769844435580374,lg3,9687eeed,outputs/abundtrim/HSMA33MX.abundtrim.fq.gz,0.006515719172503665,54000,0.8640181883213995,6139" + in sum_gather_results[4] + ) + assert ( + "test1,2,0.7957718388512166,unclassified,9687eeed,outputs/abundtrim/HSMA33MX.abundtrim.fq.gz,0.8691969376119889,3990000,,6139" + in sum_gather_results[5] + ) + assert ( + "test1,19,0.010769844435580374,lg4,9687eeed,outputs/abundtrim/HSMA33MX.abundtrim.fq.gz,0.006515719172503665,54000,0.8640181883213995,6139" + in sum_gather_results[6] + ) + assert ( + "test1,19,0.7957718388512166,unclassified,9687eeed,outputs/abundtrim/HSMA33MX.abundtrim.fq.gz,0.8691969376119889,3990000,,6139" + in sum_gather_results[7] + ) + + def test_metagenome_LIN_human_summary_no_lin_position(runtmp): c = runtmp From 77c83c3c23d7b17c24062ed3969873090da2851c Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Mon, 26 Aug 2024 13:48:46 -0700 Subject: [PATCH 3/9] add note for LINgroup csv_summary; add file exts to docs --- doc/command-line.md | 27 +++++++++++++++++++-------- 1 file changed, 19 insertions(+), 8 deletions(-) diff --git a/doc/command-line.md b/doc/command-line.md index 05f741475..e911d7519 100644 --- a/doc/command-line.md +++ b/doc/command-line.md @@ -637,13 +637,16 @@ sourmash tax metagenome --taxonomy gtdb-rs202.taxonomy.v2.csv ``` -The possible output formats are: -- `human` -- `csv_summary` -- `lineage_summary` -- `krona` -- `kreport` -- `lingroup_report` +The possible output formats are listed below, followed by the file extension used when writing to a file rather than stdout. When using more than one output format, you must provide an output basename (`--output-base`) that will be used to name the output files. If an `--output-dir` is provided, files will output to that directory. + +- `human`: ".human.txt", +- `csv_summary`: ".summarized.csv", +- `lineage_summary`: ".lineage_summary.tsv", +- `krona`: ".krona.tsv", +- `kreport`: ".kreport.txt", +- `lingroup`: ".lingroup.tsv", +- `bioboxes`: ".bioboxes.profile", + #### `csv_summary` output format @@ -672,6 +675,9 @@ o__Bacteroidales;f__Bacteroidaceae;g__Phocaeicola;s__Phocaeicola vulgatus ``` The `query_md5` and `query_filename` columns are omitted here for brevity. +Note: When using `--lins` with a `--lingroup` file, the `csv_summary` file will report +summarization for each specified `lingroup`, rather than all possible `lin` ranks (v4.8.12+). + #### `krona` output format `krona` format is a tab-separated list of these results at a specific rank. @@ -971,7 +977,12 @@ sourmash tax genome > This command uses the default classification strategy, which uses a containment threshold of 0.1 (10%). -There are two possible output formats, `csv_summary` and `krona`. +`sourmash tax genome` can produce the following output formats: + +- `human`: ".human.txt", +- `csv_summary`: ".classifications.csv", +- `krona`: ".krona.tsv", +- `lineage_summary`: ".lineage_summary.tsv", #### `csv_summary` output format From acb81206be140fffc7e91fe6328959be26cb6688 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 26 Aug 2024 20:54:31 +0000 Subject: [PATCH 4/9] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- src/sourmash/tax/__main__.py | 7 +++++-- src/sourmash/tax/tax_utils.py | 20 +++++++++++++++----- tests/test_tax.py | 5 +++-- 3 files changed, 23 insertions(+), 9 deletions(-) diff --git a/src/sourmash/tax/__main__.py b/src/sourmash/tax/__main__.py index b5dd36538..1c7ca2e69 100644 --- a/src/sourmash/tax/__main__.py +++ b/src/sourmash/tax/__main__.py @@ -155,7 +155,7 @@ def metagenome(args): sys.exit(-1) # if lingroup file is passed in, read it - lingroups=None + lingroups = None if args.lingroup is not None: try: lingroups = tax_utils.read_lingroups(args.lingroup) @@ -211,7 +211,10 @@ def metagenome(args): ) with FileOutputCSV(summary_outfile) as out_fp: tax_utils.write_summary( - query_gather_results, out_fp, limit_float_decimals=limit_float, lingroups=lingroups, + query_gather_results, + out_fp, + limit_float_decimals=limit_float, + lingroups=lingroups, ) # write summarized --> kreport output tsv diff --git a/src/sourmash/tax/tax_utils.py b/src/sourmash/tax/tax_utils.py index 857687554..a1d34da07 100644 --- a/src/sourmash/tax/tax_utils.py +++ b/src/sourmash/tax/tax_utils.py @@ -1124,7 +1124,9 @@ def write_summary( w = None for q_res in query_gather_results: header, summary = q_res.make_full_summary( - limit_float=limit_float_decimals, classification=classification, lingroups=lingroups + limit_float=limit_float_decimals, + classification=classification, + lingroups=lingroups, ) if w is None: w = csv.DictWriter(csv_fp, header, delimiter=sep) @@ -2080,7 +2082,11 @@ def as_summary_dict(self, query_info, limit_float=False, lingroups=None): # if lingroups, convert lingroup number to lingroup name if lingroups is not None and sD["lineage"] in lingroups.keys(): sD["lineage"] = lingroups[sD["lineage"]] - elif lingroups and sD["lineage"] != "unclassified" and sD["lineage"] not in lingroups.keys(): + elif ( + lingroups + and sD["lineage"] != "unclassified" + and sD["lineage"] not in lingroups.keys() + ): return None # import pdb;pdb.set_trace() sD["query_name"] = query_info.query_name @@ -2560,7 +2566,9 @@ def make_human_summary(self, display_rank, classification=False): results.append(res.as_human_friendly_dict(query_info=self.query_info)) return results - def make_full_summary(self, classification=False, limit_float=False, lingroups=None): + def make_full_summary( + self, classification=False, limit_float=False, lingroups=None + ): results = [] rD = {} if classification: @@ -2601,7 +2609,7 @@ def make_full_summary(self, classification=False, limit_float=False, lingroups=N if lingroups is not None: for lin in lingroups.keys(): # e.g. "14;1;0;0;0;0;0;0;0;0" => 9 - lin_rank = len(lin.split(';')) - 1 + lin_rank = len(lin.split(";")) - 1 lingroup_ranks.add(lin_rank) for rank in self.summarized_ranks[::-1]: # descending @@ -2616,7 +2624,9 @@ def make_full_summary(self, classification=False, limit_float=False, lingroups=N ) # v5?: f_weighted_at_rank) for res in rank_results: rD = res.as_summary_dict( - query_info=self.query_info, limit_float=limit_float, lingroups=lingroups, + query_info=self.query_info, + limit_float=limit_float, + lingroups=lingroups, ) if rD is None: continue diff --git a/tests/test_tax.py b/tests/test_tax.py index 0190a5227..bb123cc74 100644 --- a/tests/test_tax.py +++ b/tests/test_tax.py @@ -6094,9 +6094,10 @@ def test_metagenome_LIN_lingroups_summary(runtmp): lg_file, "-o", csv_base, - '--output-dir', + "--output-dir", outdir, - "-F", "csv_summary" + "-F", + "csv_summary", ) print(c.last_result.status) From 4855569a5720da8ba08f80faf2468666295791b5 Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Wed, 4 Sep 2024 09:58:00 -0700 Subject: [PATCH 5/9] allow csv_summary output for multiple queries --- doc/command-line.md | 2 ++ src/sourmash/cli/tax/metagenome.py | 4 ++-- src/sourmash/tax/__main__.py | 2 +- tests/test_tax.py | 31 +++++++++++++++++++++++++----- 4 files changed, 31 insertions(+), 8 deletions(-) diff --git a/doc/command-line.md b/doc/command-line.md index e911d7519..d6370a307 100644 --- a/doc/command-line.md +++ b/doc/command-line.md @@ -848,6 +848,8 @@ lg4 1;0;1;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0 0.65 80000 Related lingroup subpaths will be grouped in output, but exact ordering may change between runs. +Note: this output format requires a single sample only. For a similar output with multiple query samples, provide the `lingroup` file and use the 'csv_summary' output format. + #### `bioboxes` output format When using standard taxonomic ranks (not lins), you can choose to output a 'bioboxes' profile, `{base}.bioboxes.profile`, where `{base}` is the name provided via the `-o/--output-base` option. This output is organized according to the [bioboxes profile specifications](https://github.com/bioboxes/rfc/tree/master/data-format) so that this file can be used for CAMI challenges. diff --git a/src/sourmash/cli/tax/metagenome.py b/src/sourmash/cli/tax/metagenome.py index f8e31902e..2f04665dc 100644 --- a/src/sourmash/cli/tax/metagenome.py +++ b/src/sourmash/cli/tax/metagenome.py @@ -114,14 +114,14 @@ def subparser(subparsers): "--lingroups", metavar="FILE", default=None, - help="CSV containing 'name', 'lin' columns, where 'lin' is the lingroup prefix. Will produce a 'lingroup' report containing taxonomic summarization for each group.", + help="CSV containing 'name', 'lin' columns, where 'lin' is the lingroup prefix. For 'tax metagenome' runs with a single 'gather' file (single query), providing this file will allow us to output a 'lingroup' report containing taxonomic summarization for each group. For multiple queries, we recommend the 'csv_summary' output format.", ) subparser.add_argument( "--ictv", "--ictv-taxonomy", action="store_true", default=False, - help="use ICTV taxonomy in place of standard taxonomic ranks. Note that the taxonomy CSV must contain ICTV ranks.", + help="use ICTV taxonomy in place of standard taxonomic ranks. Note that the taxonomy CSV must contain ICTV ranks.", ) add_rank_arg(subparser) diff --git a/src/sourmash/tax/__main__.py b/src/sourmash/tax/__main__.py index 1c7ca2e69..c4e22d07f 100644 --- a/src/sourmash/tax/__main__.py +++ b/src/sourmash/tax/__main__.py @@ -126,7 +126,7 @@ def metagenome(args): notify("No gather results loaded. Exiting.") sys.exit(-1) - single_query_output_formats = ["csv_summary", "kreport"] + single_query_output_formats = ["kreport", "lingroup", "bioboxes"] desired_single_outputs = [] if len(query_gather_results) > 1: # working with multiple queries desired_single_outputs = [ diff --git a/tests/test_tax.py b/tests/test_tax.py index bb123cc74..92f9840e4 100644 --- a/tests/test_tax.py +++ b/tests/test_tax.py @@ -1980,6 +1980,7 @@ def test_metagenome_two_queries_with_single_query_output_formats_fail(runtmp): fp.write(line) csv_summary_out = runtmp.output("tst.summarized.csv") + bioboxes_out = runtmp.output("tst.bioboxes.out") kreport_out = runtmp.output("tst.kreport.txt") with pytest.raises(SourmashCommandFailed) as exc: @@ -1992,7 +1993,7 @@ def test_metagenome_two_queries_with_single_query_output_formats_fail(runtmp): "--taxonomy-csv", taxonomy_csv, "-F", - "csv_summary", + "bioboxes", "kreport", "--rank", "phylum", @@ -2001,13 +2002,13 @@ def test_metagenome_two_queries_with_single_query_output_formats_fail(runtmp): ) print(str(exc.value)) - assert not os.path.exists(csv_summary_out) + assert not os.path.exists(bioboxes_out) assert not os.path.exists(kreport_out) assert c.last_result.status == -1 assert "loaded results for 2 queries from 2 gather CSVs" in c.last_result.err assert ( - "WARNING: found results for multiple gather queries. Can only output multi-query result formats: skipping csv_summary, kreport" + "WARNING: found results for multiple gather queries. Can only output multi-query result formats: skipping bioboxes, kreport" in c.last_result.err ) assert "ERROR: No output formats remaining." in c.last_result.err @@ -2028,6 +2029,7 @@ def test_metagenome_two_queries_skip_single_query_output_formats(runtmp): csv_summary_out = runtmp.output("tst.summarized.csv") kreport_out = runtmp.output("tst.kreport.txt") + bioboxes_out = runtmp.output("tst.bioboxes.txt") lineage_summary_out = runtmp.output("tst.lineage_summary.tsv") c.run_sourmash( @@ -2040,6 +2042,7 @@ def test_metagenome_two_queries_skip_single_query_output_formats(runtmp): taxonomy_csv, "-F", "csv_summary", + "bioboxes", "kreport", "lineage_summary", "--rank", @@ -2048,17 +2051,35 @@ def test_metagenome_two_queries_skip_single_query_output_formats(runtmp): "tst", ) - assert not os.path.exists(csv_summary_out) assert not os.path.exists(kreport_out) + assert not os.path.exists(bioboxes_out) + assert os.path.exists(csv_summary_out) assert os.path.exists(lineage_summary_out) assert c.last_result.status == 0 assert "loaded results for 2 queries from 2 gather CSVs" in c.last_result.err assert ( - "WARNING: found results for multiple gather queries. Can only output multi-query result formats: skipping csv_summary, kreport" + "WARNING: found results for multiple gather queries. Can only output multi-query result formats: skipping bioboxes, kreport" in c.last_result.err ) + assert f"saving 'csv_summary' output to '{os.path.basename(csv_summary_out)}'" in runtmp.last_result.err + sum_gather_results = [x.rstrip() for x in Path(csv_summary_out).read_text().splitlines()] + assert ( + "query_name,rank,fraction,lineage,query_md5,query_filename,f_weighted_at_rank,bp_match_at_rank" + in sum_gather_results[0] + ) + # check both queries exist in csv_summary results + assert ( + "test1,superkingdom,0.2042281611487834,d__Bacteria,md5,test1.sig,0.13080306238801107,1024000,0.9500482567175479,0" + in sum_gather_results[1] + ) + assert ( + "test2,superkingdom,0.2042281611487834,d__Bacteria,md5,test2.sig,0.13080306238801107,1024000,0.9500482567175479,0" + in sum_gather_results[23] + ) + + def test_metagenome_two_queries_krona(runtmp): # for now, we enable multi-query krona. Is this desired? From 0ddea9f586ffeffd51daf0ab022c58c6feb106e4 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 4 Sep 2024 16:59:29 +0000 Subject: [PATCH 6/9] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- tests/test_tax.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/tests/test_tax.py b/tests/test_tax.py index 92f9840e4..6d5becf34 100644 --- a/tests/test_tax.py +++ b/tests/test_tax.py @@ -1979,7 +1979,7 @@ def test_metagenome_two_queries_with_single_query_output_formats_fail(runtmp): line = line.replace("test1", "test2") + "\n" fp.write(line) - csv_summary_out = runtmp.output("tst.summarized.csv") + runtmp.output("tst.summarized.csv") bioboxes_out = runtmp.output("tst.bioboxes.out") kreport_out = runtmp.output("tst.kreport.txt") @@ -2063,8 +2063,13 @@ def test_metagenome_two_queries_skip_single_query_output_formats(runtmp): in c.last_result.err ) - assert f"saving 'csv_summary' output to '{os.path.basename(csv_summary_out)}'" in runtmp.last_result.err - sum_gather_results = [x.rstrip() for x in Path(csv_summary_out).read_text().splitlines()] + assert ( + f"saving 'csv_summary' output to '{os.path.basename(csv_summary_out)}'" + in runtmp.last_result.err + ) + sum_gather_results = [ + x.rstrip() for x in Path(csv_summary_out).read_text().splitlines() + ] assert ( "query_name,rank,fraction,lineage,query_md5,query_filename,f_weighted_at_rank,bp_match_at_rank" in sum_gather_results[0] @@ -2080,7 +2085,6 @@ def test_metagenome_two_queries_skip_single_query_output_formats(runtmp): ) - def test_metagenome_two_queries_krona(runtmp): # for now, we enable multi-query krona. Is this desired? c = runtmp From 2042b92f89d5abb1762388420d6e6b53b25d9681 Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Wed, 4 Sep 2024 10:23:05 -0700 Subject: [PATCH 7/9] add explicit test for second sample for csv_summary --- tests/test_tax.py | 128 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 128 insertions(+) diff --git a/tests/test_tax.py b/tests/test_tax.py index 92f9840e4..fbcd097b2 100644 --- a/tests/test_tax.py +++ b/tests/test_tax.py @@ -1966,6 +1966,134 @@ def test_metagenome_two_queries_human_output(runtmp): assert "test2 1.6% 89.1% d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Phocaeicola;s__Phocaeicola vulgatus" +def test_metagenome_two_queries_csv_summary_output(runtmp): + # remove single-query outputs when working with multiple queries + c = runtmp + taxonomy_csv = utils.get_test_data("tax/test.taxonomy.csv") + g_res = utils.get_test_data("tax/test1.gather.csv") + + # make a second query with same output + g_res2 = runtmp.output("test2.gather.csv") + with open(g_res2, "w") as fp: + for line in Path(g_res).read_text().splitlines(): + line = line.replace("test1", "test2") + "\n" + fp.write(line) + + csv_summary_out = runtmp.output("tst.summarized.csv") + + c.run_sourmash( + "tax", + "metagenome", + "--gather-csv", + g_res, + g_res2, + "--taxonomy-csv", + taxonomy_csv, + "-F", + "csv_summary", + "--rank", + "phylum", + "-o", + "tst", + ) + + assert os.path.exists(csv_summary_out) + + assert c.last_result.status == 0 + assert "loaded results for 2 queries from 2 gather CSVs" in c.last_result.err + assert f"saving 'csv_summary' output to '{os.path.basename(csv_summary_out)}'" in runtmp.last_result.err + sum_gather_results = [x.rstrip() for x in Path(csv_summary_out).read_text().splitlines()] + assert ( + "query_name,rank,fraction,lineage,query_md5,query_filename,f_weighted_at_rank,bp_match_at_rank" + in sum_gather_results[0] + ) + # check both queries exist in csv_summary results; check several + assert ( + "test1,superkingdom,0.2042281611487834,d__Bacteria,md5,test1.sig,0.13080306238801107,1024000,0.9500482567175479,0" + in sum_gather_results[1] + ) + assert ( + "test2,superkingdom,0.2042281611487834,d__Bacteria,md5,test2.sig,0.13080306238801107,1024000,0.9500482567175479,0" + in sum_gather_results[23] + ) + assert ( + "test2,phylum,0.11607499002792182,d__Bacteria;p__Bacteroidota,md5,test2.sig,0.07265026877341586,582000" + in sum_gather_results[25] + ) + assert ( + "test2,phylum,0.08815317112086159,d__Bacteria;p__Proteobacteria,md5,test2.sig,0.05815279361459521,442000" + in sum_gather_results[26] + ) + assert ( + "test2,phylum,0.7957718388512166,unclassified,md5,test2.sig,0.8691969376119889,3990000" + in sum_gather_results[27] + ) + assert ( + "test2,class,0.11607499002792182,d__Bacteria;p__Bacteroidota;c__Bacteroidia,md5,test2.sig,0.07265026877341586,582000" + in sum_gather_results[28] + ) + assert ( + "test2,class,0.08815317112086159,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria,md5,test2.sig,0.05815279361459521,442000" + in sum_gather_results[29] + ) + assert ( + "test2,class,0.7957718388512166,unclassified,md5,test2.sig,0.8691969376119889,3990000" + in sum_gather_results[30] + ) + assert ( + "test2,order,0.11607499002792182,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales,md5,test2.sig,0.07265026877341586,582000" + in sum_gather_results[31] + ) + assert ( + "test2,order,0.08815317112086159,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales,md5,test2.sig,0.05815279361459521,442000" + in sum_gather_results[32] + ) + assert ( + "test2,order,0.7957718388512166,unclassified,md5,test2.sig,0.8691969376119889,3990000" + in sum_gather_results[33] + ) + assert ( + "test2,family,0.11607499002792182,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae,md5,test2.sig,0.07265026877341586,582000" + in sum_gather_results[34] + ) + assert ( + "test2,family,0.08815317112086159,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae,md5,test2.sig,0.05815279361459521,442000" + in sum_gather_results[35] + ) + assert ( + "test2,family,0.7957718388512166,unclassified,md5,test2.sig,0.8691969376119889,3990000" + in sum_gather_results[36] + ) + assert ( + "test2,genus,0.0885520542481053,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella,md5,test2.sig,0.05701254275940707,444000" + in sum_gather_results[37] + ) + assert ( + "test2,genus,0.08815317112086159,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae;g__Escherichia,md5,test2.sig,0.05815279361459521,442000" + in sum_gather_results[38] + ) + assert ( + "test2,genus,0.027522935779816515,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Phocaeicola,md5,test2.sig,0.015637726014008795,138000" + in sum_gather_results[39] + ) + assert ( + "test2,genus,0.7957718388512166,unclassified,md5,test2.sig,0.8691969376119889,3990000" + in sum_gather_results[40] + ) + assert ( + "test2,species,0.0885520542481053,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella;s__Prevotella copri,md5,test2.sig,0.05701254275940707,444000" + in sum_gather_results[41] + ) + assert ( + "test2,species,0.08815317112086159,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae;g__Escherichia;s__Escherichia coli,md5,test2.sig,0.05815279361459521,442000" + in sum_gather_results[42] + ) + assert ( + "test2,species,0.027522935779816515,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Phocaeicola;s__Phocaeicola vulgatus,md5,test2.sig,0.015637726014008795,138000" + in sum_gather_results[43] + ) + + def test_metagenome_two_queries_with_single_query_output_formats_fail(runtmp): # fail on multiple queries with single query output formats c = runtmp From 2395db57cd73cf319fda820cdfb7f60fc94f193a Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 4 Sep 2024 18:55:37 +0000 Subject: [PATCH 8/9] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- tests/test_tax.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/tests/test_tax.py b/tests/test_tax.py index a54694744..72889aaed 100644 --- a/tests/test_tax.py +++ b/tests/test_tax.py @@ -2001,13 +2001,18 @@ def test_metagenome_two_queries_csv_summary_output(runtmp): assert c.last_result.status == 0 assert "loaded results for 2 queries from 2 gather CSVs" in c.last_result.err - assert f"saving 'csv_summary' output to '{os.path.basename(csv_summary_out)}'" in runtmp.last_result.err - sum_gather_results = [x.rstrip() for x in Path(csv_summary_out).read_text().splitlines()] + assert ( + f"saving 'csv_summary' output to '{os.path.basename(csv_summary_out)}'" + in runtmp.last_result.err + ) + sum_gather_results = [ + x.rstrip() for x in Path(csv_summary_out).read_text().splitlines() + ] assert ( "query_name,rank,fraction,lineage,query_md5,query_filename,f_weighted_at_rank,bp_match_at_rank" in sum_gather_results[0] ) - # check both queries exist in csv_summary results; check several + # check both queries exist in csv_summary results; check several assert ( "test1,superkingdom,0.2042281611487834,d__Bacteria,md5,test1.sig,0.13080306238801107,1024000,0.9500482567175479,0" in sum_gather_results[1] From cffbf38338aa8cc78cb39af7de028dca5f396e0e Mon Sep 17 00:00:00 2001 From: Tessa Pierce Ward Date: Tue, 17 Sep 2024 09:25:26 -0700 Subject: [PATCH 9/9] clean up --- src/sourmash/tax/tax_utils.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/sourmash/tax/tax_utils.py b/src/sourmash/tax/tax_utils.py index a1d34da07..60a967c6b 100644 --- a/src/sourmash/tax/tax_utils.py +++ b/src/sourmash/tax/tax_utils.py @@ -2088,7 +2088,6 @@ def as_summary_dict(self, query_info, limit_float=False, lingroups=None): and sD["lineage"] not in lingroups.keys() ): return None - # import pdb;pdb.set_trace() sD["query_name"] = query_info.query_name sD["query_md5"] = query_info.query_md5 sD["query_filename"] = query_info.query_filename