FIX: update get-ncbi-genomes to use genome accession IDs as taxonom…

…y IDs (#193) * FIX: use NCBI accession numbers to represent fetched chromosomes * More updates and fixes * Update the tests + small other updates * Lint * Add missing test files * Review requests
bokulich-lab · Jun 24, 2024 · 7d9ef27 · 7d9ef27
1 parent 3c179ca
commit 7d9ef27
Show file tree

Hide file tree

Showing 11 changed files with 38,638 additions and 56 deletions.
diff --git a/rescript/ncbi_datasets.py b/rescript/ncbi_datasets.py
@@ -7,12 +7,13 @@
 # ----------------------------------------------------------------------------
 
 import glob
+import json
 import os
 import shutil
 import tempfile
 import zipfile
 from copy import deepcopy
-from typing import List
+from typing import List, Dict
 
 import ncbi.datasets as nd
 from ncbi.datasets.openapi import ApiClient as DatasetsApiClient
@@ -30,9 +31,8 @@
 def _get_assembly_descriptors(
         api_instance, assembly_levels, assembly_source, only_reference,
         page_size, taxon, tax_exact_match
-):
-    all_acc_ids = []
-    all_tax_ids = []
+) -> Dict[str, str]:
+    assembly_to_taxon = {}
     next_page_token = ''
     while True:
         try:
@@ -72,16 +72,28 @@ def _get_assembly_descriptors(
         next_page_token = genome_summary.next_page_token
 
         genome_assembly = [x.assembly for x in genome_summary.assemblies]
-
-        all_acc_ids.extend([x.assembly_accession for x in genome_assembly])
-        all_tax_ids.extend([x.org.tax_id for x in genome_assembly])
+        assembly_to_taxon.update(
+            {x.assembly_accession: x.org.tax_id for x in genome_assembly}
+        )
 
         if not next_page_token:
             break
-    return all_acc_ids, all_tax_ids
+    return assembly_to_taxon
 
 
-def _fetch_and_extract_dataset(api_response):
+def _extract_accession_ids(seq_reports: List[dict]):
+    accessions, assembly_id = {'refseq': [], 'genbank': []}, None
+    for rep in seq_reports:
+        if not assembly_id:
+            assembly_id = rep.get('assemblyAccession')
+        accessions['refseq'].append(rep.get('refseqAccession'))
+        accessions['genbank'].append(rep.get('genbankAccession'))
+    return assembly_id, accessions
+
+
+def _fetch_and_extract_dataset(
+        api_response, genomes, loci, proteins, only_genomic
+):
     with tempfile.TemporaryDirectory() as tmp:
         result_path = os.path.join(tmp, 'datasets.zip')
         with open(result_path, 'wb') as f:
@@ -91,37 +103,61 @@ def _fetch_and_extract_dataset(api_response):
             zipped.extractall(tmp)
 
         # find and move all the genome sequences
-        genomes = DNAFASTAFormat()
         genome_seq_fps = glob.glob(
             os.path.join(tmp, 'ncbi_dataset', 'data', '*', '*_genomic.fna')
         )
+        acc_to_assembly = {}
         with open(str(genomes), 'a') as fin:
             for f in genome_seq_fps:
+                dp = os.path.dirname(f)
+                summary_fp = os.path.join(dp, 'sequence_report.jsonl')
+                with open(summary_fp, 'r') as jsonl_file:
+                    molecules = [json.loads(line) for line in jsonl_file]
+
+                if only_genomic:
+                    molecules = [
+                        x for x in molecules
+                        if x['assignedMoleculeLocationType'] == 'Chromosome'
+                    ]
+
+                assembly_id, accession_ids = _extract_accession_ids(molecules)
+                molecule_ids = [
+                    *accession_ids['refseq'], *accession_ids['genbank']
+                ]
+
                 seq = skbio.read(
                     f, format='fasta', constructor=skbio.DNA, lowercase=True
                 )
-                skbio.io.write(seq, format='fasta', into=fin)
+                acc_to_assembly[assembly_id] = []
+                for s in seq:
+                    _id = s.metadata['id']
+                    if s.metadata['id'] in molecule_ids:
+                        skbio.io.write(s, format='fasta', into=fin)
+                        acc_to_assembly[assembly_id].append(_id)
 
         # find and move all the gff files
-        loci = LociDirectoryFormat()
         loci_fps = glob.glob(
             os.path.join(tmp, 'ncbi_dataset', 'data', '*', 'genomic.gff'))
         for f in loci_fps:
             _id = f.split('/')[-2]
             shutil.move(f, os.path.join(str(loci), f'{_id}_loci.gff'))
 
         # find and move all the protein translations
-        proteins = ProteinsDirectoryFormat()
         protein_fps = glob.glob(
             os.path.join(tmp, 'ncbi_dataset', 'data', '*', 'protein.faa'))
         for f in protein_fps:
             _id = f.split('/')[-2]
             shutil.move(f, os.path.join(
                 str(proteins), f'{_id}_proteins.fasta'))
-    return genomes, loci, proteins
+
+    return acc_to_assembly
 
 
-def _fetch_taxonomy(all_acc_ids, all_tax_ids):
+def _fetch_taxonomy(
+        all_acc_ids: list,
+        all_tax_ids: list,
+        accession_to_assembly: pd.Series
+):
     manager = Manager()
     taxa, bad_accs = get_taxonomies(
         taxids={k: v for k, v in zip(all_acc_ids, all_tax_ids)},
@@ -135,15 +171,46 @@ def _fetch_taxonomy(all_acc_ids, all_tax_ids):
                         f'{", ".join(bad_accs)}. Please check your query '
                         f'and try again.')
     taxa = pd.DataFrame(taxa, index=['Taxon']).T
+    taxa = pd.merge(
+        taxa, accession_to_assembly,
+        left_index=True, right_index=True, how="outer"
+    ).set_index('assembly_id')
     taxa.index.name = 'Feature ID'
     return taxa
 
 
+def _fetch_and_extract_all(api_instance, assemblies, only_genomic):
+    genomes = DNAFASTAFormat()
+    loci = LociDirectoryFormat()
+    proteins = ProteinsDirectoryFormat()
+    accession_map = {}
+
+    chunks = [
+        assemblies[i:i + 20] for i in range(0, len(assemblies), 20)
+    ]
+    for assembly_subset in chunks:
+        api_response = api_instance.download_assembly_package(
+            assembly_subset,
+            exclude_sequence=False,
+            include_annotation_type=['PROT_FASTA', 'GENOME_GFF'],
+            _preload_content=False
+        )
+        accessions = _fetch_and_extract_dataset(
+            api_response, genomes, loci, proteins,
+            only_genomic=only_genomic
+        )
+        accession_map.update(accessions)
+    accession_map = pd.Series(accession_map, name='assembly_id')
+
+    return genomes, loci, proteins, accession_map
+
+
 def get_ncbi_genomes(
         taxon: str,
         assembly_source: str = 'refseq',
         assembly_levels: List[str] = ['complete_genome'],
         only_reference: bool = True,
+        only_genomic: bool = False,
         tax_exact_match: bool = False,
         page_size: int = 20,
 ) -> (DNAFASTAFormat, LociDirectoryFormat,
@@ -161,18 +228,18 @@ def get_ncbi_genomes(
     with DatasetsApiClient() as api_client:
         api_instance = nd.GenomeApi(api_client)
 
-        all_acc_ids, all_tax_ids = _get_assembly_descriptors(
+        assembly_to_taxon = _get_assembly_descriptors(
             api_instance=api_instance, **assembly_descriptors_params
         )
 
-        api_response = api_instance.download_assembly_package(
-            all_acc_ids,
-            exclude_sequence=False,
-            include_annotation_type=['PROT_FASTA', 'GENOME_GFF'],
-            _preload_content=False
+        genomes, loci, proteins, accession_map = _fetch_and_extract_all(
+            api_instance, list(assembly_to_taxon.keys()), only_genomic
         )
 
-        genomes, loci, proteins = _fetch_and_extract_dataset(api_response)
-        taxa = _fetch_taxonomy(all_acc_ids, all_tax_ids)
+        taxa = _fetch_taxonomy(
+            assembly_to_taxon.keys(),
+            assembly_to_taxon.values(),
+            accession_map.explode()
+        )
 
     return genomes, loci, proteins, taxa
diff --git a/rescript/plugin_setup.py b/rescript/plugin_setup.py
@@ -1174,8 +1174,9 @@
     inputs={},
     parameters={
         'taxon': Str,
-        'assembly_source': Str % Choices(['refseq', 'genbank']),
+        'assembly_source': Str % Choices(['refseq', 'genbank', 'all']),
         'only_reference': Bool,
+        'only_genomic': Bool,
         'assembly_levels': List[Str % Choices(
             ['complete_genome', 'chromosome', 'scaffold', 'contig'])],
         'tax_exact_match': Bool,
@@ -1194,6 +1195,9 @@
         'assembly_source': 'Fetch only RefSeq or GenBank genome assemblies.',
         'only_reference': 'Fetch only reference and representative '
                           'genome assemblies.',
+        'only_genomic': 'Exclude plasmid, mitochondrial and chloroplast '
+                        'molecules from the final results (i.e., keep '
+                        'only genomic DNA).',
         'assembly_levels': 'Fetch only genome assemblies that are one of the '
                            'specified assembly levels.',
         'tax_exact_match': 'If true, only return assemblies with the given '

diff --git a/rescript/tests/data/ncbi-dataset.zip b/rescript/tests/data/ncbi-dataset.zip
diff --git a/.../data/ncbi-dataset/ncbi_dataset/data/GCA_000008865.2/GCA_000008865.2_ASM886v2_genomic.fna b/.../data/ncbi-dataset/ncbi_dataset/data/GCA_000008865.2/GCA_000008865.2_ASM886v2_genomic.fna
@@ -0,0 +1,11 @@
+>BA000007.3 Escherichia coli O157:H7 str. Sakai DNA, complete genome
+AGCTTTTCATTCTGACTGCAACGGGCAATATGTCTCTGTGTGGATTAAAAAAAGAGTCTCTGACAGCAGCTTCTGAACTG
+AAGAAAGCTTCGTAGAAGCT
+--
+>AB011548.2 Escherichia coli O157:H7 str. Sakai plasmid pOSAK1 DNA, complete sequence
+TTCTTCTGCGAGTTCGTGCAGCTTCTCACACATGGTGGCCTGCTCGTCAGCATCGAGTGCGTCCAGTTTTTCGAGCAGCG
+TCAGGCTCTGGCTTTTTATGAATCCCGCCATGTTGAGTGCAGTTTGCTGCTGCTTGTTCATCTTTCTGTTTTCTCCGTTC
+--
+>AB011549.2 Escherichia coli O157:H7 str. Sakai plasmid pO157 DNA, complete sequence
+AGCCAGATTTTACCCGCCCATCCTAAAGAAGGGGATAGTCAACCACATCTGACCAGCCTGCGGAAAAGTCTGCTGCTTGT
+CCGTCCGGTGAAAGCTGATGATAAAACACCTGTTCAGGTGGAAGCCCGCGATGATAATAATAAAATTCTCGGTACGTTAA