Skip to content

Commit

Permalink
FIX: update get-ncbi-genomes to use genome accession IDs as taxonom…
Browse files Browse the repository at this point in the history
…y IDs (#193)

* FIX: use NCBI accession numbers to represent fetched chromosomes

* More updates and fixes

* Update the tests + small other updates

* Lint

* Add missing test files

* Review requests
  • Loading branch information
misialq committed Jun 24, 2024
1 parent 3c179ca commit 7d9ef27
Show file tree
Hide file tree
Showing 11 changed files with 38,638 additions and 56 deletions.
113 changes: 90 additions & 23 deletions rescript/ncbi_datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,13 @@
# ----------------------------------------------------------------------------

import glob
import json
import os
import shutil
import tempfile
import zipfile
from copy import deepcopy
from typing import List
from typing import List, Dict

import ncbi.datasets as nd
from ncbi.datasets.openapi import ApiClient as DatasetsApiClient
Expand All @@ -30,9 +31,8 @@
def _get_assembly_descriptors(
api_instance, assembly_levels, assembly_source, only_reference,
page_size, taxon, tax_exact_match
):
all_acc_ids = []
all_tax_ids = []
) -> Dict[str, str]:
assembly_to_taxon = {}
next_page_token = ''
while True:
try:
Expand Down Expand Up @@ -72,16 +72,28 @@ def _get_assembly_descriptors(
next_page_token = genome_summary.next_page_token

genome_assembly = [x.assembly for x in genome_summary.assemblies]

all_acc_ids.extend([x.assembly_accession for x in genome_assembly])
all_tax_ids.extend([x.org.tax_id for x in genome_assembly])
assembly_to_taxon.update(
{x.assembly_accession: x.org.tax_id for x in genome_assembly}
)

if not next_page_token:
break
return all_acc_ids, all_tax_ids
return assembly_to_taxon


def _fetch_and_extract_dataset(api_response):
def _extract_accession_ids(seq_reports: List[dict]):
accessions, assembly_id = {'refseq': [], 'genbank': []}, None
for rep in seq_reports:
if not assembly_id:
assembly_id = rep.get('assemblyAccession')
accessions['refseq'].append(rep.get('refseqAccession'))
accessions['genbank'].append(rep.get('genbankAccession'))
return assembly_id, accessions


def _fetch_and_extract_dataset(
api_response, genomes, loci, proteins, only_genomic
):
with tempfile.TemporaryDirectory() as tmp:
result_path = os.path.join(tmp, 'datasets.zip')
with open(result_path, 'wb') as f:
Expand All @@ -91,37 +103,61 @@ def _fetch_and_extract_dataset(api_response):
zipped.extractall(tmp)

# find and move all the genome sequences
genomes = DNAFASTAFormat()
genome_seq_fps = glob.glob(
os.path.join(tmp, 'ncbi_dataset', 'data', '*', '*_genomic.fna')
)
acc_to_assembly = {}
with open(str(genomes), 'a') as fin:
for f in genome_seq_fps:
dp = os.path.dirname(f)
summary_fp = os.path.join(dp, 'sequence_report.jsonl')
with open(summary_fp, 'r') as jsonl_file:
molecules = [json.loads(line) for line in jsonl_file]

if only_genomic:
molecules = [
x for x in molecules
if x['assignedMoleculeLocationType'] == 'Chromosome'
]

assembly_id, accession_ids = _extract_accession_ids(molecules)
molecule_ids = [
*accession_ids['refseq'], *accession_ids['genbank']
]

seq = skbio.read(
f, format='fasta', constructor=skbio.DNA, lowercase=True
)
skbio.io.write(seq, format='fasta', into=fin)
acc_to_assembly[assembly_id] = []
for s in seq:
_id = s.metadata['id']
if s.metadata['id'] in molecule_ids:
skbio.io.write(s, format='fasta', into=fin)
acc_to_assembly[assembly_id].append(_id)

# find and move all the gff files
loci = LociDirectoryFormat()
loci_fps = glob.glob(
os.path.join(tmp, 'ncbi_dataset', 'data', '*', 'genomic.gff'))
for f in loci_fps:
_id = f.split('/')[-2]
shutil.move(f, os.path.join(str(loci), f'{_id}_loci.gff'))

# find and move all the protein translations
proteins = ProteinsDirectoryFormat()
protein_fps = glob.glob(
os.path.join(tmp, 'ncbi_dataset', 'data', '*', 'protein.faa'))
for f in protein_fps:
_id = f.split('/')[-2]
shutil.move(f, os.path.join(
str(proteins), f'{_id}_proteins.fasta'))
return genomes, loci, proteins

return acc_to_assembly


def _fetch_taxonomy(all_acc_ids, all_tax_ids):
def _fetch_taxonomy(
all_acc_ids: list,
all_tax_ids: list,
accession_to_assembly: pd.Series
):
manager = Manager()
taxa, bad_accs = get_taxonomies(
taxids={k: v for k, v in zip(all_acc_ids, all_tax_ids)},
Expand All @@ -135,15 +171,46 @@ def _fetch_taxonomy(all_acc_ids, all_tax_ids):
f'{", ".join(bad_accs)}. Please check your query '
f'and try again.')
taxa = pd.DataFrame(taxa, index=['Taxon']).T
taxa = pd.merge(
taxa, accession_to_assembly,
left_index=True, right_index=True, how="outer"
).set_index('assembly_id')
taxa.index.name = 'Feature ID'
return taxa


def _fetch_and_extract_all(api_instance, assemblies, only_genomic):
genomes = DNAFASTAFormat()
loci = LociDirectoryFormat()
proteins = ProteinsDirectoryFormat()
accession_map = {}

chunks = [
assemblies[i:i + 20] for i in range(0, len(assemblies), 20)
]
for assembly_subset in chunks:
api_response = api_instance.download_assembly_package(
assembly_subset,
exclude_sequence=False,
include_annotation_type=['PROT_FASTA', 'GENOME_GFF'],
_preload_content=False
)
accessions = _fetch_and_extract_dataset(
api_response, genomes, loci, proteins,
only_genomic=only_genomic
)
accession_map.update(accessions)
accession_map = pd.Series(accession_map, name='assembly_id')

return genomes, loci, proteins, accession_map


def get_ncbi_genomes(
taxon: str,
assembly_source: str = 'refseq',
assembly_levels: List[str] = ['complete_genome'],
only_reference: bool = True,
only_genomic: bool = False,
tax_exact_match: bool = False,
page_size: int = 20,
) -> (DNAFASTAFormat, LociDirectoryFormat,
Expand All @@ -161,18 +228,18 @@ def get_ncbi_genomes(
with DatasetsApiClient() as api_client:
api_instance = nd.GenomeApi(api_client)

all_acc_ids, all_tax_ids = _get_assembly_descriptors(
assembly_to_taxon = _get_assembly_descriptors(
api_instance=api_instance, **assembly_descriptors_params
)

api_response = api_instance.download_assembly_package(
all_acc_ids,
exclude_sequence=False,
include_annotation_type=['PROT_FASTA', 'GENOME_GFF'],
_preload_content=False
genomes, loci, proteins, accession_map = _fetch_and_extract_all(
api_instance, list(assembly_to_taxon.keys()), only_genomic
)

genomes, loci, proteins = _fetch_and_extract_dataset(api_response)
taxa = _fetch_taxonomy(all_acc_ids, all_tax_ids)
taxa = _fetch_taxonomy(
assembly_to_taxon.keys(),
assembly_to_taxon.values(),
accession_map.explode()
)

return genomes, loci, proteins, taxa
6 changes: 5 additions & 1 deletion rescript/plugin_setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -1174,8 +1174,9 @@
inputs={},
parameters={
'taxon': Str,
'assembly_source': Str % Choices(['refseq', 'genbank']),
'assembly_source': Str % Choices(['refseq', 'genbank', 'all']),
'only_reference': Bool,
'only_genomic': Bool,
'assembly_levels': List[Str % Choices(
['complete_genome', 'chromosome', 'scaffold', 'contig'])],
'tax_exact_match': Bool,
Expand All @@ -1194,6 +1195,9 @@
'assembly_source': 'Fetch only RefSeq or GenBank genome assemblies.',
'only_reference': 'Fetch only reference and representative '
'genome assemblies.',
'only_genomic': 'Exclude plasmid, mitochondrial and chloroplast '
'molecules from the final results (i.e., keep '
'only genomic DNA).',
'assembly_levels': 'Fetch only genome assemblies that are one of the '
'specified assembly levels.',
'tax_exact_match': 'If true, only return assemblies with the given '
Expand Down
Binary file removed rescript/tests/data/ncbi-dataset.zip
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
>BA000007.3 Escherichia coli O157:H7 str. Sakai DNA, complete genome
AGCTTTTCATTCTGACTGCAACGGGCAATATGTCTCTGTGTGGATTAAAAAAAGAGTCTCTGACAGCAGCTTCTGAACTG
AAGAAAGCTTCGTAGAAGCT
--
>AB011548.2 Escherichia coli O157:H7 str. Sakai plasmid pOSAK1 DNA, complete sequence
TTCTTCTGCGAGTTCGTGCAGCTTCTCACACATGGTGGCCTGCTCGTCAGCATCGAGTGCGTCCAGTTTTTCGAGCAGCG
TCAGGCTCTGGCTTTTTATGAATCCCGCCATGTTGAGTGCAGTTTGCTGCTGCTTGTTCATCTTTCTGTTTTCTCCGTTC
--
>AB011549.2 Escherichia coli O157:H7 str. Sakai plasmid pO157 DNA, complete sequence
AGCCAGATTTTACCCGCCCATCCTAAAGAAGGGGATAGTCAACCACATCTGACCAGCCTGCGGAAAAGTCTGCTGCTTGT
CCGTCCGGTGAAAGCTGATGATAAAACACCTGTTCAGGTGGAAGCCCGCGATGATAATAATAAAATTCTCGGTACGTTAA
Loading

0 comments on commit 7d9ef27

Please sign in to comment.