Skip to content

Commit

Permalink
Add complexportal
Browse files Browse the repository at this point in the history
  • Loading branch information
cthoyt committed Mar 15, 2020
1 parent 5139077 commit 2626c48
Show file tree
Hide file tree
Showing 2 changed files with 176 additions and 0 deletions.
1 change: 1 addition & 0 deletions src/pyobo/sources/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@

CONVERTED = {
'cgnc': 'cgnc',
'complexportal': 'complexportal',
'ncbigene': 'entrez',
'ec-code': 'expasy',
'hgnc': 'hgnc',
Expand Down
175 changes: 175 additions & 0 deletions src/pyobo/sources/complexportal.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,175 @@
# -*- coding: utf-8 -*-

"""Converter for ComplexPortal."""

import logging
from typing import Iterable, List, Tuple

import pandas as pd
from tqdm import tqdm

from pyobo import get_id_name_mapping
from pyobo.path_utils import ensure_df
from pyobo.struct import Obo, Reference, Synonym, Term, from_species, has_part

logger = logging.getLogger(__name__)

PREFIX = 'complexportal'
VERSION = '2020-03-11'
URL_BASE = f'ftp://ftp.ebi.ac.uk/pub/databases/intact/complex/{VERSION}/complextab'
SPECIES = [
'arabidopsis_thaliana',
'bos_taurus',
'caenorhabditis_elegans',
'canis_familiaris',
'danio_rerio',
'drosophila_melanogaster',
'escherichia_coli',
'gallus_gallus',
'homo_sapiens',
'lymnaea_stagnalis',
'mus_musculus',
'oryctolagus_cuniculus',
'ovis_aries',
'pseudomonas_aeruginosa',
'rattus_norvegicus',
'saccharomyces_cerevisiae',
'schizosaccharomyces_pombe',
'sus_scrofa',
'torpedo_californica',
'torpedo_marmorata',
'xenopus_laevis',
]
URLS = [f'{URL_BASE}/{species}.tsv' for species in SPECIES]

COLUMNS = [
'complexportal_id',
'name',
'aliases',
'taxonomy_id',
'members',
'confidence',
'experimental_evidence',
'goa',
'xrefs',
'definition',
'Complex properties',
'Complex assembly',
'Ligand',
'Disease',
'Agonist',
'Antagonist',
'Comment',
'Source',
]
DTYPE = {
'taxonomy_id': str,
}


def _parse_members(s) -> List[Tuple[Reference, str]]:
if pd.isna(s):
return []

rv = []
for member in s.split('|'):
entity_id, count = member.split('(')
count = count.rstrip(')')
if ':' in entity_id:
prefix, identifier = entity_id.split(':', 1)
else:
prefix, identifier = 'uniprot', entity_id
rv.append((Reference(prefix=prefix, identifier=identifier), count))
return rv


def _parse_xrefs(s) -> List[Tuple[Reference, str]]:
if pd.isna(s):
return []

rv = []
for xref in s.split('|'):
entity_id, note = xref.split('(')
note = note.rstrip(')')
prefix, identifier = entity_id.split(':', 1)
rv.append((Reference(prefix=prefix, identifier=identifier), note))
return rv


def get_obo() -> Obo:
"""Get the ComplexPortal OBO."""
terms = list(get_terms())
return Obo(
ontology=PREFIX,
name='Complex Portal',
data_version=VERSION,
terms=terms,
)


def get_df() -> pd.DataFrame:
"""Get a combine ComplexPortal dataframe."""
dfs = [
ensure_df(PREFIX, url, version=VERSION, na_values={'-'}, names=COLUMNS, header=0, dtype=DTYPE)
for url in URLS
]
return pd.concat(dfs)


def get_terms() -> Iterable[Term]:
"""Get ComplexPortal terms."""
df = get_df()

df['aliases'] = df['aliases'].map(lambda s: s.split('|') if pd.notna(s) else [])
df['members'] = df['members'].map(_parse_members)
df['xrefs'] = df['xrefs'].map(_parse_xrefs)

taxnomy_id_to_name = get_id_name_mapping('ncbitaxon')
df['taxonomy_name'] = df['taxonomy_id'].map(taxnomy_id_to_name.get)

slim_df = df[[
'complexportal_id',
'name',
'definition',
'aliases',
'xrefs',
'taxonomy_id',
'taxonomy_name',
'members',
]]
it = tqdm(slim_df.values, total=len(slim_df.index))
unhandled_xref_type = set()
for complexportal_id, name, definition, aliases, xrefs, taxonomy_id, taxonomy_name, members in it:
synonyms = [
Synonym(name=alias)
for alias in aliases
]
_xrefs = []
provenance = []
for reference, note in xrefs:
if note == 'identity':
_xrefs.append(reference)
elif note == 'see-also' and reference.prefix == 'pubmed':
provenance.append(reference)
elif (note, reference.prefix) not in unhandled_xref_type:
it.write(f'unhandled xref type: {note} / {reference.prefix}')
unhandled_xref_type.add((note, reference.prefix))

term = Term(
reference=Reference(prefix=PREFIX, identifier=complexportal_id, name=name),
name=name,
definition=definition.strip(),
synonyms=synonyms,
xrefs=_xrefs,
provenance=provenance,
)
term.append_relationship(from_species, Reference(prefix='taxonomy', identifier=taxonomy_id, name=taxonomy_name))

for reference, count in members:
term.append_relationship(has_part, reference)

yield term


if __name__ == '__main__':
get_obo().write_default()

0 comments on commit 2626c48

Please sign in to comment.