From 1dcbfd50673b6712f0700c666f3b2abb9e5ce29c Mon Sep 17 00:00:00 2001 From: Anthony Bretaudeau Date: Mon, 18 Nov 2019 10:02:59 +0100 Subject: [PATCH] Fix interproscan loader failing to load IPR by name --- README.md | 1 + chado/client.py | 16 ++++++++++++++++ chado/load/__init__.py | 21 ++++++++++++++------- 3 files changed, 31 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index 52dc86b..0e1abd3 100644 --- a/README.md +++ b/README.md @@ -89,6 +89,7 @@ $ chakin feature load_fasta \ - 2.3.2 - Fix interproscan loader only loading the first result of XML v5 + - Fix interproscan loader failing to load IPR by name - 2.3.1 - Fix data loading in Tripal database diff --git a/chado/client.py b/chado/client.py index d9d5a34..51ea42c 100644 --- a/chado/client.py +++ b/chado/client.py @@ -46,6 +46,7 @@ def _reset_cache(self): self._featured_dirty_rels = None self._analysisfeature_cache = None self._analysisprop_cache = None + self._interpro_cache = None self.cache_existing = True @@ -336,3 +337,18 @@ def _add_feat_cvterm_with_id(self, feat, cvterm_id, pub_id=None): if feat not in self._featcvterm_cache: self._featcvterm_cache[feat] = [] self._featcvterm_cache[feat].append(cvterm_id) + + def _init_interpro_cache(self, force=False): + + if self._interpro_cache is not None and force: + self._interpro_cache = None + + if self._interpro_cache is None: + self._interpro_cache = {} + if self.cache_existing: + res = self.session.query(self.model.dbxref.accession, self.model.cvterm.cvterm_id) \ + .join(self.model.db, self.model.db.db_id == self.model.dbxref.db_id) \ + .filter(self.model.db.name == "INTERPRO") \ + .join(self.model.cvterm, self.model.dbxref.dbxref_id == self.model.cvterm.dbxref_id) + + self._interpro_cache = {x.accession: x.cvterm_id for x in res} diff --git a/chado/load/__init__.py b/chado/load/__init__.py index 76a1f67..011f40d 100644 --- a/chado/load/__init__.py +++ b/chado/load/__init__.py @@ -275,6 +275,7 @@ def interpro(self, analysis_id, organism_id, input, parse_go=False, re_name=None # Cache analysisfeature content for given analysis_id self._init_analysisfeature_cache(analysis_id) self._init_featcvterm_cache() + self._init_interpro_cache() # Cache all existing cvterms from GO cv db = 'GO' @@ -501,13 +502,19 @@ def _load_ipr_terms(self, ipr_terms, feature_id, analysis_id, skip_missing): # load the IPR terms that way, we need to just add them # as we encounter them. If the term already exists # we do not want to update it. - cvterm_id = self.ci.create_cvterm(ipr_term['ipr_name'], 'INTERPRO', 'INTERPRO', term_definition=ipr_term['ipr_desc'], accession=ipr_id) - if not cvterm_id: - if skip_missing: - warn('Could not find cvterm %s %s, skipping it', ipr_id, ipr_term['ipr_name']) - continue - else: - raise Exception('Could not find cvterm %s %s' % ipr_id, ipr_term['ipr_name']) + + # Check using IPRnumber (in case ipr_name changed at some point in time) + if ipr_id in self._interpro_cache: + cvterm_id = self._interpro_cache[ipr_id] + else: + cvterm_id = self.ci.create_cvterm(ipr_term['ipr_name'], 'INTERPRO', 'INTERPRO', term_definition=ipr_term['ipr_desc'], accession=ipr_id) + if not cvterm_id: + if skip_missing: + warn('Could not find cvterm %s %s, skipping it', ipr_id, ipr_term['ipr_name']) + continue + else: + raise Exception('Could not find cvterm %s %s' % ipr_id, ipr_term['ipr_name']) + self._interpro_cache[ipr_id] = cvterm_id # Insert IPR terms into the feature_cvterm table # the default pub_id of 1 (NULL) is used. if the cvterm already exists then just skip adding it