Fix interproscan loader failing to load IPR by name

galaxy-genome-annotation · Nov 18, 2019 · 1dcbfd5 · 1dcbfd5
1 parent 7685251
commit 1dcbfd5
Show file tree

Hide file tree

Showing 3 changed files with 31 additions and 7 deletions.
diff --git a/README.md b/README.md
@@ -89,6 +89,7 @@ $ chakin feature load_fasta \
 
 - 2.3.2
     - Fix interproscan loader only loading the first result of XML v5
+    - Fix interproscan loader failing to load IPR by name
 
 - 2.3.1
     - Fix data loading in Tripal database

diff --git a/chado/client.py b/chado/client.py
@@ -46,6 +46,7 @@ def _reset_cache(self):
         self._featured_dirty_rels = None
         self._analysisfeature_cache = None
         self._analysisprop_cache = None
+        self._interpro_cache = None
 
         self.cache_existing = True
 
@@ -336,3 +337,18 @@ def _add_feat_cvterm_with_id(self, feat, cvterm_id, pub_id=None):
             if feat not in self._featcvterm_cache:
                 self._featcvterm_cache[feat] = []
             self._featcvterm_cache[feat].append(cvterm_id)
+
+    def _init_interpro_cache(self, force=False):
+
+        if self._interpro_cache is not None and force:
+            self._interpro_cache = None
+
+        if self._interpro_cache is None:
+            self._interpro_cache = {}
+            if self.cache_existing:
+                res = self.session.query(self.model.dbxref.accession, self.model.cvterm.cvterm_id) \
+                    .join(self.model.db, self.model.db.db_id == self.model.dbxref.db_id) \
+                    .filter(self.model.db.name == "INTERPRO") \
+                    .join(self.model.cvterm, self.model.dbxref.dbxref_id == self.model.cvterm.dbxref_id)
+
+                self._interpro_cache = {x.accession: x.cvterm_id for x in res}
diff --git a/chado/load/__init__.py b/chado/load/__init__.py
@@ -275,6 +275,7 @@ def interpro(self, analysis_id, organism_id, input, parse_go=False, re_name=None
         # Cache analysisfeature content for given analysis_id
         self._init_analysisfeature_cache(analysis_id)
         self._init_featcvterm_cache()
+        self._init_interpro_cache()
 
         # Cache all existing cvterms from GO cv
         db = 'GO'
@@ -501,13 +502,19 @@ def _load_ipr_terms(self, ipr_terms, feature_id, analysis_id, skip_missing):
                 # load the IPR terms that way, we need to just add them
                 # as we encounter them. If the term already exists
                 # we do not want to update it.
-                cvterm_id = self.ci.create_cvterm(ipr_term['ipr_name'], 'INTERPRO', 'INTERPRO', term_definition=ipr_term['ipr_desc'], accession=ipr_id)
-                if not cvterm_id:
-                    if skip_missing:
-                        warn('Could not find cvterm %s %s, skipping it', ipr_id, ipr_term['ipr_name'])
-                        continue
-                    else:
-                        raise Exception('Could not find cvterm %s %s' % ipr_id, ipr_term['ipr_name'])
+
+                # Check using IPRnumber (in case ipr_name changed at some point in time)
+                if ipr_id in self._interpro_cache:
+                    cvterm_id = self._interpro_cache[ipr_id]
+                else:
+                    cvterm_id = self.ci.create_cvterm(ipr_term['ipr_name'], 'INTERPRO', 'INTERPRO', term_definition=ipr_term['ipr_desc'], accession=ipr_id)
+                    if not cvterm_id:
+                        if skip_missing:
+                            warn('Could not find cvterm %s %s, skipping it', ipr_id, ipr_term['ipr_name'])
+                            continue
+                        else:
+                            raise Exception('Could not find cvterm %s %s' % ipr_id, ipr_term['ipr_name'])
+                    self._interpro_cache[ipr_id] = cvterm_id
 
                 # Insert IPR terms into the feature_cvterm table
                 # the default pub_id of 1 (NULL) is used. if the cvterm already exists then just skip adding it