From 674402bdc3c5d2a4fdba89491d8979e36432afea Mon Sep 17 00:00:00 2001 From: Charles Tapley Hoyt Date: Tue, 2 Jan 2024 13:38:44 +0100 Subject: [PATCH 1/2] Improve safety of NER build This PR updates the NER index build to skip entries whose norm terms are none or contain an empty string. This shouldn't happen in practice, but depending on the pipeline used to construct terms, this might happen. A few alternatives to consider: 1. Have the Term class raise errors when None is given for norm_text 2. Raise an error in the NER index build --- gilda/grounder.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/gilda/grounder.py b/gilda/grounder.py index 6413ccb..1ddb300 100644 --- a/gilda/grounder.py +++ b/gilda/grounder.py @@ -111,7 +111,11 @@ def __init__( def _build_prefix_index(self): prefix_index = defaultdict(set) for norm_term in self.entries: + if not norm_term: + continue parts = norm_term.split() + if not parts: + continue prefix_index[parts[0]].add(len(parts)) self.prefix_index = dict(prefix_index) From 7b72202d2d65f6a06bd70e1c8578f1e7524ac4b3 Mon Sep 17 00:00:00 2001 From: Charles Tapley Hoyt Date: Mon, 5 Feb 2024 14:23:01 +0100 Subject: [PATCH 2/2] Update term.py --- gilda/term.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/gilda/term.py b/gilda/term.py index b44d164..6d5f988 100644 --- a/gilda/term.py +++ b/gilda/term.py @@ -52,6 +52,8 @@ def __init__(self, norm_text, text, db, id, entry_name, status, source, organism=None, source_db=None, source_id=None): if not text: raise ValueError('Text for Term cannot be empty') + if not norm_text.strip(): + raise ValueError('Normalized text for Term cannot be empty') self.norm_text = norm_text self.text = text self.db = db