Skip to content

Commit b4ae1e7

Browse files
authored
Merge pull request #1477 from gyorilab/edirect-fix
Edirect esearch fix
2 parents 0e4d97f + acc58b0 commit b4ae1e7

File tree

2 files changed

+14
-7
lines changed

2 files changed

+14
-7
lines changed

indra/literature/pubmed_client.py

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -370,7 +370,11 @@ def get_full_xml_by_pmids(
370370
"for instructions.")
371371

372372
tree = lxml_etree.fromstring(xml_bytes, parser=parser)
373-
# Each article is in a <PubmedArticle> tag, encapsulated in a <PubmedArticleSet> tag
373+
# Each article is in a <PubmedArticle> tag, encapsulated in a
374+
# <PubmedArticleSet> tag.
375+
# Note that the <PubmedArticle> tags are sorted by PMID numerically e.g.,
376+
# 10, 11, 20, 1000, and not lexicographically e.g., 10, 1000, 11, 20,
377+
# regardless of the order in which the pmids are passed
374378
if fname is not None:
375379
pretty_save_xml(tree, fname)
376380
return tree
@@ -1250,9 +1254,10 @@ def get_all_ids(search_term):
12501254
"for instructions.")
12511255
# Output is divided by new lines
12521256
elements = res.split('\n')
1253-
# If there are more than 10k IDs, the CLI outputs a . for each
1254-
# iteration, these have to be filtered out
1255-
pmids = [e for e in elements if '.' not in e]
1257+
# The CLI automatically retries on errors, subprocess.getoutput unfortunately
1258+
# adds the error message associated with the retry to the output, so we need
1259+
# to filter out non-numeric elements
1260+
pmids = [e for e in elements if e.isdigit()]
12561261
return pmids
12571262

12581263

indra/tests/test_pubmed_client.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -283,14 +283,16 @@ def test_get_article_from_full_xml2():
283283
@pytest.mark.nogha
284284
def test_get_full_xml_by_pmids():
285285
# Uses edirect CLI
286-
pmids = ["35814366", "35814367"]
286+
pmids = ["35814367", "35814366"]
287+
pmids_int_sort = sorted(pmids, key=int)
287288
full_xml = pubmed_client.get_full_xml_by_pmids(pmids)
288289
pubmed_articles = full_xml.findall(".//PubmedArticle")
289290
assert len(pubmed_articles) == 2, len(pubmed_articles)
290-
for pmid, pubmed_article in zip(pmids, pubmed_articles):
291+
for pmid_int_sort, pubmed_article in zip(pmids_int_sort, pubmed_articles):
291292
xml_pmid = pubmed_article.find(".//PMID")
292293
assert xml_pmid is not None
293-
assert xml_pmid.text == pmid, xml_pmid.text
294+
# Check that the pubmed articles are sorted numerically by PMID
295+
assert xml_pmid.text == pmid_int_sort, xml_pmid.text
294296

295297

296298
@pytest.mark.webservice

0 commit comments

Comments
 (0)