Skip to content

Commit 42c8ccc

Browse files
authored
Merge pull request #100 from Genesis-Research/master
Added issue, page number, and firstname to medline_parse_xml()
2 parents f5d52d2 + b63ee03 commit 42c8ccc

File tree

3 files changed

+42
-10
lines changed

3 files changed

+42
-10
lines changed

Diff for: pubmed_parser/medline_parser.py

+39-9
Original file line numberDiff line numberDiff line change
@@ -325,7 +325,7 @@ def parse_author_affiliation(medline):
325325
----------
326326
medline: Element
327327
The lxml node pointing to a medline document
328-
328+
329329
Returns
330330
-------
331331
authors: list
@@ -343,13 +343,17 @@ def parse_author_affiliation(medline):
343343
else:
344344
forename = ""
345345
if author.find("Initials") is not None:
346-
firstname = (author.find("Initials").text or "").strip() or ""
346+
initials = (author.find("Initials").text or "").strip() or ""
347347
else:
348-
firstname = ""
348+
initials = ""
349349
if author.find("LastName") is not None:
350350
lastname = (author.find("LastName").text or "").strip() or ""
351351
else:
352352
lastname = ""
353+
if author.find("Identifier") is not None:
354+
identifier = (author.find("Identifier").text or "").strip() or ""
355+
else:
356+
identifier = ""
353357
if author.find("AffiliationInfo/Affiliation") is not None:
354358
affiliation = author.find("AffiliationInfo/Affiliation").text or ""
355359
affiliation = affiliation.replace(
@@ -360,9 +364,10 @@ def parse_author_affiliation(medline):
360364
affiliation = ""
361365
authors.append(
362366
{
363-
"forename": forename,
364-
"firstname": firstname,
365367
"lastname": lastname,
368+
"forename": forename,
369+
"initials": initials,
370+
"identifier": identifier,
366371
"affiliation": affiliation,
367372
}
368373
)
@@ -426,7 +431,7 @@ def parse_references(pubmed_article, reference_list):
426431
pubmed_article: Element
427432
The lxml element pointing to a medline document
428433
429-
reference_list: bool
434+
reference_list: bool
430435
if it is True, return a list of dictionary
431436
if it is False return a string of PMIDs seprated by semicolon ';'
432437
@@ -495,7 +500,7 @@ def parse_article_info(
495500
article: dict
496501
Dictionary containing information about the article, including
497502
`title`, `abstract`, `journal`, `authors`, `affiliations`, `pubdate`,
498-
`pmid`, `other_id`, `mesh_terms`, and `keywords`. The field
503+
`pmid`, `other_id`, `mesh_terms`, `pages`, `issue`, and `keywords`. The field
499504
`delete` is always `False` because this function parses
500505
articles that by definition are not deleted.
501506
"""
@@ -507,6 +512,26 @@ def parse_article_info(
507512
else:
508513
title = ""
509514

515+
if article.find("Journal/JournalIssue/Volume") is not None:
516+
volume = article.find("Journal/JournalIssue/Volume").text or ""
517+
else:
518+
volume = ""
519+
520+
if article.find("Journal/JournalIssue/Issue") is not None:
521+
issue = article.find("Journal/JournalIssue/Issue").text or ""
522+
else:
523+
issue = ""
524+
525+
if volume == "":
526+
issue = ""
527+
else:
528+
issue = f"{volume}({issue})"
529+
530+
if article.find("Pagination/MedlinePgn") is not None:
531+
pages = article.find("Pagination/MedlinePgn").text or ""
532+
else:
533+
pages = ""
534+
510535
category = "NlmCategory" if nlm_category else "Label"
511536
if article.find("Abstract/AbstractText") is not None:
512537
# parsing structured abstract
@@ -540,7 +565,8 @@ def parse_article_info(
540565
)
541566
authors = ";".join(
542567
[
543-
author.get("firstname", "") + " " + author.get("lastname", "")
568+
author.get("lastname", "") + "|" + author.get("forename", "") + "|" +
569+
author.get("initials", "") + "|" + author.get("identifier", "")
544570
for author in authors_dict
545571
]
546572
)
@@ -561,6 +587,8 @@ def parse_article_info(
561587
journal_info_dict = parse_journal_info(medline)
562588
dict_out = {
563589
"title": title,
590+
"issue": issue,
591+
"pages": pages,
564592
"abstract": abstract,
565593
"journal": journal_name,
566594
"authors": authors,
@@ -609,7 +637,7 @@ def parse_medline_xml(
609637
if False, this will parse structured abstract where each section will be assigned to
610638
NLM category of each sections
611639
default: False
612-
author_list: bool
640+
author_list: bool
613641
if True, return parsed author output as a list of authors
614642
if False, return parsed author output as a string of authors concatenated with ``;``
615643
default: False
@@ -664,6 +692,8 @@ def parse_medline_xml(
664692
"issn_linking": np.nan,
665693
"country": np.nan,
666694
"references": np.nan,
695+
"issue": np.nan,
696+
"pages": np.nan,
667697
}
668698
for p in delete_citations
669699
]

Diff for: setup.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
if __name__ == "__main__":
55
setup(
66
name="pubmed_parser",
7-
version="0.2.2",
7+
version="0.3.0",
88
description="A python parser for Pubmed Open-Access Subset and MEDLINE XML repository",
99
url="https://github.com/titipata/pubmed_parser",
1010
download_url="https://github.com/titipata/pubmed_parser.git",

Diff for: tests/test_medline_parser.py

+2
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,8 @@ def test_parse_medline_xml():
1818
len([p for p in parsed_medline if len(p["title"]) > 0]) == 30000
1919
), "Expect every records to have title"
2020
assert parsed_medline[0]["title"][0:50] == expected_title
21+
assert parsed_medline[0]["issue"] == "50(2)"
22+
assert parsed_medline[0]["pages"] == "123-33"
2123
assert parsed_medline[0]["abstract"][0:50] == expected_abstract
2224
assert parsed_medline[0]["pmid"] == "399296"
2325

0 commit comments

Comments
 (0)