@@ -325,7 +325,7 @@ def parse_author_affiliation(medline):
325
325
----------
326
326
medline: Element
327
327
The lxml node pointing to a medline document
328
-
328
+
329
329
Returns
330
330
-------
331
331
authors: list
@@ -343,13 +343,17 @@ def parse_author_affiliation(medline):
343
343
else :
344
344
forename = ""
345
345
if author .find ("Initials" ) is not None :
346
- firstname = (author .find ("Initials" ).text or "" ).strip () or ""
346
+ initials = (author .find ("Initials" ).text or "" ).strip () or ""
347
347
else :
348
- firstname = ""
348
+ initials = ""
349
349
if author .find ("LastName" ) is not None :
350
350
lastname = (author .find ("LastName" ).text or "" ).strip () or ""
351
351
else :
352
352
lastname = ""
353
+ if author .find ("Identifier" ) is not None :
354
+ identifier = (author .find ("Identifier" ).text or "" ).strip () or ""
355
+ else :
356
+ identifier = ""
353
357
if author .find ("AffiliationInfo/Affiliation" ) is not None :
354
358
affiliation = author .find ("AffiliationInfo/Affiliation" ).text or ""
355
359
affiliation = affiliation .replace (
@@ -360,9 +364,10 @@ def parse_author_affiliation(medline):
360
364
affiliation = ""
361
365
authors .append (
362
366
{
363
- "forename" : forename ,
364
- "firstname" : firstname ,
365
367
"lastname" : lastname ,
368
+ "forename" : forename ,
369
+ "initials" : initials ,
370
+ "identifier" : identifier ,
366
371
"affiliation" : affiliation ,
367
372
}
368
373
)
@@ -426,7 +431,7 @@ def parse_references(pubmed_article, reference_list):
426
431
pubmed_article: Element
427
432
The lxml element pointing to a medline document
428
433
429
- reference_list: bool
434
+ reference_list: bool
430
435
if it is True, return a list of dictionary
431
436
if it is False return a string of PMIDs seprated by semicolon ';'
432
437
@@ -495,7 +500,7 @@ def parse_article_info(
495
500
article: dict
496
501
Dictionary containing information about the article, including
497
502
`title`, `abstract`, `journal`, `authors`, `affiliations`, `pubdate`,
498
- `pmid`, `other_id`, `mesh_terms`, and `keywords`. The field
503
+ `pmid`, `other_id`, `mesh_terms`, `pages`, `issue`, and `keywords`. The field
499
504
`delete` is always `False` because this function parses
500
505
articles that by definition are not deleted.
501
506
"""
@@ -507,6 +512,26 @@ def parse_article_info(
507
512
else :
508
513
title = ""
509
514
515
+ if article .find ("Journal/JournalIssue/Volume" ) is not None :
516
+ volume = article .find ("Journal/JournalIssue/Volume" ).text or ""
517
+ else :
518
+ volume = ""
519
+
520
+ if article .find ("Journal/JournalIssue/Issue" ) is not None :
521
+ issue = article .find ("Journal/JournalIssue/Issue" ).text or ""
522
+ else :
523
+ issue = ""
524
+
525
+ if volume == "" :
526
+ issue = ""
527
+ else :
528
+ issue = f"{ volume } ({ issue } )"
529
+
530
+ if article .find ("Pagination/MedlinePgn" ) is not None :
531
+ pages = article .find ("Pagination/MedlinePgn" ).text or ""
532
+ else :
533
+ pages = ""
534
+
510
535
category = "NlmCategory" if nlm_category else "Label"
511
536
if article .find ("Abstract/AbstractText" ) is not None :
512
537
# parsing structured abstract
@@ -540,7 +565,8 @@ def parse_article_info(
540
565
)
541
566
authors = ";" .join (
542
567
[
543
- author .get ("firstname" , "" ) + " " + author .get ("lastname" , "" )
568
+ author .get ("lastname" , "" ) + "|" + author .get ("forename" , "" ) + "|" +
569
+ author .get ("initials" , "" ) + "|" + author .get ("identifier" , "" )
544
570
for author in authors_dict
545
571
]
546
572
)
@@ -561,6 +587,8 @@ def parse_article_info(
561
587
journal_info_dict = parse_journal_info (medline )
562
588
dict_out = {
563
589
"title" : title ,
590
+ "issue" : issue ,
591
+ "pages" : pages ,
564
592
"abstract" : abstract ,
565
593
"journal" : journal_name ,
566
594
"authors" : authors ,
@@ -609,7 +637,7 @@ def parse_medline_xml(
609
637
if False, this will parse structured abstract where each section will be assigned to
610
638
NLM category of each sections
611
639
default: False
612
- author_list: bool
640
+ author_list: bool
613
641
if True, return parsed author output as a list of authors
614
642
if False, return parsed author output as a string of authors concatenated with ``;``
615
643
default: False
@@ -664,6 +692,8 @@ def parse_medline_xml(
664
692
"issn_linking" : np .nan ,
665
693
"country" : np .nan ,
666
694
"references" : np .nan ,
695
+ "issue" : np .nan ,
696
+ "pages" : np .nan ,
667
697
}
668
698
for p in delete_citations
669
699
]
0 commit comments