From 2b82771bd5da1272cc32006d9843a2171e8166c5 Mon Sep 17 00:00:00 2001 From: Marcel Bollmann Date: Sun, 8 Jun 2025 16:56:15 +0200 Subject: [PATCH 1/2] Port MIT Press ingestion script to use our library --- bin/ingest_mitpress.py | 277 ++++++++++++----------------------------- 1 file changed, 78 insertions(+), 199 deletions(-) diff --git a/bin/ingest_mitpress.py b/bin/ingest_mitpress.py index 29fd53fd7d..c042de18ce 100755 --- a/bin/ingest_mitpress.py +++ b/bin/ingest_mitpress.py @@ -2,6 +2,7 @@ """ Convert MIT Press XML files for CL and TACL to Anthology XML. +version 0.6 - uses new Python library for ingestion version 0.5 - reads from new MIT Press format. version 0.4 - now updates XML directly, skips existing papers, sorts by page number version 0.3 - produces anthology ID in new format 2020.cl-1.1 @@ -30,7 +31,7 @@ Warning (August 2020): not yet tested with CL, but should work! -Authors: Arya D. McCarthy, Matt Post +Authors: Arya D. McCarthy, Matt Post, Marcel Bollmann """ import os import shutil @@ -40,11 +41,16 @@ from pathlib import Path from typing import List, Optional, Tuple -from anthology import Anthology, Paper, Volume -from normalize_anth import normalize -from anthology.utils import make_simple_element, indent, compute_hash_from_file +from acl_anthology import Anthology +from acl_anthology.files import PDFReference +from acl_anthology.people import Name, NameSpecification as NameSpec +from acl_anthology.text import MarkupText +from acl_anthology.utils import setup_rich_logging -__version__ = "0.5" +from fixedcase.protect import protect + + +__version__ = "0.6" TACL = "tacl" CL = "cl" @@ -108,43 +114,6 @@ def get_title(xml_front_node: etree.Element) -> str: return title_text -def get_year(xml_front_node: etree.Element) -> str: - nsmap = xml_front_node.nsmap - - article_meta = xml_front_node.find("article-meta", nsmap) - pub_date = article_meta.find("pub-date", nsmap) - year_text = pub_date.find("year", nsmap).text - return year_text - - -def get_month(xml_front_node: etree.Element) -> str: - nsmap = xml_front_node.nsmap - - article_meta = xml_front_node.find("article-meta", nsmap) - pub_date = article_meta.find("pub-date", nsmap) - try: - month_id = pub_date.find("month", nsmap).text - except AttributeError: - return None - months = [ - None, - "January", - "February", - "March", - "April", - "May", - "June", - "July", - "August", - "September", - "October", - "November", - "December", - ] - month_text = months[int(month_id)] - return month_text - - def get_abstract(xml_front_node: etree.Element) -> str: nsmap = xml_front_node.nsmap @@ -268,86 +237,25 @@ def process_xml(xml: Path, is_tacl: bool) -> Optional[etree.Element]: info, issue, volume = get_article_journal_info(front, is_tacl) - paper = etree.Element("paper") - - title_text = get_title(front) - title = etree.Element("title") - title.text = title_text - paper.append(title) - - authors = get_authors(front) - for given_names, surname in authors: - first = etree.Element("first") - first.text = given_names - - last = etree.Element("last") - last.text = surname - - author = etree.Element("author") - author.append(first) - author.append(last) - - paper.append(author) - - doi_text = get_doi(front) - doi = etree.Element("doi") - doi.text = doi_text - paper.append(doi) - - abstract_text = get_abstract(front) - if abstract_text: - make_simple_element("abstract", abstract_text, parent=paper) - - pages_tuple = get_pages(front) - pages = etree.Element("pages") - pages.text = "–".join(pages_tuple) # en-dash, not hyphen! - paper.append(pages) + paper = { + "title": get_title(front), + "authors": [ + { + "first": given_names, + "last": surname, + } + for given_names, surname in get_authors(front) + ], + "doi": get_doi(front), + "abstract": get_abstract(front), + "pages": get_pages(front), # tuple + } return paper, info, issue, volume -def issue_info_to_node( - issue_info: str, year_: str, journal_issue: str, venue: str, volume: str -) -> etree.Element: - """Creates the meta block for a new issue / volume""" - meta = make_simple_element("meta") - - assert int(year_) - - make_simple_element("booktitle", issue_info, parent=meta) - make_simple_element("publisher", "MIT Press", parent=meta) - make_simple_element("address", "Cambridge, MA", parent=meta) - - if venue == "cl": - month_text = issue_info.split()[-2] # blah blah blah month year - if month_text not in { - "January", - "February", - "March", - "April", - "May", - "June", - "July", - "August", - "September", - "October", - "November", - "December", - }: - logging.error("Unknown month: " + month_text) - make_simple_element("month", month_text, parent=meta) - - make_simple_element("year", str(year_), parent=meta) - make_simple_element("venue", venue, parent=meta) - make_simple_element("journal-volume", volume, parent=meta) - - return meta - - def main(args): - anthology = Anthology( - importdir=os.path.join(args.anthology_dir, "data"), require_bibkeys=False - ) + anthology = Anthology(datadir=os.path.join(args.anthology_dir, "data")) is_tacl = "tacl" in args.root_dir.stem logging.info(f"Looks like a {'TACL' if is_tacl else 'CL'} ingestion") @@ -363,24 +271,13 @@ def main(args): sys.exit(-1) collection_id = str(year) + "." + venue - - collection_file = os.path.join( - args.anthology_dir, "data", "xml", f"{collection_id}.xml" - ) - if os.path.exists(collection_file): - collection = etree.parse(collection_file).getroot() - else: - collection = make_simple_element("collection", attrib={"id": collection_id}) - - # volume_info = get_volume_info(list(args.year_root.glob("*.*.*/*.*.*.xml"))[0]) - # volume.append(volume_info) - - previous_issue_info = None + if (collection := anthology.collections.get(collection_id)) is None: + collection = anthology.collections.create(collection_id) papers = [] for xml in sorted(args.root_dir.glob("*.xml")): - papernode, issue_info, issue, volume = process_xml(xml, is_tacl) - if papernode is None or papernode.find("title").text.startswith("Erratum: “"): + paper_dict, issue_info, issue, volume_num = process_xml(xml, is_tacl) + if paper_dict["title"].startswith("Erratum: “"): continue pdf_path = xml.parent / xml.with_suffix(".pdf").name @@ -388,7 +285,7 @@ def main(args): logging.error(f"Missing pdf for {pdf_path}") sys.exit(1) - papers.append((papernode, pdf_path, issue_info, issue)) + papers.append((paper_dict, pdf_path, issue_info, issue)) pdf_destination = Path(args.pdfs_dir) pdf_destination = pdf_destination / "pdf" / venue @@ -396,89 +293,71 @@ def main(args): # MIT Press does assign its IDs in page order, so we have to sort by page def sort_papers_by_page(paper_tuple): - papernode = paper_tuple[0] - startpage = int(papernode.find("./pages").text.split("–")[0]) + startpage = paper_tuple[0]["pages"][0] return startpage - paper_id = 1 # Stupid non-enumerate counter because of "Erratum: " papers interleaved with real ones. - for papernode, pdf_path, issue_info, issue in sorted(papers, key=sort_papers_by_page): + for paper_dict, pdf_path, issue_info, issue in sorted( + papers, key=sort_papers_by_page + ): issue = issue or "1" - if issue_info != previous_issue_info: - # Emit the new volume info before the paper. - logging.info("New issue") - logging.info(f"{issue_info} vs. {previous_issue_info}") - previous_issue_info = issue_info - - # Look for node in tree, else create it - volume_xml = collection.find(f'./volume[@id="{issue}"]') - if volume_xml is None: - # xml volume = journal issue - volume_xml = make_simple_element( - "volume", attrib={"id": issue, "type": "journal"}, parent=collection - ) - volume_xml.append( - issue_info_to_node(issue_info, year, issue, venue, volume) - ) - paper_id = 1 - else: - for paper in volume_xml.findall(".//paper"): - paper_id = max(paper_id, int(paper.attrib["id"])) + if (volume := collection.get(issue)) is None: + logging.info(f"New issue: {issue_info}") - paper_id += 1 - - anth_id = f"{collection_id}-{issue}.{paper_id}" + if venue == "cl": + month = issue_info.split()[-2] # blah blah blah month year + if month not in MONTHS.values(): + logging.error("Unknown month: " + month) + else: + month = None + + volume = collection.create_volume( + issue, + title=MarkupText.from_string(issue_info), + type="journal", + year=str(year), + month=month, + publisher="MIT Press", + address="Cambridge, MA", + venue_ids=[venue], + journal_volume=volume_num, + journal_issue=issue, + ) # Check if the paper is already present in the volume - doi_text = papernode.find("./doi").text - doi_node = collection.xpath(f'.//doi[text()="{doi_text}"]') - if len(doi_node): - logging.info( - f"Skipping existing paper {anth_id}/{doi_text} with title {papernode.find('title').text}" - ) + if any(paper.doi == paper_dict["doi"] for paper in volume.papers()): + logging.info(f"Skipping existing paper with DOI {paper_dict['doi']}") continue - papernode.attrib["id"] = f"{paper_id}" + paper = volume.create_paper( + title=MarkupText.from_latex_maybe(paper_dict["title"]), + abstract=MarkupText.from_latex_maybe(paper_dict["abstract"]), + doi=paper_dict["doi"], + pages="–".join(paper_dict["pages"]), # bit inelegant? + authors=[ + NameSpec(Name.from_dict(author)) for author in paper_dict["authors"] + ], + ) + anth_id = paper.full_id destination = pdf_destination / f"{anth_id}.pdf" - print(f"Copying {pdf_path} to {destination}") + logging.info(f"Copying {pdf_path} to {destination}") shutil.copyfile(pdf_path, destination) - checksum = compute_hash_from_file(pdf_path) - - url_text = anth_id - url = etree.Element("url") - url.attrib["hash"] = checksum - url.text = url_text - papernode.append(url) - - # Generate bibkey - volume = Volume.from_xml( - volume_xml, - collection_id, - anthology.venues, - anthology.sigs, - anthology.formatter, - ) - paper = Paper.from_xml(papernode, volume, anthology.formatter) - bibkey = anthology.pindex.create_bibkey(paper, vidx=anthology.venues) - make_simple_element("bibkey", bibkey, parent=papernode) - - # Normalize - for oldnode in papernode: - normalize(oldnode, informat="latex") - volume_xml.append(papernode) + paper.pdf = PDFReference.from_file(destination) - paper_id += 1 + # TODO: fixedcase currently not ported to new library + xml_title = paper.title.to_xml("title") + protect(xml_title) + paper.title = MarkupText.from_xml(xml_title) - indent(collection) # from anthology.utils - et = etree.ElementTree(collection) - et.write(collection_file, encoding="UTF-8", xml_declaration=True, with_tail=True) + # All serialization to XML happens here + collection.save() if __name__ == "__main__": import sys - if sys.version_info < (3, 6): - sys.stderr.write("Python >=3.6 required.\n") + if sys.version_info < (3, 10): + sys.stderr.write("Python >=3.10 required.\n") sys.exit(1) import argparse @@ -519,6 +398,6 @@ def sort_papers_by_page(paper_tuple): args = parser.parse_args() args.root_dir = args.root_dir.resolve() # Get absolute path. - logging.basicConfig(level=args.verbose) + setup_rich_logging(level=args.verbose) main(args) From cc1d45226a9ae6c2bd1230e567073c30b540a4ae Mon Sep 17 00:00:00 2001 From: Marcel Bollmann Date: Sun, 8 Jun 2025 17:33:11 +0200 Subject: [PATCH 2/2] Attempt to satisfy type-checker, needs quite a lot of ignores currently --- bin/ingest_mitpress.py | 72 ++++++++++++++++++++++++------------------ justfile | 12 +++++++ 2 files changed, 53 insertions(+), 31 deletions(-) create mode 100644 justfile diff --git a/bin/ingest_mitpress.py b/bin/ingest_mitpress.py index c042de18ce..f6887c189f 100755 --- a/bin/ingest_mitpress.py +++ b/bin/ingest_mitpress.py @@ -33,13 +33,17 @@ Authors: Arya D. McCarthy, Matt Post, Marcel Bollmann """ +# mypy: disable-error-code="union-attr" +# ^---- Reason for this and most "type: ignore" comments: +# code never checks if etree functions return None + import os import shutil import logging import lxml.etree as etree from pathlib import Path -from typing import List, Optional, Tuple +from typing import Any, Optional, cast from acl_anthology import Anthology from acl_anthology.files import PDFReference @@ -75,7 +79,7 @@ def collapse_spaces(text: str) -> str: return " ".join(text.split()) -def get_volume_info(xml: Path) -> str: +def get_volume_info(xml: Path) -> etree._Element: logging.info("Getting volume info from {}".format(xml)) # So far, their XML for the volume doesn't play nicely with xml.etree. Thus, we hack. paper = etree.Element("paper") @@ -104,23 +108,23 @@ def get_paperid(xml: Path, count: int, issue_count: int) -> str: return f"{issue_count}.{count}" # after dash in new anth id -def get_title(xml_front_node: etree.Element) -> str: - nsmap = xml_front_node.nsmap +def get_title(xml_front_node: etree._Element) -> str: + nsmap = {k: v for k, v in xml_front_node.nsmap.items() if k is not None} article_meta = xml_front_node.find("article-meta", nsmap) title_group = article_meta.find("title-group", nsmap) title = title_group.find("article-title", nsmap) - title_text = collapse_spaces("".join(title.itertext())) + title_text = collapse_spaces("".join(title.itertext())) # type: ignore[arg-type] return title_text -def get_abstract(xml_front_node: etree.Element) -> str: - nsmap = xml_front_node.nsmap +def get_abstract(xml_front_node: etree._Element) -> Optional[str]: + nsmap = {k: v for k, v in xml_front_node.nsmap.items() if k is not None} article_meta = xml_front_node.find("article-meta", nsmap) abstract = article_meta.find("abstract", nsmap) if abstract is not None: - abstract_text = collapse_spaces("".join(abstract.itertext())) + abstract_text = collapse_spaces("".join(abstract.itertext())) # type: ignore[arg-type] # 2022/June abstracts all started with "Abstract " if abstract_text.startswith("Abstract "): abstract_text = abstract_text[9:] @@ -129,48 +133,50 @@ def get_abstract(xml_front_node: etree.Element) -> str: return None -def get_authors(xml_front_node: etree.Element) -> List[Tuple[str, str]]: - nsmap = xml_front_node.nsmap +def get_authors(xml_front_node: etree._Element) -> list[tuple[str | None, str]]: + nsmap = {k: v for k, v in xml_front_node.nsmap.items() if k is not None} article_meta = xml_front_node.find("article-meta", nsmap) contrib_group = article_meta.find("contrib-group", nsmap) authors = [] for author in contrib_group.findall("contrib", nsmap): string_name = author.find("name", nsmap) - try: - given_names = string_name.find("given-names", nsmap).text - except AttributeError: - given_names = "" # Special case for Mausam, and potentially Madonna. - surname = string_name.find("surname", nsmap).text - try: - suffix = string_name.find("suffix", nsmap).text - surname = surname + " " + suffix - except AttributeError: - pass + if (node := string_name.find("given-names", nsmap)) is not None: + given_names = str(node.text) if node.text else None + else: + given_names = None # Special case for Mausam, and potentially Madonna. + surname = cast(str, string_name.find("surname", nsmap).text) + if (node := string_name.find("suffix", nsmap)) is not None: + if node.text is not None: + surname = f"{surname} {node.text}" authors.append((given_names, surname)) return authors -def get_pages(xml_front_node: etree.Element) -> Tuple[str, str]: - nsmap = xml_front_node.nsmap +def get_pages(xml_front_node: etree._Element) -> tuple[str, str]: + nsmap = {k: v for k, v in xml_front_node.nsmap.items() if k is not None} article_meta = xml_front_node.find("article-meta", nsmap) fpage = article_meta.find("fpage", nsmap) lpage = article_meta.find("lpage", nsmap) + assert fpage.text is not None and lpage.text is not None return fpage.text, lpage.text -def get_doi(xml_front_node: etree.Element) -> str: - nsmap = xml_front_node.nsmap +def get_doi(xml_front_node: etree._Element) -> str: + nsmap = {k: v for k, v in xml_front_node.nsmap.items() if k is not None} article_meta = xml_front_node.find("article-meta", nsmap) doi_ = article_meta.find("*[@pub-id-type='doi']", nsmap) + assert doi_.text is not None return doi_.text -def get_article_journal_info(xml_front_node: etree.Element, is_tacl: bool) -> str: +def get_article_journal_info( + xml_front_node: etree._Element, is_tacl: bool +) -> tuple[str, str | None, str]: """ """ - nsmap = xml_front_node.nsmap + nsmap = {k: v for k, v in xml_front_node.nsmap.items() if k is not None} journal_meta = xml_front_node.find("journal-meta", nsmap) @@ -192,7 +198,8 @@ def get_article_journal_info(xml_front_node: etree.Element, is_tacl: bool) -> st "Association of Computational Linguistics", "Association for Computational Linguistics", ) - volume_text = volume.text.lstrip("0") # Sometimes we find "06" instead of "6" + assert volume.text is not None + volume_text = str(volume.text).lstrip("0") # Sometimes we find "06" instead of "6" if is_tacl: issue_text = None @@ -200,7 +207,8 @@ def get_article_journal_info(xml_front_node: etree.Element, is_tacl: bool) -> st format_string = "{journal}, Volume {volume}" else: issue = article_meta.find("issue", nsmap) - issue_text = issue.text + assert issue.text is not None + issue_text = str(issue.text) string_date_text = None for pub_date in article_meta.findall("pub-date", nsmap): @@ -212,7 +220,7 @@ def get_article_journal_info(xml_front_node: etree.Element, is_tacl: bool) -> st break if string_date_text is None: - print("Fatal: found no year/date", file=sys.stderr) + logging.critical("Found no year/date") sys.exit(1) format_string = "{journal}, Volume {volume}, Issue {issue} - {date}" @@ -227,13 +235,15 @@ def get_article_journal_info(xml_front_node: etree.Element, is_tacl: bool) -> st return format_string.format(**data), issue_text, volume_text -def process_xml(xml: Path, is_tacl: bool) -> Optional[etree.Element]: +def process_xml(xml: Path, is_tacl: bool) -> tuple[dict[str, Any], str, str | None, str]: """ """ logging.info("Reading {}".format(xml)) tree = etree.parse(open(str(xml))) root = tree.getroot() - front = root.find("front", root.nsmap) + nsmap = {k: v for k, v in root.nsmap.items() if k is not None} + front = root.find("front", nsmap) + assert isinstance(front, etree._Element) info, issue, volume = get_article_journal_info(front, is_tacl) diff --git a/justfile b/justfile new file mode 100644 index 0000000000..06298516a4 --- /dev/null +++ b/justfile @@ -0,0 +1,12 @@ +@_default: + just -l + + +# Access commands from the Python library (`just -l python` to list them) +mod python + + +# Run type-checker on a single file, intended for bin/ files +[no-cd] +typecheck FILE: + env MYPYPATH={{justfile_directory()}}/python mypy --follow-imports silent {{FILE}}