From 2b82771bd5da1272cc32006d9843a2171e8166c5 Mon Sep 17 00:00:00 2001
From: Marcel Bollmann <marcel@bollmann.me>
Date: Sun, 8 Jun 2025 16:56:15 +0200
Subject: [PATCH 1/2] Port MIT Press ingestion script to use our library

---
 bin/ingest_mitpress.py | 277 ++++++++++++-----------------------------
 1 file changed, 78 insertions(+), 199 deletions(-)

diff --git a/bin/ingest_mitpress.py b/bin/ingest_mitpress.py
index 29fd53fd7d..c042de18ce 100755
--- a/bin/ingest_mitpress.py
+++ b/bin/ingest_mitpress.py
@@ -2,6 +2,7 @@
 """
 Convert MIT Press XML files for CL and TACL to Anthology XML.
 
+version 0.6 - uses new Python library for ingestion
 version 0.5 - reads from new MIT Press format.
 version 0.4 - now updates XML directly, skips existing papers, sorts by page number
 version 0.3 - produces anthology ID in new format 2020.cl-1.1
@@ -30,7 +31,7 @@
 
 Warning (August 2020): not yet tested with CL, but should work!
 
-Authors: Arya D. McCarthy, Matt Post
+Authors: Arya D. McCarthy, Matt Post, Marcel Bollmann
 """
 import os
 import shutil
@@ -40,11 +41,16 @@
 from pathlib import Path
 from typing import List, Optional, Tuple
 
-from anthology import Anthology, Paper, Volume
-from normalize_anth import normalize
-from anthology.utils import make_simple_element, indent, compute_hash_from_file
+from acl_anthology import Anthology
+from acl_anthology.files import PDFReference
+from acl_anthology.people import Name, NameSpecification as NameSpec
+from acl_anthology.text import MarkupText
+from acl_anthology.utils import setup_rich_logging
 
-__version__ = "0.5"
+from fixedcase.protect import protect
+
+
+__version__ = "0.6"
 
 TACL = "tacl"
 CL = "cl"
@@ -108,43 +114,6 @@ def get_title(xml_front_node: etree.Element) -> str:
     return title_text
 
 
-def get_year(xml_front_node: etree.Element) -> str:
-    nsmap = xml_front_node.nsmap
-
-    article_meta = xml_front_node.find("article-meta", nsmap)
-    pub_date = article_meta.find("pub-date", nsmap)
-    year_text = pub_date.find("year", nsmap).text
-    return year_text
-
-
-def get_month(xml_front_node: etree.Element) -> str:
-    nsmap = xml_front_node.nsmap
-
-    article_meta = xml_front_node.find("article-meta", nsmap)
-    pub_date = article_meta.find("pub-date", nsmap)
-    try:
-        month_id = pub_date.find("month", nsmap).text
-    except AttributeError:
-        return None
-    months = [
-        None,
-        "January",
-        "February",
-        "March",
-        "April",
-        "May",
-        "June",
-        "July",
-        "August",
-        "September",
-        "October",
-        "November",
-        "December",
-    ]
-    month_text = months[int(month_id)]
-    return month_text
-
-
 def get_abstract(xml_front_node: etree.Element) -> str:
     nsmap = xml_front_node.nsmap
 
@@ -268,86 +237,25 @@ def process_xml(xml: Path, is_tacl: bool) -> Optional[etree.Element]:
 
     info, issue, volume = get_article_journal_info(front, is_tacl)
 
-    paper = etree.Element("paper")
-
-    title_text = get_title(front)
-    title = etree.Element("title")
-    title.text = title_text
-    paper.append(title)
-
-    authors = get_authors(front)
-    for given_names, surname in authors:
-        first = etree.Element("first")
-        first.text = given_names
-
-        last = etree.Element("last")
-        last.text = surname
-
-        author = etree.Element("author")
-        author.append(first)
-        author.append(last)
-
-        paper.append(author)
-
-    doi_text = get_doi(front)
-    doi = etree.Element("doi")
-    doi.text = doi_text
-    paper.append(doi)
-
-    abstract_text = get_abstract(front)
-    if abstract_text:
-        make_simple_element("abstract", abstract_text, parent=paper)
-
-    pages_tuple = get_pages(front)
-    pages = etree.Element("pages")
-    pages.text = "–".join(pages_tuple)  # en-dash, not hyphen!
-    paper.append(pages)
+    paper = {
+        "title": get_title(front),
+        "authors": [
+            {
+                "first": given_names,
+                "last": surname,
+            }
+            for given_names, surname in get_authors(front)
+        ],
+        "doi": get_doi(front),
+        "abstract": get_abstract(front),
+        "pages": get_pages(front),  # tuple
+    }
 
     return paper, info, issue, volume
 
 
-def issue_info_to_node(
-    issue_info: str, year_: str, journal_issue: str, venue: str, volume: str
-) -> etree.Element:
-    """Creates the meta block for a new issue / volume"""
-    meta = make_simple_element("meta")
-
-    assert int(year_)
-
-    make_simple_element("booktitle", issue_info, parent=meta)
-    make_simple_element("publisher", "MIT Press", parent=meta)
-    make_simple_element("address", "Cambridge, MA", parent=meta)
-
-    if venue == "cl":
-        month_text = issue_info.split()[-2]  # blah blah blah month year
-        if month_text not in {
-            "January",
-            "February",
-            "March",
-            "April",
-            "May",
-            "June",
-            "July",
-            "August",
-            "September",
-            "October",
-            "November",
-            "December",
-        }:
-            logging.error("Unknown month: " + month_text)
-        make_simple_element("month", month_text, parent=meta)
-
-    make_simple_element("year", str(year_), parent=meta)
-    make_simple_element("venue", venue, parent=meta)
-    make_simple_element("journal-volume", volume, parent=meta)
-
-    return meta
-
-
 def main(args):
-    anthology = Anthology(
-        importdir=os.path.join(args.anthology_dir, "data"), require_bibkeys=False
-    )
+    anthology = Anthology(datadir=os.path.join(args.anthology_dir, "data"))
 
     is_tacl = "tacl" in args.root_dir.stem
     logging.info(f"Looks like a {'TACL' if is_tacl else 'CL'} ingestion")
@@ -363,24 +271,13 @@ def main(args):
             sys.exit(-1)
 
     collection_id = str(year) + "." + venue
-
-    collection_file = os.path.join(
-        args.anthology_dir, "data", "xml", f"{collection_id}.xml"
-    )
-    if os.path.exists(collection_file):
-        collection = etree.parse(collection_file).getroot()
-    else:
-        collection = make_simple_element("collection", attrib={"id": collection_id})
-
-    # volume_info = get_volume_info(list(args.year_root.glob("*.*.*/*.*.*.xml"))[0])
-    # volume.append(volume_info)
-
-    previous_issue_info = None
+    if (collection := anthology.collections.get(collection_id)) is None:
+        collection = anthology.collections.create(collection_id)
 
     papers = []
     for xml in sorted(args.root_dir.glob("*.xml")):
-        papernode, issue_info, issue, volume = process_xml(xml, is_tacl)
-        if papernode is None or papernode.find("title").text.startswith("Erratum: “"):
+        paper_dict, issue_info, issue, volume_num = process_xml(xml, is_tacl)
+        if paper_dict["title"].startswith("Erratum: “"):
             continue
 
         pdf_path = xml.parent / xml.with_suffix(".pdf").name
@@ -388,7 +285,7 @@ def main(args):
             logging.error(f"Missing pdf for {pdf_path}")
             sys.exit(1)
 
-        papers.append((papernode, pdf_path, issue_info, issue))
+        papers.append((paper_dict, pdf_path, issue_info, issue))
 
         pdf_destination = Path(args.pdfs_dir)
         pdf_destination = pdf_destination / "pdf" / venue
@@ -396,89 +293,71 @@ def main(args):
 
     # MIT Press does assign its IDs in page order, so we have to sort by page
     def sort_papers_by_page(paper_tuple):
-        papernode = paper_tuple[0]
-        startpage = int(papernode.find("./pages").text.split("–")[0])
+        startpage = paper_tuple[0]["pages"][0]
         return startpage
 
-    paper_id = 1  # Stupid non-enumerate counter because of "Erratum: " papers interleaved with real ones.
-    for papernode, pdf_path, issue_info, issue in sorted(papers, key=sort_papers_by_page):
+    for paper_dict, pdf_path, issue_info, issue in sorted(
+        papers, key=sort_papers_by_page
+    ):
         issue = issue or "1"
-        if issue_info != previous_issue_info:
-            # Emit the new volume info before the paper.
-            logging.info("New issue")
-            logging.info(f"{issue_info} vs. {previous_issue_info}")
-            previous_issue_info = issue_info
-
-            # Look for node in tree, else create it
-            volume_xml = collection.find(f'./volume[@id="{issue}"]')
-            if volume_xml is None:
-                # xml volume = journal issue
-                volume_xml = make_simple_element(
-                    "volume", attrib={"id": issue, "type": "journal"}, parent=collection
-                )
-                volume_xml.append(
-                    issue_info_to_node(issue_info, year, issue, venue, volume)
-                )
-                paper_id = 1
-            else:
-                for paper in volume_xml.findall(".//paper"):
-                    paper_id = max(paper_id, int(paper.attrib["id"]))
+        if (volume := collection.get(issue)) is None:
+            logging.info(f"New issue: {issue_info}")
 
-                paper_id += 1
-
-        anth_id = f"{collection_id}-{issue}.{paper_id}"
+            if venue == "cl":
+                month = issue_info.split()[-2]  # blah blah blah month year
+                if month not in MONTHS.values():
+                    logging.error("Unknown month: " + month)
+            else:
+                month = None
+
+            volume = collection.create_volume(
+                issue,
+                title=MarkupText.from_string(issue_info),
+                type="journal",
+                year=str(year),
+                month=month,
+                publisher="MIT Press",
+                address="Cambridge, MA",
+                venue_ids=[venue],
+                journal_volume=volume_num,
+                journal_issue=issue,
+            )
 
         # Check if the paper is already present in the volume
-        doi_text = papernode.find("./doi").text
-        doi_node = collection.xpath(f'.//doi[text()="{doi_text}"]')
-        if len(doi_node):
-            logging.info(
-                f"Skipping existing paper {anth_id}/{doi_text} with title {papernode.find('title').text}"
-            )
+        if any(paper.doi == paper_dict["doi"] for paper in volume.papers()):
+            logging.info(f"Skipping existing paper with DOI {paper_dict['doi']}")
             continue
 
-        papernode.attrib["id"] = f"{paper_id}"
+        paper = volume.create_paper(
+            title=MarkupText.from_latex_maybe(paper_dict["title"]),
+            abstract=MarkupText.from_latex_maybe(paper_dict["abstract"]),
+            doi=paper_dict["doi"],
+            pages="–".join(paper_dict["pages"]),  # bit inelegant?
+            authors=[
+                NameSpec(Name.from_dict(author)) for author in paper_dict["authors"]
+            ],
+        )
 
+        anth_id = paper.full_id
         destination = pdf_destination / f"{anth_id}.pdf"
-        print(f"Copying {pdf_path} to {destination}")
+        logging.info(f"Copying {pdf_path} to {destination}")
         shutil.copyfile(pdf_path, destination)
-        checksum = compute_hash_from_file(pdf_path)
-
-        url_text = anth_id
-        url = etree.Element("url")
-        url.attrib["hash"] = checksum
-        url.text = url_text
-        papernode.append(url)
-
-        # Generate bibkey
-        volume = Volume.from_xml(
-            volume_xml,
-            collection_id,
-            anthology.venues,
-            anthology.sigs,
-            anthology.formatter,
-        )
-        paper = Paper.from_xml(papernode, volume, anthology.formatter)
-        bibkey = anthology.pindex.create_bibkey(paper, vidx=anthology.venues)
-        make_simple_element("bibkey", bibkey, parent=papernode)
-
-        # Normalize
-        for oldnode in papernode:
-            normalize(oldnode, informat="latex")
-        volume_xml.append(papernode)
+        paper.pdf = PDFReference.from_file(destination)
 
-        paper_id += 1
+        # TODO: fixedcase currently not ported to new library
+        xml_title = paper.title.to_xml("title")
+        protect(xml_title)
+        paper.title = MarkupText.from_xml(xml_title)
 
-    indent(collection)  # from anthology.utils
-    et = etree.ElementTree(collection)
-    et.write(collection_file, encoding="UTF-8", xml_declaration=True, with_tail=True)
+    # All serialization to XML happens here
+    collection.save()
 
 
 if __name__ == "__main__":
     import sys
 
-    if sys.version_info < (3, 6):
-        sys.stderr.write("Python >=3.6 required.\n")
+    if sys.version_info < (3, 10):
+        sys.stderr.write("Python >=3.10 required.\n")
         sys.exit(1)
 
     import argparse
@@ -519,6 +398,6 @@ def sort_papers_by_page(paper_tuple):
     args = parser.parse_args()
     args.root_dir = args.root_dir.resolve()  # Get absolute path.
 
-    logging.basicConfig(level=args.verbose)
+    setup_rich_logging(level=args.verbose)
 
     main(args)

From cc1d45226a9ae6c2bd1230e567073c30b540a4ae Mon Sep 17 00:00:00 2001
From: Marcel Bollmann <marcel@bollmann.me>
Date: Sun, 8 Jun 2025 17:33:11 +0200
Subject: [PATCH 2/2] Attempt to satisfy type-checker, needs quite a lot of
 ignores currently

---
 bin/ingest_mitpress.py | 72 ++++++++++++++++++++++++------------------
 justfile               | 12 +++++++
 2 files changed, 53 insertions(+), 31 deletions(-)
 create mode 100644 justfile

diff --git a/bin/ingest_mitpress.py b/bin/ingest_mitpress.py
index c042de18ce..f6887c189f 100755
--- a/bin/ingest_mitpress.py
+++ b/bin/ingest_mitpress.py
@@ -33,13 +33,17 @@
 
 Authors: Arya D. McCarthy, Matt Post, Marcel Bollmann
 """
+# mypy: disable-error-code="union-attr"
+#       ^---- Reason for this and most "type: ignore" comments:
+#             code never checks if etree functions return None
+
 import os
 import shutil
 import logging
 import lxml.etree as etree
 
 from pathlib import Path
-from typing import List, Optional, Tuple
+from typing import Any, Optional, cast
 
 from acl_anthology import Anthology
 from acl_anthology.files import PDFReference
@@ -75,7 +79,7 @@ def collapse_spaces(text: str) -> str:
     return " ".join(text.split())
 
 
-def get_volume_info(xml: Path) -> str:
+def get_volume_info(xml: Path) -> etree._Element:
     logging.info("Getting volume info from {}".format(xml))
     # So far, their XML for the volume doesn't play nicely with xml.etree. Thus, we hack.
     paper = etree.Element("paper")
@@ -104,23 +108,23 @@ def get_paperid(xml: Path, count: int, issue_count: int) -> str:
     return f"{issue_count}.{count}"  # after dash in new anth id
 
 
-def get_title(xml_front_node: etree.Element) -> str:
-    nsmap = xml_front_node.nsmap
+def get_title(xml_front_node: etree._Element) -> str:
+    nsmap = {k: v for k, v in xml_front_node.nsmap.items() if k is not None}
 
     article_meta = xml_front_node.find("article-meta", nsmap)
     title_group = article_meta.find("title-group", nsmap)
     title = title_group.find("article-title", nsmap)
-    title_text = collapse_spaces("".join(title.itertext()))
+    title_text = collapse_spaces("".join(title.itertext()))  # type: ignore[arg-type]
     return title_text
 
 
-def get_abstract(xml_front_node: etree.Element) -> str:
-    nsmap = xml_front_node.nsmap
+def get_abstract(xml_front_node: etree._Element) -> Optional[str]:
+    nsmap = {k: v for k, v in xml_front_node.nsmap.items() if k is not None}
 
     article_meta = xml_front_node.find("article-meta", nsmap)
     abstract = article_meta.find("abstract", nsmap)
     if abstract is not None:
-        abstract_text = collapse_spaces("".join(abstract.itertext()))
+        abstract_text = collapse_spaces("".join(abstract.itertext()))  # type: ignore[arg-type]
         # 2022/June abstracts all started with "Abstract "
         if abstract_text.startswith("Abstract "):
             abstract_text = abstract_text[9:]
@@ -129,48 +133,50 @@ def get_abstract(xml_front_node: etree.Element) -> str:
         return None
 
 
-def get_authors(xml_front_node: etree.Element) -> List[Tuple[str, str]]:
-    nsmap = xml_front_node.nsmap
+def get_authors(xml_front_node: etree._Element) -> list[tuple[str | None, str]]:
+    nsmap = {k: v for k, v in xml_front_node.nsmap.items() if k is not None}
 
     article_meta = xml_front_node.find("article-meta", nsmap)
     contrib_group = article_meta.find("contrib-group", nsmap)
     authors = []
     for author in contrib_group.findall("contrib", nsmap):
         string_name = author.find("name", nsmap)
-        try:
-            given_names = string_name.find("given-names", nsmap).text
-        except AttributeError:
-            given_names = ""  # Special case for Mausam, and potentially Madonna.
-        surname = string_name.find("surname", nsmap).text
-        try:
-            suffix = string_name.find("suffix", nsmap).text
-            surname = surname + " " + suffix
-        except AttributeError:
-            pass
+        if (node := string_name.find("given-names", nsmap)) is not None:
+            given_names = str(node.text) if node.text else None
+        else:
+            given_names = None  # Special case for Mausam, and potentially Madonna.
+        surname = cast(str, string_name.find("surname", nsmap).text)
+        if (node := string_name.find("suffix", nsmap)) is not None:
+            if node.text is not None:
+                surname = f"{surname} {node.text}"
         authors.append((given_names, surname))
     return authors
 
 
-def get_pages(xml_front_node: etree.Element) -> Tuple[str, str]:
-    nsmap = xml_front_node.nsmap
+def get_pages(xml_front_node: etree._Element) -> tuple[str, str]:
+    nsmap = {k: v for k, v in xml_front_node.nsmap.items() if k is not None}
 
     article_meta = xml_front_node.find("article-meta", nsmap)
     fpage = article_meta.find("fpage", nsmap)
     lpage = article_meta.find("lpage", nsmap)
+    assert fpage.text is not None and lpage.text is not None
     return fpage.text, lpage.text
 
 
-def get_doi(xml_front_node: etree.Element) -> str:
-    nsmap = xml_front_node.nsmap
+def get_doi(xml_front_node: etree._Element) -> str:
+    nsmap = {k: v for k, v in xml_front_node.nsmap.items() if k is not None}
 
     article_meta = xml_front_node.find("article-meta", nsmap)
     doi_ = article_meta.find("*[@pub-id-type='doi']", nsmap)
+    assert doi_.text is not None
     return doi_.text
 
 
-def get_article_journal_info(xml_front_node: etree.Element, is_tacl: bool) -> str:
+def get_article_journal_info(
+    xml_front_node: etree._Element, is_tacl: bool
+) -> tuple[str, str | None, str]:
     """ """
-    nsmap = xml_front_node.nsmap
+    nsmap = {k: v for k, v in xml_front_node.nsmap.items() if k is not None}
 
     journal_meta = xml_front_node.find("journal-meta", nsmap)
 
@@ -192,7 +198,8 @@ def get_article_journal_info(xml_front_node: etree.Element, is_tacl: bool) -> st
         "Association of Computational Linguistics",
         "Association for Computational Linguistics",
     )
-    volume_text = volume.text.lstrip("0")  # Sometimes we find "06" instead of "6"
+    assert volume.text is not None
+    volume_text = str(volume.text).lstrip("0")  # Sometimes we find "06" instead of "6"
 
     if is_tacl:
         issue_text = None
@@ -200,7 +207,8 @@ def get_article_journal_info(xml_front_node: etree.Element, is_tacl: bool) -> st
         format_string = "{journal}, Volume {volume}"
     else:
         issue = article_meta.find("issue", nsmap)
-        issue_text = issue.text
+        assert issue.text is not None
+        issue_text = str(issue.text)
 
         string_date_text = None
         for pub_date in article_meta.findall("pub-date", nsmap):
@@ -212,7 +220,7 @@ def get_article_journal_info(xml_front_node: etree.Element, is_tacl: bool) -> st
             break
 
         if string_date_text is None:
-            print("Fatal: found no year/date", file=sys.stderr)
+            logging.critical("Found no year/date")
             sys.exit(1)
 
         format_string = "{journal}, Volume {volume}, Issue {issue} - {date}"
@@ -227,13 +235,15 @@ def get_article_journal_info(xml_front_node: etree.Element, is_tacl: bool) -> st
     return format_string.format(**data), issue_text, volume_text
 
 
-def process_xml(xml: Path, is_tacl: bool) -> Optional[etree.Element]:
+def process_xml(xml: Path, is_tacl: bool) -> tuple[dict[str, Any], str, str | None, str]:
     """ """
     logging.info("Reading {}".format(xml))
 
     tree = etree.parse(open(str(xml)))
     root = tree.getroot()
-    front = root.find("front", root.nsmap)
+    nsmap = {k: v for k, v in root.nsmap.items() if k is not None}
+    front = root.find("front", nsmap)
+    assert isinstance(front, etree._Element)
 
     info, issue, volume = get_article_journal_info(front, is_tacl)
 
diff --git a/justfile b/justfile
new file mode 100644
index 0000000000..06298516a4
--- /dev/null
+++ b/justfile
@@ -0,0 +1,12 @@
+@_default:
+  just -l
+
+
+# Access commands from the Python library (`just -l python` to list them)
+mod python
+
+
+# Run type-checker on a single file, intended for bin/ files
+[no-cd]
+typecheck FILE:
+     env MYPYPATH={{justfile_directory()}}/python mypy --follow-imports silent {{FILE}}