acl-org · mbollmann · Jul 19, 2025 · Jul 19, 2025 · Jul 19, 2025 · Aug 2, 2025
diff --git a/bin/ingest_pwc.py b/bin/ingest_pwc.py
@@ -1,6 +1,9 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 #
+# Copyright 2021 Robert Stojnic
+# Copyright 2023–2025 Matt Post, Marcel Bollmann
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -18,42 +21,17 @@
 Used to import the links to code and data from Papers with Code (paperswithcode.com)
 """
 
-import logging
 import json
-import lxml.etree as etree
 import os
 import logging as log
+from pathlib import Path
 import requests
 import sys
 
-
-def format_str(x):
-    """Format as string if a value is missing or bool."""
-    if x is None:
-        return ""
-    elif isinstance(x, bool):
-        return "true" if x else "false"
-    else:
-        return str(x)
-
-
-def shift_tails(element):
-    """Shift XML children tails to preserve the exact formatting"""
-    children = list(element)
-    children[-1].tail = children[-2].tail
-    children[-2].tail = children[-3].tail
-
-
-def remove_and_shift_tails(element, child):
-    """Remove element and make tails consistent"""
-    children = list(element)
-    inx = children.index(child)
-
-    if inx > 0:
-        children[inx - 1].tail = children[inx].tail
-
-    element.remove(child)
-
+from acl_anthology import Anthology
+from acl_anthology.files import PapersWithCodeReference
+from acl_anthology.utils.ids import parse_id
+from acl_anthology.utils.logging import setup_rich_logging
 
 if __name__ == "__main__":
     import argparse
@@ -64,7 +42,7 @@ def remove_and_shift_tails(element, child):
     )
     args = ap.parse_args()
 
-    logging.basicConfig(level=logging.INFO)
+    setup_rich_logging(level=log.INFO)
 
     if args.infile:
         with open(args.infile, "r") as f:
@@ -78,62 +56,54 @@ def remove_and_shift_tails(element, child):
             log.warning("Couldn't fetch metadata from Papers with Code (server error).")
             sys.exit(1)
 
-    data_base = "data/xml"
+    datadir = Path(os.path.dirname(os.path.abspath(__file__))) / ".." / "data"
+    anthology = Anthology(datadir=datadir)
 
-    for xml_filename in os.listdir(data_base):
-        # skip any non-xml files
-        if not xml_filename.endswith(".xml"):
-            continue
-
-        full_path = os.path.join(data_base, xml_filename)
-
-        # load
-        with open(full_path) as f:
-            tree = etree.parse(f)
-
-        # track if we modified
-        old_content = etree.tostring(
-            tree, encoding="UTF-8", xml_declaration=True, with_tail=True
-        ).decode("utf8")
-
-        for volume in tree.findall("volume"):
-            for paper in volume.findall("paper"):
-                acl_url = paper.find("url")
-                if acl_url is not None:
-                    acl_id = acl_url.text
-                else:
-                    # skip if we cannot construct the id
-                    continue
+    # Iterate over all papers in the JSON response
+    changed_collections = set()
+    ids_with_pwc_reference = set()
 
-                # start by removing any old entries
-                for old in paper.findall("pwccode"):
-                    remove_and_shift_tails(paper, old)
-                for old in paper.findall("pwcdataset"):
-                    remove_and_shift_tails(paper, old)
-
-                if acl_id in pwc_meta:
-                    pwc = pwc_meta[acl_id]
-                    pwc_code = pwc["code"]
-                    if pwc_code["url"] or pwc_code["additional"]:
-                        code = etree.SubElement(paper, "pwccode")
-                        code.set("url", format_str(pwc_code["url"]))
-                        code.set("additional", format_str(pwc_code["additional"]))
-                        if pwc_code["name"]:
-                            code.text = pwc_code["name"]
-                        shift_tails(paper)
-
-                    for pwc_data in pwc["datasets"]:
-                        data = etree.SubElement(paper, "pwcdataset")
-                        data.set("url", pwc_data["url"])
-                        data.text = pwc_data["name"]
-                        shift_tails(paper)
-
-        new_content = etree.tostring(
-            tree, encoding="UTF-8", xml_declaration=True, with_tail=True
-        ).decode("utf8")
-
-        if old_content != new_content:
-            with open(full_path, "w") as outfile:
-                outfile.write(new_content + "\n")  # all files end with newline
+    for full_id, pwc_data in pwc_meta.items():
+        if full_id.endswith(".pdf"):
+            full_id = full_id[:-4]
+        try:
+            parsed_id = parse_id(full_id)
+        except ValueError:
+            log.error(f"Failed to parse Anthology ID: {full_id}")
+            continue
 
-            log.info(f"Modified Papers with Code metadata in {full_path}")
+        if paper := anthology.get_paper(parsed_id):
+            pwc_code = pwc_data["code"]
+            reference = PapersWithCodeReference(
+                code=(pwc_code["name"], pwc_code["url"]) if pwc_code["url"] else None,
+                community_code=bool(pwc_code["additional"]),
+                datasets=[
+                    (pwc_ds["name"], pwc_ds["url"]) for pwc_ds in pwc_data["datasets"]
+                ],
+            )
+            if (
+                reference.code is None
+                and not reference.community_code
+                and not reference.datasets
+            ):
+                reference = None
+            else:
+                ids_with_pwc_reference.add(full_id)
+
+            if paper.paperswithcode != reference:
+                paper.paperswithcode = reference
+                changed_collections.add(parsed_id[0])
+
+    # Sanity-check that there are no other papers with PwC references
+    all_pwc = {
+        paper.full_id for paper in anthology.papers() if paper.paperswithcode is not None
+    }
+    for full_id in all_pwc - ids_with_pwc_reference:
+        paper = anthology.get_paper(full_id)
+        paper.paperswithcode = None
+        changed_collections.add(paper.full_id_tuple[0])
+
+    # Save changes
+    for collection_id in changed_collections:
+        log.info(f"Modified Papers with Code metadata in {collection_id}")
+        anthology.get_collection(collection_id).save()
diff --git a/python/CHANGELOG.md b/python/CHANGELOG.md
@@ -1,5 +1,11 @@
 # Changelog
 
+## [UNRELEASED]
+
+### Removed
+
+- `PapersWithCodeReference` has been removed as Papers with Code is no longer functional.
+
 ## [0.5.3] — 2025-06-22
 
 This release adds more functionality for ingesting new proceedings and modifying existing data.

diff --git a/python/acl_anthology/collections/paper.py b/python/acl_anthology/collections/paper.py
@@ -28,7 +28,6 @@
 from ..exceptions import AnthologyInvalidIDError, AnthologyXMLError
 from ..files import (
     AttachmentReference,
-    PapersWithCodeReference,
     PDFReference,
     PDFThumbnailReference,
     VideoReference,
@@ -247,7 +246,6 @@ class Paper:
         language: The language this paper is (mainly) written in.  When given, this should be a ISO 639-2 code (e.g. "eng"), though occasionally IETF is used (e.g. "pt-BR").
         note: A note attached to this paper.  Used very sparingly.
         pages: Page numbers of this paper within its volume.
-        paperswithcode: Links to code implementations and datasets as provided by [Papers with Code](https://paperswithcode.com/).
         pdf: A reference to the paper's PDF.
         type: The paper's type, currently used to mark frontmatter and backmatter.
     """
@@ -300,9 +298,6 @@ class Paper:
     language: Optional[str] = field(default=None, repr=False)
     note: Optional[str] = field(default=None, repr=False)
     pages: Optional[str] = field(default=None, repr=False)
-    paperswithcode: Optional[PapersWithCodeReference] = field(
-        default=None, on_setattr=attrs.setters.frozen, repr=False
-    )
     pdf: Optional[PDFReference] = field(default=None, repr=False)
     type: PaperType = field(default=PaperType.PAPER, repr=False, converter=PaperType)
 
@@ -671,10 +666,6 @@ def from_xml(cls, parent: Volume, paper: etree._Element) -> Paper:
                 if "errata" not in kwargs:
                     kwargs["errata"] = []
                 kwargs["errata"].append(PaperErratum.from_xml(element))
-            elif element.tag in ("pwccode", "pwcdataset"):
-                if "paperswithcode" not in kwargs:
-                    kwargs["paperswithcode"] = PapersWithCodeReference()
-                kwargs["paperswithcode"].append_from_xml(element)
             elif element.tag in ("removed", "retracted"):
                 kwargs["deletion"] = PaperDeletionNotice.from_xml(element)
             elif element.tag == "revision":
@@ -751,6 +742,4 @@ def to_xml(self) -> etree._Element:
         if self.deletion is not None:
             paper.append(self.deletion.to_xml())
         paper.append(E.bibkey(self.bibkey))
-        if self.paperswithcode is not None:
-            paper.extend(self.paperswithcode.to_xml_list())
         return paper
diff --git a/python/acl_anthology/data/schema.rnc b/python/acl_anthology/data/schema.rnc
@@ -88,15 +88,6 @@ Paper = element paper {
      }*
    & element language { xsd:language }?
    & element award { text }*
-   & element pwcdataset {
-       attribute url { xsd:anyURI },
-       text
-     }*
-   & element pwccode {
-       attribute url { xsd:anyURI },
-       attribute additional { xsd:boolean },
-       text
-     }?
    )
 }
 Meta = element meta {

diff --git a/python/acl_anthology/files.py b/python/acl_anthology/files.py
@@ -17,7 +17,7 @@
 from __future__ import annotations
 
 import sys
-from attrs import define, field, validators as v, Factory
+from attrs import define, field, validators as v
 from lxml import etree
 from lxml.builder import E
 from pathlib import Path
@@ -191,51 +191,3 @@ def to_xml(self, tag: str = "video") -> etree._Element:
         if not self.permission:
             elem.set("permission", "false")
         return elem
-
-
-@define
-class PapersWithCodeReference:
-    """Class aggregating [Papers with Code](https://paperswithcode.com/) (PwC) links in a paper.
-
-    Attributes:
-        code: An official code repository, given as a tuple of the form `(name, url)`.
-        community_code: Whether the PwC page of the paper has additional, community-provided code links.
-        datasets: A list of datasets on PwC, given as tuples of the form `(name, url)`.
-    """
-
-    code: Optional[tuple[str | None, str]] = field(default=None)
-    community_code: bool = field(default=False)
-    datasets: list[tuple[str | None, str]] = Factory(list)
-
-    def append_from_xml(self, elem: etree._Element) -> None:
-        """Appends information from a `<pwccode>` or `<pwcdataset>` block to this reference."""
-        pwc_tuple = (elem.text, elem.get("url", ""))
-        if elem.tag == "pwccode":
-            self.community_code = xsd_boolean(elem.get("additional", ""))
-            self.code = pwc_tuple
-        elif elem.tag == "pwcdataset":
-            self.datasets.append(pwc_tuple)
-        else:  # pragma: no cover
-            raise ValueError(
-                f"Unsupported element for PapersWithCodeReference: <{elem.tag}>"
-            )
-
-    def to_xml_list(self) -> list[etree._Element]:
-        """
-        Returns:
-            A serialization of all PapersWithCode information as a list of corresponding XML tags in the Anthology XML format.
-        """
-        elements = []
-        if self.code is not None:
-            args = [self.code[0]] if self.code[0] is not None else []
-            elements.append(
-                E.pwccode(
-                    *args,
-                    url=self.code[1],
-                    additional=str(self.community_code).lower(),
-                )
-            )
-        for dataset in self.datasets:
-            args = [dataset[0]] if dataset[0] is not None else []
-            elements.append(E.pwcdataset(*args, url=dataset[1]))
-        return elements
diff --git a/python/acl_anthology/utils/attrs.py b/python/acl_anthology/utils/attrs.py
@@ -73,7 +73,6 @@ def auto_validate_types(
         AttachmentReference,
         EventFileReference,
         VideoReference,
-        PapersWithCodeReference,
     )
     from ..people import Name, NameSpecification
     from ..text import MarkupText
@@ -89,7 +88,6 @@ def auto_validate_types(
             AttachmentReference,
             EventFileReference,
             VideoReference,
-            PapersWithCodeReference,
             MarkupText,
             Name,
             NameSpecification,

diff --git a/python/acl_anthology/utils/xml.py b/python/acl_anthology/utils/xml.py
@@ -60,7 +60,6 @@
     "attachment",
     "award",
     "video",
-    "pwcdataset",
 }
 """XML tags that may appear multiple times per parent tag, and whose relative order matters even if their parent tag belongs to `TAGS_WITH_UNORDERED_CHILDREN`."""
 

diff --git a/python/benchmarks/bench_xml_parsing.py b/python/benchmarks/bench_xml_parsing.py
@@ -30,7 +30,6 @@
     "revision",
     "erratum",
     "award",
-    "pwcdataset",
     "video",
     "venue",
 )
@@ -65,12 +64,6 @@ def parse_element(xml_element):
                 elif tag == "last":
                     last = subelement.text or ""
             value = (first, last, id_)
-        elif tag == "pwccode":
-            value = {
-                "url": element.get("url"),
-                "additional": element.get("additional"),
-                "name": element.text,
-            }
         else:
             value = element.text
 
@@ -105,12 +98,6 @@ def parse_single_element(element):
             elif tag == "last":
                 last = subelement.text or ""
         value = (first, last, id_)
-    elif tag == "pwccode":
-        value = {
-            "url": element.get("url"),
-            "additional": element.get("additional"),
-            "name": element.text,
-        }
     else:
         value = element.text
 

diff --git a/python/tests/collections/paper_test.py b/python/tests/collections/paper_test.py
@@ -256,8 +256,6 @@ def test_paper_add_author(anthology):
   <award>Most ridiculous entry</award>
   <removed date="2023-09-30">Removed immediately for being fake</removed>
   <bibkey>why-would-you-cite-this</bibkey>
-  <pwccode url="https://github.com/acl-org/fake-repo" additional="false">acl-org/fake-repo</pwccode>
-  <pwcdataset url="https://paperswithcode.com/dataset/fake-dataset">FaKe-DaTaSeT</pwcdataset>
 </paper>
 """,
 )