From ebd4e1ba4b288b5ba2d420d5c755ff16fffbc7f8 Mon Sep 17 00:00:00 2001 From: Marcel Bollmann Date: Sat, 19 Jul 2025 17:29:55 +0200 Subject: [PATCH 1/4] Unfreeze Paper.paperswithcode attribute The rationale for freezing was to indicate that this attribute should never be modified manually, since it's (over)written by the PwC integration, but that of course doesn't work if we want the PwC integration itself to go through the library :) --- python/acl_anthology/collections/paper.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/python/acl_anthology/collections/paper.py b/python/acl_anthology/collections/paper.py index b48807fae9..d4f5c46bd6 100644 --- a/python/acl_anthology/collections/paper.py +++ b/python/acl_anthology/collections/paper.py @@ -300,9 +300,7 @@ class Paper: language: Optional[str] = field(default=None, repr=False) note: Optional[str] = field(default=None, repr=False) pages: Optional[str] = field(default=None, repr=False) - paperswithcode: Optional[PapersWithCodeReference] = field( - default=None, on_setattr=attrs.setters.frozen, repr=False - ) + paperswithcode: Optional[PapersWithCodeReference] = field(default=None, repr=False) pdf: Optional[PDFReference] = field(default=None, repr=False) type: PaperType = field(default=PaperType.PAPER, repr=False, converter=PaperType) From 36df5a2b157f3cefe28e76200d764ebc131929ba Mon Sep 17 00:00:00 2001 From: Marcel Bollmann Date: Sat, 19 Jul 2025 17:52:57 +0200 Subject: [PATCH 2/4] Fix PapersWithCodeReference.code to be None instead of (None, '') --- python/CHANGELOG.md | 6 ++++++ python/acl_anthology/files.py | 22 +++++++++++++--------- python/tests/files_test.py | 2 +- 3 files changed, 20 insertions(+), 10 deletions(-) diff --git a/python/CHANGELOG.md b/python/CHANGELOG.md index ff2aadaa04..ae58502abc 100644 --- a/python/CHANGELOG.md +++ b/python/CHANGELOG.md @@ -1,5 +1,11 @@ # Changelog +## [UNRELEASED] + +### Changed + +- `PapersWithCodeReference.code` will now always correctly be `None` if neither name or URL for a code repository are given, instead of `(None, '')` which was previously possible. + ## [0.5.3] — 2025-06-22 This release adds more functionality for ingesting new proceedings and modifying existing data. diff --git a/python/acl_anthology/files.py b/python/acl_anthology/files.py index ab02b4805a..9d9ea2d2ef 100644 --- a/python/acl_anthology/files.py +++ b/python/acl_anthology/files.py @@ -212,7 +212,7 @@ def append_from_xml(self, elem: etree._Element) -> None: pwc_tuple = (elem.text, elem.get("url", "")) if elem.tag == "pwccode": self.community_code = xsd_boolean(elem.get("additional", "")) - self.code = pwc_tuple + self.code = pwc_tuple if any(pwc_tuple) else None elif elem.tag == "pwcdataset": self.datasets.append(pwc_tuple) else: # pragma: no cover @@ -226,15 +226,19 @@ def to_xml_list(self) -> list[etree._Element]: A serialization of all PapersWithCode information as a list of corresponding XML tags in the Anthology XML format. """ elements = [] - if self.code is not None: - args = [self.code[0]] if self.code[0] is not None else [] - elements.append( - E.pwccode( - *args, - url=self.code[1], - additional=str(self.community_code).lower(), + if self.code or self.community_code: + additional = str(self.community_code).lower() + if self.code is None: + elements.append(E.pwccode(url="", additional=additional)) + else: + args = [self.code[0]] if self.code[0] is not None else [] + elements.append( + E.pwccode( + *args, + url=self.code[1], + additional=additional, + ) ) - ) for dataset in self.datasets: args = [dataset[0]] if dataset[0] is not None else [] elements.append(E.pwcdataset(*args, url=dataset[1])) diff --git a/python/tests/files_test.py b/python/tests/files_test.py index 2cb373fa14..6f2876b889 100644 --- a/python/tests/files_test.py +++ b/python/tests/files_test.py @@ -110,7 +110,7 @@ ( # This happens, so it needs to be handled ('',), - (None, ""), + None, True, [], ), From 2e9f73848815242cf868bdff8ad08273bf3cc677 Mon Sep 17 00:00:00 2001 From: Marcel Bollmann Date: Sat, 19 Jul 2025 18:03:30 +0200 Subject: [PATCH 3/4] Adapt ingest_pwc.py to use Python library --- bin/ingest_pwc.py | 144 ++++++++++++++++++---------------------------- 1 file changed, 57 insertions(+), 87 deletions(-) diff --git a/bin/ingest_pwc.py b/bin/ingest_pwc.py index 55e5dcd082..ecb2af9ea4 100755 --- a/bin/ingest_pwc.py +++ b/bin/ingest_pwc.py @@ -1,6 +1,9 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- # +# Copyright 2021 Robert Stojnic +# Copyright 2023–2025 Matt Post, Marcel Bollmann +# # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at @@ -18,42 +21,17 @@ Used to import the links to code and data from Papers with Code (paperswithcode.com) """ -import logging import json -import lxml.etree as etree import os import logging as log +from pathlib import Path import requests import sys - -def format_str(x): - """Format as string if a value is missing or bool.""" - if x is None: - return "" - elif isinstance(x, bool): - return "true" if x else "false" - else: - return str(x) - - -def shift_tails(element): - """Shift XML children tails to preserve the exact formatting""" - children = list(element) - children[-1].tail = children[-2].tail - children[-2].tail = children[-3].tail - - -def remove_and_shift_tails(element, child): - """Remove element and make tails consistent""" - children = list(element) - inx = children.index(child) - - if inx > 0: - children[inx - 1].tail = children[inx].tail - - element.remove(child) - +from acl_anthology import Anthology +from acl_anthology.files import PapersWithCodeReference +from acl_anthology.utils.ids import parse_id +from acl_anthology.utils.logging import setup_rich_logging if __name__ == "__main__": import argparse @@ -64,7 +42,7 @@ def remove_and_shift_tails(element, child): ) args = ap.parse_args() - logging.basicConfig(level=logging.INFO) + setup_rich_logging(level=log.INFO) if args.infile: with open(args.infile, "r") as f: @@ -78,62 +56,54 @@ def remove_and_shift_tails(element, child): log.warning("Couldn't fetch metadata from Papers with Code (server error).") sys.exit(1) - data_base = "data/xml" + datadir = Path(os.path.dirname(os.path.abspath(__file__))) / ".." / "data" + anthology = Anthology(datadir=datadir) - for xml_filename in os.listdir(data_base): - # skip any non-xml files - if not xml_filename.endswith(".xml"): - continue - - full_path = os.path.join(data_base, xml_filename) - - # load - with open(full_path) as f: - tree = etree.parse(f) - - # track if we modified - old_content = etree.tostring( - tree, encoding="UTF-8", xml_declaration=True, with_tail=True - ).decode("utf8") - - for volume in tree.findall("volume"): - for paper in volume.findall("paper"): - acl_url = paper.find("url") - if acl_url is not None: - acl_id = acl_url.text - else: - # skip if we cannot construct the id - continue + # Iterate over all papers in the JSON response + changed_collections = set() + ids_with_pwc_reference = set() - # start by removing any old entries - for old in paper.findall("pwccode"): - remove_and_shift_tails(paper, old) - for old in paper.findall("pwcdataset"): - remove_and_shift_tails(paper, old) - - if acl_id in pwc_meta: - pwc = pwc_meta[acl_id] - pwc_code = pwc["code"] - if pwc_code["url"] or pwc_code["additional"]: - code = etree.SubElement(paper, "pwccode") - code.set("url", format_str(pwc_code["url"])) - code.set("additional", format_str(pwc_code["additional"])) - if pwc_code["name"]: - code.text = pwc_code["name"] - shift_tails(paper) - - for pwc_data in pwc["datasets"]: - data = etree.SubElement(paper, "pwcdataset") - data.set("url", pwc_data["url"]) - data.text = pwc_data["name"] - shift_tails(paper) - - new_content = etree.tostring( - tree, encoding="UTF-8", xml_declaration=True, with_tail=True - ).decode("utf8") - - if old_content != new_content: - with open(full_path, "w") as outfile: - outfile.write(new_content + "\n") # all files end with newline + for full_id, pwc_data in pwc_meta.items(): + if full_id.endswith(".pdf"): + full_id = full_id[:-4] + try: + parsed_id = parse_id(full_id) + except ValueError: + log.error(f"Failed to parse Anthology ID: {full_id}") + continue - log.info(f"Modified Papers with Code metadata in {full_path}") + if paper := anthology.get_paper(parsed_id): + pwc_code = pwc_data["code"] + reference = PapersWithCodeReference( + code=(pwc_code["name"], pwc_code["url"]) if pwc_code["url"] else None, + community_code=bool(pwc_code["additional"]), + datasets=[ + (pwc_ds["name"], pwc_ds["url"]) for pwc_ds in pwc_data["datasets"] + ], + ) + if ( + reference.code is None + and not reference.community_code + and not reference.datasets + ): + reference = None + else: + ids_with_pwc_reference.add(full_id) + + if paper.paperswithcode != reference: + paper.paperswithcode = reference + changed_collections.add(parsed_id[0]) + + # Sanity-check that there are no other papers with PwC references + all_pwc = { + paper.full_id for paper in anthology.papers() if paper.paperswithcode is not None + } + for full_id in all_pwc - ids_with_pwc_reference: + paper = anthology.get_paper(full_id) + paper.paperswithcode = None + changed_collections.add(paper.full_id_tuple[0]) + + # Save changes + for collection_id in changed_collections: + log.info(f"Modified Papers with Code metadata in {collection_id}") + anthology.get_collection(collection_id).save() From 0933466e63e1e67e263d4c3ce282faf6c5313215 Mon Sep 17 00:00:00 2001 From: Marcel Bollmann Date: Sat, 2 Aug 2025 11:44:05 +0200 Subject: [PATCH 4/4] Remove support for Papers with Code --- python/CHANGELOG.md | 4 +- python/acl_anthology/collections/paper.py | 9 - python/acl_anthology/data/schema.rnc | 9 - python/acl_anthology/files.py | 54 +- python/acl_anthology/utils/attrs.py | 2 - python/acl_anthology/utils/xml.py | 1 - python/benchmarks/bench_xml_parsing.py | 13 - python/tests/collections/paper_test.py | 2 - python/tests/data/anthology/xml/2022.acl.xml | 1744 ------------------ python/tests/files_test.py | 77 - 10 files changed, 3 insertions(+), 1912 deletions(-) diff --git a/python/CHANGELOG.md b/python/CHANGELOG.md index ae58502abc..95f5dfc462 100644 --- a/python/CHANGELOG.md +++ b/python/CHANGELOG.md @@ -2,9 +2,9 @@ ## [UNRELEASED] -### Changed +### Removed -- `PapersWithCodeReference.code` will now always correctly be `None` if neither name or URL for a code repository are given, instead of `(None, '')` which was previously possible. +- `PapersWithCodeReference` has been removed as Papers with Code is no longer functional. ## [0.5.3] — 2025-06-22 diff --git a/python/acl_anthology/collections/paper.py b/python/acl_anthology/collections/paper.py index d4f5c46bd6..09246eaac1 100644 --- a/python/acl_anthology/collections/paper.py +++ b/python/acl_anthology/collections/paper.py @@ -28,7 +28,6 @@ from ..exceptions import AnthologyInvalidIDError, AnthologyXMLError from ..files import ( AttachmentReference, - PapersWithCodeReference, PDFReference, PDFThumbnailReference, VideoReference, @@ -247,7 +246,6 @@ class Paper: language: The language this paper is (mainly) written in. When given, this should be a ISO 639-2 code (e.g. "eng"), though occasionally IETF is used (e.g. "pt-BR"). note: A note attached to this paper. Used very sparingly. pages: Page numbers of this paper within its volume. - paperswithcode: Links to code implementations and datasets as provided by [Papers with Code](https://paperswithcode.com/). pdf: A reference to the paper's PDF. type: The paper's type, currently used to mark frontmatter and backmatter. """ @@ -300,7 +298,6 @@ class Paper: language: Optional[str] = field(default=None, repr=False) note: Optional[str] = field(default=None, repr=False) pages: Optional[str] = field(default=None, repr=False) - paperswithcode: Optional[PapersWithCodeReference] = field(default=None, repr=False) pdf: Optional[PDFReference] = field(default=None, repr=False) type: PaperType = field(default=PaperType.PAPER, repr=False, converter=PaperType) @@ -669,10 +666,6 @@ def from_xml(cls, parent: Volume, paper: etree._Element) -> Paper: if "errata" not in kwargs: kwargs["errata"] = [] kwargs["errata"].append(PaperErratum.from_xml(element)) - elif element.tag in ("pwccode", "pwcdataset"): - if "paperswithcode" not in kwargs: - kwargs["paperswithcode"] = PapersWithCodeReference() - kwargs["paperswithcode"].append_from_xml(element) elif element.tag in ("removed", "retracted"): kwargs["deletion"] = PaperDeletionNotice.from_xml(element) elif element.tag == "revision": @@ -749,6 +742,4 @@ def to_xml(self) -> etree._Element: if self.deletion is not None: paper.append(self.deletion.to_xml()) paper.append(E.bibkey(self.bibkey)) - if self.paperswithcode is not None: - paper.extend(self.paperswithcode.to_xml_list()) return paper diff --git a/python/acl_anthology/data/schema.rnc b/python/acl_anthology/data/schema.rnc index dabcbd8bed..a071552542 100644 --- a/python/acl_anthology/data/schema.rnc +++ b/python/acl_anthology/data/schema.rnc @@ -88,15 +88,6 @@ Paper = element paper { }* & element language { xsd:language }? & element award { text }* - & element pwcdataset { - attribute url { xsd:anyURI }, - text - }* - & element pwccode { - attribute url { xsd:anyURI }, - attribute additional { xsd:boolean }, - text - }? ) } Meta = element meta { diff --git a/python/acl_anthology/files.py b/python/acl_anthology/files.py index 9d9ea2d2ef..91d05e4400 100644 --- a/python/acl_anthology/files.py +++ b/python/acl_anthology/files.py @@ -17,7 +17,7 @@ from __future__ import annotations import sys -from attrs import define, field, validators as v, Factory +from attrs import define, field, validators as v from lxml import etree from lxml.builder import E from pathlib import Path @@ -191,55 +191,3 @@ def to_xml(self, tag: str = "video") -> etree._Element: if not self.permission: elem.set("permission", "false") return elem - - -@define -class PapersWithCodeReference: - """Class aggregating [Papers with Code](https://paperswithcode.com/) (PwC) links in a paper. - - Attributes: - code: An official code repository, given as a tuple of the form `(name, url)`. - community_code: Whether the PwC page of the paper has additional, community-provided code links. - datasets: A list of datasets on PwC, given as tuples of the form `(name, url)`. - """ - - code: Optional[tuple[str | None, str]] = field(default=None) - community_code: bool = field(default=False) - datasets: list[tuple[str | None, str]] = Factory(list) - - def append_from_xml(self, elem: etree._Element) -> None: - """Appends information from a `` or `` block to this reference.""" - pwc_tuple = (elem.text, elem.get("url", "")) - if elem.tag == "pwccode": - self.community_code = xsd_boolean(elem.get("additional", "")) - self.code = pwc_tuple if any(pwc_tuple) else None - elif elem.tag == "pwcdataset": - self.datasets.append(pwc_tuple) - else: # pragma: no cover - raise ValueError( - f"Unsupported element for PapersWithCodeReference: <{elem.tag}>" - ) - - def to_xml_list(self) -> list[etree._Element]: - """ - Returns: - A serialization of all PapersWithCode information as a list of corresponding XML tags in the Anthology XML format. - """ - elements = [] - if self.code or self.community_code: - additional = str(self.community_code).lower() - if self.code is None: - elements.append(E.pwccode(url="", additional=additional)) - else: - args = [self.code[0]] if self.code[0] is not None else [] - elements.append( - E.pwccode( - *args, - url=self.code[1], - additional=additional, - ) - ) - for dataset in self.datasets: - args = [dataset[0]] if dataset[0] is not None else [] - elements.append(E.pwcdataset(*args, url=dataset[1])) - return elements diff --git a/python/acl_anthology/utils/attrs.py b/python/acl_anthology/utils/attrs.py index a5da86e7d2..755c8f4ac6 100644 --- a/python/acl_anthology/utils/attrs.py +++ b/python/acl_anthology/utils/attrs.py @@ -73,7 +73,6 @@ def auto_validate_types( AttachmentReference, EventFileReference, VideoReference, - PapersWithCodeReference, ) from ..people import Name, NameSpecification from ..text import MarkupText @@ -89,7 +88,6 @@ def auto_validate_types( AttachmentReference, EventFileReference, VideoReference, - PapersWithCodeReference, MarkupText, Name, NameSpecification, diff --git a/python/acl_anthology/utils/xml.py b/python/acl_anthology/utils/xml.py index 3952beb8da..5a111e3d27 100644 --- a/python/acl_anthology/utils/xml.py +++ b/python/acl_anthology/utils/xml.py @@ -60,7 +60,6 @@ "attachment", "award", "video", - "pwcdataset", } """XML tags that may appear multiple times per parent tag, and whose relative order matters even if their parent tag belongs to `TAGS_WITH_UNORDERED_CHILDREN`.""" diff --git a/python/benchmarks/bench_xml_parsing.py b/python/benchmarks/bench_xml_parsing.py index 610dbd1e6e..14795af230 100644 --- a/python/benchmarks/bench_xml_parsing.py +++ b/python/benchmarks/bench_xml_parsing.py @@ -30,7 +30,6 @@ "revision", "erratum", "award", - "pwcdataset", "video", "venue", ) @@ -65,12 +64,6 @@ def parse_element(xml_element): elif tag == "last": last = subelement.text or "" value = (first, last, id_) - elif tag == "pwccode": - value = { - "url": element.get("url"), - "additional": element.get("additional"), - "name": element.text, - } else: value = element.text @@ -105,12 +98,6 @@ def parse_single_element(element): elif tag == "last": last = subelement.text or "" value = (first, last, id_) - elif tag == "pwccode": - value = { - "url": element.get("url"), - "additional": element.get("additional"), - "name": element.text, - } else: value = element.text diff --git a/python/tests/collections/paper_test.py b/python/tests/collections/paper_test.py index d28a0e1f14..a9f1be197a 100644 --- a/python/tests/collections/paper_test.py +++ b/python/tests/collections/paper_test.py @@ -256,8 +256,6 @@ def test_paper_add_author(anthology): Most ridiculous entry Removed immediately for being fake why-would-you-cite-this - acl-org/fake-repo - FaKe-DaTaSeT """, ) diff --git a/python/tests/data/anthology/xml/2022.acl.xml b/python/tests/data/anthology/xml/2022.acl.xml index 3e9ee6f0b5..0a490235fd 100644 --- a/python/tests/data/anthology/xml/2022.acl.xml +++ b/python/tests/data/anthology/xml/2022.acl.xml @@ -29,15 +29,6 @@ modarressi-etal-2022-adapler 10.18653/v1/2022.acl-long.1