diff --git a/bin/ingest_pwc.py b/bin/ingest_pwc.py index 55e5dcd082..ecb2af9ea4 100755 --- a/bin/ingest_pwc.py +++ b/bin/ingest_pwc.py @@ -1,6 +1,9 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- # +# Copyright 2021 Robert Stojnic +# Copyright 2023–2025 Matt Post, Marcel Bollmann +# # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at @@ -18,42 +21,17 @@ Used to import the links to code and data from Papers with Code (paperswithcode.com) """ -import logging import json -import lxml.etree as etree import os import logging as log +from pathlib import Path import requests import sys - -def format_str(x): - """Format as string if a value is missing or bool.""" - if x is None: - return "" - elif isinstance(x, bool): - return "true" if x else "false" - else: - return str(x) - - -def shift_tails(element): - """Shift XML children tails to preserve the exact formatting""" - children = list(element) - children[-1].tail = children[-2].tail - children[-2].tail = children[-3].tail - - -def remove_and_shift_tails(element, child): - """Remove element and make tails consistent""" - children = list(element) - inx = children.index(child) - - if inx > 0: - children[inx - 1].tail = children[inx].tail - - element.remove(child) - +from acl_anthology import Anthology +from acl_anthology.files import PapersWithCodeReference +from acl_anthology.utils.ids import parse_id +from acl_anthology.utils.logging import setup_rich_logging if __name__ == "__main__": import argparse @@ -64,7 +42,7 @@ def remove_and_shift_tails(element, child): ) args = ap.parse_args() - logging.basicConfig(level=logging.INFO) + setup_rich_logging(level=log.INFO) if args.infile: with open(args.infile, "r") as f: @@ -78,62 +56,54 @@ def remove_and_shift_tails(element, child): log.warning("Couldn't fetch metadata from Papers with Code (server error).") sys.exit(1) - data_base = "data/xml" + datadir = Path(os.path.dirname(os.path.abspath(__file__))) / ".." / "data" + anthology = Anthology(datadir=datadir) - for xml_filename in os.listdir(data_base): - # skip any non-xml files - if not xml_filename.endswith(".xml"): - continue - - full_path = os.path.join(data_base, xml_filename) - - # load - with open(full_path) as f: - tree = etree.parse(f) - - # track if we modified - old_content = etree.tostring( - tree, encoding="UTF-8", xml_declaration=True, with_tail=True - ).decode("utf8") - - for volume in tree.findall("volume"): - for paper in volume.findall("paper"): - acl_url = paper.find("url") - if acl_url is not None: - acl_id = acl_url.text - else: - # skip if we cannot construct the id - continue + # Iterate over all papers in the JSON response + changed_collections = set() + ids_with_pwc_reference = set() - # start by removing any old entries - for old in paper.findall("pwccode"): - remove_and_shift_tails(paper, old) - for old in paper.findall("pwcdataset"): - remove_and_shift_tails(paper, old) - - if acl_id in pwc_meta: - pwc = pwc_meta[acl_id] - pwc_code = pwc["code"] - if pwc_code["url"] or pwc_code["additional"]: - code = etree.SubElement(paper, "pwccode") - code.set("url", format_str(pwc_code["url"])) - code.set("additional", format_str(pwc_code["additional"])) - if pwc_code["name"]: - code.text = pwc_code["name"] - shift_tails(paper) - - for pwc_data in pwc["datasets"]: - data = etree.SubElement(paper, "pwcdataset") - data.set("url", pwc_data["url"]) - data.text = pwc_data["name"] - shift_tails(paper) - - new_content = etree.tostring( - tree, encoding="UTF-8", xml_declaration=True, with_tail=True - ).decode("utf8") - - if old_content != new_content: - with open(full_path, "w") as outfile: - outfile.write(new_content + "\n") # all files end with newline + for full_id, pwc_data in pwc_meta.items(): + if full_id.endswith(".pdf"): + full_id = full_id[:-4] + try: + parsed_id = parse_id(full_id) + except ValueError: + log.error(f"Failed to parse Anthology ID: {full_id}") + continue - log.info(f"Modified Papers with Code metadata in {full_path}") + if paper := anthology.get_paper(parsed_id): + pwc_code = pwc_data["code"] + reference = PapersWithCodeReference( + code=(pwc_code["name"], pwc_code["url"]) if pwc_code["url"] else None, + community_code=bool(pwc_code["additional"]), + datasets=[ + (pwc_ds["name"], pwc_ds["url"]) for pwc_ds in pwc_data["datasets"] + ], + ) + if ( + reference.code is None + and not reference.community_code + and not reference.datasets + ): + reference = None + else: + ids_with_pwc_reference.add(full_id) + + if paper.paperswithcode != reference: + paper.paperswithcode = reference + changed_collections.add(parsed_id[0]) + + # Sanity-check that there are no other papers with PwC references + all_pwc = { + paper.full_id for paper in anthology.papers() if paper.paperswithcode is not None + } + for full_id in all_pwc - ids_with_pwc_reference: + paper = anthology.get_paper(full_id) + paper.paperswithcode = None + changed_collections.add(paper.full_id_tuple[0]) + + # Save changes + for collection_id in changed_collections: + log.info(f"Modified Papers with Code metadata in {collection_id}") + anthology.get_collection(collection_id).save() diff --git a/python/CHANGELOG.md b/python/CHANGELOG.md index ff2aadaa04..95f5dfc462 100644 --- a/python/CHANGELOG.md +++ b/python/CHANGELOG.md @@ -1,5 +1,11 @@ # Changelog +## [UNRELEASED] + +### Removed + +- `PapersWithCodeReference` has been removed as Papers with Code is no longer functional. + ## [0.5.3] — 2025-06-22 This release adds more functionality for ingesting new proceedings and modifying existing data. diff --git a/python/acl_anthology/collections/paper.py b/python/acl_anthology/collections/paper.py index b48807fae9..09246eaac1 100644 --- a/python/acl_anthology/collections/paper.py +++ b/python/acl_anthology/collections/paper.py @@ -28,7 +28,6 @@ from ..exceptions import AnthologyInvalidIDError, AnthologyXMLError from ..files import ( AttachmentReference, - PapersWithCodeReference, PDFReference, PDFThumbnailReference, VideoReference, @@ -247,7 +246,6 @@ class Paper: language: The language this paper is (mainly) written in. When given, this should be a ISO 639-2 code (e.g. "eng"), though occasionally IETF is used (e.g. "pt-BR"). note: A note attached to this paper. Used very sparingly. pages: Page numbers of this paper within its volume. - paperswithcode: Links to code implementations and datasets as provided by [Papers with Code](https://paperswithcode.com/). pdf: A reference to the paper's PDF. type: The paper's type, currently used to mark frontmatter and backmatter. """ @@ -300,9 +298,6 @@ class Paper: language: Optional[str] = field(default=None, repr=False) note: Optional[str] = field(default=None, repr=False) pages: Optional[str] = field(default=None, repr=False) - paperswithcode: Optional[PapersWithCodeReference] = field( - default=None, on_setattr=attrs.setters.frozen, repr=False - ) pdf: Optional[PDFReference] = field(default=None, repr=False) type: PaperType = field(default=PaperType.PAPER, repr=False, converter=PaperType) @@ -671,10 +666,6 @@ def from_xml(cls, parent: Volume, paper: etree._Element) -> Paper: if "errata" not in kwargs: kwargs["errata"] = [] kwargs["errata"].append(PaperErratum.from_xml(element)) - elif element.tag in ("pwccode", "pwcdataset"): - if "paperswithcode" not in kwargs: - kwargs["paperswithcode"] = PapersWithCodeReference() - kwargs["paperswithcode"].append_from_xml(element) elif element.tag in ("removed", "retracted"): kwargs["deletion"] = PaperDeletionNotice.from_xml(element) elif element.tag == "revision": @@ -751,6 +742,4 @@ def to_xml(self) -> etree._Element: if self.deletion is not None: paper.append(self.deletion.to_xml()) paper.append(E.bibkey(self.bibkey)) - if self.paperswithcode is not None: - paper.extend(self.paperswithcode.to_xml_list()) return paper diff --git a/python/acl_anthology/data/schema.rnc b/python/acl_anthology/data/schema.rnc index dabcbd8bed..a071552542 100644 --- a/python/acl_anthology/data/schema.rnc +++ b/python/acl_anthology/data/schema.rnc @@ -88,15 +88,6 @@ Paper = element paper { }* & element language { xsd:language }? & element award { text }* - & element pwcdataset { - attribute url { xsd:anyURI }, - text - }* - & element pwccode { - attribute url { xsd:anyURI }, - attribute additional { xsd:boolean }, - text - }? ) } Meta = element meta { diff --git a/python/acl_anthology/files.py b/python/acl_anthology/files.py index ab02b4805a..91d05e4400 100644 --- a/python/acl_anthology/files.py +++ b/python/acl_anthology/files.py @@ -17,7 +17,7 @@ from __future__ import annotations import sys -from attrs import define, field, validators as v, Factory +from attrs import define, field, validators as v from lxml import etree from lxml.builder import E from pathlib import Path @@ -191,51 +191,3 @@ def to_xml(self, tag: str = "video") -> etree._Element: if not self.permission: elem.set("permission", "false") return elem - - -@define -class PapersWithCodeReference: - """Class aggregating [Papers with Code](https://paperswithcode.com/) (PwC) links in a paper. - - Attributes: - code: An official code repository, given as a tuple of the form `(name, url)`. - community_code: Whether the PwC page of the paper has additional, community-provided code links. - datasets: A list of datasets on PwC, given as tuples of the form `(name, url)`. - """ - - code: Optional[tuple[str | None, str]] = field(default=None) - community_code: bool = field(default=False) - datasets: list[tuple[str | None, str]] = Factory(list) - - def append_from_xml(self, elem: etree._Element) -> None: - """Appends information from a `` or `` block to this reference.""" - pwc_tuple = (elem.text, elem.get("url", "")) - if elem.tag == "pwccode": - self.community_code = xsd_boolean(elem.get("additional", "")) - self.code = pwc_tuple - elif elem.tag == "pwcdataset": - self.datasets.append(pwc_tuple) - else: # pragma: no cover - raise ValueError( - f"Unsupported element for PapersWithCodeReference: <{elem.tag}>" - ) - - def to_xml_list(self) -> list[etree._Element]: - """ - Returns: - A serialization of all PapersWithCode information as a list of corresponding XML tags in the Anthology XML format. - """ - elements = [] - if self.code is not None: - args = [self.code[0]] if self.code[0] is not None else [] - elements.append( - E.pwccode( - *args, - url=self.code[1], - additional=str(self.community_code).lower(), - ) - ) - for dataset in self.datasets: - args = [dataset[0]] if dataset[0] is not None else [] - elements.append(E.pwcdataset(*args, url=dataset[1])) - return elements diff --git a/python/acl_anthology/utils/attrs.py b/python/acl_anthology/utils/attrs.py index a5da86e7d2..755c8f4ac6 100644 --- a/python/acl_anthology/utils/attrs.py +++ b/python/acl_anthology/utils/attrs.py @@ -73,7 +73,6 @@ def auto_validate_types( AttachmentReference, EventFileReference, VideoReference, - PapersWithCodeReference, ) from ..people import Name, NameSpecification from ..text import MarkupText @@ -89,7 +88,6 @@ def auto_validate_types( AttachmentReference, EventFileReference, VideoReference, - PapersWithCodeReference, MarkupText, Name, NameSpecification, diff --git a/python/acl_anthology/utils/xml.py b/python/acl_anthology/utils/xml.py index 3952beb8da..5a111e3d27 100644 --- a/python/acl_anthology/utils/xml.py +++ b/python/acl_anthology/utils/xml.py @@ -60,7 +60,6 @@ "attachment", "award", "video", - "pwcdataset", } """XML tags that may appear multiple times per parent tag, and whose relative order matters even if their parent tag belongs to `TAGS_WITH_UNORDERED_CHILDREN`.""" diff --git a/python/benchmarks/bench_xml_parsing.py b/python/benchmarks/bench_xml_parsing.py index 610dbd1e6e..14795af230 100644 --- a/python/benchmarks/bench_xml_parsing.py +++ b/python/benchmarks/bench_xml_parsing.py @@ -30,7 +30,6 @@ "revision", "erratum", "award", - "pwcdataset", "video", "venue", ) @@ -65,12 +64,6 @@ def parse_element(xml_element): elif tag == "last": last = subelement.text or "" value = (first, last, id_) - elif tag == "pwccode": - value = { - "url": element.get("url"), - "additional": element.get("additional"), - "name": element.text, - } else: value = element.text @@ -105,12 +98,6 @@ def parse_single_element(element): elif tag == "last": last = subelement.text or "" value = (first, last, id_) - elif tag == "pwccode": - value = { - "url": element.get("url"), - "additional": element.get("additional"), - "name": element.text, - } else: value = element.text diff --git a/python/tests/collections/paper_test.py b/python/tests/collections/paper_test.py index d28a0e1f14..a9f1be197a 100644 --- a/python/tests/collections/paper_test.py +++ b/python/tests/collections/paper_test.py @@ -256,8 +256,6 @@ def test_paper_add_author(anthology): Most ridiculous entry Removed immediately for being fake why-would-you-cite-this - acl-org/fake-repo - FaKe-DaTaSeT """, ) diff --git a/python/tests/data/anthology/xml/2022.acl.xml b/python/tests/data/anthology/xml/2022.acl.xml index 3e9ee6f0b5..0a490235fd 100644 --- a/python/tests/data/anthology/xml/2022.acl.xml +++ b/python/tests/data/anthology/xml/2022.acl.xml @@ -29,15 +29,6 @@ modarressi-etal-2022-adapler 10.18653/v1/2022.acl-long.1