Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
144 changes: 57 additions & 87 deletions bin/ingest_pwc.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
#
# Copyright 2021 Robert Stojnic
# Copyright 2023–2025 Matt Post, Marcel Bollmann
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
Expand All @@ -18,42 +21,17 @@
Used to import the links to code and data from Papers with Code (paperswithcode.com)
"""

import logging
import json
import lxml.etree as etree
import os
import logging as log
from pathlib import Path
import requests
import sys


def format_str(x):
"""Format as string if a value is missing or bool."""
if x is None:
return ""
elif isinstance(x, bool):
return "true" if x else "false"
else:
return str(x)


def shift_tails(element):
"""Shift XML children tails to preserve the exact formatting"""
children = list(element)
children[-1].tail = children[-2].tail
children[-2].tail = children[-3].tail


def remove_and_shift_tails(element, child):
"""Remove element and make tails consistent"""
children = list(element)
inx = children.index(child)

if inx > 0:
children[inx - 1].tail = children[inx].tail

element.remove(child)

from acl_anthology import Anthology
from acl_anthology.files import PapersWithCodeReference
from acl_anthology.utils.ids import parse_id
from acl_anthology.utils.logging import setup_rich_logging

if __name__ == "__main__":
import argparse
Expand All @@ -64,7 +42,7 @@ def remove_and_shift_tails(element, child):
)
args = ap.parse_args()

logging.basicConfig(level=logging.INFO)
setup_rich_logging(level=log.INFO)

if args.infile:
with open(args.infile, "r") as f:
Expand All @@ -78,62 +56,54 @@ def remove_and_shift_tails(element, child):
log.warning("Couldn't fetch metadata from Papers with Code (server error).")
sys.exit(1)

data_base = "data/xml"
datadir = Path(os.path.dirname(os.path.abspath(__file__))) / ".." / "data"
anthology = Anthology(datadir=datadir)

for xml_filename in os.listdir(data_base):
# skip any non-xml files
if not xml_filename.endswith(".xml"):
continue

full_path = os.path.join(data_base, xml_filename)

# load
with open(full_path) as f:
tree = etree.parse(f)

# track if we modified
old_content = etree.tostring(
tree, encoding="UTF-8", xml_declaration=True, with_tail=True
).decode("utf8")

for volume in tree.findall("volume"):
for paper in volume.findall("paper"):
acl_url = paper.find("url")
if acl_url is not None:
acl_id = acl_url.text
else:
# skip if we cannot construct the id
continue
# Iterate over all papers in the JSON response
changed_collections = set()
ids_with_pwc_reference = set()

# start by removing any old entries
for old in paper.findall("pwccode"):
remove_and_shift_tails(paper, old)
for old in paper.findall("pwcdataset"):
remove_and_shift_tails(paper, old)

if acl_id in pwc_meta:
pwc = pwc_meta[acl_id]
pwc_code = pwc["code"]
if pwc_code["url"] or pwc_code["additional"]:
code = etree.SubElement(paper, "pwccode")
code.set("url", format_str(pwc_code["url"]))
code.set("additional", format_str(pwc_code["additional"]))
if pwc_code["name"]:
code.text = pwc_code["name"]
shift_tails(paper)

for pwc_data in pwc["datasets"]:
data = etree.SubElement(paper, "pwcdataset")
data.set("url", pwc_data["url"])
data.text = pwc_data["name"]
shift_tails(paper)

new_content = etree.tostring(
tree, encoding="UTF-8", xml_declaration=True, with_tail=True
).decode("utf8")

if old_content != new_content:
with open(full_path, "w") as outfile:
outfile.write(new_content + "\n") # all files end with newline
for full_id, pwc_data in pwc_meta.items():
if full_id.endswith(".pdf"):
full_id = full_id[:-4]
try:
parsed_id = parse_id(full_id)
except ValueError:
log.error(f"Failed to parse Anthology ID: {full_id}")
continue

log.info(f"Modified Papers with Code metadata in {full_path}")
if paper := anthology.get_paper(parsed_id):
pwc_code = pwc_data["code"]
reference = PapersWithCodeReference(
code=(pwc_code["name"], pwc_code["url"]) if pwc_code["url"] else None,
community_code=bool(pwc_code["additional"]),
datasets=[
(pwc_ds["name"], pwc_ds["url"]) for pwc_ds in pwc_data["datasets"]
],
)
if (
reference.code is None
and not reference.community_code
and not reference.datasets
):
reference = None
else:
ids_with_pwc_reference.add(full_id)

if paper.paperswithcode != reference:
paper.paperswithcode = reference
changed_collections.add(parsed_id[0])

# Sanity-check that there are no other papers with PwC references
all_pwc = {
paper.full_id for paper in anthology.papers() if paper.paperswithcode is not None
}
for full_id in all_pwc - ids_with_pwc_reference:
paper = anthology.get_paper(full_id)
paper.paperswithcode = None
changed_collections.add(paper.full_id_tuple[0])

# Save changes
for collection_id in changed_collections:
log.info(f"Modified Papers with Code metadata in {collection_id}")
anthology.get_collection(collection_id).save()
6 changes: 6 additions & 0 deletions python/CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,11 @@
# Changelog

## [UNRELEASED]

### Removed

- `PapersWithCodeReference` has been removed as Papers with Code is no longer functional.

## [0.5.3] — 2025-06-22

This release adds more functionality for ingesting new proceedings and modifying existing data.
Expand Down
11 changes: 0 additions & 11 deletions python/acl_anthology/collections/paper.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,6 @@
from ..exceptions import AnthologyInvalidIDError, AnthologyXMLError
from ..files import (
AttachmentReference,
PapersWithCodeReference,
PDFReference,
PDFThumbnailReference,
VideoReference,
Expand Down Expand Up @@ -247,7 +246,6 @@ class Paper:
language: The language this paper is (mainly) written in. When given, this should be a ISO 639-2 code (e.g. "eng"), though occasionally IETF is used (e.g. "pt-BR").
note: A note attached to this paper. Used very sparingly.
pages: Page numbers of this paper within its volume.
paperswithcode: Links to code implementations and datasets as provided by [Papers with Code](https://paperswithcode.com/).
pdf: A reference to the paper's PDF.
type: The paper's type, currently used to mark frontmatter and backmatter.
"""
Expand Down Expand Up @@ -300,9 +298,6 @@ class Paper:
language: Optional[str] = field(default=None, repr=False)
note: Optional[str] = field(default=None, repr=False)
pages: Optional[str] = field(default=None, repr=False)
paperswithcode: Optional[PapersWithCodeReference] = field(
default=None, on_setattr=attrs.setters.frozen, repr=False
)
pdf: Optional[PDFReference] = field(default=None, repr=False)
type: PaperType = field(default=PaperType.PAPER, repr=False, converter=PaperType)

Expand Down Expand Up @@ -671,10 +666,6 @@ def from_xml(cls, parent: Volume, paper: etree._Element) -> Paper:
if "errata" not in kwargs:
kwargs["errata"] = []
kwargs["errata"].append(PaperErratum.from_xml(element))
elif element.tag in ("pwccode", "pwcdataset"):
if "paperswithcode" not in kwargs:
kwargs["paperswithcode"] = PapersWithCodeReference()
kwargs["paperswithcode"].append_from_xml(element)
elif element.tag in ("removed", "retracted"):
kwargs["deletion"] = PaperDeletionNotice.from_xml(element)
elif element.tag == "revision":
Expand Down Expand Up @@ -751,6 +742,4 @@ def to_xml(self) -> etree._Element:
if self.deletion is not None:
paper.append(self.deletion.to_xml())
paper.append(E.bibkey(self.bibkey))
if self.paperswithcode is not None:
paper.extend(self.paperswithcode.to_xml_list())
return paper
9 changes: 0 additions & 9 deletions python/acl_anthology/data/schema.rnc
Original file line number Diff line number Diff line change
Expand Up @@ -88,15 +88,6 @@ Paper = element paper {
}*
& element language { xsd:language }?
& element award { text }*
& element pwcdataset {
attribute url { xsd:anyURI },
text
}*
& element pwccode {
attribute url { xsd:anyURI },
attribute additional { xsd:boolean },
text
}?
)
}
Meta = element meta {
Expand Down
50 changes: 1 addition & 49 deletions python/acl_anthology/files.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
from __future__ import annotations

import sys
from attrs import define, field, validators as v, Factory
from attrs import define, field, validators as v
from lxml import etree
from lxml.builder import E
from pathlib import Path
Expand Down Expand Up @@ -191,51 +191,3 @@ def to_xml(self, tag: str = "video") -> etree._Element:
if not self.permission:
elem.set("permission", "false")
return elem


@define
class PapersWithCodeReference:
"""Class aggregating [Papers with Code](https://paperswithcode.com/) (PwC) links in a paper.

Attributes:
code: An official code repository, given as a tuple of the form `(name, url)`.
community_code: Whether the PwC page of the paper has additional, community-provided code links.
datasets: A list of datasets on PwC, given as tuples of the form `(name, url)`.
"""

code: Optional[tuple[str | None, str]] = field(default=None)
community_code: bool = field(default=False)
datasets: list[tuple[str | None, str]] = Factory(list)

def append_from_xml(self, elem: etree._Element) -> None:
"""Appends information from a `<pwccode>` or `<pwcdataset>` block to this reference."""
pwc_tuple = (elem.text, elem.get("url", ""))
if elem.tag == "pwccode":
self.community_code = xsd_boolean(elem.get("additional", ""))
self.code = pwc_tuple
elif elem.tag == "pwcdataset":
self.datasets.append(pwc_tuple)
else: # pragma: no cover
raise ValueError(
f"Unsupported element for PapersWithCodeReference: <{elem.tag}>"
)

def to_xml_list(self) -> list[etree._Element]:
"""
Returns:
A serialization of all PapersWithCode information as a list of corresponding XML tags in the Anthology XML format.
"""
elements = []
if self.code is not None:
args = [self.code[0]] if self.code[0] is not None else []
elements.append(
E.pwccode(
*args,
url=self.code[1],
additional=str(self.community_code).lower(),
)
)
for dataset in self.datasets:
args = [dataset[0]] if dataset[0] is not None else []
elements.append(E.pwcdataset(*args, url=dataset[1]))
return elements
2 changes: 0 additions & 2 deletions python/acl_anthology/utils/attrs.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,6 @@ def auto_validate_types(
AttachmentReference,
EventFileReference,
VideoReference,
PapersWithCodeReference,
)
from ..people import Name, NameSpecification
from ..text import MarkupText
Expand All @@ -89,7 +88,6 @@ def auto_validate_types(
AttachmentReference,
EventFileReference,
VideoReference,
PapersWithCodeReference,
MarkupText,
Name,
NameSpecification,
Expand Down
1 change: 0 additions & 1 deletion python/acl_anthology/utils/xml.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,6 @@
"attachment",
"award",
"video",
"pwcdataset",
}
"""XML tags that may appear multiple times per parent tag, and whose relative order matters even if their parent tag belongs to `TAGS_WITH_UNORDERED_CHILDREN`."""

Expand Down
13 changes: 0 additions & 13 deletions python/benchmarks/bench_xml_parsing.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,6 @@
"revision",
"erratum",
"award",
"pwcdataset",
"video",
"venue",
)
Expand Down Expand Up @@ -65,12 +64,6 @@ def parse_element(xml_element):
elif tag == "last":
last = subelement.text or ""
value = (first, last, id_)
elif tag == "pwccode":
value = {
"url": element.get("url"),
"additional": element.get("additional"),
"name": element.text,
}
else:
value = element.text

Expand Down Expand Up @@ -105,12 +98,6 @@ def parse_single_element(element):
elif tag == "last":
last = subelement.text or ""
value = (first, last, id_)
elif tag == "pwccode":
value = {
"url": element.get("url"),
"additional": element.get("additional"),
"name": element.text,
}
else:
value = element.text

Expand Down
2 changes: 0 additions & 2 deletions python/tests/collections/paper_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -256,8 +256,6 @@ def test_paper_add_author(anthology):
<award>Most ridiculous entry</award>
<removed date="2023-09-30">Removed immediately for being fake</removed>
<bibkey>why-would-you-cite-this</bibkey>
<pwccode url="https://github.com/acl-org/fake-repo" additional="false">acl-org/fake-repo</pwccode>
<pwcdataset url="https://paperswithcode.com/dataset/fake-dataset">FaKe-DaTaSeT</pwcdataset>
</paper>
""",
)
Expand Down
Loading
Loading