Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
151 changes: 103 additions & 48 deletions bin/add_author_id.py
Original file line number Diff line number Diff line change
@@ -1,83 +1,138 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
#
# Copyright 2022 Matt Post <[email protected]>
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# -*- coding: utf-8 -*-
"""Add an author ID to NameSpecification entries using the acl_anthology module.

"""
Adds an ID tag to all instances of an author in all XML files where there is no ID tag.

First use case was the Bill Byrne separation of July 2022.

2020.gebnlp-1.4 E14-1026 E14-1028 W16-2324 2021.acl-long.55 2021.eancs-1.2 W15-0116 D19-1125 D19-1331 D19-1459 P14-3000 2022.naacl-main.136 W18-1821 W18-5420 W18-6427 2020.nlp4call-1.2 N19-1406 2021.emnlp-main.620 2021.emnlp-main.666 N18-2081 N18-3013 W17-3531 2020.wmt-1.94 D15-1273 2022.nlp4convai-1.7 P16-2049 C14-1195 P19-1022 W19-4417 W19-4424 W19-5340 W19-5421 2020.wat-1.21 E17-2058 2022.ecnlp-1.13 J14-3008 N15-1041 N15-1105 P18-2051 D17-1208 D17-1220 D17-2005 2020.acl-main.690 2020.acl-main.693 N16-1100 2022.findings-acl.223 2022.findings-acl.301
This script adds the name ID to all papers matching the first and last name.
It will use the module to find the list of papers to edit. Alternately, you
provide it with the list of papers.

Usage:

./add_author_id.py bill-byrne --last-name Byrne --first-name Bill
./add_author_id.py <id> "Last name[, First name]" [--paper-ids 2028.acl-main.74 ...]
"""

import argparse
import os
from __future__ import annotations

from pathlib import Path
from anthology.utils import indent
import argparse
from collections import defaultdict
from itertools import chain
from pathlib import Path

from acl_anthology.anthology import Anthology

# old library since we're still editing XML files
from anthology.utils import indent
import lxml.etree as ET


def main(args):
for xml_file in Path(args.data_dir).glob("**/*.xml"):
changed_one = False
def main(args: argparse.Namespace) -> None:

last_name, first_name = (
args.name.split(", ") if ", " in args.name else (args.name, None)
)

anthology = Anthology(args.data_dir, verbose=True)

# Build a collection of the set of papers to modify within each XML file
collection_to_paper_map = defaultdict(list)

if args.paper_ids:
for paper_id in args.paper_ids:
paper = anthology.get_paper(paper_id)
if paper:
collection_to_paper_map[paper.collection_id].append(paper.full_id_tuple)

else:
people = anthology.find_people(args.name)
if not people:
print(f"No person found matching name {args.name}")

# find the person with the non-explicit ID
for person in people:
if not person.is_explicit:
break

if not person:
print(f"No person found matching name {args.name} with an explicit ID")
return

for paper in person.papers():
collection_to_paper_map[paper.collection_id].append(paper.full_id_tuple)

if collection_to_paper_map:
print("Will edit the following paper IDs:")
for paper_id_tuples in collection_to_paper_map.values():
for paper_id in paper_id_tuples:
print(f" - {paper_id}")

# Now iterate over those files and the papers within them
for collection_id, paper_id_tuples in collection_to_paper_map.items():
xml_file = Path(args.data_dir) / "xml" / f"{collection_id}.xml"

tree = ET.parse(xml_file)
for paper_xml in chain(
tree.getroot().findall(".//paper"), tree.getroot().findall(".//meta")
):

for paper_tuple in paper_id_tuples:
_, volume_id, paper_id = paper_tuple

# Get the paper
paper_xml = tree.getroot().find(
f"./volume[@id='{volume_id}']/paper[@id='{paper_id}']"
)

for author_xml in chain(
paper_xml.findall("./author"), paper_xml.findall("./editor")
):
if "id" in author_xml.attrib:
continue
last_name = author_xml.find("./last").text
try:
first_name = author_xml.find("./first").text
author_first_name = author_xml.find("./first").text
except AttributeError:
first_name = ""
if last_name == args.last_name and first_name == args.first_name:
author_first_name = None
author_last_name = author_xml.find("./last").text

if author_last_name == last_name and author_first_name == first_name:
paper_id = (
paper_xml.attrib["id"] if paper_xml.text == "paper" else "0"
)
anth_id = f"{xml_file}/{paper_id}"
print(f"Adding {args.id} to {anth_id}...")
paper_id = anthology.get_paper(paper_tuple).full_id
print(
f"Adding {args.id} to {author_first_name} {author_last_name} on paper {paper_id}..."
)
author_xml.attrib["id"] = args.id
changed_one = True

if changed_one:
indent(tree.getroot())
tree.write(xml_file, encoding="UTF-8", xml_declaration=True)
indent(tree.getroot())
tree.write(xml_file, encoding="UTF-8", xml_declaration=True)

"""
Once we have the module published, we should be able to modify this to use
it to write the changed XML files, instead of the above.
"""
# for paper in person.papers():
# print("PAPER", paper.full_id)
# authors = paper.get_editors() if paper.is_frontmatter else paper.authors
# for author in authors:
# if author.name in person.names:
# print("-> Found", author)
# author.id = args.id
# # collection_paper_map[paper.collection_id].append(paper.full_id)

# # save the anthology (doesn't currently work)
# anthology.save_all()


if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser = argparse.ArgumentParser("Add an author ID to all of an author's papers")
parser.add_argument("id", help="Author ID to add")
parser.add_argument("--last-name", help="Author's last name")
parser.add_argument("--first-name", help="Author's first name")
parser.add_argument("--confirm", action="store_true", help="Confirm each instance")
parser.add_argument("name", help="Author's name (last[, first])")
parser.add_argument("--paper-ids", nargs="*", help="List of paper IDs to modify")
parser.add_argument(
"--data-dir", default=os.path.join(os.path.dirname(__file__), "..", "data", "xml")
"--data-dir",
default=None,
help="Path to anthology data directory (default: ../data relative to repository root)",
)
args = parser.parse_args()
# Normalize data_dir to a Path string used by Anthology
# If the user supplies a path, trust it; otherwise compute relative to this script
if args.data_dir is None:
args.data_dir = str(Path(__file__).parent.parent / "data")

main(args)
2 changes: 1 addition & 1 deletion data/xml/2020.acl.xml
Original file line number Diff line number Diff line change
Expand Up @@ -3245,7 +3245,7 @@
<paper id="241">
<title>Orthogonal Relation Transforms with Graph Context Modeling for Knowledge Graph Embedding</title>
<author><first>Yun</first><last>Tang</last></author>
<author><first>Jing</first><last>Huang</last></author>
<author id="jing-huang"><first>Jing</first><last>Huang</last></author>
<author><first>Guangtao</first><last>Wang</last></author>
<author><first>Xiaodong</first><last>He</last></author>
<author><first>Bowen</first><last>Zhou</last></author>
Expand Down
4 changes: 2 additions & 2 deletions data/xml/2021.naacl.xml
Original file line number Diff line number Diff line change
Expand Up @@ -2814,7 +2814,7 @@
<author><first>Kevin</first><last>Huang</last></author>
<author><first>Tengyu</first><last>Ma</last></author>
<author><first>Quanquan</first><last>Gu</last></author>
<author><first>Jing</first><last>Huang</last></author>
<author id="jing-huang"><first>Jing</first><last>Huang</last></author>
<pages>2609–2615</pages>
<abstract>First-order meta-learning algorithms have been widely used in practice to learn initial model parameters that can be quickly adapted to new tasks due to their efficiency and effectiveness. However, existing studies find that meta-learner can overfit to some specific adaptation when we have heterogeneous tasks, leading to significantly degraded performance. In Natural Language Processing (NLP) applications, datasets are often diverse and each task has its unique characteristics. Therefore, to address the overfitting issue when applying first-order meta-learning to NLP applications, we propose to reduce the variance of the gradient estimator used in task adaptation. To this end, we develop a variance-reduced first-order meta-learning algorithm. The core of our algorithm is to introduce a novel variance reduction term to the gradient estimation when performing the task adaptation. Experiments on two NLP applications: few-shot text classification and multi-domain dialog state tracking demonstrate the superior performance of our proposed method.</abstract>
<url hash="93468fde">2021.naacl-main.206</url>
Expand Down Expand Up @@ -3111,7 +3111,7 @@
<author><first>Peng</first><last>Qi</last></author>
<author><first>Guangtao</first><last>Wang</last></author>
<author><first>Rex</first><last>Ying</last></author>
<author><first>Jing</first><last>Huang</last></author>
<author id="jing-huang"><first>Jing</first><last>Huang</last></author>
<author><first>Xiaodong</first><last>He</last></author>
<author><first>Bowen</first><last>Zhou</last></author>
<pages>2884–2894</pages>
Expand Down
2 changes: 1 addition & 1 deletion data/xml/2021.repl4nlp.xml
Original file line number Diff line number Diff line change
Expand Up @@ -377,7 +377,7 @@
<author><first>Peng</first><last>Qi</last></author>
<author><first>Guangtao</first><last>Wang</last></author>
<author><first>Tengyu</first><last>Ma</last></author>
<author><first>Jing</first><last>Huang</last></author>
<author id="jing-huang"><first>Jing</first><last>Huang</last></author>
<pages>307–315</pages>
<abstract>Document-level relation extraction is a challenging task, requiring reasoning over multiple sentences to predict a set of relations in a document. In this paper, we propose a novel framework E2GRE (Entity and Evidence Guided Relation Extraction) that jointly extracts relations and the underlying evidence sentences by using large pretrained language model (LM) as input encoder. First, we propose to guide the pretrained LM’s attention mechanism to focus on relevant context by using attention probabilities as additional features for evidence prediction. Furthermore, instead of feeding the whole document into pretrained LMs to obtain entity representation, we concatenate document text with head entities to help LMs concentrate on parts of the document that are more related to the head entity. Our E2GRE jointly learns relation extraction and evidence prediction effectively, showing large gains on both these tasks, which we find are highly correlated.</abstract>
<url hash="d034db75">2021.repl4nlp-1.30</url>
Expand Down
2 changes: 1 addition & 1 deletion data/xml/2021.sustainlp.xml
Original file line number Diff line number Diff line change
Expand Up @@ -143,7 +143,7 @@
<author><first>Xiaochen</first><last>Hou</last></author>
<author><first>Diyi</first><last>Yang</last></author>
<author><first>Kathleen</first><last>McKeown</last></author>
<author><first>Jing</first><last>Huang</last></author>
<author id="jing-huang"><first>Jing</first><last>Huang</last></author>
<pages>79–85</pages>
<abstract>Large pre-trained language models (PLMs) have led to great success on various commonsense question answering (QA) tasks in an end-to-end fashion. However, little attention has been paid to what commonsense knowledge is needed to deeply characterize these QA tasks. In this work, we proposed to categorize the semantics needed for these tasks using the SocialIQA as an example. Building upon our labeled social knowledge categories dataset on top of SocialIQA, we further train neural QA models to incorporate such social knowledge categories and relation information from a knowledge base. Unlike previous work, we observe our models with semantic categorizations of social knowledge can achieve comparable performance with a relatively simple model and smaller size compared to other complex approaches.</abstract>
<url hash="499d3240">2021.sustainlp-1.10</url>
Expand Down
2 changes: 1 addition & 1 deletion data/xml/2021.textgraphs.xml
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,7 @@
<paper id="8">
<title>Selective Attention Based Graph Convolutional Networks for Aspect-Level Sentiment Classification</title>
<author><first>Xiaochen</first><last>Hou</last></author>
<author><first>Jing</first><last>Huang</last></author>
<author id="jing-huang"><first>Jing</first><last>Huang</last></author>
<author><first>Guangtao</first><last>Wang</last></author>
<author><first>Peng</first><last>Qi</last></author>
<author><first>Xiaodong</first><last>He</last></author>
Expand Down
2 changes: 1 addition & 1 deletion data/xml/2022.acl.xml
Original file line number Diff line number Diff line change
Expand Up @@ -7546,7 +7546,7 @@ in the Case of Unambiguous Gender</title>
<author><first>Chao</first><last>Shang</last></author>
<author><first>Guangtao</first><last>Wang</last></author>
<author><first>Peng</first><last>Qi</last></author>
<author><first>Jing</first><last>Huang</last></author>
<author id="jing-huang"><first>Jing</first><last>Huang</last></author>
<pages>8017-8026</pages>
<abstract>Question answering over temporal knowledge graphs (KGs) efficiently uses facts contained in a temporal KG, which records entity relations and when they occur in time, to answer natural language questions (e.g., “Who was the president of the US before Obama?”). These questions often involve three time-related challenges that previous work fail to adequately address: 1) questions often do not specify exact timestamps of interest (e.g., “Obama” instead of 2000); 2) subtle lexical differences in time relations (e.g., “before” vs “after”); 3) off-the-shelf temporal KG embeddings that previous work builds on ignore the temporal order of timestamps, which is crucial for answering temporal-order related questions. In this paper, we propose a time-sensitive question answering (TSQA) framework to tackle these problems. TSQA features a timestamp estimation module to infer the unwritten timestamp from the question. We also employ a time-sensitive KG encoder to inject ordering information into the temporal KG embeddings that TSQA is based on. With the help of techniques to reduce the search space for potential answers, TSQA significantly outperforms the previous state of the art on a new benchmark for question answering over temporal KGs, especially achieving a 32% (absolute) error reduction on complex questions that require multiple steps of reasoning over facts in the temporal KG.</abstract>
<url hash="2642c44d">2022.acl-long.552</url>
Expand Down
4 changes: 2 additions & 2 deletions data/xml/2022.emnlp.xml
Original file line number Diff line number Diff line change
Expand Up @@ -4236,7 +4236,7 @@
<author><first>Shereen</first><last>Oraby</last><affiliation>Amazon Alexa AI</affiliation></author>
<author><first>Alessandra</first><last>Cervone</last><affiliation>Amazon Alexa AI</affiliation></author>
<author><first>Tagyoung</first><last>Chung</last><affiliation>Amazon Alexa AI</affiliation></author>
<author><first>Jing</first><last>Huang</last><affiliation>Amazon</affiliation></author>
<author id="jing-huang"><first>Jing</first><last>Huang</last><affiliation>Amazon</affiliation></author>
<author id="yang-liu"><first>Yang</first><last>Liu</last><affiliation>Amazon</affiliation></author>
<author><first>Nanyun</first><last>Peng</last><affiliation>University of California, Los Angeles</affiliation></author>
<pages>4590-4605</pages>
Expand Down Expand Up @@ -4264,7 +4264,7 @@
<author><first>Shereen</first><last>Oraby</last><affiliation>Amazon Alexa AI</affiliation></author>
<author><first>Shuyang</first><last>Gao</last><affiliation>Amazon.com, Inc.</affiliation></author>
<author><first>Tagyoung</first><last>Chung</last><affiliation>Amazon Alexa AI</affiliation></author>
<author><first>Jing</first><last>Huang</last><affiliation>Amazon</affiliation></author>
<author id="jing-huang"><first>Jing</first><last>Huang</last><affiliation>Amazon</affiliation></author>
<author id="yang-liu"><first>Yang</first><last>Liu</last><affiliation>Amazon</affiliation></author>
<author><first>Nanyun</first><last>Peng</last><affiliation>University of California, Los Angeles</affiliation></author>
<pages>4635-4648</pages>
Expand Down
2 changes: 1 addition & 1 deletion data/xml/2023.acl.xml
Original file line number Diff line number Diff line change
Expand Up @@ -7341,7 +7341,7 @@
<author><first>Wenbo</first><last>Zhao</last><affiliation>Amazon</affiliation></author>
<author><first>Yiwen</first><last>Chen</last><affiliation>University of Cambridge</affiliation></author>
<author><first>Tagyoung</first><last>Chung</last><affiliation>Amazon Alexa AI</affiliation></author>
<author><first>Jing</first><last>Huang</last><affiliation>Amazon</affiliation></author>
<author id="jing-huang"><first>Jing</first><last>Huang</last><affiliation>Amazon</affiliation></author>
<author><first>Nanyun</first><last>Peng</last><affiliation>University of California, Los Angeles</affiliation></author>
<pages>9235-9254</pages>
<abstract>Automatic melody-to-lyric generation is a task in which song lyrics are generated to go with a given melody. It is of significant practical interest and more challenging than unconstrained lyric generation as the music imposes additional constraints onto the lyrics. The training data is limited as most songs are copyrighted, resulting in models that underfit the complicated cross-modal relationship between melody and lyrics. In this work, we propose a method for generating high-quality lyrics without training on any aligned melody-lyric data. Specifically, we design a hierarchical lyric generation framework that first generates a song outline and second the complete lyrics. The framework enables disentanglement of training (based purely on text) from inference (melody-guided text generation) to circumvent the shortage of parallel data. We leverage the segmentation and rhythm alignment between melody and lyrics to compile the given melody into decoding constraints as guidance during inference. The two-step hierarchical design also enables content control via the lyric outline, a much-desired feature for democratizing collaborative song creation. Experimental results show that our model can generate high-quality lyrics that are more on-topic, singable, intelligible, and coherent than strong baselines, for example SongMASS, a SOTA model trained on a parallel dataset, with a 24% relative overall quality improvement based on human ratings. Our code is available at <url>https://github.com/amazon-science/unsupervised-melody-to-lyrics-generation</url>.</abstract>
Expand Down
2 changes: 1 addition & 1 deletion data/xml/2023.blackboxnlp.xml
Original file line number Diff line number Diff line change
Expand Up @@ -272,7 +272,7 @@
</paper>
<paper id="24">
<title>Rigorously Assessing Natural Language Explanations of Neurons</title>
<author><first>Jing</first><last>Huang</last></author>
<author id="jing-huang-stanford"><first>Jing</first><last>Huang</last></author>
<author><first>Atticus</first><last>Geiger</last></author>
<author><first>Karel</first><last>D’Oosterlinck</last></author>
<author><first>Zhengxuan</first><last>Wu</last></author>
Expand Down
Loading