acl-org · mjpost · Aug 21, 2025 · Aug 21, 2025 · Aug 21, 2025 · Aug 21, 2025
diff --git a/bin/add_author_id.py b/bin/add_author_id.py
@@ -1,83 +1,138 @@
 #!/usr/bin/env python3
-# -*- coding: utf-8  -*-
-#
-# Copyright 2022 Matt Post <[email protected]>
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
+# -*- coding: utf-8 -*-
+"""Add an author ID to NameSpecification entries using the acl_anthology module.
 
-"""
-Adds an ID tag to all instances of an author in all XML files where there is no ID tag.
-
-First use case was the Bill Byrne separation of July 2022.
-
-2020.gebnlp-1.4 E14-1026 E14-1028 W16-2324 2021.acl-long.55 2021.eancs-1.2 W15-0116 D19-1125 D19-1331 D19-1459 P14-3000 2022.naacl-main.136 W18-1821 W18-5420 W18-6427 2020.nlp4call-1.2 N19-1406 2021.emnlp-main.620 2021.emnlp-main.666 N18-2081 N18-3013 W17-3531 2020.wmt-1.94 D15-1273 2022.nlp4convai-1.7 P16-2049 C14-1195 P19-1022 W19-4417 W19-4424 W19-5340 W19-5421 2020.wat-1.21 E17-2058 2022.ecnlp-1.13 J14-3008 N15-1041 N15-1105 P18-2051 D17-1208 D17-1220 D17-2005 2020.acl-main.690 2020.acl-main.693 N16-1100 2022.findings-acl.223 2022.findings-acl.301
+This script adds the name ID to all papers matching the first and last name.
+It will use the module to find the list of papers to edit. Alternately, you
+provide it with the list of papers.
 
 Usage:
-
-    ./add_author_id.py bill-byrne --last-name Byrne --first-name Bill
+    ./add_author_id.py <id> "Last name[, First name]" [--paper-ids 2028.acl-main.74 ...]
 """
 
-import argparse
-import os
+from __future__ import annotations
 
-from pathlib import Path
-from anthology.utils import indent
+import argparse
+from collections import defaultdict
 from itertools import chain
+from pathlib import Path
+
+from acl_anthology.anthology import Anthology
 
+# old library since we're still editing XML files
+from anthology.utils import indent
 import lxml.etree as ET
 
 
-def main(args):
-    for xml_file in Path(args.data_dir).glob("**/*.xml"):
-        changed_one = False
+def main(args: argparse.Namespace) -> None:
+
+    last_name, first_name = (
+        args.name.split(", ") if ", " in args.name else (args.name, None)
+    )
+
+    anthology = Anthology(args.data_dir, verbose=True)
+
+    # Build a collection of the set of papers to modify within each XML file
+    collection_to_paper_map = defaultdict(list)
+
+    if args.paper_ids:
+        for paper_id in args.paper_ids:
+            paper = anthology.get_paper(paper_id)
+            if paper:
+                collection_to_paper_map[paper.collection_id].append(paper.full_id_tuple)
+
+    else:
+        people = anthology.find_people(args.name)
+        if not people:
+            print(f"No person found matching name {args.name}")
+
+        # find the person with the non-explicit ID
+        for person in people:
+            if not person.is_explicit:
+                break
+
+        if not person:
+            print(f"No person found matching name {args.name} with an explicit ID")
+            return
+
+        for paper in person.papers():
+            collection_to_paper_map[paper.collection_id].append(paper.full_id_tuple)
+
+    if collection_to_paper_map:
+        print("Will edit the following paper IDs:")
+        for paper_id_tuples in collection_to_paper_map.values():
+            for paper_id in paper_id_tuples:
+                print(f" - {paper_id}")
+
+    # Now iterate over those files and the papers within them
+    for collection_id, paper_id_tuples in collection_to_paper_map.items():
+        xml_file = Path(args.data_dir) / "xml" / f"{collection_id}.xml"
 
         tree = ET.parse(xml_file)
-        for paper_xml in chain(
-            tree.getroot().findall(".//paper"), tree.getroot().findall(".//meta")
-        ):
+
+        for paper_tuple in paper_id_tuples:
+            _, volume_id, paper_id = paper_tuple
+
+            # Get the paper
+            paper_xml = tree.getroot().find(
+                f"./volume[@id='{volume_id}']/paper[@id='{paper_id}']"
+            )
+
             for author_xml in chain(
                 paper_xml.findall("./author"), paper_xml.findall("./editor")
             ):
                 if "id" in author_xml.attrib:
                     continue
-                last_name = author_xml.find("./last").text
                 try:
-                    first_name = author_xml.find("./first").text
+                    author_first_name = author_xml.find("./first").text
                 except AttributeError:
-                    first_name = ""
-                if last_name == args.last_name and first_name == args.first_name:
+                    author_first_name = None
+                author_last_name = author_xml.find("./last").text
+
+                if author_last_name == last_name and author_first_name == first_name:
                     paper_id = (
                         paper_xml.attrib["id"] if paper_xml.text == "paper" else "0"
                     )
-                    anth_id = f"{xml_file}/{paper_id}"
-                    print(f"Adding {args.id} to {anth_id}...")
+                    paper_id = anthology.get_paper(paper_tuple).full_id
+                    print(
+                        f"Adding {args.id} to {author_first_name} {author_last_name} on paper {paper_id}..."
+                    )
                     author_xml.attrib["id"] = args.id
-                    changed_one = True
 
-        if changed_one:
-            indent(tree.getroot())
-            tree.write(xml_file, encoding="UTF-8", xml_declaration=True)
+        indent(tree.getroot())
+        tree.write(xml_file, encoding="UTF-8", xml_declaration=True)
+
+    """
+    Once we have the module published, we should be able to modify this to use
+    it to write the changed XML files, instead of the above.
+    """
+    # for paper in person.papers():
+    #     print("PAPER", paper.full_id)
+    #     authors = paper.get_editors() if paper.is_frontmatter else paper.authors
+    #     for author in authors:
+    #         if author.name in person.names:
+    #             print("-> Found", author)
+    #             author.id = args.id
+    #     # collection_paper_map[paper.collection_id].append(paper.full_id)
+
+    # # save the anthology (doesn't currently work)
+    # anthology.save_all()
 
 
 if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
+    parser = argparse.ArgumentParser("Add an author ID to all of an author's papers")
     parser.add_argument("id", help="Author ID to add")
-    parser.add_argument("--last-name", help="Author's last name")
-    parser.add_argument("--first-name", help="Author's first name")
-    parser.add_argument("--confirm", action="store_true", help="Confirm each instance")
+    parser.add_argument("name", help="Author's name (last[, first])")
+    parser.add_argument("--paper-ids", nargs="*", help="List of paper IDs to modify")
     parser.add_argument(
-        "--data-dir", default=os.path.join(os.path.dirname(__file__), "..", "data", "xml")
+        "--data-dir",
+        default=None,
+        help="Path to anthology data directory (default: ../data relative to repository root)",
     )
     args = parser.parse_args()
+    # Normalize data_dir to a Path string used by Anthology
+    # If the user supplies a path, trust it; otherwise compute relative to this script
+    if args.data_dir is None:
+        args.data_dir = str(Path(__file__).parent.parent / "data")
 
     main(args)
diff --git a/data/xml/2020.acl.xml b/data/xml/2020.acl.xml
@@ -3245,7 +3245,7 @@
     <paper id="241">
       <title>Orthogonal Relation Transforms with Graph Context Modeling for Knowledge Graph Embedding</title>
       <author><first>Yun</first><last>Tang</last></author>
-      <author><first>Jing</first><last>Huang</last></author>
+      <author id="jing-huang"><first>Jing</first><last>Huang</last></author>
       <author><first>Guangtao</first><last>Wang</last></author>
       <author><first>Xiaodong</first><last>He</last></author>
       <author><first>Bowen</first><last>Zhou</last></author>

diff --git a/data/xml/2021.naacl.xml b/data/xml/2021.naacl.xml
@@ -2814,7 +2814,7 @@
       <author><first>Kevin</first><last>Huang</last></author>
       <author><first>Tengyu</first><last>Ma</last></author>
       <author><first>Quanquan</first><last>Gu</last></author>
-      <author><first>Jing</first><last>Huang</last></author>
+      <author id="jing-huang"><first>Jing</first><last>Huang</last></author>
       <pages>2609–2615</pages>
       <abstract>First-order meta-learning algorithms have been widely used in practice to learn initial model parameters that can be quickly adapted to new tasks due to their efficiency and effectiveness. However, existing studies find that meta-learner can overfit to some specific adaptation when we have heterogeneous tasks, leading to significantly degraded performance. In Natural Language Processing (NLP) applications, datasets are often diverse and each task has its unique characteristics. Therefore, to address the overfitting issue when applying first-order meta-learning to NLP applications, we propose to reduce the variance of the gradient estimator used in task adaptation. To this end, we develop a variance-reduced first-order meta-learning algorithm. The core of our algorithm is to introduce a novel variance reduction term to the gradient estimation when performing the task adaptation. Experiments on two NLP applications: few-shot text classification and multi-domain dialog state tracking demonstrate the superior performance of our proposed method.</abstract>
       <url hash="93468fde">2021.naacl-main.206</url>
@@ -3111,7 +3111,7 @@
       <author><first>Peng</first><last>Qi</last></author>
       <author><first>Guangtao</first><last>Wang</last></author>
       <author><first>Rex</first><last>Ying</last></author>
-      <author><first>Jing</first><last>Huang</last></author>
+      <author id="jing-huang"><first>Jing</first><last>Huang</last></author>
       <author><first>Xiaodong</first><last>He</last></author>
       <author><first>Bowen</first><last>Zhou</last></author>
       <pages>2884–2894</pages>

diff --git a/data/xml/2021.repl4nlp.xml b/data/xml/2021.repl4nlp.xml
@@ -377,7 +377,7 @@
       <author><first>Peng</first><last>Qi</last></author>
       <author><first>Guangtao</first><last>Wang</last></author>
       <author><first>Tengyu</first><last>Ma</last></author>
-      <author><first>Jing</first><last>Huang</last></author>
+      <author id="jing-huang"><first>Jing</first><last>Huang</last></author>
       <pages>307–315</pages>
       <abstract>Document-level relation extraction is a challenging task, requiring reasoning over multiple sentences to predict a set of relations in a document. In this paper, we propose a novel framework E2GRE (Entity and Evidence Guided Relation Extraction) that jointly extracts relations and the underlying evidence sentences by using large pretrained language model (LM) as input encoder. First, we propose to guide the pretrained LM’s attention mechanism to focus on relevant context by using attention probabilities as additional features for evidence prediction. Furthermore, instead of feeding the whole document into pretrained LMs to obtain entity representation, we concatenate document text with head entities to help LMs concentrate on parts of the document that are more related to the head entity. Our E2GRE jointly learns relation extraction and evidence prediction effectively, showing large gains on both these tasks, which we find are highly correlated.</abstract>
       <url hash="d034db75">2021.repl4nlp-1.30</url>

diff --git a/data/xml/2021.sustainlp.xml b/data/xml/2021.sustainlp.xml
@@ -143,7 +143,7 @@
       <author><first>Xiaochen</first><last>Hou</last></author>
       <author><first>Diyi</first><last>Yang</last></author>
       <author><first>Kathleen</first><last>McKeown</last></author>
-      <author><first>Jing</first><last>Huang</last></author>
+      <author id="jing-huang"><first>Jing</first><last>Huang</last></author>
       <pages>79–85</pages>
       <abstract>Large pre-trained language models (PLMs) have led to great success on various commonsense question answering (QA) tasks in an end-to-end fashion. However, little attention has been paid to what commonsense knowledge is needed to deeply characterize these QA tasks. In this work, we proposed to categorize the semantics needed for these tasks using the SocialIQA as an example. Building upon our labeled social knowledge categories dataset on top of SocialIQA, we further train neural QA models to incorporate such social knowledge categories and relation information from a knowledge base. Unlike previous work, we observe our models with semantic categorizations of social knowledge can achieve comparable performance with a relatively simple model and smaller size compared to other complex approaches.</abstract>
       <url hash="499d3240">2021.sustainlp-1.10</url>

diff --git a/data/xml/2021.textgraphs.xml b/data/xml/2021.textgraphs.xml
@@ -107,7 +107,7 @@
     <paper id="8">
       <title>Selective Attention Based Graph Convolutional Networks for Aspect-Level Sentiment Classification</title>
       <author><first>Xiaochen</first><last>Hou</last></author>
-      <author><first>Jing</first><last>Huang</last></author>
+      <author id="jing-huang"><first>Jing</first><last>Huang</last></author>
       <author><first>Guangtao</first><last>Wang</last></author>
       <author><first>Peng</first><last>Qi</last></author>
       <author><first>Xiaodong</first><last>He</last></author>

diff --git a/data/xml/2022.acl.xml b/data/xml/2022.acl.xml
@@ -7546,7 +7546,7 @@ in the Case of Unambiguous Gender</title>
       <author><first>Chao</first><last>Shang</last></author>
       <author><first>Guangtao</first><last>Wang</last></author>
       <author><first>Peng</first><last>Qi</last></author>
-      <author><first>Jing</first><last>Huang</last></author>
+      <author id="jing-huang"><first>Jing</first><last>Huang</last></author>
       <pages>8017-8026</pages>
       <abstract>Question answering over temporal knowledge graphs (KGs) efficiently uses facts contained in a temporal KG, which records entity relations and when they occur in time, to answer natural language questions (e.g., “Who was the president of the US before Obama?”). These questions often involve three time-related challenges that previous work fail to adequately address: 1) questions often do not specify exact timestamps of interest (e.g., “Obama” instead of 2000); 2) subtle lexical differences in time relations (e.g., “before” vs “after”); 3) off-the-shelf temporal KG embeddings that previous work builds on ignore the temporal order of timestamps, which is crucial for answering temporal-order related questions. In this paper, we propose a time-sensitive question answering (TSQA) framework to tackle these problems. TSQA features a timestamp estimation module to infer the unwritten timestamp from the question. We also employ a time-sensitive KG encoder to inject ordering information into the temporal KG embeddings that TSQA is based on. With the help of techniques to reduce the search space for potential answers, TSQA significantly outperforms the previous state of the art on a new benchmark for question answering over temporal KGs, especially achieving a 32% (absolute) error reduction on complex questions that require multiple steps of reasoning over facts in the temporal KG.</abstract>
       <url hash="2642c44d">2022.acl-long.552</url>

diff --git a/data/xml/2022.emnlp.xml b/data/xml/2022.emnlp.xml
@@ -4236,7 +4236,7 @@
       <author><first>Shereen</first><last>Oraby</last><affiliation>Amazon Alexa AI</affiliation></author>
       <author><first>Alessandra</first><last>Cervone</last><affiliation>Amazon Alexa AI</affiliation></author>
       <author><first>Tagyoung</first><last>Chung</last><affiliation>Amazon Alexa AI</affiliation></author>
-      <author><first>Jing</first><last>Huang</last><affiliation>Amazon</affiliation></author>
+      <author id="jing-huang"><first>Jing</first><last>Huang</last><affiliation>Amazon</affiliation></author>
       <author id="yang-liu"><first>Yang</first><last>Liu</last><affiliation>Amazon</affiliation></author>
       <author><first>Nanyun</first><last>Peng</last><affiliation>University of California, Los Angeles</affiliation></author>
       <pages>4590-4605</pages>
@@ -4264,7 +4264,7 @@
       <author><first>Shereen</first><last>Oraby</last><affiliation>Amazon Alexa AI</affiliation></author>
       <author><first>Shuyang</first><last>Gao</last><affiliation>Amazon.com, Inc.</affiliation></author>
       <author><first>Tagyoung</first><last>Chung</last><affiliation>Amazon Alexa AI</affiliation></author>
-      <author><first>Jing</first><last>Huang</last><affiliation>Amazon</affiliation></author>
+      <author id="jing-huang"><first>Jing</first><last>Huang</last><affiliation>Amazon</affiliation></author>
       <author id="yang-liu"><first>Yang</first><last>Liu</last><affiliation>Amazon</affiliation></author>
       <author><first>Nanyun</first><last>Peng</last><affiliation>University of California, Los Angeles</affiliation></author>
       <pages>4635-4648</pages>

diff --git a/data/xml/2023.acl.xml b/data/xml/2023.acl.xml
@@ -7341,7 +7341,7 @@
       <author><first>Wenbo</first><last>Zhao</last><affiliation>Amazon</affiliation></author>
       <author><first>Yiwen</first><last>Chen</last><affiliation>University of Cambridge</affiliation></author>
       <author><first>Tagyoung</first><last>Chung</last><affiliation>Amazon Alexa AI</affiliation></author>
-      <author><first>Jing</first><last>Huang</last><affiliation>Amazon</affiliation></author>
+      <author id="jing-huang"><first>Jing</first><last>Huang</last><affiliation>Amazon</affiliation></author>
       <author><first>Nanyun</first><last>Peng</last><affiliation>University of California, Los Angeles</affiliation></author>
       <pages>9235-9254</pages>
       <abstract>Automatic melody-to-lyric generation is a task in which song lyrics are generated to go with a given melody. It is of significant practical interest and more challenging than unconstrained lyric generation as the music imposes additional constraints onto the lyrics. The training data is limited as most songs are copyrighted, resulting in models that underfit the complicated cross-modal relationship between melody and lyrics. In this work, we propose a method for generating high-quality lyrics without training on any aligned melody-lyric data. Specifically, we design a hierarchical lyric generation framework that first generates a song outline and second the complete lyrics. The framework enables disentanglement of training (based purely on text) from inference (melody-guided text generation) to circumvent the shortage of parallel data. We leverage the segmentation and rhythm alignment between melody and lyrics to compile the given melody into decoding constraints as guidance during inference. The two-step hierarchical design also enables content control via the lyric outline, a much-desired feature for democratizing collaborative song creation. Experimental results show that our model can generate high-quality lyrics that are more on-topic, singable, intelligible, and coherent than strong baselines, for example SongMASS, a SOTA model trained on a parallel dataset, with a 24% relative overall quality improvement based on human ratings. Our code is available at <url>https://github.com/amazon-science/unsupervised-melody-to-lyrics-generation</url>.</abstract>

diff --git a/data/xml/2023.blackboxnlp.xml b/data/xml/2023.blackboxnlp.xml
@@ -272,7 +272,7 @@
     </paper>
     <paper id="24">
       <title>Rigorously Assessing Natural Language Explanations of Neurons</title>
-      <author><first>Jing</first><last>Huang</last></author>
+      <author id="jing-huang-stanford"><first>Jing</first><last>Huang</last></author>
       <author><first>Atticus</first><last>Geiger</last></author>
       <author><first>Karel</first><last>D’Oosterlinck</last></author>
       <author><first>Zhengxuan</first><last>Wu</last></author>