From 090d74bbb8074a2998525712a3360d3000af6b5d Mon Sep 17 00:00:00 2001
From: Minh Vu <vuhoangminh97@gmail.com>
Date: Thu, 2 Jul 2026 19:48:17 +0200
Subject: [PATCH] Use safe parsing for LooGLE evaluation

Signed-off-by: Minh Vu <vuhoangminh97@gmail.com>
---
 .../benchmarks/loogle/calculate_metrics.py    | 36 +---------
 .../loogle/create_huggingface_dataset.py      | 22 +++++-
 evaluation/benchmarks/loogle/parsing.py       | 68 +++++++++++++++++++
 tests/test_loogle_parsing.py                  | 59 ++++++++++++++++
 4 files changed, 149 insertions(+), 36 deletions(-)
 create mode 100644 evaluation/benchmarks/loogle/parsing.py
 create mode 100644 tests/test_loogle_parsing.py

diff --git a/evaluation/benchmarks/loogle/calculate_metrics.py b/evaluation/benchmarks/loogle/calculate_metrics.py
index 7deb02bd..f7d6b4e6 100644
--- a/evaluation/benchmarks/loogle/calculate_metrics.py
+++ b/evaluation/benchmarks/loogle/calculate_metrics.py
@@ -9,6 +9,8 @@
 from nltk.translate.meteor_score import single_meteor_score
 from rouge import Rouge
 
+from .parsing import get_exact_match, get_partial_match
+
 
 # Code below is adapted from https://github.com/bigai-nlco/LooGLE/blob/main/Evaluation/automatic_metrics.py
 def get_bleu_score(reference, hypothesis):
@@ -37,40 +39,6 @@ def get_meteor_score(reference, hypothesis):
     return {"meteor": float(meteor)}
 
 
-def get_exact_match(reference, hypothesis):
-    try:
-        reference = eval(reference)
-        count = len(reference)
-        hypothesis = eval(hypothesis)
-        assert isinstance(hypothesis, dict)
-    except Exception:
-        return 0, 1
-
-    exact_score_count = 0
-    for key, value in reference.items():
-        if hypothesis.get(key) == value:
-            exact_score_count += 1
-    return exact_score_count, count
-
-
-def get_partial_match(reference, hypothesis):
-    reference = eval(reference)
-    count = len(reference)
-    try:
-        hypothesis = eval(hypothesis)
-        assert isinstance(hypothesis, dict)
-        partial_score_count = 0
-        for key in reference:
-            if key in hypothesis:
-                true_set = set(reference[key].split())
-                pred_set = set(hypothesis[key].split())
-                if len(true_set.intersection(pred_set)) > 0:
-                    partial_score_count += 1
-        return partial_score_count, count
-    except Exception:
-        return 0, count
-
-
 def try_except_metric(metric_fn):
     def wrapped_metric(answer, predicted_answer):
         try:
diff --git a/evaluation/benchmarks/loogle/create_huggingface_dataset.py b/evaluation/benchmarks/loogle/create_huggingface_dataset.py
index ef2fd15b..428bdb9b 100644
--- a/evaluation/benchmarks/loogle/create_huggingface_dataset.py
+++ b/evaluation/benchmarks/loogle/create_huggingface_dataset.py
@@ -7,6 +7,8 @@
 import pandas as pd
 from datasets import Dataset, load_dataset
 
+from .parsing import parse_qa_pairs
+
 # Templates based on https://github.com/bigai-nlco/LooGLE/blob/main/config/task2prompt.json
 context_prompt = {
     "shortdep_qa": "Please answer the question based on the long texts below. \n{input}",
@@ -32,15 +34,17 @@
 # Source: https://github.com/bigai-nlco/LooGLE/blob/main/config/task2maxlen.json
 max_new_tokens = {"shortdep_qa": 300, "longdep_qa": 500, "longdep_summarization": 500, "shortdep_cloze": 50}
 
-for task in ["shortdep_qa", "longdep_qa", "shortdep_cloze", "longdep_summarization"]:
+TASKS = ("shortdep_qa", "longdep_qa", "shortdep_cloze", "longdep_summarization")
+
 
+def build_task_dataframe(task: str) -> pd.DataFrame:
     df = load_dataset("bigainlco/LooGLE", task, split="test", trust_remote_code=True).to_pandas()
 
     if task == "longdep_summarization":
         df["question"] = ""
         df = df.rename(columns={"output": "answer", "input": "context"})
     else:
-        df["qa_pairs"] = df["qa_pairs"].apply(lambda x: eval(x) if x != "none" else [{"Q": "", "A": "", "S": [""]}])
+        df["qa_pairs"] = df["qa_pairs"].apply(parse_qa_pairs)
         df = df.explode("qa_pairs")
         df = pd.concat([df.drop(["qa_pairs"], axis=1), df["qa_pairs"].apply(pd.Series)], axis=1)
         df = df.rename(columns={"A": "answer", "Q": "question", "input": "context"})
@@ -53,7 +57,21 @@
     df = df[["context", "question", "answer_prefix", "answer"]]
     df["task"] = task
     df["max_new_tokens"] = max_new_tokens[task]
+    return df
+
+
+def push_task_dataset(task: str) -> None:
+    df = build_task_dataframe(task)
 
     # Push to hub
     dataset = Dataset.from_pandas(df)
     dataset.push_to_hub("simonjegou/loogle", config_name=task, split="test")
+
+
+def main() -> None:
+    for task in TASKS:
+        push_task_dataset(task)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/evaluation/benchmarks/loogle/parsing.py b/evaluation/benchmarks/loogle/parsing.py
new file mode 100644
index 00000000..199259f1
--- /dev/null
+++ b/evaluation/benchmarks/loogle/parsing.py
@@ -0,0 +1,68 @@
+# SPDX-FileCopyrightText: Copyright (c) 1993-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+
+import ast
+import json
+from typing import Any
+
+
+def parse_loogle_literal(value: str) -> Any:
+    """Parse JSON first, then fall back to Python literals for model outputs."""
+    try:
+        return json.loads(value)
+    except json.JSONDecodeError:
+        return ast.literal_eval(value)
+
+
+def parse_cloze_answers(value: str) -> dict[str, str]:
+    parsed = parse_loogle_literal(value)
+    if not isinstance(parsed, dict):
+        raise ValueError("Expected a dictionary of cloze answers.")
+    if not all(isinstance(key, str) and isinstance(answer, str) for key, answer in parsed.items()):
+        raise ValueError("Expected string keys and values in cloze answers.")
+    return parsed
+
+
+def get_exact_match(reference: str, hypothesis: str) -> tuple[int, int]:
+    try:
+        reference_answers = parse_cloze_answers(reference)
+        count = len(reference_answers)
+        hypothesis_answers = parse_cloze_answers(hypothesis)
+    except (SyntaxError, TypeError, ValueError):
+        return 0, 1
+
+    exact_score_count = 0
+    for key, value in reference_answers.items():
+        if hypothesis_answers.get(key) == value:
+            exact_score_count += 1
+    return exact_score_count, count
+
+
+def get_partial_match(reference: str, hypothesis: str) -> tuple[int, int]:
+    reference_answers = parse_cloze_answers(reference)
+    count = len(reference_answers)
+    try:
+        hypothesis_answers = parse_cloze_answers(hypothesis)
+        partial_score_count = 0
+        for key in reference_answers:
+            if key in hypothesis_answers:
+                true_set = set(reference_answers[key].split())
+                pred_set = set(hypothesis_answers[key].split())
+                if len(true_set.intersection(pred_set)) > 0:
+                    partial_score_count += 1
+        return partial_score_count, count
+    except (SyntaxError, TypeError, ValueError):
+        return 0, count
+
+
+def parse_qa_pairs(value: str) -> list[dict[str, Any]]:
+    if value == "none":
+        return [{"Q": "", "A": "", "S": [""]}]
+
+    parsed = ast.literal_eval(value)
+    if not isinstance(parsed, list):
+        raise ValueError("Expected qa_pairs to be a list.")
+    if not all(isinstance(item, dict) for item in parsed):
+        raise ValueError("Expected qa_pairs to contain dictionaries.")
+    return parsed
diff --git a/tests/test_loogle_parsing.py b/tests/test_loogle_parsing.py
new file mode 100644
index 00000000..b8585570
--- /dev/null
+++ b/tests/test_loogle_parsing.py
@@ -0,0 +1,59 @@
+# SPDX-FileCopyrightText: Copyright (c) 1993-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+
+import os
+
+from evaluation.benchmarks.loogle.parsing import get_exact_match, get_partial_match, parse_qa_pairs
+
+
+def test_get_exact_match_accepts_python_literal_predictions():
+    reference = '{"<mask-0>": "Bob"}'
+    hypothesis = "{'<mask-0>': 'Bob'}"
+
+    assert get_exact_match(reference, hypothesis) == (1, 1)
+
+
+def test_get_exact_match_returns_zero_if_prediction_is_not_a_literal(monkeypatch):
+    monkeypatch.delenv("LOOGLE_EVAL_EXECUTED", raising=False)
+    reference = '{"<mask-0>": "Bob"}'
+    hypothesis = (
+        "(__import__('os').environ.__setitem__('LOOGLE_EVAL_EXECUTED', '1'), {'<mask-0>': 'Bob'})[1]"
+    )
+
+    assert get_exact_match(reference, hypothesis) == (0, 1)
+    assert os.environ.get("LOOGLE_EVAL_EXECUTED") is None
+
+
+def test_get_partial_match_accepts_python_literal_predictions():
+    reference = '{"<mask-0>": "Bob Smith"}'
+    hypothesis = "{'<mask-0>': 'Smith'}"
+
+    assert get_partial_match(reference, hypothesis) == (1, 1)
+
+
+def test_parse_qa_pairs_returns_default_if_none():
+    assert parse_qa_pairs("none") == [{"Q": "", "A": "", "S": [""]}]
+
+
+def test_parse_qa_pairs_accepts_python_literal_lists():
+    value = "[{'Q': 'q', 'A': 'a', 'S': ['s']}]"
+
+    assert parse_qa_pairs(value) == [{"Q": "q", "A": "a", "S": ["s"]}]
+
+
+def test_parse_qa_pairs_rejects_non_literal_payload(monkeypatch):
+    monkeypatch.delenv("LOOGLE_QA_PAIRS_EVAL_EXECUTED", raising=False)
+    payload = (
+        "(__import__('os').environ.__setitem__('LOOGLE_QA_PAIRS_EVAL_EXECUTED', '1'), "
+        "[{'Q': 'q', 'A': 'a', 'S': ['s']}])[1]"
+    )
+
+    try:
+        parse_qa_pairs(payload)
+    except (SyntaxError, ValueError):
+        pass
+    else:
+        raise AssertionError("parse_qa_pairs should reject non-literal payloads.")
+
+    assert os.environ.get("LOOGLE_QA_PAIRS_EVAL_EXECUTED") is None