diff --git a/scripts/validity_check.py b/scripts/validity_check.py new file mode 100644 index 00000000000..de64847372d --- /dev/null +++ b/scripts/validity_check.py @@ -0,0 +1,346 @@ +# flake8: noqa +# type: ignore +# fmt: off + +import io +import pickle +from typing import Any, Callable, Dict, List, Optional, Tuple, Union + +import numpy as np +import pandas as pd +import torch +from huggingface_hub import HfApi, hf_hub_download, login +from pyrelimri.tetrachoric_correlation import tetrachoric_corr +from torch.distributions import Bernoulli +from tqdm import tqdm + + +def trainer(parameters: List[torch.Tensor], + optim: torch.optim.Optimizer, + closure: Callable[[], + torch.Tensor]) -> List[torch.Tensor]: + pbar = tqdm(range(100)) + loss: torch.Tensor + for iteration in pbar: + if iteration > 0: + previous_parameters = [p.clone() for p in parameters] + previous_loss = loss.clone() + loss = optim.step(closure) + if iteration > 0: + d_loss = (previous_loss - loss).item() + d_parameters = sum( + torch.norm( + prev - curr, + p=2).item() for prev, + curr in zip( + previous_parameters, + parameters)) + grad_norm = sum(torch.norm(p.grad, p=2).item() + for p in parameters if p.grad is not None) + pbar.set_postfix( + {"grad_norm": grad_norm, "d_parameter": d_parameters, "d_loss": d_loss}) + if d_loss < 1e-5 and d_parameters < 1e-5 and grad_norm < 1e-5: + break + return parameters + + +# The following function is copied verbatim from: +# https://github.com/hardy-education/pymokken/blob/main/scalability_coefs.py +# under the MIT license: + +# Copyright (c) 2025, Michael Hardy. All rights reserved. + +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the +# “Software”), to deal in the Software without restriction, including +# without limitation the rights to use, copy, modify, merge, publish, +# distribute, sublicense, and/or sell copies of the Software, and to +# permit persons to whom the Software is furnished to do so, subject to +# the following conditions: + +# The above copyright notice and this permission notice shall be included +# in all copies or substantial portions of the Software. + +# THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS +# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +# IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +# CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +# TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +# SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + +def scalability_coefs(X: Union[np.ndarray, pd.DataFrame]) -> Dict[str, Any]: + """ + Compute item-level scalability coefficients (Hi and Zi) using simplified approach, + which does not include standard errors or confidence intervals. + (Loevinger, 1948; Mokken, 1971; Molenaar and Sijtsma, 2000; Sijtsma and Molenaar, 2002) + + This function computes: + - Hi: Item-level H coefficients (scalability of each item with rest of scale) + - Zi: Item-level Z-scores (standardized Hi coefficients) + - H: Overall scale H coefficient (scalar) + - Z: Overall scale Z-score (scalar) + - Hij: Item-pair H coefficients (matrix of shape (n_items, n_items)) + - Zij: Item-pair Z-scores (matrix of shape (n_items, n_items)) + + Parameters + ---------- + X : array-like of shape (n_subjects, n_items) + Data matrix containing item responses. Should be integer-valued. + Missing values are handled by listwise deletion. + + Returns + ------- + dict + Dictionary containing: + - 'Hi': Item-level H coefficients (array of length n_items) + - 'Zi': Item-level Z-scores (array of length n_items) + - 'H': Overall scale H coefficient (scalar) + - 'Z': Overall scale Z-score (scalar) + - 'Hij': Item-pair H coefficients (matrix of shape (n_items, n_items)) + - 'Zij': Item-pair Z-scores (matrix of shape (n_items, n_items)) + + Examples + -------- + >>> import numpy as np + >>> X = np.random.randint(0, 4, (100, 5)) + >>> result = scalability_simple(X) + >>> print(f"Item coefficients: {result['Hi']}") + >>> print(f"Overall coefficient: {result['H']:.3f}") + """ + # Convert input to numpy array + if isinstance(X, pd.DataFrame): + X = X.values + X = np.asarray(X, dtype=float) + + # Handle missing data with listwise deletion + if np.any(np.isnan(X)): + complete_cases = ~np.any(np.isnan(X), axis=1) + X = X[complete_cases] + if X.shape[0] < 5: + raise ValueError( + "Insufficient complete cases after removing missing data") + + # Convert to integers + X = X.astype(int) + + # Validate input + if X.ndim != 2: + raise ValueError("X must be a 2D array") + if X.shape[1] < 2: + raise ValueError("X must have at least 2 items") + if X.shape[0] < 5: + raise ValueError("X must have at least 5 subjects") + + n_subjects, n_items = X.shape + + # Check for zero variance, handle with listwise deletion + if np.any(np.var(X, axis=0) == 0): + complete_cases = ~np.any(np.var(X, axis=0) == 0, axis=1) + X = X[complete_cases] + if X.shape[0] < 5: + raise ValueError( + "Insufficient complete cases after removing zero variance items") + + # Compute H scaling (Loevinger, 1948; Mokken, 1971) using simple method + # Compute covariance matrices + S = np.cov(X, rowvar=False) # Item covariance matrix + X_sorted = np.sort(X, axis=0) # Sort each item independently + Smax = np.cov(X_sorted, rowvar=False) # Maximum possible covariance + + # Compute Hij matrix (item-pair coefficients) + Hij = S / Smax + np.fill_diagonal(Hij, 0) # Zero out diagonal + + # Compute Hi coefficients (item-level) + S_offdiag = S.copy() + Smax_offdiag = Smax.copy() + np.fill_diagonal(S_offdiag, 0) + np.fill_diagonal(Smax_offdiag, 0) + + # for future reference: + Hij = np.divide( + S_offdiag, + Smax_offdiag, + out=np.zeros_like(S_offdiag), + where=Smax_offdiag != 0) + Hi = np.sum(Hij, axis=1) + + # Compute overall H coefficient + H = np.sum(S_offdiag) / np.sum(Smax_offdiag) + + # Compute Z-standardized scaling using simple method + # (Mokken, 1971; Molenaar and Sijtsma, 2000; Sijtsma and Molenaar, 2002) + # Only appropriate for testing lowerbound = 0. + # Item variances, unweighted and unbiased + var_vec = np.var(X, axis=0, ddof=1) + Sij = np.outer(var_vec, var_vec) # Outer product of variances + + # Item-pair Z-standardized scaling coefficients + Zij = np.divide(S * np.sqrt(n_subjects - 1), np.sqrt(Sij), + out=np.zeros_like(S_offdiag), where=Sij != 0) + np.fill_diagonal(Zij, 0) # Zero diagonal + + # Item-level Z-standardized scaling + Sij_for_z = Sij.copy() + np.fill_diagonal(Sij_for_z, 0) + + Zi = np.divide( + np.sum(S_offdiag, axis=1) * np.sqrt(n_subjects - 1), + np.sqrt(np.sum(Sij_for_z, axis=1)), + out=np.zeros(n_items), + where=np.sum(Sij_for_z, axis=1) != 0, + ) + + # Overall Z-standardized scaling (divided by 2 because the matrix is + # symmetric, I think) + sum_S = np.sum(S_offdiag) / 2.0 + sum_Sij = np.sum(Sij_for_z) / 2.0 + Z = (sum_S * np.sqrt(n_subjects - 1)) / \ + np.sqrt(sum_Sij) if sum_Sij != 0 else 0.0 + + return {"Hi": Hi, "Zi": Zi, "H": H, "Z": Z, "Hij": Hij, "Zij": Zij} + + +def raw_item_total_correlations(X: np.ndarray) -> List[float]: + total = X.sum(axis=1) + Xc = X - X.mean(axis=0) + Tc = total - total.mean() + numer = (Xc * Tc[:, None]).sum(axis=0) + denom = np.sqrt((Xc**2).sum(axis=0) * (Tc**2).sum()) + raw_r = numer / denom + return raw_r.tolist() + + +if __name__ == "__main__": + benchmark: str = "lite" + scenario: str = "gsm" + + # load information from long table + long_path: str = hf_hub_download( + repo_id="stair-lab/reeval_data_public", + repo_type="dataset", + filename="long.pkl") + with open(long_path, "rb") as f: + long: Any = pickle.load(f) + sub_long: pd.DataFrame = long[(long["benchmark"] == benchmark) & ( + long["scenario"] == scenario)].copy() + sub_long = sub_long.drop_duplicates( + subset=[ + "instance_id", + "train_trial_index", + "perturbation.name"]).reset_index( + drop=True) + sub_long = sub_long[["instance_id", + "train_trial_index", + "perturbation.name", + "input.text"]] + + # load resmat + resmat_path: str = hf_hub_download( + repo_id="stair-lab/reeval_data_public", + repo_type="dataset", + filename="resmat.pkl") + with open(resmat_path, "rb") as f: + resmat: pd.DataFrame = pickle.load(f) + sub_mask: pd.Series = ( + resmat.columns.get_level_values("benchmark") == benchmark) & ( + resmat.columns.get_level_values("scenario") == scenario) + sub_resmat: pd.DataFrame = resmat.loc[:, sub_mask] + sub_resmat = sub_resmat.dropna(axis=0, how="all") + questions: pd.Index = sub_resmat.columns.get_level_values("input.text") + data: np.ndarray = sub_resmat.values + n_test_takers: int + n_questions: int + n_test_takers, n_questions = data.shape + + # 1. tetrachoric correlation + print("1. tetrachoric correlation") + corr_matrix = np.zeros((n_questions, n_questions)) + for i in tqdm(range(n_questions)): + for j in range(i, n_questions): + r = tetrachoric_corr(data[:, i], data[:, j]) + corr_matrix[i, j] = corr_matrix[j, i] = r + tetrachoric = np.nanmean(corr_matrix, axis=1) + + # 2. 2PL IRT discriminant + print("2. 2PL IRT discriminant") + device: str = "cuda" if torch.cuda.is_available() else "cpu" + data_tensor: torch.Tensor = torch.tensor(data, device=device) + z: torch.Tensor = torch.zeros( + n_questions, + requires_grad=True, + device=device) + a: torch.Tensor = torch.ones(n_questions, requires_grad=True, device=device) + optim: torch.optim.Optimizer = torch.optim.LBFGS( + [z, a], lr=0.1, max_iter=20, history_size=10, line_search_fn="strong_wolfe" + ) + thetas: torch.Tensor = torch.randn(150, n_test_takers, device=device) + + def closure(): + optim.zero_grad() + probs = torch.sigmoid( + (thetas[:, :, None] + z[None, None, :]) * a[None, None, :]) + loss = -(Bernoulli(probs=probs).log_prob(data_tensor) + ).mean() + 0.01 * (a - 1).pow(2).mean() + loss.backward() + return loss + + z, a = trainer([z, a], optim, closure) + a = a.detach().cpu().numpy() + + # 3. scalability coefficients + print("3. scalability coefficients") + scalability_coeff_results: Dict[str, Any] = scalability_coefs(data) + scalability_coeff: np.ndarray = scalability_coeff_results["Zij"].mean(0) + + # 4. item-total correlation + print("4. item-total correlation") + item_total_corr: List[float] = raw_item_total_correlations(data) + + # merge the two data + validity_metrics: pd.DataFrame = pd.DataFrame( + { + "input.text": questions, + "tetrachoric": tetrachoric, + "2pl_irt_discriminant": a, + "scalability_coeff": scalability_coeff, + "item_total_corr": item_total_corr, + } + ) + merged: pd.DataFrame = validity_metrics.merge( + sub_long, on="input.text", how="inner") + merged = merged.where(merged.notna(), None) + + # create dict, upload to HF + validity_dict: Dict[Tuple[str, Optional[str], int], Dict[str, float]] = { + (row["instance_id"], row["perturbation.name"], row["train_trial_index"]): { + "tetrachoric": row["tetrachoric"], + "2pl_irt_discriminant": row["2pl_irt_discriminant"], + "scalability_coeff": row["scalability_coeff"], + "item_total_corr": row["item_total_corr"], + } + for _, row in merged.iterrows() + } + cleaned_validity_dict: Dict[Tuple[str, Optional[str], int], Dict[str, float]] = { + (inst_id, None if pd.isna(perturb) else perturb, trial_idx): valid + for (inst_id, perturb, trial_idx), valid in validity_dict.items() + } + validity_df = ( + pd.DataFrame.from_dict(cleaned_validity_dict, orient="index") + .rename_axis(index=["instance_id", "perturbation", "train_trial_index"]) + .reset_index() + ) + buffer: io.BytesIO = io.BytesIO() + validity_df.to_parquet(buffer, index=False) + buffer.seek(0) + + login() + api: HfApi = HfApi() + api.upload_file( + path_or_fileobj=buffer, + path_in_repo="validity.parquet", + repo_id="stair-lab/helm_display_validity", + repo_type="dataset", + ) diff --git a/src/helm/benchmark/presentation/run_display.py b/src/helm/benchmark/presentation/run_display.py index e7a43290a11..f9755b2082e 100644 --- a/src/helm/benchmark/presentation/run_display.py +++ b/src/helm/benchmark/presentation/run_display.py @@ -1,8 +1,11 @@ from collections import OrderedDict, defaultdict from dataclasses import dataclass import os +import math from typing import Dict, Iterable, List, Optional, Set, Tuple, Any +import pandas as pd + from helm.benchmark.adaptation.adapter_spec import ( ADAPT_MULTIPLE_CHOICE_SEPARATE_METHODS, ADAPT_MULTIPLE_CHOICE_SEPARATE_CALIBRATED, @@ -21,6 +24,7 @@ from helm.common.images_utils import encode_base64 from helm.common.request import Request from helm.common.codec import from_json, to_json +from huggingface_hub import hf_hub_download @dataclass(frozen=True) @@ -154,7 +158,9 @@ def _get_metric_names_for_groups(run_group_names: Iterable[str], schema: Schema) @htrack(None) -def write_run_display_json(run_path: str, run_spec: RunSpec, schema: Schema, skip_completed: bool) -> None: +def write_run_display_json( + run_path: str, run_spec: RunSpec, schema: Schema, skip_completed: bool, validity_check: bool = False +) -> None: """Write run JSON files that are used by the web frontend. The derived JSON files that are used by the web frontend are much more compact than @@ -229,6 +235,21 @@ def write_run_display_json(run_path: str, run_spec: RunSpec, schema: Schema, ski predictions: List[DisplayPrediction] = [] requests: List[DisplayRequest] = [] + if validity_check: + validity_path: Optional[str] = hf_hub_download( + repo_id="stair-lab/helm_display_validity", repo_type="dataset", filename="validity.parquet" + ) + validity_df: pd.DataFrame = pd.read_parquet(validity_path) + validity_dict: Dict[Tuple[str, Optional[PerturbationDescription], int], Dict[str, float]] = { + (row.instance_id, row.perturbation, row.train_trial_index): { + "tetrachoric": row.tetrachoric, + "2pl_irt_discriminant": row._2pl_irt_discriminant, + "scalability_coeff": row.scalability_coeff, + "item_total_corr": row.item_total_corr, + } + for row in validity_df.itertuples(index=False) + } + for request_state in scenario_state.request_states: assert request_state.instance.id is not None if request_state.result is None: @@ -248,6 +269,18 @@ def write_run_display_json(run_path: str, run_spec: RunSpec, schema: Schema, ski request_state.train_trial_index, ) trial_stats: Dict[str, float] = stats_by_trial[stats_key] + + if validity_check: + validity_metrics: Optional[Dict[str, float]] = validity_dict.get(stats_key) + + def get_metric(name: str) -> float: + return validity_metrics.get(name, math.nan) if validity_metrics is not None else math.nan + + trial_stats["tetrachoric"] = get_metric("tetrachoric") + trial_stats["2pl_irt_discriminant"] = get_metric("2pl_irt_discriminant") + trial_stats["scalability_coeff"] = get_metric("scalability_coeff") + trial_stats["item_total_corr"] = get_metric("item_total_corr") + # For the multiple_choice_separate_* adapter methods, # only keep the prediction for the chosen reference and discard the rest. if ( diff --git a/src/helm/benchmark/presentation/summarize.py b/src/helm/benchmark/presentation/summarize.py index 47961a4a83b..f2a5178bb81 100644 --- a/src/helm/benchmark/presentation/summarize.py +++ b/src/helm/benchmark/presentation/summarize.py @@ -347,6 +347,7 @@ def __init__( verbose: bool, num_threads: int, allow_unknown_models: bool, + validity_check: bool, ): """ A note on the relation between `release`, `suites`, and `suite`: @@ -377,6 +378,7 @@ def __init__( self.verbose: bool = verbose self.num_threads: int = num_threads self.allow_unknown_models: bool = allow_unknown_models + self.validity_check: bool = validity_check ensure_directory_exists(self.run_release_path) @@ -1194,7 +1196,7 @@ def write_groups(self): def write_run_display_json(self, skip_completed: bool) -> None: def process(run: Run) -> None: - write_run_display_json(run.run_path, run.run_spec, self.schema, skip_completed) + write_run_display_json(run.run_path, run.run_spec, self.schema, self.validity_check, skip_completed) parallel_map(process, self.runs, parallelism=self.num_threads) @@ -1270,6 +1272,7 @@ def summarize(args): verbose=args.debug, num_threads=args.num_threads, allow_unknown_models=args.allow_unknown_models, + validity_check=args.validity_check, ) summarizer.run_pipeline(skip_completed=args.skip_completed_run_display_json) hlog("Done.") @@ -1340,6 +1343,12 @@ def main(): default=None, help="EXPERIMENTAL: Full class name of the Summarizer class to use. If unset, uses the default Summarizer.", ) + parser.add_argument( + "--validity-check", + type=bool, + help="EXPERIMENTAL: Whether to load the validity signals as metrics to display", + default=False, + ) args = parser.parse_args() setup_default_logging() summarize(args)