diff --git a/examples/evaluate_bluebench.sh b/examples/evaluate_bluebench.sh index 181d6068a0..d05d4ccadd 100644 --- a/examples/evaluate_bluebench.sh +++ b/examples/evaluate_bluebench.sh @@ -28,4 +28,4 @@ unitxt-evaluate \ --batch_size 8 \ --verbosity ERROR -unitxt-summarize ./results/bluebench +unitxt-summarize --folder ./results/bluebench diff --git a/src/unitxt/evaluate_cli.py b/src/unitxt/evaluate_cli.py index 2e416a3bd6..b16b7b01f3 100644 --- a/src/unitxt/evaluate_cli.py +++ b/src/unitxt/evaluate_cli.py @@ -7,7 +7,7 @@ import platform import subprocess import sys -from datetime import datetime +from datetime import datetime, timezone from functools import partial from typing import Any, Dict, List, Optional, Tuple, Union @@ -691,9 +691,8 @@ def _save_results_to_disk( "results": global_scores, } - # prepend to the results_path name the time in a wat like this: 2025-04-04T11:37:32 - - timestamp = datetime.now().strftime("%Y-%m-%dT%H-%M-%S") + # prepend the timestamp in UTC (e.g., 2025-01-18T11-37-32) to the file names + timestamp = datetime.now().astimezone(timezone.utc).strftime("%Y-%m-%dT%H-%M-%S") results_path = prepend_timestamp_to_path(results_path, timestamp) samples_path = prepend_timestamp_to_path(samples_path, timestamp) @@ -836,48 +835,129 @@ def main(): logger.info("Unitxt Evaluation CLI finished successfully.") -def extract_scores(directory): # pragma: no cover +def extract_scores(folder: str, subset: str, group: str): # pragma: no cover import pandas as pd - data = [] + def safe_score(d: dict, key="score"): + na = "N/A" + return d.get(key, na) if isinstance(d, dict) else na - for filename in sorted(os.listdir(directory)): - if filename.endswith("evaluation_results.json"): - file_path = os.path.join(directory, filename) - try: - with open(file_path, encoding="utf-8") as f: - content = json.load(f) + def extract_subset(results: dict, subset: str, group: str): + subset_results = results.get(subset, {}) + row = {subset: safe_score(subset_results)} + + groups = subset_results.get("groups", {}) + + if not groups: + return row + + group_results = groups.get(group) if group else next(iter(groups.values()), {}) - env_info = content.get("environment_info", {}) - timestamp = env_info.get("timestamp_utc", "N/A") - model = env_info.get("parsed_arguments", {}).get("model", "N/A") - results = content.get("results", {}) + if not isinstance(group_results, dict): + return row - row = {} - row["Model"] = model - row["Timestamp"] = timestamp - row["Average"] = results.get("score", "N/A") + row.update( + {k: safe_score(v) for k, v in group_results.items() if isinstance(v, dict)} + ) + return row + + def extract_all(results: dict): + row = {"Average": safe_score(results)} + row.update( + {k: safe_score(v) for k, v in results.items() if isinstance(v, dict)} + ) + return row + + data = [] - for key in results.keys(): - if isinstance(results[key], dict): - score = results[key].get("score", "N/A") - row[key] = score + for filename in sorted(os.listdir(folder)): + if not filename.endswith("evaluation_results.json"): + continue - data.append(row) - except Exception as e: - logger.error(f"Error parsing results file {filename}: {e}.") + file_path = os.path.join(folder, filename) + try: + with open(file_path, encoding="utf-8") as f: + content = json.load(f) + + env_info = content.get("environment_info", {}) + row = { + "Model": safe_score(env_info.get("parsed_arguments", {}), "model"), + "Timestamp": safe_score(env_info, "timestamp_utc"), + } + + results = content.get("results", {}) + + extra = ( + extract_subset(results, subset, group) + if subset + else extract_all(results) + ) + row.update(extra) + data.append(row) + except Exception as e: + logger.error(f"Error parsing results file {filename}: {e}.") return pd.DataFrame(data).sort_values(by="Timestamp", ascending=True) +def setup_summarization_parser() -> argparse.ArgumentParser: + parser = argparse.ArgumentParser( + formatter_class=argparse.RawTextHelpFormatter, + description="CLI utility for summarizing evaluation results.", + ) + + parser.add_argument( + "--folder", + "-f", + dest="folder", + type=str, + default=".", + help="Directory containing evaluation results json files. Default: current folder.\n", + ) + + parser.add_argument( + "--subset", + "-s", + type=str, + dest="subset", + default=None, + help="Subset to filter results by. Default: none.", + ) + + parser.add_argument( + "--group", + "-g", + type=str, + dest="group", + default=None, + help="Group to filter results to. Requires specifying a subset. Default: first group.", + ) + + parser.add_argument( + "--output", + "-o", + type=str, + choices=["markdown", "csv"], + dest="output", + default="markdown", + help="Output format. Can be markdown or csv. Default: markdown", + ) + + return parser + + def summarize_cli(): - if len(sys.argv) != 2: - logger.error("Usage: python summarize_cli_results.py ") - sys.exit(1) - directory = sys.argv[1] - df = extract_scores(directory) + parser = setup_summarization_parser() + args = parser.parse_args() - logger.info(df.to_markdown(index=False)) + df = extract_scores(args.folder, args.subset, args.group) + + if args.output == "markdown": + logger.info(df.to_markdown(index=False)) + elif args.output == "csv": + logger.info(df.to_csv(index=False)) + else: + logger.error(f"Unsupported output format: {args.output}") if __name__ == "__main__": diff --git a/tests/library/test_cli.py b/tests/library/test_cli.py index 75c2a86378..dc5e922d9b 100644 --- a/tests/library/test_cli.py +++ b/tests/library/test_cli.py @@ -751,8 +751,12 @@ def test_save_results_to_disk_summary_only( # --- Arrange --- # (Arrange section remains the same as previous version) mock_timestamp = "2025-04-14T10:00:00" + mock_timestamp_utc = "2025-04-14T08:00:00" mock_now = MagicMock() mock_now.strftime.return_value = mock_timestamp + mock_astimezone = MagicMock() + mock_astimezone.strftime.return_value = mock_timestamp_utc + mock_now.astimezone.return_value = mock_astimezone mock_datetime.now.return_value = mock_now mock_utcnow = MagicMock() mock_utcnow.isoformat.return_value = "2025-04-14T08:00:00" @@ -784,7 +788,9 @@ def test_save_results_to_disk_summary_only( } base_results_path = "/out/results_prefix.json" base_samples_path = "/out/results_prefix_samples.json" - expected_timestamped_results_path = f"/out/{mock_timestamp}_results_prefix.json" + expected_timestamped_results_path = ( + f"/out/{mock_timestamp_utc}_results_prefix.json" + ) # --- Act --- cli._save_results_to_disk( @@ -844,7 +850,7 @@ def test_save_results_to_disk_summary_only( ) log_calls = [call[0][0] for call in mock_logger.info.call_args_list] expected_timestamped_samples_path = ( - f"/out/{mock_timestamp}_results_prefix_samples.json" + f"/out/{mock_timestamp_utc}_results_prefix_samples.json" ) self.assertNotIn( f"Saving detailed samples to: {expected_timestamped_samples_path}",