diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml index b67f6fc7b..9a2c16247 100644 --- a/.github/workflows/benchmark.yml +++ b/.github/workflows/benchmark.yml @@ -56,7 +56,7 @@ jobs: - name: Run benchmark if: steps.changed-files.outputs.any_changed == 'true' || github.event_name == 'workflow_dispatch' run: | - tests/bench-aggregator.py + python3 tests/bench.py --json --quiet - name: Store benchmark results if: steps.changed-files.outputs.any_changed == 'true' || github.event_name == 'workflow_dispatch' diff --git a/src/main.c b/src/main.c index 1dcf97c85..509bd548b 100644 --- a/src/main.c +++ b/src/main.c @@ -292,7 +292,7 @@ int main(int argc, char **args) .args_offset_size = ARGS_OFFSET_SIZE, .argc = prog_argc, .argv = prog_args, - .log_level = LOG_TRACE, + .log_level = LOG_WARN, .run_flag = run_flag, .profile_output_file = prof_out_file, .cycle_per_step = CYCLE_PER_STEP, diff --git a/tests/bench-aggregator.py b/tests/bench-aggregator.py deleted file mode 100755 index c82426711..000000000 --- a/tests/bench-aggregator.py +++ /dev/null @@ -1,34 +0,0 @@ -#!/usr/bin/env python3 - -import json -import subprocess - - -def run_benchmark(b): - interp = None - if "sh" in b: - interp = "bash" - elif "py" in b: - interp = "python3" - - subprocess.run(args=[interp, b], shell=False, check=True) - - -def load_benchmark(file): - f = open(file, "r") - return json.load(f) - - -# run benchmarks -benchmarks = ["tests/dhrystone.sh", "tests/coremark.py"] -for b in benchmarks: - run_benchmark(b) - -# combine benchmarks output data -benchmarks_output = ["dhrystone_output.json", "coremark_output.json"] -benchmark_data = [load_benchmark(bo) for bo in benchmarks_output] - -benchmark_output = "benchmark_output.json" -f = open(benchmark_output, "w") -f.write(json.dumps(benchmark_data, indent=4)) -f.close() diff --git a/tests/bench.py b/tests/bench.py new file mode 100755 index 000000000..c089440e7 --- /dev/null +++ b/tests/bench.py @@ -0,0 +1,601 @@ +#!/usr/bin/env python3 +""" +Unified benchmark runner for rv32emu. + +Benchmarks are registered via the @register_benchmark decorator. +Supports parallel execution while preserving user-specified output order. +""" + +import subprocess +import re +import statistics +import os +import sys +import json +import argparse +import threading +import time +from abc import ABC, abstractmethod +from concurrent.futures import ThreadPoolExecutor, as_completed +from subprocess import TimeoutExpired +from typing import ClassVar, Dict, List, Optional, Tuple, Type + +# Configuration +EMU_PATH = "build/rv32emu" +DEFAULT_RUNS = 5 # Balance, providing reasonable statistics +TIMEOUT_SECONDS = 600 # 10 min timeout per run (safety limit) +SLOW_THRESHOLD_SECONDS = 300 # If single run > 5 min, use only 1 run +MAX_BENCHMARK_SECONDS = 600 # 10 min max total time per benchmark + +# Benchmark registry +_BENCHMARK_REGISTRY: Dict[str, Type["Benchmark"]] = {} + + +class ProgressIndicator: + """Thread-safe progress indicator with spinner animation.""" + + SPINNER = ["◐", "◓", "◑", "◒"] # Rotating circle animation + + def __init__(self, benchmarks: List[str], n_runs: int, quiet: bool = False): + self.benchmarks = benchmarks + self.n_runs = n_runs + # Disable indicator if not a TTY to avoid log clutter + self.quiet = quiet or not sys.stdout.isatty() + self.lock = threading.Lock() + # Track status: {bench_name: status} + self.status: Dict[str, str] = {name: "pending" for name in benchmarks} + self.start_time = time.monotonic() + self.last_render = 0.0 + self._stop_event = threading.Event() + self._spinner_thread: Optional[threading.Thread] = None + + def start(self) -> None: + """Start the background spinner thread.""" + if self.quiet: + return + # Reserve terminal space to avoid overwriting history + # (1 line for elapsed + 1 line per benchmark) + sys.stdout.write("\n" * (len(self.benchmarks) + 1)) + sys.stdout.flush() + self._stop_event.clear() + self._spinner_thread = threading.Thread( + target=self._spinner_loop, daemon=True + ) + self._spinner_thread.start() + + def _spinner_loop(self) -> None: + """Background loop to update spinner every 1 second.""" + while not self._stop_event.is_set(): + with self.lock: + self._render() + self._stop_event.wait(1.0) + + def update( + self, bench_name: str, run: int, status: str = "running" + ) -> None: + """Update status for a benchmark.""" + with self.lock: + self.status[bench_name] = status + + def _render(self) -> None: + """Render status for all benchmarks.""" + if self.quiet: + return + elapsed = time.monotonic() - self.start_time + spinner_idx = int(elapsed) % len(self.SPINNER) + spinner = self.SPINNER[spinner_idx] + + lines = [f"\033[2K Elapsed: {elapsed:.1f}s\n"] + + for name in self.benchmarks: + status = self.status[name] + if status == "pending": + indicator = "⏳" + state = "" + elif status == "done": + indicator = "✓" + state = "" + elif status == "failed": + indicator = "✗" + state = " (failed)" + else: # running + indicator = spinner + state = " (running)" + + lines.append(f"\033[2K {indicator} {name}{state}\n") + + # Move cursor up to overwrite + sys.stdout.write(f"\033[{len(lines)}A") + sys.stdout.write("".join(lines)) + sys.stdout.flush() + + def finish(self) -> None: + """Stop spinner and show final state (preserving failed status).""" + self._stop_event.set() + if self._spinner_thread: + self._spinner_thread.join(timeout=1.0) + if self.quiet: + return + with self.lock: + # Only mark pending/running as done, preserve failed status + for name in self.benchmarks: + if self.status[name] not in ("done", "failed"): + self.status[name] = "done" + self._render() + # Move past display + print("\n" * (len(self.benchmarks) + 1)) + + +def register_benchmark(name: str): + """Decorator to register a benchmark class.""" + + def decorator(cls: Type["Benchmark"]) -> Type["Benchmark"]: + _BENCHMARK_REGISTRY[name.lower()] = cls + return cls + + return decorator + + +def get_registered_benchmarks() -> Dict[str, Type["Benchmark"]]: + """Return all registered benchmarks.""" + return _BENCHMARK_REGISTRY.copy() + + +class Benchmark(ABC): + """Abstract base class for all benchmarks.""" + + name: ClassVar[str] + unit: ClassVar[str] + BIN_PATH: ClassVar[str] + + def __init__( + self, n_runs: int, progress: Optional[ProgressIndicator] = None + ): + self.n_runs = n_runs + self.progress = progress + self.logs: List[str] = [] + + def log(self, msg: str) -> None: + """Buffer log messages to avoid interleaving in parallel mode.""" + self.logs.append(msg) + + def get_logs(self) -> str: + """Return buffered logs as a single string.""" + return "\n".join(self.logs) + + @classmethod + def prepare(cls) -> None: + """Ensure dependencies are built. Run BEFORE parallel execution.""" + if hasattr(cls, "BIN_PATH") and not os.path.exists(cls.BIN_PATH): + print(f"Building {cls.name}...") + result = subprocess.run( + ["make", "artifact"], + capture_output=True, + text=True, + check=False, + ) + if result.returncode != 0: + raise RuntimeError( + f"Failed to build {cls.name}\n" + f"stdout: {result.stdout[:500]}\nstderr: {result.stderr[:500]}" + ) + if not os.path.exists(cls.BIN_PATH): + raise RuntimeError(f"{cls.name} not found at {cls.BIN_PATH}") + + @abstractmethod + def run_single(self) -> float: + """Run a single benchmark iteration and return the result.""" + raise NotImplementedError + + def validate(self) -> float: + """Run validation before benchmark. Returns the result for reuse.""" + return self.run_single() + + def run(self) -> Tuple[float, float, List[float], int]: + """Run the full benchmark suite. Returns (mean, stdev, filtered_values, actual_runs).""" + bench_key = self.name.lower() + bench_start = time.monotonic() + + # Validation run (also serves as timing reference) + self.log(f"Validating {self.name}...") + if self.progress: + self.progress.update(bench_key, 0, "running") + run_start = time.monotonic() + first_value = self.validate() + run_elapsed = time.monotonic() - run_start + self.log(f"{self.name} validation passed ({run_elapsed:.1f}s)") + + # Adaptive run count based on single run time + actual_runs = self.n_runs + if run_elapsed > SLOW_THRESHOLD_SECONDS: + self.log( + f"Warning: {self.name} took {run_elapsed:.1f}s (>{SLOW_THRESHOLD_SECONDS}s), " + "using single run only" + ) + actual_runs = 1 + + values = [first_value] # Include validation result + for i in range(1, actual_runs): + # Check time budget before starting next run. + # Note: uses validation run time as estimate; assumes runs are similar. + total_elapsed = time.monotonic() - bench_start + remaining = MAX_BENCHMARK_SECONDS - total_elapsed + if remaining < run_elapsed: + self.log( + f"Time budget: {total_elapsed:.0f}s elapsed, " + f"stopping after {len(values)} runs" + ) + break + self.log(f"Running {self.name} benchmark - Run #{i + 1}") + if self.progress: + self.progress.update(bench_key, i + 1, "running") + values.append(self.run_single()) + + if self.progress: + self.progress.update(bench_key, len(values), "done") + + avg, stdev, filtered = self.calculate_stats(values) + self.log("-" * 40) + self.log( + f"{self.name}: {avg:.3f} ± {stdev:.3f} {self.unit} " + f"({len(filtered)}/{len(values)} valid runs)" + ) + self.log("-" * 40) + + return avg, stdev, filtered, len(values) + + def calculate_stats( + self, values: List[float] + ) -> Tuple[float, float, List[float]]: + """Filter outliers using median-based 2-sigma rule. Returns (mean, stdev, filtered).""" + if not values: + return 0.0, 0.0, [] + + n = len(values) + median = statistics.median(values) + stdev_val = statistics.stdev(values) if n > 1 else 0.0 + + # Filter values within 2 standard deviations of median + filtered = [x for x in values if abs(x - median) <= 2.0 * stdev_val] + + if len(filtered) < 2: + self.log("Warning: Too many outliers filtered, using all results") + filtered = values + + final_mean = statistics.mean(filtered) + final_stdev = statistics.stdev(filtered) if len(filtered) > 1 else 0.0 + + return final_mean, final_stdev, filtered + + +@register_benchmark("dhrystone") +class DhrystoneBenchmark(Benchmark): + """Dhrystone benchmark measuring DMIPS.""" + + name = "Dhrystone" + unit = "DMIPS" + BIN_PATH = "build/riscv32/dhrystone" + + def run_single(self) -> float: + proc = subprocess.Popen( + [EMU_PATH, "-q", self.BIN_PATH], + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True, + ) + try: + stdout, stderr = proc.communicate(timeout=TIMEOUT_SECONDS) + except TimeoutExpired: + proc.kill() + proc.communicate() # Clean up buffers + raise RuntimeError( + f"Dhrystone timed out after {TIMEOUT_SECONDS} seconds" + ) + + if proc.returncode != 0: + raise RuntimeError( + f"Dhrystone failed (exit {proc.returncode})\n" + f"stdout: {stdout[:500]}\nstderr: {stderr[:500]}" + ) + + match = re.search(r"([0-9]+(?:\.[0-9]+)?) DMIPS", stdout) + if not match: + raise RuntimeError(f"Failed to parse DMIPS:\n{stdout[:500]}") + + return float(match.group(1)) + + def validate(self) -> float: + dmips = self.run_single() + if dmips <= 0: + raise RuntimeError(f"Invalid DMIPS value: {dmips}") + return dmips + + +@register_benchmark("coremark") +class CoreMarkBenchmark(Benchmark): + """CoreMark benchmark measuring iterations/sec.""" + + name = "CoreMark" + unit = "iterations/sec" + BIN_PATH = "build/riscv32/coremark" + + ITERATIONS = 30000 + + def run_single(self) -> float: + cmd = [ + EMU_PATH, + "-q", + self.BIN_PATH, + "0x0", + "0x0", + "0x66", + str(self.ITERATIONS), + "7", + "1", + "2000", + ] + proc = subprocess.Popen( + cmd, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True, + ) + try: + stdout, stderr = proc.communicate(timeout=TIMEOUT_SECONDS) + except TimeoutExpired: + proc.kill() + proc.communicate() # Clean up buffers + raise RuntimeError( + f"CoreMark timed out after {TIMEOUT_SECONDS} seconds" + ) + + if proc.returncode != 0: + raise RuntimeError( + f"CoreMark failed (exit {proc.returncode})\n" + f"stdout: {stdout[:500]}\nstderr: {stderr[:500]}" + ) + + match = re.search(r"Iterations/Sec\s*:\s*([0-9]+(?:\.[0-9]+)?)", stdout) + if not match: + raise RuntimeError( + f"Failed to parse Iterations/Sec:\n{stdout[:500]}" + ) + + return float(match.group(1)) + + +def run_benchmark_task( + bench_name: str, n_runs: int, progress: Optional[ProgressIndicator] = None +) -> Tuple[str, dict, List[str], Optional[Exception]]: + """Run a single benchmark. Returns (name, result, logs, error).""" + bench = None + try: + bench_cls = _BENCHMARK_REGISTRY[bench_name] + bench = bench_cls(n_runs, progress) + avg, stdev, _, actual_runs = bench.run() + result = { + "name": bench.name, + "unit": bench.unit, # Store raw unit for proper formatting + "value": round(avg, 3), + "stdev": round(stdev, 3), + "runs": actual_runs, # Actual number of runs completed + } + return bench_name, result, bench.logs, None + except Exception as e: + if progress: + progress.update(bench_name, 0, "failed") + # Preserve logs even on failure for debugging + logs = bench.logs if bench else [] + return bench_name, {}, logs, e + + +def run_benchmarks( + selected: List[str], + output_json: bool, + n_runs: int, + parallel: int = 0, + quiet: bool = False, +) -> None: + """Run selected benchmarks, optionally in parallel.""" + if not os.path.exists(EMU_PATH): + print( + f"Error: {EMU_PATH} not found. Please compile first", + file=sys.stderr, + ) + sys.exit(1) + + # Validate selections + registry = get_registered_benchmarks() + for name in selected: + if name not in registry: + print(f"Error: Unknown benchmark '{name}'", file=sys.stderr) + print( + f"Available: {', '.join(sorted(registry.keys()))}", + file=sys.stderr, + ) + sys.exit(1) + + # Prepare phase: build all binaries sequentially before running benchmarks + if not quiet: + print("Preparing benchmarks...") + try: + for name in selected: + registry[name].prepare() + except RuntimeError as e: + print(f"Error: {e}", file=sys.stderr) + sys.exit(1) + if not quiet: + print("Preparation complete.\n") + + # Create and start progress indicator + progress = ProgressIndicator(selected, n_runs, quiet=quiet) + progress.start() + + results: Dict[str, dict] = {} + all_logs: Dict[str, List[str]] = {} + errors: Dict[str, Exception] = {} + + start_time = time.monotonic() + + if parallel and parallel > 0 and len(selected) > 1: + workers = min(parallel, len(selected)) + if not quiet: + print( + f">>> Running {len(selected)} benchmarks in parallel ({workers} workers) <<<" + ) + with ThreadPoolExecutor(max_workers=workers) as executor: + futures = { + executor.submit( + run_benchmark_task, name, n_runs, progress + ): name + for name in selected + } + for future in as_completed(futures): + name, result, logs, error = future.result() + all_logs[name] = logs + if error: + errors[name] = error + else: + results[name] = result + + progress.finish() + + # Print logs in user-specified order after all complete (only if not quiet) + if not quiet: + for name in selected: + if name in all_logs and all_logs[name]: + print(f"\n[{name}]") + for line in all_logs[name]: + print(f" {line}") + else: + for name in selected: + name, result, logs, error = run_benchmark_task( + name, n_runs, progress + ) + all_logs[name] = logs + if error: + errors[name] = error + else: + results[name] = result + + progress.finish() + + # Print logs after spinner finishes to avoid garbled output + if not quiet: + for name in selected: + if name in all_logs and all_logs[name]: + print(f"\n[{name}]") + for line in all_logs[name]: + print(f" {line}") + + elapsed = time.monotonic() - start_time + + # Report errors + for name, error in errors.items(): + print(f"\nError in {name}: {error}", file=sys.stderr) + + if errors: + sys.exit(1) + + # Output results in user-specified order + print("\n" + "=" * 50) + print("Benchmark results") + print("=" * 50) + ordered_results = [] + for name in selected: + if name in results: + r = results[name] + ordered_results.append( + { + "name": r["name"], + "unit": r["unit"], + "value": r["value"], + "runs": r["runs"], + } + ) + print( + f" {r['name']}: {r['value']} ± {r['stdev']} {r['unit']} ({r['runs']} runs)" + ) + print("=" * 50) + print(f" Total time: {elapsed:.1f}s") + + if output_json: + combined_file = "benchmark_output.json" + with open(combined_file, "w") as f: + json.dump(ordered_results, f, indent=4) + if not quiet: + print(f"Saved: {combined_file}") + + +def parse_benchmarks(args: List[str]) -> List[str]: + """Parse benchmark arguments, preserving order.""" + if not args: + # Default: all registered benchmarks in registration order + return list(_BENCHMARK_REGISTRY.keys()) + + # Handle comma-separated and space-separated inputs + result = [] + for arg in args: + for part in arg.split(","): + name = part.strip().lower() + if name and name not in result: # Preserve order, no duplicates + result.append(name) + return result + + +def main(): + parser = argparse.ArgumentParser( + description="Run benchmarks for rv32emu", + epilog=f"Available benchmarks: {', '.join(sorted(_BENCHMARK_REGISTRY.keys()))}", + ) + parser.add_argument( + "--json", + action="store_true", + help="Output results to JSON files", + ) + parser.add_argument( + "--parallel", + type=int, + metavar="N", + help="Run benchmarks in parallel with N workers (default: sequential)", + ) + parser.add_argument( + "--quiet", + action="store_true", + help="Quiet mode for CI (no progress indicator)", + ) + parser.add_argument( + "--runs", + type=int, + default=DEFAULT_RUNS, + help=f"Number of runs per benchmark (default: {DEFAULT_RUNS})", + ) + parser.add_argument( + "benchmarks", + nargs="*", + metavar="BENCH", + help="Benchmarks to run (comma or space-separated)", + ) + + args = parser.parse_args() + + # Validate --runs + if args.runs < 1: + parser.error("--runs must be at least 1") + + selected = parse_benchmarks(args.benchmarks) + if not selected: + print("Error: No benchmarks specified", file=sys.stderr) + sys.exit(1) + + run_benchmarks( + selected, + args.json, + args.runs, + parallel=args.parallel or 0, + quiet=args.quiet, + ) + + +if __name__ == "__main__": + main() diff --git a/tests/coremark.py b/tests/coremark.py deleted file mode 100755 index 6c75a8d8d..000000000 --- a/tests/coremark.py +++ /dev/null @@ -1,60 +0,0 @@ -#!/usr/bin/env python3 -import subprocess -import re -import numpy -import os -import json - -iter = 1 -coremark_param = "0x0 0x0 0x66 30000 7 1 2000" -res = [] -file_exist = os.path.exists("build/rv32emu") -if not file_exist: - print("Please compile before running test") - exit(1) -print("Start Test CoreMark benchmark") -comp_proc = subprocess.check_output( - "build/rv32emu build/riscv32/coremark {}".format(coremark_param), shell=True -).decode("utf-8") -if not comp_proc or comp_proc.find("Error") != -1: - print("Test Error") - exit(1) -else: - print("Test Pass") - -for i in range(iter): - print("Running CoreMark benchmark - Run #{}".format(i + 1)) - comp_proc = subprocess.check_output( - "build/rv32emu build/riscv32/coremark {}".format(coremark_param), - shell=True, - ).decode("utf-8") - if not comp_proc: - print("Fail\n") - exit(1) - else: - res.append( - float( - re.findall(r"Iterations/Sec : [0-9]+.[0-9]+", comp_proc)[0][ - 19: - ] - ) - ) - -mean = numpy.mean(res, dtype=numpy.float64) -deviation = numpy.std(res, dtype=numpy.float64) -for n in res: - if abs(n - mean) > (deviation * 2): - res.remove(n) - -print("{:.3f}".format(numpy.mean(res, dtype=numpy.float64))) - -# save Average Iterations/Sec in JSON format for benchmark action workflow -benchmark_output = "coremark_output.json" -benchmark_data = { - "name": "Coremark", - "unit": "Average iterations/sec over 10 runs", - "value": float("{:.3f}".format(numpy.mean(res, dtype=numpy.float64))), -} -f = open(benchmark_output, "w") -f.write(json.dumps(benchmark_data)) -f.close() diff --git a/tests/dhrystone.sh b/tests/dhrystone.sh deleted file mode 100755 index 4fdf80d57..000000000 --- a/tests/dhrystone.sh +++ /dev/null @@ -1,102 +0,0 @@ -#!/usr/bin/env bash - -source tests/common.sh - -# Set the number of runs for the Dhrystone benchmark -N_RUNS=1 - -function sanity_check() -{ - if test ! -f $O/riscv32/dhrystone; then - make artifact || exit 1 - fi -} - -function run_dhrystone() -{ - # Run Dhrystone and extract the DMIPS value - output=$($RUN $O/riscv32/dhrystone 2>&1) - local exit_code=$? - [ $exit_code -ne 0 ] && fail - dmips=$(echo "$output" | grep -Po '[0-9]+(?= DMIPS)' | awk '{print}') - echo "$dmips" -} - -sanity_check -# Run Dhrystone benchmark and collect DMIPS values -dmips_values=() -for ((i = 1; i <= $N_RUNS; i++)); do - echo "Running Dhrystone benchmark - Run #$i" - dmips=$(run_dhrystone) - exit_code=$? - [ $exit_code -ne 0 ] && fail - dmips_values+=("$dmips") -done - -# Sort DMIPS values -sorted_dmips=($(printf "%s\n" "${dmips_values[@]}" | sort -n)) - -# Calculate Median Absolute Deviation (MAD) -num_dmips=${#sorted_dmips[@]} -median_index=$((num_dmips / 2)) -if ((num_dmips % 2 == 0)); then - median=$(echo "scale=2; (${sorted_dmips[median_index - 1]} + ${sorted_dmips[median_index]}) / 2" | bc -l) -else - median=${sorted_dmips[median_index]} -fi - -deviation=0 -for dmips in "${sorted_dmips[@]}"; do - if (($(echo "$dmips > $median" | bc -l))); then - diff=$(echo "$dmips - $median" | bc -l) - else - diff=$(echo "$median - $dmips" | bc -l) - fi - deviation=$(echo "scale=2; $deviation + $diff" | bc -l) -done - -mad=$(echo "scale=2; $deviation / $num_dmips" | bc -l) - -# Filter outliers based on MAD -filtered_dmips=() -for dmips in "${sorted_dmips[@]}"; do - if (($(echo "$dmips > 0" | bc -l))); then - if (($(echo "$dmips > $median" | bc -l))); then - diff=$(echo "$dmips - $median" | bc -l) - else - diff=$(echo "$median - $dmips" | bc -l) - fi - if (($(echo "$diff <= $mad * 2" | bc -l))); then - filtered_dmips+=("$dmips") - fi - fi -done - -#dhrystone benchmark output file -benchmark_output=dhrystone_output.json -# empty the file -echo -n "" > $benchmark_output - -# Calculate average DMIPS excluding outliers -num_filtered=${#filtered_dmips[@]} -if ((num_filtered > 0)); then - total_dmips=0 - for dmips in "${filtered_dmips[@]}"; do - total_dmips=$(echo "scale=2; $total_dmips + $dmips" | bc -l) - done - - average_dmips=$(echo "scale=2; $total_dmips / $num_filtered" | bc -l) - echo "--------------------------" - echo "Average DMIPS : $average_dmips" - echo "--------------------------" - - #save Average DMIPS in JSON format for benchmark action workflow - echo -n '{' >> $benchmark_output - echo -n '"name": "Dhrystone",' >> $benchmark_output - echo -n '"unit": "Average DMIPS over 10 runs",' >> $benchmark_output - echo -n '"value": ' >> $benchmark_output - echo -n $average_dmips >> $benchmark_output - echo -n '}' >> $benchmark_output -else - fail -fi