gpu-mode · alexzhang13 · Feb 8, 2025 · Feb 3, 2025 · Feb 2, 2025 · Feb 2, 2025
diff --git a/examples/eval.cu b/examples/eval.cu
diff --git a/examples/eval.py b/examples/eval.py
@@ -1,11 +1,22 @@
+import dataclasses
+import re
 import time
 import os
 import sys
 import math
+from pathlib import Path
+from typing import Any
+
+import torch.cuda
 
 from utils import set_seed
+try:
+    from task import TestSpec
+except ImportError:
+    TestSpec = dict
+
 from submission import custom_kernel
-from reference import ref_kernel, check_implementation, generate_input
+from reference import check_implementation, generate_input
 
 WARMUP_RUNS = 10
 TIMED_RUNS = 100
@@ -28,66 +39,235 @@ def log(self, key, value):
         self.print(f"{key}: {value}")
 
 
-def measure_runtime(logger: PopcornOutput):
-    print("warming up...")
+@dataclasses.dataclass
+class TestCase:
+    args: dict
+    spec: str
 
-    warmup_data = generate_input()
-    for _ in range(WARMUP_RUNS):
-        custom_kernel(warmup_data)
-
-    durations = []
 
-    for _ in range(TIMED_RUNS):
-        data = generate_input()
-        start = time.time()
-        submission_output = custom_kernel(data)
-        end = time.time()
-        durations.append((end - start) * 1e9)
+def get_test_cases(file_name: str) -> list[TestCase]:
+    try:
+        content = Path(file_name).read_text()
+    except Exception as E:
+        print(f"Could not open test file`{file_name}`: {E}", file=sys.stderr)
+        exit(113)
 
-        reference_output = ref_kernel(data)
-        if not check_implementation(submission_output, reference_output):
-            logger.log("check", "fail")
-            sys.exit(112)
-
-    total_duration = sum(durations)
+    tests = []
+    lines = content.splitlines()
+    match = r"\s*([a-zA-Z]+):\s*([a-zA-Z]+|[+-]?[0-9]+)\s*"
+    for line in lines:
+        parts = line.split(";")
+        case = {}
+        for part in parts:
+            matched = re.match(match, part)
+            if not re.fullmatch(match, part):
+                print(f"invalid test case: '{line}': '{part}'", file=sys.stderr)
+                exit(113)
+            key = matched[1]
+            val = matched[2]
+            try:
+                val = int(val)
+            except ValueError:
+                pass
+
+            case[key] = val
+        tests.append(TestCase(spec=line, args=case))
+
+    return tests
+
+
+def warm_up(test: TestCase):
+    data = generate_input(**test.args)
+    start = time.perf_counter()
+    while time.perf_counter() - start < 0.2:
+        custom_kernel(data)
+        torch.cuda.synchronize()
+
+
+@dataclasses.dataclass
+class Stats:
+    runs: int
+    mean: float
+    std: float
+    err: float
+    best: float
+    worst: float
+
+
+def calculate_stats(durations: list[int]):
+    """
+    Calculate statistical data from a list of durations.
+
+    @param durations: A list of durations in nanoseconds.
+    @return: A Stats object containing the number of runs, mean, standard deviation, error, best, and worst durations.
+    """
+    runs = len(durations)
+    total = sum(durations)
     best = min(durations)
     worst = max(durations)
-    average_duration = total_duration / TIMED_RUNS
 
-    variance = sum([(d - average_duration) ** 2 for d in durations])
-    standard_deviation = math.sqrt(variance / (TIMED_RUNS - 1))
-    standard_error = standard_deviation / math.sqrt(TIMED_RUNS)
+    avg = total / runs
+    variance = sum(map(lambda x: (x - avg)**2, durations))
+    std = math.sqrt(variance / (runs - 1))
+    err = std / math.sqrt(runs)
+
+    return Stats(runs=runs, mean=avg, std=std, err=err, best=float(best),
+                 worst=float(worst))
 
-    logger.log("check", "pass")
-    logger.log("duration.mean", average_duration)
-    logger.log("duration.std", standard_deviation)
-    logger.log("duration.err", standard_error)
-    logger.log("duration.best", best)
-    logger.log("duration.worst", worst)
 
-    print(f"average kernel runtime: {average_duration / 1e6} ± {standard_error / 1e6} µs")
+def run_testing(logger: PopcornOutput, tests: list[TestCase]):
+    """
+    Executes the actual test case code and checks for correctness.
+
+    @param logger: A PopcornOutput object used for logging test results.
+    @param tests: A list of TestCase objects representing the test cases to be executed.
+    @return: An integer representing the exit status: 0 if all tests pass, otherwise 112.
+    """
+    passed = True
+    logger.log("test-count", len(tests))
+    for idx, test in enumerate(tests):
+        logger.log(f"test.{idx}.spec", test.spec)
+
+        data = generate_input(**test.args)
+        torch.cuda.synchronize()
+        submission_output = custom_kernel(data)
+        torch.cuda.synchronize()
+        error = check_implementation(data, submission_output)
+        if error:
+            logger.log(f"test.{idx}.status", "fail")
+            logger.log(f"test.{idx}.error", error)
+            passed = False
+        else:
+            logger.log(f"test.{idx}.status", "pass")
+
+    if passed:
+        logger.log("check", "pass")
+        return 0
+    else:
+        logger.log("check", "fail")
+        return 112
+
+
+def benchmark(test: TestCase, recheck: bool, max_repeats: int, max_time_ns: float) -> Stats | Any:
+    """
+    For a particular test case, check correctness (if applicable) and grab runtime results.
+
+    @param test: TestCase object.
+    @param recheck: Flag for whether to explicitly check functional correctness.
+    @param max_repeats: Number of trials to repeat.
+    @param max_time_ns: Timeout time in nanoseconds.
+    @return: A Stats object for this particular benchmark case or an error if the test fails.
+    """
+    durations = []
+    # generate input data once
+    data = generate_input(**test.args)
+    #  first, one obligatory correctness check
+    output = custom_kernel(data)
+    error = check_implementation(data, output)
+    if error:
+        return error
+
+    # now, do multiple timing runs without further correctness testing
+    # there is an upper bound of 100 runs, and a lower bound of 3 runs;
+    # otherwise, we repeat until we either measure at least 10 full seconds,
+    # or the relative error of the mean is below 1%.
+
+    for i in range(max_repeats):
+        if recheck:
+            data = generate_input(**test.args)
+        torch.cuda.synchronize()
+        start = time.perf_counter_ns()
+        output = custom_kernel(data)
+        torch.cuda.synchronize()
+        end = time.perf_counter_ns()
+
+        if recheck:
+            error = check_implementation(data, output)
+            if error:
+                return error
+
+        del output
+        durations.append(end-start)
+
+        if i > 1:
+            stats = calculate_stats(durations)
+            if stats.err / stats.mean < 0.01 or stats.mean *  stats.runs > max_time_ns:
+                break
+
+    return calculate_stats(durations)
+
+
+def run_benchmarking(logger: PopcornOutput, tests: list[TestCase]):
+    """
+    Executes benchmarking code for a CUDA Kernel and logs runtimes.
+
+    @param logger: A PopcornOutput object used for logging benchmark results.
+    @param tests: A list of TestCase objects representing the test cases to be benchmarked.
+    @return: An integer representing the exit status: 0 if all benchmarks pass, otherwise 112.
+    """
+    warm_up(tests[0])
+    passed = True
+    logger.log("benchmark-count", len(tests))
+    for idx, test in enumerate(tests):
+        logger.log(f"benchmark.{idx}.spec", test.spec)
+        result = benchmark(test, False, 100, 10e9)
+        if isinstance(result, Stats):
+            for field in dataclasses.fields(Stats):
+                logger.log(f"benchmark.{idx}.{field.name}", getattr(result, field.name))
+        else:
+            passed = False
+            logger.log(f"benchmark.{idx}.status", "fail")
+            logger.log(f"benchmark.{idx}.error", result)
+
+    if passed:
+        logger.log("check", "pass")
+        return 0
+    else:
+        logger.log("check", "fail")
+        return 112
 
 
 def main():
     fd = os.getenv("POPCORN_FD")
     if not fd:
         return 111
 
+    if len(sys.argv) < 3:
+        return 2
+
+    mode = sys.argv[1]
+    tests = get_test_cases(sys.argv[2])
+
     with PopcornOutput(int(fd)) as logger:
         seed = os.getenv("POPCORN_SEED")
         seed = int(seed) if seed else 42
-
         set_seed(seed)
-        data = generate_input()
-        reference_output = ref_kernel(data)
-        submission_output = custom_kernel(data)
 
-        if not check_implementation(submission_output, reference_output):
-            logger.log("check", "fail")
-            return 112
+        if mode == "test":
+            return run_testing(logger, tests)
 
-        measure_runtime(logger)
-        return 0
+        if mode == "benchmark":
+            return run_benchmarking(logger, tests)
+
+        if mode == "leaderboard":
+            warm_up(tests[0])
+            result = benchmark(tests[-1], True, 100, 30e9)
+            if isinstance(result, Stats):
+                logger.log("benchmark-count", 1)
+                logger.log(f"benchmark.0.spec", tests[-1].spec)
+                logger.log(f"benchmark.0.runs", result.runs)
+                logger.log(f"benchmark.0.mean", result.mean)
+                logger.log(f"benchmark.0.std", result.std)
+                logger.log(f"benchmark.0.err", result.err)
+                logger.log("check", "pass")
+            else:
+                logger.log("test-count", 1)
+                logger.log("test.0.status", "fail")
+                logger.log("test.0.error", str(result)) #TODO: Make sure result implements __str__?
+
+        else:
+            # TODO: Implement script and profile mode
+            return 2
 
 
 if __name__ == "__main__":

diff --git a/examples/identity_cuda/reference.cuh b/examples/identity_cuda/reference.cuh
@@ -11,17 +11,15 @@
 
 #include "task.h"
 
-static input_t generate_input(int seed) {
+static input_t generate_input(int seed, int size) {
   std::mt19937 rng(seed);
   input_t data;
 
   std::uniform_real_distribution<float> dist(0, 1);
 
-  for (int i = 0; i < N_SIZES; ++i) {
-    data[i].resize(Ns[i]);
-    for (int j = 0; j < Ns[i]; ++j) {
-      data[i][j] = dist(rng);
-    }
+  data.resize(size);
+  for (int j = 0; j < size; ++j) {
+    data[j] = dist(rng);
   }
 
   return data;
@@ -32,28 +30,22 @@ static output_t ref_kernel(input_t data) {
   return (output_t) data;
 }
 
-static bool check_implementation(output_t out, output_t ref, float epsilon = 1e-5) {
+static void check_implementation(TestReporter& reporter, input_t data, output_t out, float epsilon = 1e-5) {
   // input_t data = generate_input();
-  // output_t reference_out = reference(data);
-
-  for (int i = 0; i < N_SIZES; ++i) {
-    auto ref_ptr = ref[i];
-    auto out_ptr = out[i];
+  output_t ref = ref_kernel(data);
 
-    if(out[i].size() != Ns[i]) {
-        std::cerr <<  "SIZE MISMATCH at " << i << ": " << Ns[i] << " " << out[i].size() << std::endl;
-        return false;
-    }
+  if(out.size() != ref.size()) {
+      if(!reporter.check_equal("size mismatch", out.size(), ref.size())) return;
+  }
 
-    for (int j = 0; j < Ns[i]; ++j) {
-      if (std::fabs(ref_ptr[j] - out_ptr[j]) > epsilon) {
-        std::cerr <<  "ERROR AT " << i << ", "<< j << ": " << ref_ptr[j] << " " << out_ptr[j] << std::endl;
-        return false;
-      }
+  for (int j = 0; j < ref.size(); ++j) {
+    if (std::fabs(ref[j] - out[j]) > epsilon) {
+        reporter.fail() << "error at " << j << ": " << ref[j] << " "  << std::to_string(out[j]);
+        return;
     }
   }
 
-  return true;
+  reporter.pass();
 }
 
 #endif
diff --git a/examples/identity_cuda/submission.cu b/examples/identity_cuda/submission.cu
@@ -14,33 +14,9 @@ __global__ void copy_kernel(float *input, float *output, int N)
 
 output_t custom_kernel(input_t data)
 {
-    output_t result;
-
-    for (int i = 0; i < N_SIZES; ++i)
-    {
-        int N = Ns[i];
-        result[i].resize(N);
-
-        // Allocate device memory
-        float *d_input, *d_output;
-        CUDA_CHECK(cudaMalloc(&d_input, N * sizeof(float)));
-        CUDA_CHECK(cudaMalloc(&d_output, N * sizeof(float)));
-
-        // Copy input to device
-        CUDA_CHECK(cudaMemcpy(d_input, data[i].data(), N * sizeof(float), cudaMemcpyHostToDevice));
-
-        // Launch kernel
-        int blockSize = 256;
-        int numBlocks = (N + blockSize - 1) / blockSize;
-        copy_kernel<<<numBlocks, blockSize>>>(d_input, d_output, N);
-
-        // Copy result back to host
-        CUDA_CHECK(cudaMemcpy(result[i].data(), d_output, N * sizeof(float), cudaMemcpyDeviceToHost));
-
-        // Free device memory
-        CUDA_CHECK(cudaFree(d_input));
-        CUDA_CHECK(cudaFree(d_output));
+/*  if(data.size() > 256) {
+        data[0] = -1;
     }
-
-    return result;
+*/
+    return data;
 }