Skip to content

Commit

Permalink
WIP: Tests and Benchmarks (#146)
Browse files Browse the repository at this point in the history
* individual test cases and benchmarks

* wip

* fixes

* increase timeout for testing

* more convenient create local

* wip

* fixups

* updates

* functionally correct saving score to leaderboard db (using mean of all shapes)

* fix linting complaints

* change thread names based on benchmark

* Feat: add eval leaderboard command for python

* fix identity cuda example

* fix create-local single gpu

* feat: update python examples

* fix cuda examples (move semantic bug) and TK

* fix copy/move semantics for test cases

* Feat: matmul example

* Add comments to eval.cu for clarity

* add docstrings for Python examples

* update score to be leaderboard average

* Leaderboard message updated

---------

Co-authored-by: Alex Zhang <[email protected]>
Co-authored-by: S1ro1 <[email protected]>
  • Loading branch information
3 people authored Feb 8, 2025
1 parent ff9c47b commit 3e6e766
Show file tree
Hide file tree
Showing 51 changed files with 1,658 additions and 686 deletions.
464 changes: 391 additions & 73 deletions examples/eval.cu

Large diffs are not rendered by default.

260 changes: 220 additions & 40 deletions examples/eval.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,22 @@
import dataclasses
import re
import time
import os
import sys
import math
from pathlib import Path
from typing import Any

import torch.cuda

from utils import set_seed
try:
from task import TestSpec
except ImportError:
TestSpec = dict

from submission import custom_kernel
from reference import ref_kernel, check_implementation, generate_input
from reference import check_implementation, generate_input

WARMUP_RUNS = 10
TIMED_RUNS = 100
Expand All @@ -28,66 +39,235 @@ def log(self, key, value):
self.print(f"{key}: {value}")


def measure_runtime(logger: PopcornOutput):
print("warming up...")
@dataclasses.dataclass
class TestCase:
args: dict
spec: str

warmup_data = generate_input()
for _ in range(WARMUP_RUNS):
custom_kernel(warmup_data)

durations = []

for _ in range(TIMED_RUNS):
data = generate_input()
start = time.time()
submission_output = custom_kernel(data)
end = time.time()
durations.append((end - start) * 1e9)
def get_test_cases(file_name: str) -> list[TestCase]:
try:
content = Path(file_name).read_text()
except Exception as E:
print(f"Could not open test file`{file_name}`: {E}", file=sys.stderr)
exit(113)

reference_output = ref_kernel(data)
if not check_implementation(submission_output, reference_output):
logger.log("check", "fail")
sys.exit(112)

total_duration = sum(durations)
tests = []
lines = content.splitlines()
match = r"\s*([a-zA-Z]+):\s*([a-zA-Z]+|[+-]?[0-9]+)\s*"
for line in lines:
parts = line.split(";")
case = {}
for part in parts:
matched = re.match(match, part)
if not re.fullmatch(match, part):
print(f"invalid test case: '{line}': '{part}'", file=sys.stderr)
exit(113)
key = matched[1]
val = matched[2]
try:
val = int(val)
except ValueError:
pass

case[key] = val
tests.append(TestCase(spec=line, args=case))

return tests


def warm_up(test: TestCase):
data = generate_input(**test.args)
start = time.perf_counter()
while time.perf_counter() - start < 0.2:
custom_kernel(data)
torch.cuda.synchronize()


@dataclasses.dataclass
class Stats:
runs: int
mean: float
std: float
err: float
best: float
worst: float


def calculate_stats(durations: list[int]):
"""
Calculate statistical data from a list of durations.
@param durations: A list of durations in nanoseconds.
@return: A Stats object containing the number of runs, mean, standard deviation, error, best, and worst durations.
"""
runs = len(durations)
total = sum(durations)
best = min(durations)
worst = max(durations)
average_duration = total_duration / TIMED_RUNS

variance = sum([(d - average_duration) ** 2 for d in durations])
standard_deviation = math.sqrt(variance / (TIMED_RUNS - 1))
standard_error = standard_deviation / math.sqrt(TIMED_RUNS)
avg = total / runs
variance = sum(map(lambda x: (x - avg)**2, durations))
std = math.sqrt(variance / (runs - 1))
err = std / math.sqrt(runs)

return Stats(runs=runs, mean=avg, std=std, err=err, best=float(best),
worst=float(worst))

logger.log("check", "pass")
logger.log("duration.mean", average_duration)
logger.log("duration.std", standard_deviation)
logger.log("duration.err", standard_error)
logger.log("duration.best", best)
logger.log("duration.worst", worst)

print(f"average kernel runtime: {average_duration / 1e6} ± {standard_error / 1e6} µs")
def run_testing(logger: PopcornOutput, tests: list[TestCase]):
"""
Executes the actual test case code and checks for correctness.
@param logger: A PopcornOutput object used for logging test results.
@param tests: A list of TestCase objects representing the test cases to be executed.
@return: An integer representing the exit status: 0 if all tests pass, otherwise 112.
"""
passed = True
logger.log("test-count", len(tests))
for idx, test in enumerate(tests):
logger.log(f"test.{idx}.spec", test.spec)

data = generate_input(**test.args)
torch.cuda.synchronize()
submission_output = custom_kernel(data)
torch.cuda.synchronize()
error = check_implementation(data, submission_output)
if error:
logger.log(f"test.{idx}.status", "fail")
logger.log(f"test.{idx}.error", error)
passed = False
else:
logger.log(f"test.{idx}.status", "pass")

if passed:
logger.log("check", "pass")
return 0
else:
logger.log("check", "fail")
return 112


def benchmark(test: TestCase, recheck: bool, max_repeats: int, max_time_ns: float) -> Stats | Any:
"""
For a particular test case, check correctness (if applicable) and grab runtime results.
@param test: TestCase object.
@param recheck: Flag for whether to explicitly check functional correctness.
@param max_repeats: Number of trials to repeat.
@param max_time_ns: Timeout time in nanoseconds.
@return: A Stats object for this particular benchmark case or an error if the test fails.
"""
durations = []
# generate input data once
data = generate_input(**test.args)
# first, one obligatory correctness check
output = custom_kernel(data)
error = check_implementation(data, output)
if error:
return error

# now, do multiple timing runs without further correctness testing
# there is an upper bound of 100 runs, and a lower bound of 3 runs;
# otherwise, we repeat until we either measure at least 10 full seconds,
# or the relative error of the mean is below 1%.

for i in range(max_repeats):
if recheck:
data = generate_input(**test.args)
torch.cuda.synchronize()
start = time.perf_counter_ns()
output = custom_kernel(data)
torch.cuda.synchronize()
end = time.perf_counter_ns()

if recheck:
error = check_implementation(data, output)
if error:
return error

del output
durations.append(end-start)

if i > 1:
stats = calculate_stats(durations)
if stats.err / stats.mean < 0.01 or stats.mean * stats.runs > max_time_ns:
break

return calculate_stats(durations)


def run_benchmarking(logger: PopcornOutput, tests: list[TestCase]):
"""
Executes benchmarking code for a CUDA Kernel and logs runtimes.
@param logger: A PopcornOutput object used for logging benchmark results.
@param tests: A list of TestCase objects representing the test cases to be benchmarked.
@return: An integer representing the exit status: 0 if all benchmarks pass, otherwise 112.
"""
warm_up(tests[0])
passed = True
logger.log("benchmark-count", len(tests))
for idx, test in enumerate(tests):
logger.log(f"benchmark.{idx}.spec", test.spec)
result = benchmark(test, False, 100, 10e9)
if isinstance(result, Stats):
for field in dataclasses.fields(Stats):
logger.log(f"benchmark.{idx}.{field.name}", getattr(result, field.name))
else:
passed = False
logger.log(f"benchmark.{idx}.status", "fail")
logger.log(f"benchmark.{idx}.error", result)

if passed:
logger.log("check", "pass")
return 0
else:
logger.log("check", "fail")
return 112


def main():
fd = os.getenv("POPCORN_FD")
if not fd:
return 111

if len(sys.argv) < 3:
return 2

mode = sys.argv[1]
tests = get_test_cases(sys.argv[2])

with PopcornOutput(int(fd)) as logger:
seed = os.getenv("POPCORN_SEED")
seed = int(seed) if seed else 42

set_seed(seed)
data = generate_input()
reference_output = ref_kernel(data)
submission_output = custom_kernel(data)

if not check_implementation(submission_output, reference_output):
logger.log("check", "fail")
return 112
if mode == "test":
return run_testing(logger, tests)

measure_runtime(logger)
return 0
if mode == "benchmark":
return run_benchmarking(logger, tests)

if mode == "leaderboard":
warm_up(tests[0])
result = benchmark(tests[-1], True, 100, 30e9)
if isinstance(result, Stats):
logger.log("benchmark-count", 1)
logger.log(f"benchmark.0.spec", tests[-1].spec)
logger.log(f"benchmark.0.runs", result.runs)
logger.log(f"benchmark.0.mean", result.mean)
logger.log(f"benchmark.0.std", result.std)
logger.log(f"benchmark.0.err", result.err)
logger.log("check", "pass")
else:
logger.log("test-count", 1)
logger.log("test.0.status", "fail")
logger.log("test.0.error", str(result)) #TODO: Make sure result implements __str__?

else:
# TODO: Implement script and profile mode
return 2


if __name__ == "__main__":
Expand Down
36 changes: 14 additions & 22 deletions examples/identity_cuda/reference.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -11,17 +11,15 @@

#include "task.h"

static input_t generate_input(int seed) {
static input_t generate_input(int seed, int size) {
std::mt19937 rng(seed);
input_t data;

std::uniform_real_distribution<float> dist(0, 1);

for (int i = 0; i < N_SIZES; ++i) {
data[i].resize(Ns[i]);
for (int j = 0; j < Ns[i]; ++j) {
data[i][j] = dist(rng);
}
data.resize(size);
for (int j = 0; j < size; ++j) {
data[j] = dist(rng);
}

return data;
Expand All @@ -32,28 +30,22 @@ static output_t ref_kernel(input_t data) {
return (output_t) data;
}

static bool check_implementation(output_t out, output_t ref, float epsilon = 1e-5) {
static void check_implementation(TestReporter& reporter, input_t data, output_t out, float epsilon = 1e-5) {
// input_t data = generate_input();
// output_t reference_out = reference(data);

for (int i = 0; i < N_SIZES; ++i) {
auto ref_ptr = ref[i];
auto out_ptr = out[i];
output_t ref = ref_kernel(data);

if(out[i].size() != Ns[i]) {
std::cerr << "SIZE MISMATCH at " << i << ": " << Ns[i] << " " << out[i].size() << std::endl;
return false;
}
if(out.size() != ref.size()) {
if(!reporter.check_equal("size mismatch", out.size(), ref.size())) return;
}

for (int j = 0; j < Ns[i]; ++j) {
if (std::fabs(ref_ptr[j] - out_ptr[j]) > epsilon) {
std::cerr << "ERROR AT " << i << ", "<< j << ": " << ref_ptr[j] << " " << out_ptr[j] << std::endl;
return false;
}
for (int j = 0; j < ref.size(); ++j) {
if (std::fabs(ref[j] - out[j]) > epsilon) {
reporter.fail() << "error at " << j << ": " << ref[j] << " " << std::to_string(out[j]);
return;
}
}

return true;
reporter.pass();
}

#endif
32 changes: 4 additions & 28 deletions examples/identity_cuda/submission.cu
Original file line number Diff line number Diff line change
Expand Up @@ -14,33 +14,9 @@ __global__ void copy_kernel(float *input, float *output, int N)

output_t custom_kernel(input_t data)
{
output_t result;

for (int i = 0; i < N_SIZES; ++i)
{
int N = Ns[i];
result[i].resize(N);

// Allocate device memory
float *d_input, *d_output;
CUDA_CHECK(cudaMalloc(&d_input, N * sizeof(float)));
CUDA_CHECK(cudaMalloc(&d_output, N * sizeof(float)));

// Copy input to device
CUDA_CHECK(cudaMemcpy(d_input, data[i].data(), N * sizeof(float), cudaMemcpyHostToDevice));

// Launch kernel
int blockSize = 256;
int numBlocks = (N + blockSize - 1) / blockSize;
copy_kernel<<<numBlocks, blockSize>>>(d_input, d_output, N);

// Copy result back to host
CUDA_CHECK(cudaMemcpy(result[i].data(), d_output, N * sizeof(float), cudaMemcpyDeviceToHost));

// Free device memory
CUDA_CHECK(cudaFree(d_input));
CUDA_CHECK(cudaFree(d_output));
/* if(data.size() > 256) {
data[0] = -1;
}

return result;
*/
return data;
}
Loading

0 comments on commit 3e6e766

Please sign in to comment.