InfiniTensor · voltjia · Jul 2, 2025 · Jan 10, 2025 · Jan 10, 2025 · Jan 11, 2025
diff --git a/.gitignore b/.gitignore
@@ -160,3 +160,10 @@ cython_debug/
 #  and can be added to the global gitignore or merged into this file.  For a more nuclear
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
 #.idea/
+
+# Evaluation results
+*.csv
+*.html
+*.json
+*.png
+*.tex
diff --git a/README.md b/README.md
@@ -1,18 +1,20 @@
 # NineToothed Examples
 
-This repository contains examples for [NineToothed](https://github.com/InfiniTensor/ninetoothed), including implementations of several common compute kernels written using NineToothed.
+This repository contains examples of [NineToothed](https://github.com/InfiniTensor/ninetoothed), including implementations of several common compute kernels written using NineToothed.
 
 ## Usage
 
 After cloning this repository, you can run any of the examples using Python. For instance, to run the matrix multiplication example, execute the following command:
 
 ```bash
-python matmul.py
+python mm.py
 ```
 
 ### Autotuning Behavior
 
-By default, the examples apply autotuning, which may take several minutes or longer to complete for complex kernels. If you wish to disable autotuning, you can replace symbol definitions with concrete values. Consider the following example:
+Some examples apply autotuning, which may take several minutes or longer to complete for complex kernels. If you wish to disable autotuning, you can replace symbol definitions with concrete values.
+
+Consider the following example:
 
 ```python
 BLOCK_SIZE = Symbol("BLOCK_SIZE", meta=True)
@@ -29,6 +31,8 @@ BLOCK_SIZE = 1024
 
 These approaches allow you to obtain results in seconds. However, selecting optimal values is crucial for good performance. Experiment with different values to determine the best configuration.
 
+Note: Please don't forget to also disable the autotuning of the corresponding Triton compute kernels.
+
 ## Third-Party Code and Licenses
 
 This project includes code modified or inspired from the following open-source repositories:

diff --git a/add.py b/add.py
@@ -1,118 +1,70 @@
-import ninetoothed
 import torch
 import triton
-import triton.language as tl
-from ninetoothed import Symbol, Tensor
 
-BLOCK_SIZE = Symbol("BLOCK_SIZE", meta=True)
-
-
-@ninetoothed.jit
-def add_kernel(
-    lhs: Tensor(1).tile((BLOCK_SIZE,)),
-    rhs: Tensor(1).tile((BLOCK_SIZE,)),
-    output: Tensor(1).tile((BLOCK_SIZE,)),
-):
-    output = lhs + rhs  # noqa: F841
-
-
-def add(lhs, rhs):
-    output = torch.empty_like(lhs)
-
-    add_kernel(lhs, rhs, output)
-
-    return output
-
-
-@triton.jit
-def triton_add_kernel(
-    lhs_ptr,
-    rhs_ptr,
-    output_ptr,
-    n_elements,
-    BLOCK_SIZE: tl.constexpr,
-):
-    pid = tl.program_id(0)
-
-    block_start = pid * BLOCK_SIZE
-    offsets = block_start + tl.arange(0, BLOCK_SIZE)
-    mask = offsets < n_elements
-
-    lhs = tl.load(lhs_ptr + offsets, mask=mask)
-    rhs = tl.load(rhs_ptr + offsets, mask=mask)
-    output = lhs + rhs
-
-    tl.store(output_ptr + offsets, output, mask=mask)
-
-
-def triton_add(lhs, rhs):
-    output = torch.empty_like(lhs)
-    n_elements = output.numel()
-
-    def grid(meta):
-        return (triton.cdiv(n_elements, meta["BLOCK_SIZE"]),)
-
-    triton_add_kernel[grid](lhs, rhs, output, n_elements, BLOCK_SIZE=1024)
-
-    return output
-
-
-torch.manual_seed(0)
-size = 98432
-lhs = torch.rand(size, device="cuda")
-rhs = torch.rand(size, device="cuda")
-ninetoothed_output = add(lhs, rhs)
-torch_output = lhs + rhs
-triton_output = triton_add(lhs, rhs)
-print(ninetoothed_output)
-print(torch_output)
-print(triton_output)
-if torch.allclose(ninetoothed_output, torch_output):
-    print("✅ NineToothed and PyTorch match.")
-else:
-    print("❌ NineToothed and PyTorch differ.")
-if torch.allclose(ninetoothed_output, triton_output):
-    print("✅ NineToothed and Triton match.")
-else:
-    print("❌ NineToothed and Triton differ.")
-
-
-@triton.testing.perf_report(
-    triton.testing.Benchmark(
-        x_names=["size"],
-        x_vals=[2**i for i in range(12, 28, 1)],
-        x_log=True,
-        line_arg="provider",
-        line_vals=["ninetoothed", "torch", "triton"],
-        line_names=["NineToothed", "PyTorch", "Triton"],
-        styles=[("blue", "-"), ("green", "-"), ("orange", "-")],
-        ylabel="GB/s",
-        plot_name="vector-addition-performance",
-        args={},
+import ops.ninetoothed.torch
+import ops.triton.torch
+
+if __name__ == "__main__":
+    torch.manual_seed(0)
+
+    size = 98432
+    dtype = torch.float16
+    device = "cuda"
+
+    input = torch.randn(size, dtype=dtype, device=device)
+    other = torch.randn(size, dtype=dtype, device=device)
+
+    ninetoothed_output = ops.ninetoothed.torch.add(input, other)
+    torch_output = input + other
+    triton_output = ops.triton.torch.add(input, other)
+
+    print(ninetoothed_output)
+    print(torch_output)
+    print(triton_output)
+
+    if torch.allclose(ninetoothed_output, torch_output):
+        print("✅ NineToothed and PyTorch match.")
+    else:
+        print("❌ NineToothed and PyTorch differ.")
+    if torch.allclose(ninetoothed_output, triton_output, atol=0, rtol=0):
+        print("✅ NineToothed and Triton match.")
+    else:
+        print("❌ NineToothed and Triton differ.")
+
+    @triton.testing.perf_report(
+        triton.testing.Benchmark(
+            x_names=["size"],
+            x_vals=[2**i for i in range(18, 28)],
+            x_log=True,
+            line_arg="provider",
+            line_vals=["ninetoothed", "torch", "triton"],
+            line_names=["NineToothed", "PyTorch", "Triton"],
+            styles=[("blue", "-"), ("green", "-"), ("orange", "-")],
+            ylabel="ms",
+            plot_name="add-performance",
+            args={},
+        )
     )
-)
-def benchmark(size, provider):
-    lhs = torch.rand(size, device="cuda", dtype=torch.float32)
-    rhs = torch.rand(size, device="cuda", dtype=torch.float32)
-    quantiles = [0.5, 0.2, 0.8]
+    def benchmark(size, provider):
+        input = torch.randn(size, dtype=dtype, device=device)
+        other = torch.randn(size, dtype=dtype, device=device)
 
-    if provider == "ninetoothed":
-        ms, min_ms, max_ms = triton.testing.do_bench(
-            lambda: add(lhs, rhs), quantiles=quantiles
-        )
-    elif provider == "torch":
-        ms, min_ms, max_ms = triton.testing.do_bench(
-            lambda: lhs + rhs, quantiles=quantiles
-        )
-    elif provider == "triton":
-        ms, min_ms, max_ms = triton.testing.do_bench(
-            lambda: triton_add(lhs, rhs), quantiles=quantiles
-        )
+        ninetoothed_output = ops.ninetoothed.torch.add(input, other)
+        torch_output = torch.add(input, other)
+        triton_output = ops.triton.torch.add(input, other)
 
-    def gbps(ms):
-        return 3 * lhs.numel() * lhs.element_size() / ms * 1e-6
+        assert torch.allclose(ninetoothed_output, torch_output)
+        assert torch.allclose(ninetoothed_output, triton_output, atol=0, rtol=0)
 
-    return gbps(ms), gbps(max_ms), gbps(min_ms)
+        if provider == "ninetoothed":
+            ms = triton.testing.do_bench(
+                lambda: ops.ninetoothed.torch.add(input, other)
+            )
+        elif provider == "torch":
+            ms = triton.testing.do_bench(lambda: torch.add(input, other))
+        elif provider == "triton":
+            ms = triton.testing.do_bench(lambda: ops.triton.torch.add(input, other))
 
+        return ms
 
-benchmark.run(print_data=True, show_plots=True, save_path=".")
+    benchmark.run(print_data=True, show_plots=True, save_path=".")