NVIDIA
diff --git a/‎build_lib.py
Lines changed: 18 additions & 3 deletions b/‎build_lib.py
Lines changed: 18 additions & 3 deletions
diff --git a/‎build_llvm.py
Lines changed: 1 addition & 1 deletion b/‎build_llvm.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎warp/build_dll.py
Lines changed: 109 additions & 43 deletions b/‎warp/build_dll.py
Lines changed: 109 additions & 43 deletions
diff --git a/‎warp/native/builtin.h
Lines changed: 1 addition & 1 deletion b/‎warp/native/builtin.h
Lines changed: 1 addition & 1 deletion
diff --git a/‎warp/native/bvh.cu
Lines changed: 14 additions & 3 deletions b/‎warp/native/bvh.cu
Lines changed: 14 additions & 3 deletions
diff --git a/‎warp/native/bvh.h
Lines changed: 6 additions & 5 deletions b/‎warp/native/bvh.h
Lines changed: 6 additions & 5 deletions
diff --git a/‎warp/native/hashgrid.cu
Lines changed: 2 additions & 0 deletions b/‎warp/native/hashgrid.cu
Lines changed: 2 additions & 0 deletions
@@ -207,6 +207,8 @@ def main(argv: list[str] | None = None) -> int:
     parser.add_argument("--quick", action="store_true", help="Only generate PTX code")
     parser.set_defaults(quick=False)
 
+    parser.add_argument("-j", "--jobs", type=int, default=4, help="Number of concurrent build tasks.")
+
     group_clang_llvm = parser.add_argument_group("Clang/LLVM Options")
     group_clang_llvm.add_argument("--llvm_path", type=str, help="Path to an existing LLVM installation")
     group_clang_llvm.add_argument(
@@ -302,15 +304,28 @@ def main(argv: list[str] | None = None) -> int:
 
         if args.cuda_path is None:
             print("Warning: CUDA toolchain not found, building without CUDA support")
-            warp_cu_path = None
+            warp_cu_paths = None
         else:
-            warp_cu_path = os.path.join(build_path, "native/warp.cu")
+            cuda_sources = [
+                "native/bvh.cu",
+                "native/mesh.cu",
+                "native/sort.cu",
+                "native/hashgrid.cu",
+                "native/reduce.cu",
+                "native/runlength_encode.cu",
+                "native/scan.cu",
+                "native/sparse.cu",
+                "native/volume.cu",
+                "native/volume_builder.cu",
+                "native/warp.cu",
+            ]
+            warp_cu_paths = [os.path.join(build_path, cu) for cu in cuda_sources]
 
         if args.libmathdx and args.libmathdx_path is None:
             print("Warning: libmathdx not found, building without MathDx support")
 
         warp_dll_path = os.path.join(build_path, f"bin/{lib_name('warp')}")
-        build_dll.build_dll(args, dll_path=warp_dll_path, cpp_paths=warp_cpp_paths, cu_path=warp_cu_path)
+        build_dll.build_dll(args, dll_path=warp_dll_path, cpp_paths=warp_cpp_paths, cu_paths=warp_cu_paths)
 
         # build warp-clang.dll
         if args.standalone:
 
@@ -386,7 +386,7 @@ def build_warp_clang_for_arch(args, lib_name: str, arch: str) -> None:
             args,
             dll_path=clang_dll_path,
             cpp_paths=clang_cpp_paths,
-            cu_path=None,
+            cu_paths=None,
             arch=arch,
             libs=libs,
             mode=args.mode if args.build_llvm else "release",
 
@@ -15,10 +15,12 @@
 
 from __future__ import annotations
 
+import concurrent.futures
 import os
 import platform
 import subprocess
 import sys
+import time
 
 from warp.utils import ScopedTimer
 
@@ -174,15 +176,15 @@ def add_llvm_bin_to_path(args):
     return True
 
 
-def build_dll_for_arch(args, dll_path, cpp_paths, cu_path, arch, libs: list[str] | None = None, mode=None):
+def build_dll_for_arch(args, dll_path, cpp_paths, cu_paths, arch, libs: list[str] | None = None, mode=None):
     mode = args.mode if (mode is None) else mode
     cuda_home = args.cuda_path
     cuda_cmd = None
 
     # Add LLVM bin directory to PATH
     add_llvm_bin_to_path(args)
 
-    if args.quick or cu_path is None:
+    if args.quick or cu_paths is None:
         cuda_compat_enabled = "WP_ENABLE_CUDA_COMPATIBILITY=0"
     else:
         cuda_compat_enabled = "WP_ENABLE_CUDA_COMPATIBILITY=1"
@@ -200,7 +202,7 @@ def build_dll_for_arch(args, dll_path, cpp_paths, cu_path, arch, libs: list[str]
 
     native_dir = os.path.join(warp_home, "native")
 
-    if cu_path:
+    if cu_paths:
         # check CUDA Toolkit version
         ctk_version = get_cuda_toolkit_version(cuda_home)
         if ctk_version < MIN_CTK_VERSION:
@@ -298,15 +300,15 @@ def build_dll_for_arch(args, dll_path, cpp_paths, cu_path, arch, libs: list[str]
 
         if args.compile_time_trace:
             if ctk_version >= (12, 8):
-                nvcc_opts.append("--fdevice-time-trace=build_lib_compile-time-trace")
+                nvcc_opts.append("--fdevice-time-trace=_build/build_lib_@filename@_compile-time-trace")
             else:
                 print("Warp warning: CUDA version is less than 12.8, compile_time_trace is not supported")
 
         if args.fast_math:
             nvcc_opts.append("--use_fast_math")
 
     # is the library being built with CUDA enabled?
-    cuda_enabled = "WP_ENABLE_CUDA=1" if (cu_path is not None) else "WP_ENABLE_CUDA=0"
+    cuda_enabled = "WP_ENABLE_CUDA=1" if (cu_paths is not None) else "WP_ENABLE_CUDA=0"
 
     if args.libmathdx_path:
         libmathdx_includes = f' -I"{args.libmathdx_path}/include"'
@@ -323,11 +325,11 @@ def build_dll_for_arch(args, dll_path, cpp_paths, cu_path, arch, libs: list[str]
 
         cpp_includes = f' /I"{warp_home_path.parent}/external/llvm-project/out/install/{mode}-{arch}/include"'
         cpp_includes += f' /I"{warp_home_path.parent}/_build/host-deps/llvm-project/release-{arch}/include"'
-        cuda_includes = f' /I"{cuda_home}/include"' if cu_path else ""
+        cuda_includes = f' /I"{cuda_home}/include"' if cu_paths else ""
         includes = cpp_includes + cuda_includes
 
         # nvrtc_static.lib is built with /MT and _ITERATOR_DEBUG_LEVEL=0 so if we link it in we must match these options
-        if cu_path or mode != "debug":
+        if cu_paths or mode != "debug":
             runtime = "/MT"
             iter_dbg = "_ITERATOR_DEBUG_LEVEL=0"
             debug = "NDEBUG"
@@ -353,33 +355,65 @@ def build_dll_for_arch(args, dll_path, cpp_paths, cu_path, arch, libs: list[str]
         if args.fast_math:
             cpp_flags += " /fp:fast"
 
-        with ScopedTimer("build", active=args.verbose):
+        with concurrent.futures.ThreadPoolExecutor(max_workers=args.jobs) as executor:
+            futures, wall_clock = [], time.perf_counter_ns()
+
+            cpp_cmds = []
             for cpp_path in cpp_paths:
                 cpp_out = cpp_path + ".obj"
                 linkopts.append(quote(cpp_out))
-
                 cpp_cmd = f'"{args.host_compiler}" {cpp_flags} -c "{cpp_path}" /Fo"{cpp_out}"'
-                run_cmd(cpp_cmd)
+                cpp_cmds.append(cpp_cmd)
 
-        if cu_path:
-            cu_out = cu_path + ".o"
+            if args.jobs <= 1:
+                with ScopedTimer("build", active=args.verbose):
+                    for cpp_cmd in cpp_cmds:
+                        run_cmd(cpp_cmd)
+            else:
+                futures = [executor.submit(run_cmd, cmd=cpp_cmd) for cpp_cmd in cpp_cmds]
 
-            if mode == "debug":
-                cuda_cmd = f'"{cuda_home}/bin/nvcc" --std=c++17 --compiler-options=/MT,/Zi,/Od -g -G -O0 -DNDEBUG -D_ITERATOR_DEBUG_LEVEL=0 -I"{native_dir}" -line-info {" ".join(nvcc_opts)} -DWP_ENABLE_CUDA=1 -D{mathdx_enabled} {libmathdx_includes} -o "{cu_out}" -c "{cu_path}"'
+            cuda_cmds = []
+            if cu_paths:
+                for cu_path in cu_paths:
+                    cu_out = cu_path + ".o"
+
+                    _nvcc_opts = [
+                        opt.replace("@filename@", os.path.basename(cu_path).replace(".", "_")) for opt in nvcc_opts
+                    ]
 
-            elif mode == "release":
-                cuda_cmd = f'"{cuda_home}/bin/nvcc" --std=c++17 -O3 {" ".join(nvcc_opts)} -I"{native_dir}" -DNDEBUG -DWP_ENABLE_CUDA=1 -D{mathdx_enabled} {libmathdx_includes} -o "{cu_out}" -c "{cu_path}"'
+                    if mode == "debug":
+                        cuda_cmd = f'"{cuda_home}/bin/nvcc" --std=c++17 --compiler-options=/MT,/Zi,/Od -g -G -O0 -DNDEBUG -D_ITERATOR_DEBUG_LEVEL=0 -I"{native_dir}" -line-info {" ".join(_nvcc_opts)} -DWP_ENABLE_CUDA=1 -D{mathdx_enabled} {libmathdx_includes} -o "{cu_out}" -c "{cu_path}"'
+                    elif mode == "release":
+                        cuda_cmd = f'"{cuda_home}/bin/nvcc" --std=c++17 -O3 {" ".join(_nvcc_opts)} -I"{native_dir}" -DNDEBUG -DWP_ENABLE_CUDA=1 -D{mathdx_enabled} {libmathdx_includes} -o "{cu_out}" -c "{cu_path}"'
+
+                    cuda_cmds.append(cuda_cmd)
+
+                    linkopts.append(quote(cu_out))
 
-            with ScopedTimer("build_cuda", active=args.verbose):
-                run_cmd(cuda_cmd)
-                linkopts.append(quote(cu_out))
                 linkopts.append(
                     f'cudart_static.lib nvrtc_static.lib nvrtc-builtins_static.lib nvptxcompiler_static.lib ws2_32.lib user32.lib /LIBPATH:"{cuda_home}/lib/x64"'
                 )
 
                 if args.libmathdx_path:
                     linkopts.append(f'nvJitLink_static.lib /LIBPATH:"{args.libmathdx_path}/lib/x64" mathdx_static.lib')
 
+            if args.jobs <= 1:
+                with ScopedTimer("build_cuda", active=args.verbose):
+                    for cuda_cmd in cuda_cmds:
+                        run_cmd(cuda_cmd)
+            else:
+                futures.extend([executor.submit(run_cmd, cmd=cuda_cmd) for cuda_cmd in cuda_cmds])
+
+            if futures:
+                done, pending = concurrent.futures.wait(futures, return_when=concurrent.futures.FIRST_EXCEPTION)
+                for d in done:
+                    if e := d.exception():
+                        for f in pending:
+                            f.cancel()
+                        raise e
+                elapsed = (time.perf_counter_ns() - wall_clock) / 1000000.0
+                print(f"build took {elapsed:.2f} ms ({args.jobs:d} workers)")
+
         with ScopedTimer("link", active=args.verbose):
             link_cmd = f'"{host_linker}" {" ".join(linkopts + libs)} /out:"{dll_path}"'
             run_cmd(link_cmd)
@@ -391,7 +425,7 @@ def build_dll_for_arch(args, dll_path, cpp_paths, cu_path, arch, libs: list[str]
 
         cpp_includes = f' -I"{warp_home_path.parent}/external/llvm-project/out/install/{mode}-{arch}/include"'
         cpp_includes += f' -I"{warp_home_path.parent}/_build/host-deps/llvm-project/release-{arch}/include"'
-        cuda_includes = f' -I"{cuda_home}/include"' if cu_path else ""
+        cuda_includes = f' -I"{cuda_home}/include"' if cu_paths else ""
         includes = cpp_includes + cuda_includes
 
         if sys.platform == "darwin":
@@ -418,40 +452,72 @@ def build_dll_for_arch(args, dll_path, cpp_paths, cu_path, arch, libs: list[str]
 
         ld_inputs = []
 
-        with ScopedTimer("build", active=args.verbose):
+        with concurrent.futures.ThreadPoolExecutor(max_workers=args.jobs) as executor:
+            futures, wall_clock = [], time.perf_counter_ns()
+
+            cpp_cmds = []
             for cpp_path in cpp_paths:
                 cpp_out = cpp_path + ".o"
                 ld_inputs.append(quote(cpp_out))
+                cpp_cmd = f'{cpp_compiler} {cpp_flags} -c "{cpp_path}" -o "{cpp_out}"'
+                cpp_cmds.append(cpp_cmd)
 
-                build_cmd = f'{cpp_compiler} {cpp_flags} -c "{cpp_path}" -o "{cpp_out}"'
-                run_cmd(build_cmd)
+            if args.jobs <= 1:
+                with ScopedTimer("build", active=args.verbose):
+                    for cpp_cmd in cpp_cmds:
+                        run_cmd(cpp_cmd)
+            else:
+                futures = [executor.submit(run_cmd, cmd=cpp_cmd) for cpp_cmd in cpp_cmds]
 
-        if cu_path:
-            cu_out = cu_path + ".o"
+            cuda_cmds = []
+            if cu_paths:
+                for cu_path in cu_paths:
+                    cu_out = cu_path + ".o"
 
-            if cuda_compiler == "nvcc":
-                if mode == "debug":
-                    cuda_cmd = f'"{cuda_home}/bin/nvcc" --std=c++17 -g -G -O0 --compiler-options -fPIC,-fvisibility=hidden -D_DEBUG -D_ITERATOR_DEBUG_LEVEL=0 -line-info {" ".join(nvcc_opts)} -DWP_ENABLE_CUDA=1 -I"{native_dir}" -D{mathdx_enabled} {libmathdx_includes} -o "{cu_out}" -c "{cu_path}"'
-                elif mode == "release":
-                    cuda_cmd = f'"{cuda_home}/bin/nvcc" --std=c++17 -O3 --compiler-options -fPIC,-fvisibility=hidden {" ".join(nvcc_opts)} -DNDEBUG -DWP_ENABLE_CUDA=1 -I"{native_dir}" -D{mathdx_enabled} {libmathdx_includes} -o "{cu_out}" -c "{cu_path}"'
-            else:
-                # Use Clang compiler
-                if mode == "debug":
-                    cuda_cmd = f'clang++ -Werror -Wuninitialized -Wno-unknown-cuda-version {" ".join(clang_opts)} -g -O0 -fPIC -fvisibility=hidden -D_DEBUG -D_ITERATOR_DEBUG_LEVEL=0 -DWP_ENABLE_CUDA=1 -I"{native_dir}" -D{mathdx_enabled} {libmathdx_includes} -o "{cu_out}" -c "{cu_path}"'
-                elif mode == "release":
-                    cuda_cmd = f'clang++ -Werror -Wuninitialized -Wno-unknown-cuda-version {" ".join(clang_opts)} -O3 -fPIC -fvisibility=hidden -DNDEBUG -DWP_ENABLE_CUDA=1 -I"{native_dir}" -D{mathdx_enabled} {libmathdx_includes} -o "{cu_out}" -c "{cu_path}"'
+                    _nvcc_opts = [
+                        opt.replace("@filename@", os.path.basename(cu_path).replace(".", "_")) for opt in nvcc_opts
+                    ]
 
-            with ScopedTimer("build_cuda", active=args.verbose):
-                run_cmd(cuda_cmd)
+                    if cuda_compiler == "nvcc":
+                        if mode == "debug":
+                            cuda_cmd = f'"{cuda_home}/bin/nvcc" --std=c++17 -g -G -O0 --compiler-options -fPIC,-fvisibility=hidden -D_DEBUG -D_ITERATOR_DEBUG_LEVEL=0 -line-info {" ".join(_nvcc_opts)} -DWP_ENABLE_CUDA=1 -I"{native_dir}" -D{mathdx_enabled} {libmathdx_includes} -o "{cu_out}" -c "{cu_path}"'
+                        elif mode == "release":
+                            cuda_cmd = f'"{cuda_home}/bin/nvcc" --std=c++17 -O3 --compiler-options -fPIC,-fvisibility=hidden {" ".join(_nvcc_opts)} -DNDEBUG -DWP_ENABLE_CUDA=1 -I"{native_dir}" -D{mathdx_enabled} {libmathdx_includes} -o "{cu_out}" -c "{cu_path}"'
+                    else:
+                        # Use Clang compiler
+                        if mode == "debug":
+                            cuda_cmd = f'clang++ -Werror -Wuninitialized -Wno-unknown-cuda-version {" ".join(clang_opts)} -g -O0 -fPIC -fvisibility=hidden -D_DEBUG -D_ITERATOR_DEBUG_LEVEL=0 -DWP_ENABLE_CUDA=1 -I"{native_dir}" -D{mathdx_enabled} {libmathdx_includes} -o "{cu_out}" -c "{cu_path}"'
+                        elif mode == "release":
+                            cuda_cmd = f'clang++ -Werror -Wuninitialized -Wno-unknown-cuda-version {" ".join(clang_opts)} -O3 -fPIC -fvisibility=hidden -DNDEBUG -DWP_ENABLE_CUDA=1 -I"{native_dir}" -D{mathdx_enabled} {libmathdx_includes} -o "{cu_out}" -c "{cu_path}"'
+
+                    cuda_cmds.append(cuda_cmd)
+
+                    ld_inputs.append(quote(cu_out))
 
-                ld_inputs.append(quote(cu_out))
                 ld_inputs.append(
                     f'-L"{cuda_home}/lib64" -lcudart_static -lnvrtc_static -lnvrtc-builtins_static -lnvptxcompiler_static -lpthread -ldl -lrt'
                 )
 
                 if args.libmathdx_path:
                     ld_inputs.append(f"-lnvJitLink_static -L{args.libmathdx_path}/lib -lmathdx_static")
 
+            if args.jobs <= 1:
+                with ScopedTimer("build_cuda", active=args.verbose):
+                    for cuda_cmd in cuda_cmds:
+                        run_cmd(cuda_cmd)
+            else:
+                futures.extend([executor.submit(run_cmd, cmd=cuda_cmd) for cuda_cmd in cuda_cmds])
+
+            if futures:
+                done, pending = concurrent.futures.wait(futures, return_when=concurrent.futures.FIRST_EXCEPTION)
+                for d in done:
+                    if e := d.exception():
+                        for f in pending:
+                            f.cancel()
+                        raise e
+                elapsed = (time.perf_counter_ns() - wall_clock) / 1000000.0
+                print(f"build took {elapsed:.2f} ms ({args.jobs:d} workers)")
+
         if sys.platform == "darwin":
             opt_no_undefined = "-Wl,-undefined,error"
             opt_exclude_libs = ""
@@ -475,15 +541,15 @@ def build_dll_for_arch(args, dll_path, cpp_paths, cu_path, arch, libs: list[str]
                     )
 
 
-def build_dll(args, dll_path, cpp_paths, cu_path, libs=None):
+def build_dll(args, dll_path, cpp_paths, cu_paths, libs=None):
     if sys.platform == "darwin":
         # create a universal binary by combining x86-64 and AArch64 builds
-        build_dll_for_arch(args, dll_path + "-x86_64", cpp_paths, cu_path, "x86_64", libs)
-        build_dll_for_arch(args, dll_path + "-aarch64", cpp_paths, cu_path, "aarch64", libs)
+        build_dll_for_arch(args, dll_path + "-x86_64", cpp_paths, cu_paths, "x86_64", libs)
+        build_dll_for_arch(args, dll_path + "-aarch64", cpp_paths, cu_paths, "aarch64", libs)
 
         run_cmd(f"lipo -create -output {dll_path} {dll_path}-x86_64 {dll_path}-aarch64")
         os.remove(f"{dll_path}-x86_64")
         os.remove(f"{dll_path}-aarch64")
 
     else:
-        build_dll_for_arch(args, dll_path, cpp_paths, cu_path, machine_architecture(), libs)
+        build_dll_for_arch(args, dll_path, cpp_paths, cu_paths, machine_architecture(), libs)
@@ -49,7 +49,7 @@
 #define DEG_TO_RAD  0.01745329251994329577
 
 #if defined(__CUDACC__) && !defined(_MSC_VER)
-__device__ void __debugbreak() { __brkpt(); }
+__device__ inline void __debugbreak() { __brkpt(); }
 #endif
 
 #if defined(__clang__) && defined(__CUDA__) && defined(__CUDA_ARCH__)
 
@@ -31,11 +31,22 @@
 
 #include <cub/cub.cuh>
 
+extern CUcontext get_current_context();
 
 namespace wp
 {
-    void bvh_create_host(vec3* lowers, vec3* uppers, int num_items, int constructor_type, BVH& bvh);
-    void bvh_destroy_host(BVH& bvh);
+void bvh_create_host(vec3* lowers, vec3* uppers, int num_items, int constructor_type, BVH& bvh);
+void bvh_destroy_host(BVH& bvh);
+
+__global__ void memset_kernel(int* dest, int value, size_t n)
+{
+    const size_t tid = static_cast<size_t>(blockDim.x) * static_cast<size_t>(blockIdx.x) + static_cast<size_t>(threadIdx.x);
+    
+    if (tid < n)
+    {
+        dest[tid] = value;
+    }
+}
 
 // for LBVH: this will start with some muted leaf nodes, but that is okay, we can still trace up because there parents information is still valid
 // the only thing worth mentioning is that when the parent leaf node is also a leaf node, we need to recompute its bounds, since their child information are lost
@@ -503,7 +514,7 @@ void LinearBVHBuilderGPU::build(BVH& bvh, const vec3* item_lowers, const vec3* i
     }
     else
     {
-        // IEEE-754 bit patterns for ±FLT_MAX
+        // IEEE-754 bit patterns for +/- FLT_MAX
         constexpr int FLT_MAX_BITS = 0x7f7fffff;
         constexpr int NEG_FLT_MAX_BITS = 0xff7fffff;
 
 
@@ -498,15 +498,16 @@ CUDA_CALLABLE bool bvh_get_descriptor(uint64_t id, BVH& bvh);
 CUDA_CALLABLE void bvh_add_descriptor(uint64_t id, const BVH& bvh);
 CUDA_CALLABLE void bvh_rem_descriptor(uint64_t id);
 
-#if !__CUDA_ARCH__
-
 void bvh_create_host(vec3* lowers, vec3* uppers, int num_items,  int constructor_type, BVH& bvh);
 void bvh_destroy_host(wp::BVH& bvh);
 void bvh_refit_host(wp::BVH& bvh);
 
-void bvh_destroy_device(wp::BVH& bvh);
-void bvh_refit_device(uint64_t id);
+#if WP_ENABLE_CUDA
+
+void bvh_create_device(void* context, vec3* lowers, vec3* uppers, int num_items, int constructor_type, BVH& bvh_device_on_host);
+void bvh_destroy_device(BVH& bvh);
+void bvh_refit_device(BVH& bvh);
 
-#endif
+#endif // WP_ENABLE_CUDA
 
 } // namespace wp
@@ -20,6 +20,8 @@
 #include "hashgrid.h"
 #include "sort.h"
 
+extern CUcontext get_current_context();
+
 namespace wp
 {
Original file line number	Diff line number	Diff line change
`@@ -20,6 +20,8 @@`
`20`	`20`	`#include "hashgrid.h"`
`21`	`21`	`#include "sort.h"`
`22`	`22`
	`23`	`+extern CUcontext get_current_context();`
	`24`	`+`
`23`	`25`	`namespace wp`
`24`	`26`	`{`
`25`	`27`