Skip to content

Commit 531f4e3

Browse files
committed
Merge branch 'mk/mulitprocess_build' into 'main'
Multi-process Warp library build See merge request omniverse/warp!1547
2 parents 1c95862 + 988d29c commit 531f4e3

File tree

13 files changed

+163
-70
lines changed

13 files changed

+163
-70
lines changed

build_lib.py

Lines changed: 18 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -207,6 +207,8 @@ def main(argv: list[str] | None = None) -> int:
207207
parser.add_argument("--quick", action="store_true", help="Only generate PTX code")
208208
parser.set_defaults(quick=False)
209209

210+
parser.add_argument("-j", "--jobs", type=int, default=4, help="Number of concurrent build tasks.")
211+
210212
group_clang_llvm = parser.add_argument_group("Clang/LLVM Options")
211213
group_clang_llvm.add_argument("--llvm_path", type=str, help="Path to an existing LLVM installation")
212214
group_clang_llvm.add_argument(
@@ -302,15 +304,28 @@ def main(argv: list[str] | None = None) -> int:
302304

303305
if args.cuda_path is None:
304306
print("Warning: CUDA toolchain not found, building without CUDA support")
305-
warp_cu_path = None
307+
warp_cu_paths = None
306308
else:
307-
warp_cu_path = os.path.join(build_path, "native/warp.cu")
309+
cuda_sources = [
310+
"native/bvh.cu",
311+
"native/mesh.cu",
312+
"native/sort.cu",
313+
"native/hashgrid.cu",
314+
"native/reduce.cu",
315+
"native/runlength_encode.cu",
316+
"native/scan.cu",
317+
"native/sparse.cu",
318+
"native/volume.cu",
319+
"native/volume_builder.cu",
320+
"native/warp.cu",
321+
]
322+
warp_cu_paths = [os.path.join(build_path, cu) for cu in cuda_sources]
308323

309324
if args.libmathdx and args.libmathdx_path is None:
310325
print("Warning: libmathdx not found, building without MathDx support")
311326

312327
warp_dll_path = os.path.join(build_path, f"bin/{lib_name('warp')}")
313-
build_dll.build_dll(args, dll_path=warp_dll_path, cpp_paths=warp_cpp_paths, cu_path=warp_cu_path)
328+
build_dll.build_dll(args, dll_path=warp_dll_path, cpp_paths=warp_cpp_paths, cu_paths=warp_cu_paths)
314329

315330
# build warp-clang.dll
316331
if args.standalone:

build_llvm.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -386,7 +386,7 @@ def build_warp_clang_for_arch(args, lib_name: str, arch: str) -> None:
386386
args,
387387
dll_path=clang_dll_path,
388388
cpp_paths=clang_cpp_paths,
389-
cu_path=None,
389+
cu_paths=None,
390390
arch=arch,
391391
libs=libs,
392392
mode=args.mode if args.build_llvm else "release",

warp/build_dll.py

Lines changed: 109 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -15,10 +15,12 @@
1515

1616
from __future__ import annotations
1717

18+
import concurrent.futures
1819
import os
1920
import platform
2021
import subprocess
2122
import sys
23+
import time
2224

2325
from warp.utils import ScopedTimer
2426

@@ -174,15 +176,15 @@ def add_llvm_bin_to_path(args):
174176
return True
175177

176178

177-
def build_dll_for_arch(args, dll_path, cpp_paths, cu_path, arch, libs: list[str] | None = None, mode=None):
179+
def build_dll_for_arch(args, dll_path, cpp_paths, cu_paths, arch, libs: list[str] | None = None, mode=None):
178180
mode = args.mode if (mode is None) else mode
179181
cuda_home = args.cuda_path
180182
cuda_cmd = None
181183

182184
# Add LLVM bin directory to PATH
183185
add_llvm_bin_to_path(args)
184186

185-
if args.quick or cu_path is None:
187+
if args.quick or cu_paths is None:
186188
cuda_compat_enabled = "WP_ENABLE_CUDA_COMPATIBILITY=0"
187189
else:
188190
cuda_compat_enabled = "WP_ENABLE_CUDA_COMPATIBILITY=1"
@@ -200,7 +202,7 @@ def build_dll_for_arch(args, dll_path, cpp_paths, cu_path, arch, libs: list[str]
200202

201203
native_dir = os.path.join(warp_home, "native")
202204

203-
if cu_path:
205+
if cu_paths:
204206
# check CUDA Toolkit version
205207
ctk_version = get_cuda_toolkit_version(cuda_home)
206208
if ctk_version < MIN_CTK_VERSION:
@@ -298,15 +300,15 @@ def build_dll_for_arch(args, dll_path, cpp_paths, cu_path, arch, libs: list[str]
298300

299301
if args.compile_time_trace:
300302
if ctk_version >= (12, 8):
301-
nvcc_opts.append("--fdevice-time-trace=build_lib_compile-time-trace")
303+
nvcc_opts.append("--fdevice-time-trace=_build/build_lib_@filename@_compile-time-trace")
302304
else:
303305
print("Warp warning: CUDA version is less than 12.8, compile_time_trace is not supported")
304306

305307
if args.fast_math:
306308
nvcc_opts.append("--use_fast_math")
307309

308310
# is the library being built with CUDA enabled?
309-
cuda_enabled = "WP_ENABLE_CUDA=1" if (cu_path is not None) else "WP_ENABLE_CUDA=0"
311+
cuda_enabled = "WP_ENABLE_CUDA=1" if (cu_paths is not None) else "WP_ENABLE_CUDA=0"
310312

311313
if args.libmathdx_path:
312314
libmathdx_includes = f' -I"{args.libmathdx_path}/include"'
@@ -323,11 +325,11 @@ def build_dll_for_arch(args, dll_path, cpp_paths, cu_path, arch, libs: list[str]
323325

324326
cpp_includes = f' /I"{warp_home_path.parent}/external/llvm-project/out/install/{mode}-{arch}/include"'
325327
cpp_includes += f' /I"{warp_home_path.parent}/_build/host-deps/llvm-project/release-{arch}/include"'
326-
cuda_includes = f' /I"{cuda_home}/include"' if cu_path else ""
328+
cuda_includes = f' /I"{cuda_home}/include"' if cu_paths else ""
327329
includes = cpp_includes + cuda_includes
328330

329331
# nvrtc_static.lib is built with /MT and _ITERATOR_DEBUG_LEVEL=0 so if we link it in we must match these options
330-
if cu_path or mode != "debug":
332+
if cu_paths or mode != "debug":
331333
runtime = "/MT"
332334
iter_dbg = "_ITERATOR_DEBUG_LEVEL=0"
333335
debug = "NDEBUG"
@@ -353,33 +355,65 @@ def build_dll_for_arch(args, dll_path, cpp_paths, cu_path, arch, libs: list[str]
353355
if args.fast_math:
354356
cpp_flags += " /fp:fast"
355357

356-
with ScopedTimer("build", active=args.verbose):
358+
with concurrent.futures.ThreadPoolExecutor(max_workers=args.jobs) as executor:
359+
futures, wall_clock = [], time.perf_counter_ns()
360+
361+
cpp_cmds = []
357362
for cpp_path in cpp_paths:
358363
cpp_out = cpp_path + ".obj"
359364
linkopts.append(quote(cpp_out))
360-
361365
cpp_cmd = f'"{args.host_compiler}" {cpp_flags} -c "{cpp_path}" /Fo"{cpp_out}"'
362-
run_cmd(cpp_cmd)
366+
cpp_cmds.append(cpp_cmd)
363367

364-
if cu_path:
365-
cu_out = cu_path + ".o"
368+
if args.jobs <= 1:
369+
with ScopedTimer("build", active=args.verbose):
370+
for cpp_cmd in cpp_cmds:
371+
run_cmd(cpp_cmd)
372+
else:
373+
futures = [executor.submit(run_cmd, cmd=cpp_cmd) for cpp_cmd in cpp_cmds]
366374

367-
if mode == "debug":
368-
cuda_cmd = f'"{cuda_home}/bin/nvcc" --std=c++17 --compiler-options=/MT,/Zi,/Od -g -G -O0 -DNDEBUG -D_ITERATOR_DEBUG_LEVEL=0 -I"{native_dir}" -line-info {" ".join(nvcc_opts)} -DWP_ENABLE_CUDA=1 -D{mathdx_enabled} {libmathdx_includes} -o "{cu_out}" -c "{cu_path}"'
375+
cuda_cmds = []
376+
if cu_paths:
377+
for cu_path in cu_paths:
378+
cu_out = cu_path + ".o"
379+
380+
_nvcc_opts = [
381+
opt.replace("@filename@", os.path.basename(cu_path).replace(".", "_")) for opt in nvcc_opts
382+
]
369383

370-
elif mode == "release":
371-
cuda_cmd = f'"{cuda_home}/bin/nvcc" --std=c++17 -O3 {" ".join(nvcc_opts)} -I"{native_dir}" -DNDEBUG -DWP_ENABLE_CUDA=1 -D{mathdx_enabled} {libmathdx_includes} -o "{cu_out}" -c "{cu_path}"'
384+
if mode == "debug":
385+
cuda_cmd = f'"{cuda_home}/bin/nvcc" --std=c++17 --compiler-options=/MT,/Zi,/Od -g -G -O0 -DNDEBUG -D_ITERATOR_DEBUG_LEVEL=0 -I"{native_dir}" -line-info {" ".join(_nvcc_opts)} -DWP_ENABLE_CUDA=1 -D{mathdx_enabled} {libmathdx_includes} -o "{cu_out}" -c "{cu_path}"'
386+
elif mode == "release":
387+
cuda_cmd = f'"{cuda_home}/bin/nvcc" --std=c++17 -O3 {" ".join(_nvcc_opts)} -I"{native_dir}" -DNDEBUG -DWP_ENABLE_CUDA=1 -D{mathdx_enabled} {libmathdx_includes} -o "{cu_out}" -c "{cu_path}"'
388+
389+
cuda_cmds.append(cuda_cmd)
390+
391+
linkopts.append(quote(cu_out))
372392

373-
with ScopedTimer("build_cuda", active=args.verbose):
374-
run_cmd(cuda_cmd)
375-
linkopts.append(quote(cu_out))
376393
linkopts.append(
377394
f'cudart_static.lib nvrtc_static.lib nvrtc-builtins_static.lib nvptxcompiler_static.lib ws2_32.lib user32.lib /LIBPATH:"{cuda_home}/lib/x64"'
378395
)
379396

380397
if args.libmathdx_path:
381398
linkopts.append(f'nvJitLink_static.lib /LIBPATH:"{args.libmathdx_path}/lib/x64" mathdx_static.lib')
382399

400+
if args.jobs <= 1:
401+
with ScopedTimer("build_cuda", active=args.verbose):
402+
for cuda_cmd in cuda_cmds:
403+
run_cmd(cuda_cmd)
404+
else:
405+
futures.extend([executor.submit(run_cmd, cmd=cuda_cmd) for cuda_cmd in cuda_cmds])
406+
407+
if futures:
408+
done, pending = concurrent.futures.wait(futures, return_when=concurrent.futures.FIRST_EXCEPTION)
409+
for d in done:
410+
if e := d.exception():
411+
for f in pending:
412+
f.cancel()
413+
raise e
414+
elapsed = (time.perf_counter_ns() - wall_clock) / 1000000.0
415+
print(f"build took {elapsed:.2f} ms ({args.jobs:d} workers)")
416+
383417
with ScopedTimer("link", active=args.verbose):
384418
link_cmd = f'"{host_linker}" {" ".join(linkopts + libs)} /out:"{dll_path}"'
385419
run_cmd(link_cmd)
@@ -391,7 +425,7 @@ def build_dll_for_arch(args, dll_path, cpp_paths, cu_path, arch, libs: list[str]
391425

392426
cpp_includes = f' -I"{warp_home_path.parent}/external/llvm-project/out/install/{mode}-{arch}/include"'
393427
cpp_includes += f' -I"{warp_home_path.parent}/_build/host-deps/llvm-project/release-{arch}/include"'
394-
cuda_includes = f' -I"{cuda_home}/include"' if cu_path else ""
428+
cuda_includes = f' -I"{cuda_home}/include"' if cu_paths else ""
395429
includes = cpp_includes + cuda_includes
396430

397431
if sys.platform == "darwin":
@@ -418,40 +452,72 @@ def build_dll_for_arch(args, dll_path, cpp_paths, cu_path, arch, libs: list[str]
418452

419453
ld_inputs = []
420454

421-
with ScopedTimer("build", active=args.verbose):
455+
with concurrent.futures.ThreadPoolExecutor(max_workers=args.jobs) as executor:
456+
futures, wall_clock = [], time.perf_counter_ns()
457+
458+
cpp_cmds = []
422459
for cpp_path in cpp_paths:
423460
cpp_out = cpp_path + ".o"
424461
ld_inputs.append(quote(cpp_out))
462+
cpp_cmd = f'{cpp_compiler} {cpp_flags} -c "{cpp_path}" -o "{cpp_out}"'
463+
cpp_cmds.append(cpp_cmd)
425464

426-
build_cmd = f'{cpp_compiler} {cpp_flags} -c "{cpp_path}" -o "{cpp_out}"'
427-
run_cmd(build_cmd)
465+
if args.jobs <= 1:
466+
with ScopedTimer("build", active=args.verbose):
467+
for cpp_cmd in cpp_cmds:
468+
run_cmd(cpp_cmd)
469+
else:
470+
futures = [executor.submit(run_cmd, cmd=cpp_cmd) for cpp_cmd in cpp_cmds]
428471

429-
if cu_path:
430-
cu_out = cu_path + ".o"
472+
cuda_cmds = []
473+
if cu_paths:
474+
for cu_path in cu_paths:
475+
cu_out = cu_path + ".o"
431476

432-
if cuda_compiler == "nvcc":
433-
if mode == "debug":
434-
cuda_cmd = f'"{cuda_home}/bin/nvcc" --std=c++17 -g -G -O0 --compiler-options -fPIC,-fvisibility=hidden -D_DEBUG -D_ITERATOR_DEBUG_LEVEL=0 -line-info {" ".join(nvcc_opts)} -DWP_ENABLE_CUDA=1 -I"{native_dir}" -D{mathdx_enabled} {libmathdx_includes} -o "{cu_out}" -c "{cu_path}"'
435-
elif mode == "release":
436-
cuda_cmd = f'"{cuda_home}/bin/nvcc" --std=c++17 -O3 --compiler-options -fPIC,-fvisibility=hidden {" ".join(nvcc_opts)} -DNDEBUG -DWP_ENABLE_CUDA=1 -I"{native_dir}" -D{mathdx_enabled} {libmathdx_includes} -o "{cu_out}" -c "{cu_path}"'
437-
else:
438-
# Use Clang compiler
439-
if mode == "debug":
440-
cuda_cmd = f'clang++ -Werror -Wuninitialized -Wno-unknown-cuda-version {" ".join(clang_opts)} -g -O0 -fPIC -fvisibility=hidden -D_DEBUG -D_ITERATOR_DEBUG_LEVEL=0 -DWP_ENABLE_CUDA=1 -I"{native_dir}" -D{mathdx_enabled} {libmathdx_includes} -o "{cu_out}" -c "{cu_path}"'
441-
elif mode == "release":
442-
cuda_cmd = f'clang++ -Werror -Wuninitialized -Wno-unknown-cuda-version {" ".join(clang_opts)} -O3 -fPIC -fvisibility=hidden -DNDEBUG -DWP_ENABLE_CUDA=1 -I"{native_dir}" -D{mathdx_enabled} {libmathdx_includes} -o "{cu_out}" -c "{cu_path}"'
477+
_nvcc_opts = [
478+
opt.replace("@filename@", os.path.basename(cu_path).replace(".", "_")) for opt in nvcc_opts
479+
]
443480

444-
with ScopedTimer("build_cuda", active=args.verbose):
445-
run_cmd(cuda_cmd)
481+
if cuda_compiler == "nvcc":
482+
if mode == "debug":
483+
cuda_cmd = f'"{cuda_home}/bin/nvcc" --std=c++17 -g -G -O0 --compiler-options -fPIC,-fvisibility=hidden -D_DEBUG -D_ITERATOR_DEBUG_LEVEL=0 -line-info {" ".join(_nvcc_opts)} -DWP_ENABLE_CUDA=1 -I"{native_dir}" -D{mathdx_enabled} {libmathdx_includes} -o "{cu_out}" -c "{cu_path}"'
484+
elif mode == "release":
485+
cuda_cmd = f'"{cuda_home}/bin/nvcc" --std=c++17 -O3 --compiler-options -fPIC,-fvisibility=hidden {" ".join(_nvcc_opts)} -DNDEBUG -DWP_ENABLE_CUDA=1 -I"{native_dir}" -D{mathdx_enabled} {libmathdx_includes} -o "{cu_out}" -c "{cu_path}"'
486+
else:
487+
# Use Clang compiler
488+
if mode == "debug":
489+
cuda_cmd = f'clang++ -Werror -Wuninitialized -Wno-unknown-cuda-version {" ".join(clang_opts)} -g -O0 -fPIC -fvisibility=hidden -D_DEBUG -D_ITERATOR_DEBUG_LEVEL=0 -DWP_ENABLE_CUDA=1 -I"{native_dir}" -D{mathdx_enabled} {libmathdx_includes} -o "{cu_out}" -c "{cu_path}"'
490+
elif mode == "release":
491+
cuda_cmd = f'clang++ -Werror -Wuninitialized -Wno-unknown-cuda-version {" ".join(clang_opts)} -O3 -fPIC -fvisibility=hidden -DNDEBUG -DWP_ENABLE_CUDA=1 -I"{native_dir}" -D{mathdx_enabled} {libmathdx_includes} -o "{cu_out}" -c "{cu_path}"'
492+
493+
cuda_cmds.append(cuda_cmd)
494+
495+
ld_inputs.append(quote(cu_out))
446496

447-
ld_inputs.append(quote(cu_out))
448497
ld_inputs.append(
449498
f'-L"{cuda_home}/lib64" -lcudart_static -lnvrtc_static -lnvrtc-builtins_static -lnvptxcompiler_static -lpthread -ldl -lrt'
450499
)
451500

452501
if args.libmathdx_path:
453502
ld_inputs.append(f"-lnvJitLink_static -L{args.libmathdx_path}/lib -lmathdx_static")
454503

504+
if args.jobs <= 1:
505+
with ScopedTimer("build_cuda", active=args.verbose):
506+
for cuda_cmd in cuda_cmds:
507+
run_cmd(cuda_cmd)
508+
else:
509+
futures.extend([executor.submit(run_cmd, cmd=cuda_cmd) for cuda_cmd in cuda_cmds])
510+
511+
if futures:
512+
done, pending = concurrent.futures.wait(futures, return_when=concurrent.futures.FIRST_EXCEPTION)
513+
for d in done:
514+
if e := d.exception():
515+
for f in pending:
516+
f.cancel()
517+
raise e
518+
elapsed = (time.perf_counter_ns() - wall_clock) / 1000000.0
519+
print(f"build took {elapsed:.2f} ms ({args.jobs:d} workers)")
520+
455521
if sys.platform == "darwin":
456522
opt_no_undefined = "-Wl,-undefined,error"
457523
opt_exclude_libs = ""
@@ -475,15 +541,15 @@ def build_dll_for_arch(args, dll_path, cpp_paths, cu_path, arch, libs: list[str]
475541
)
476542

477543

478-
def build_dll(args, dll_path, cpp_paths, cu_path, libs=None):
544+
def build_dll(args, dll_path, cpp_paths, cu_paths, libs=None):
479545
if sys.platform == "darwin":
480546
# create a universal binary by combining x86-64 and AArch64 builds
481-
build_dll_for_arch(args, dll_path + "-x86_64", cpp_paths, cu_path, "x86_64", libs)
482-
build_dll_for_arch(args, dll_path + "-aarch64", cpp_paths, cu_path, "aarch64", libs)
547+
build_dll_for_arch(args, dll_path + "-x86_64", cpp_paths, cu_paths, "x86_64", libs)
548+
build_dll_for_arch(args, dll_path + "-aarch64", cpp_paths, cu_paths, "aarch64", libs)
483549

484550
run_cmd(f"lipo -create -output {dll_path} {dll_path}-x86_64 {dll_path}-aarch64")
485551
os.remove(f"{dll_path}-x86_64")
486552
os.remove(f"{dll_path}-aarch64")
487553

488554
else:
489-
build_dll_for_arch(args, dll_path, cpp_paths, cu_path, machine_architecture(), libs)
555+
build_dll_for_arch(args, dll_path, cpp_paths, cu_paths, machine_architecture(), libs)

warp/native/builtin.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,7 @@
4949
#define DEG_TO_RAD 0.01745329251994329577
5050

5151
#if defined(__CUDACC__) && !defined(_MSC_VER)
52-
__device__ void __debugbreak() { __brkpt(); }
52+
__device__ inline void __debugbreak() { __brkpt(); }
5353
#endif
5454

5555
#if defined(__clang__) && defined(__CUDA__) && defined(__CUDA_ARCH__)

warp/native/bvh.cu

Lines changed: 14 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -31,11 +31,22 @@
3131

3232
#include <cub/cub.cuh>
3333

34+
extern CUcontext get_current_context();
3435

3536
namespace wp
3637
{
37-
void bvh_create_host(vec3* lowers, vec3* uppers, int num_items, int constructor_type, BVH& bvh);
38-
void bvh_destroy_host(BVH& bvh);
38+
void bvh_create_host(vec3* lowers, vec3* uppers, int num_items, int constructor_type, BVH& bvh);
39+
void bvh_destroy_host(BVH& bvh);
40+
41+
__global__ void memset_kernel(int* dest, int value, size_t n)
42+
{
43+
const size_t tid = static_cast<size_t>(blockDim.x) * static_cast<size_t>(blockIdx.x) + static_cast<size_t>(threadIdx.x);
44+
45+
if (tid < n)
46+
{
47+
dest[tid] = value;
48+
}
49+
}
3950

4051
// for LBVH: this will start with some muted leaf nodes, but that is okay, we can still trace up because there parents information is still valid
4152
// the only thing worth mentioning is that when the parent leaf node is also a leaf node, we need to recompute its bounds, since their child information are lost
@@ -503,7 +514,7 @@ void LinearBVHBuilderGPU::build(BVH& bvh, const vec3* item_lowers, const vec3* i
503514
}
504515
else
505516
{
506-
// IEEE-754 bit patterns for ±FLT_MAX
517+
// IEEE-754 bit patterns for +/- FLT_MAX
507518
constexpr int FLT_MAX_BITS = 0x7f7fffff;
508519
constexpr int NEG_FLT_MAX_BITS = 0xff7fffff;
509520

warp/native/bvh.h

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -498,15 +498,16 @@ CUDA_CALLABLE bool bvh_get_descriptor(uint64_t id, BVH& bvh);
498498
CUDA_CALLABLE void bvh_add_descriptor(uint64_t id, const BVH& bvh);
499499
CUDA_CALLABLE void bvh_rem_descriptor(uint64_t id);
500500

501-
#if !__CUDA_ARCH__
502-
503501
void bvh_create_host(vec3* lowers, vec3* uppers, int num_items, int constructor_type, BVH& bvh);
504502
void bvh_destroy_host(wp::BVH& bvh);
505503
void bvh_refit_host(wp::BVH& bvh);
506504

507-
void bvh_destroy_device(wp::BVH& bvh);
508-
void bvh_refit_device(uint64_t id);
505+
#if WP_ENABLE_CUDA
506+
507+
void bvh_create_device(void* context, vec3* lowers, vec3* uppers, int num_items, int constructor_type, BVH& bvh_device_on_host);
508+
void bvh_destroy_device(BVH& bvh);
509+
void bvh_refit_device(BVH& bvh);
509510

510-
#endif
511+
#endif // WP_ENABLE_CUDA
511512

512513
} // namespace wp

warp/native/hashgrid.cu

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,8 @@
2020
#include "hashgrid.h"
2121
#include "sort.h"
2222

23+
extern CUcontext get_current_context();
24+
2325
namespace wp
2426
{
2527

0 commit comments

Comments
 (0)