15
15
16
16
from __future__ import annotations
17
17
18
+ import concurrent .futures
18
19
import os
19
20
import platform
20
21
import subprocess
21
22
import sys
23
+ import time
22
24
23
25
from warp .utils import ScopedTimer
24
26
@@ -174,15 +176,15 @@ def add_llvm_bin_to_path(args):
174
176
return True
175
177
176
178
177
- def build_dll_for_arch (args , dll_path , cpp_paths , cu_path , arch , libs : list [str ] | None = None , mode = None ):
179
+ def build_dll_for_arch (args , dll_path , cpp_paths , cu_paths , arch , libs : list [str ] | None = None , mode = None ):
178
180
mode = args .mode if (mode is None ) else mode
179
181
cuda_home = args .cuda_path
180
182
cuda_cmd = None
181
183
182
184
# Add LLVM bin directory to PATH
183
185
add_llvm_bin_to_path (args )
184
186
185
- if args .quick or cu_path is None :
187
+ if args .quick or cu_paths is None :
186
188
cuda_compat_enabled = "WP_ENABLE_CUDA_COMPATIBILITY=0"
187
189
else :
188
190
cuda_compat_enabled = "WP_ENABLE_CUDA_COMPATIBILITY=1"
@@ -200,7 +202,7 @@ def build_dll_for_arch(args, dll_path, cpp_paths, cu_path, arch, libs: list[str]
200
202
201
203
native_dir = os .path .join (warp_home , "native" )
202
204
203
- if cu_path :
205
+ if cu_paths :
204
206
# check CUDA Toolkit version
205
207
ctk_version = get_cuda_toolkit_version (cuda_home )
206
208
if ctk_version < MIN_CTK_VERSION :
@@ -298,15 +300,15 @@ def build_dll_for_arch(args, dll_path, cpp_paths, cu_path, arch, libs: list[str]
298
300
299
301
if args .compile_time_trace :
300
302
if ctk_version >= (12 , 8 ):
301
- nvcc_opts .append ("--fdevice-time-trace=build_lib_compile -time-trace" )
303
+ nvcc_opts .append ("--fdevice-time-trace=_build/build_lib_@filename@_compile -time-trace" )
302
304
else :
303
305
print ("Warp warning: CUDA version is less than 12.8, compile_time_trace is not supported" )
304
306
305
307
if args .fast_math :
306
308
nvcc_opts .append ("--use_fast_math" )
307
309
308
310
# is the library being built with CUDA enabled?
309
- cuda_enabled = "WP_ENABLE_CUDA=1" if (cu_path is not None ) else "WP_ENABLE_CUDA=0"
311
+ cuda_enabled = "WP_ENABLE_CUDA=1" if (cu_paths is not None ) else "WP_ENABLE_CUDA=0"
310
312
311
313
if args .libmathdx_path :
312
314
libmathdx_includes = f' -I"{ args .libmathdx_path } /include"'
@@ -323,11 +325,11 @@ def build_dll_for_arch(args, dll_path, cpp_paths, cu_path, arch, libs: list[str]
323
325
324
326
cpp_includes = f' /I"{ warp_home_path .parent } /external/llvm-project/out/install/{ mode } -{ arch } /include"'
325
327
cpp_includes += f' /I"{ warp_home_path .parent } /_build/host-deps/llvm-project/release-{ arch } /include"'
326
- cuda_includes = f' /I"{ cuda_home } /include"' if cu_path else ""
328
+ cuda_includes = f' /I"{ cuda_home } /include"' if cu_paths else ""
327
329
includes = cpp_includes + cuda_includes
328
330
329
331
# nvrtc_static.lib is built with /MT and _ITERATOR_DEBUG_LEVEL=0 so if we link it in we must match these options
330
- if cu_path or mode != "debug" :
332
+ if cu_paths or mode != "debug" :
331
333
runtime = "/MT"
332
334
iter_dbg = "_ITERATOR_DEBUG_LEVEL=0"
333
335
debug = "NDEBUG"
@@ -353,33 +355,65 @@ def build_dll_for_arch(args, dll_path, cpp_paths, cu_path, arch, libs: list[str]
353
355
if args .fast_math :
354
356
cpp_flags += " /fp:fast"
355
357
356
- with ScopedTimer ("build" , active = args .verbose ):
358
+ with concurrent .futures .ThreadPoolExecutor (max_workers = args .jobs ) as executor :
359
+ futures , wall_clock = [], time .perf_counter_ns ()
360
+
361
+ cpp_cmds = []
357
362
for cpp_path in cpp_paths :
358
363
cpp_out = cpp_path + ".obj"
359
364
linkopts .append (quote (cpp_out ))
360
-
361
365
cpp_cmd = f'"{ args .host_compiler } " { cpp_flags } -c "{ cpp_path } " /Fo"{ cpp_out } "'
362
- run_cmd (cpp_cmd )
366
+ cpp_cmds . append (cpp_cmd )
363
367
364
- if cu_path :
365
- cu_out = cu_path + ".o"
368
+ if args .jobs <= 1 :
369
+ with ScopedTimer ("build" , active = args .verbose ):
370
+ for cpp_cmd in cpp_cmds :
371
+ run_cmd (cpp_cmd )
372
+ else :
373
+ futures = [executor .submit (run_cmd , cmd = cpp_cmd ) for cpp_cmd in cpp_cmds ]
366
374
367
- if mode == "debug" :
368
- cuda_cmd = f'"{ cuda_home } /bin/nvcc" --std=c++17 --compiler-options=/MT,/Zi,/Od -g -G -O0 -DNDEBUG -D_ITERATOR_DEBUG_LEVEL=0 -I"{ native_dir } " -line-info { " " .join (nvcc_opts )} -DWP_ENABLE_CUDA=1 -D{ mathdx_enabled } { libmathdx_includes } -o "{ cu_out } " -c "{ cu_path } "'
375
+ cuda_cmds = []
376
+ if cu_paths :
377
+ for cu_path in cu_paths :
378
+ cu_out = cu_path + ".o"
379
+
380
+ _nvcc_opts = [
381
+ opt .replace ("@filename@" , os .path .basename (cu_path ).replace ("." , "_" )) for opt in nvcc_opts
382
+ ]
369
383
370
- elif mode == "release" :
371
- cuda_cmd = f'"{ cuda_home } /bin/nvcc" --std=c++17 -O3 { " " .join (nvcc_opts )} -I"{ native_dir } " -DNDEBUG -DWP_ENABLE_CUDA=1 -D{ mathdx_enabled } { libmathdx_includes } -o "{ cu_out } " -c "{ cu_path } "'
384
+ if mode == "debug" :
385
+ cuda_cmd = f'"{ cuda_home } /bin/nvcc" --std=c++17 --compiler-options=/MT,/Zi,/Od -g -G -O0 -DNDEBUG -D_ITERATOR_DEBUG_LEVEL=0 -I"{ native_dir } " -line-info { " " .join (_nvcc_opts )} -DWP_ENABLE_CUDA=1 -D{ mathdx_enabled } { libmathdx_includes } -o "{ cu_out } " -c "{ cu_path } "'
386
+ elif mode == "release" :
387
+ cuda_cmd = f'"{ cuda_home } /bin/nvcc" --std=c++17 -O3 { " " .join (_nvcc_opts )} -I"{ native_dir } " -DNDEBUG -DWP_ENABLE_CUDA=1 -D{ mathdx_enabled } { libmathdx_includes } -o "{ cu_out } " -c "{ cu_path } "'
388
+
389
+ cuda_cmds .append (cuda_cmd )
390
+
391
+ linkopts .append (quote (cu_out ))
372
392
373
- with ScopedTimer ("build_cuda" , active = args .verbose ):
374
- run_cmd (cuda_cmd )
375
- linkopts .append (quote (cu_out ))
376
393
linkopts .append (
377
394
f'cudart_static.lib nvrtc_static.lib nvrtc-builtins_static.lib nvptxcompiler_static.lib ws2_32.lib user32.lib /LIBPATH:"{ cuda_home } /lib/x64"'
378
395
)
379
396
380
397
if args .libmathdx_path :
381
398
linkopts .append (f'nvJitLink_static.lib /LIBPATH:"{ args .libmathdx_path } /lib/x64" mathdx_static.lib' )
382
399
400
+ if args .jobs <= 1 :
401
+ with ScopedTimer ("build_cuda" , active = args .verbose ):
402
+ for cuda_cmd in cuda_cmds :
403
+ run_cmd (cuda_cmd )
404
+ else :
405
+ futures .extend ([executor .submit (run_cmd , cmd = cuda_cmd ) for cuda_cmd in cuda_cmds ])
406
+
407
+ if futures :
408
+ done , pending = concurrent .futures .wait (futures , return_when = concurrent .futures .FIRST_EXCEPTION )
409
+ for d in done :
410
+ if e := d .exception ():
411
+ for f in pending :
412
+ f .cancel ()
413
+ raise e
414
+ elapsed = (time .perf_counter_ns () - wall_clock ) / 1000000.0
415
+ print (f"build took { elapsed :.2f} ms ({ args .jobs :d} workers)" )
416
+
383
417
with ScopedTimer ("link" , active = args .verbose ):
384
418
link_cmd = f'"{ host_linker } " { " " .join (linkopts + libs )} /out:"{ dll_path } "'
385
419
run_cmd (link_cmd )
@@ -391,7 +425,7 @@ def build_dll_for_arch(args, dll_path, cpp_paths, cu_path, arch, libs: list[str]
391
425
392
426
cpp_includes = f' -I"{ warp_home_path .parent } /external/llvm-project/out/install/{ mode } -{ arch } /include"'
393
427
cpp_includes += f' -I"{ warp_home_path .parent } /_build/host-deps/llvm-project/release-{ arch } /include"'
394
- cuda_includes = f' -I"{ cuda_home } /include"' if cu_path else ""
428
+ cuda_includes = f' -I"{ cuda_home } /include"' if cu_paths else ""
395
429
includes = cpp_includes + cuda_includes
396
430
397
431
if sys .platform == "darwin" :
@@ -418,40 +452,72 @@ def build_dll_for_arch(args, dll_path, cpp_paths, cu_path, arch, libs: list[str]
418
452
419
453
ld_inputs = []
420
454
421
- with ScopedTimer ("build" , active = args .verbose ):
455
+ with concurrent .futures .ThreadPoolExecutor (max_workers = args .jobs ) as executor :
456
+ futures , wall_clock = [], time .perf_counter_ns ()
457
+
458
+ cpp_cmds = []
422
459
for cpp_path in cpp_paths :
423
460
cpp_out = cpp_path + ".o"
424
461
ld_inputs .append (quote (cpp_out ))
462
+ cpp_cmd = f'{ cpp_compiler } { cpp_flags } -c "{ cpp_path } " -o "{ cpp_out } "'
463
+ cpp_cmds .append (cpp_cmd )
425
464
426
- build_cmd = f'{ cpp_compiler } { cpp_flags } -c "{ cpp_path } " -o "{ cpp_out } "'
427
- run_cmd (build_cmd )
465
+ if args .jobs <= 1 :
466
+ with ScopedTimer ("build" , active = args .verbose ):
467
+ for cpp_cmd in cpp_cmds :
468
+ run_cmd (cpp_cmd )
469
+ else :
470
+ futures = [executor .submit (run_cmd , cmd = cpp_cmd ) for cpp_cmd in cpp_cmds ]
428
471
429
- if cu_path :
430
- cu_out = cu_path + ".o"
472
+ cuda_cmds = []
473
+ if cu_paths :
474
+ for cu_path in cu_paths :
475
+ cu_out = cu_path + ".o"
431
476
432
- if cuda_compiler == "nvcc" :
433
- if mode == "debug" :
434
- cuda_cmd = f'"{ cuda_home } /bin/nvcc" --std=c++17 -g -G -O0 --compiler-options -fPIC,-fvisibility=hidden -D_DEBUG -D_ITERATOR_DEBUG_LEVEL=0 -line-info { " " .join (nvcc_opts )} -DWP_ENABLE_CUDA=1 -I"{ native_dir } " -D{ mathdx_enabled } { libmathdx_includes } -o "{ cu_out } " -c "{ cu_path } "'
435
- elif mode == "release" :
436
- cuda_cmd = f'"{ cuda_home } /bin/nvcc" --std=c++17 -O3 --compiler-options -fPIC,-fvisibility=hidden { " " .join (nvcc_opts )} -DNDEBUG -DWP_ENABLE_CUDA=1 -I"{ native_dir } " -D{ mathdx_enabled } { libmathdx_includes } -o "{ cu_out } " -c "{ cu_path } "'
437
- else :
438
- # Use Clang compiler
439
- if mode == "debug" :
440
- cuda_cmd = f'clang++ -Werror -Wuninitialized -Wno-unknown-cuda-version { " " .join (clang_opts )} -g -O0 -fPIC -fvisibility=hidden -D_DEBUG -D_ITERATOR_DEBUG_LEVEL=0 -DWP_ENABLE_CUDA=1 -I"{ native_dir } " -D{ mathdx_enabled } { libmathdx_includes } -o "{ cu_out } " -c "{ cu_path } "'
441
- elif mode == "release" :
442
- cuda_cmd = f'clang++ -Werror -Wuninitialized -Wno-unknown-cuda-version { " " .join (clang_opts )} -O3 -fPIC -fvisibility=hidden -DNDEBUG -DWP_ENABLE_CUDA=1 -I"{ native_dir } " -D{ mathdx_enabled } { libmathdx_includes } -o "{ cu_out } " -c "{ cu_path } "'
477
+ _nvcc_opts = [
478
+ opt .replace ("@filename@" , os .path .basename (cu_path ).replace ("." , "_" )) for opt in nvcc_opts
479
+ ]
443
480
444
- with ScopedTimer ("build_cuda" , active = args .verbose ):
445
- run_cmd (cuda_cmd )
481
+ if cuda_compiler == "nvcc" :
482
+ if mode == "debug" :
483
+ cuda_cmd = f'"{ cuda_home } /bin/nvcc" --std=c++17 -g -G -O0 --compiler-options -fPIC,-fvisibility=hidden -D_DEBUG -D_ITERATOR_DEBUG_LEVEL=0 -line-info { " " .join (_nvcc_opts )} -DWP_ENABLE_CUDA=1 -I"{ native_dir } " -D{ mathdx_enabled } { libmathdx_includes } -o "{ cu_out } " -c "{ cu_path } "'
484
+ elif mode == "release" :
485
+ cuda_cmd = f'"{ cuda_home } /bin/nvcc" --std=c++17 -O3 --compiler-options -fPIC,-fvisibility=hidden { " " .join (_nvcc_opts )} -DNDEBUG -DWP_ENABLE_CUDA=1 -I"{ native_dir } " -D{ mathdx_enabled } { libmathdx_includes } -o "{ cu_out } " -c "{ cu_path } "'
486
+ else :
487
+ # Use Clang compiler
488
+ if mode == "debug" :
489
+ cuda_cmd = f'clang++ -Werror -Wuninitialized -Wno-unknown-cuda-version { " " .join (clang_opts )} -g -O0 -fPIC -fvisibility=hidden -D_DEBUG -D_ITERATOR_DEBUG_LEVEL=0 -DWP_ENABLE_CUDA=1 -I"{ native_dir } " -D{ mathdx_enabled } { libmathdx_includes } -o "{ cu_out } " -c "{ cu_path } "'
490
+ elif mode == "release" :
491
+ cuda_cmd = f'clang++ -Werror -Wuninitialized -Wno-unknown-cuda-version { " " .join (clang_opts )} -O3 -fPIC -fvisibility=hidden -DNDEBUG -DWP_ENABLE_CUDA=1 -I"{ native_dir } " -D{ mathdx_enabled } { libmathdx_includes } -o "{ cu_out } " -c "{ cu_path } "'
492
+
493
+ cuda_cmds .append (cuda_cmd )
494
+
495
+ ld_inputs .append (quote (cu_out ))
446
496
447
- ld_inputs .append (quote (cu_out ))
448
497
ld_inputs .append (
449
498
f'-L"{ cuda_home } /lib64" -lcudart_static -lnvrtc_static -lnvrtc-builtins_static -lnvptxcompiler_static -lpthread -ldl -lrt'
450
499
)
451
500
452
501
if args .libmathdx_path :
453
502
ld_inputs .append (f"-lnvJitLink_static -L{ args .libmathdx_path } /lib -lmathdx_static" )
454
503
504
+ if args .jobs <= 1 :
505
+ with ScopedTimer ("build_cuda" , active = args .verbose ):
506
+ for cuda_cmd in cuda_cmds :
507
+ run_cmd (cuda_cmd )
508
+ else :
509
+ futures .extend ([executor .submit (run_cmd , cmd = cuda_cmd ) for cuda_cmd in cuda_cmds ])
510
+
511
+ if futures :
512
+ done , pending = concurrent .futures .wait (futures , return_when = concurrent .futures .FIRST_EXCEPTION )
513
+ for d in done :
514
+ if e := d .exception ():
515
+ for f in pending :
516
+ f .cancel ()
517
+ raise e
518
+ elapsed = (time .perf_counter_ns () - wall_clock ) / 1000000.0
519
+ print (f"build took { elapsed :.2f} ms ({ args .jobs :d} workers)" )
520
+
455
521
if sys .platform == "darwin" :
456
522
opt_no_undefined = "-Wl,-undefined,error"
457
523
opt_exclude_libs = ""
@@ -475,15 +541,15 @@ def build_dll_for_arch(args, dll_path, cpp_paths, cu_path, arch, libs: list[str]
475
541
)
476
542
477
543
478
- def build_dll (args , dll_path , cpp_paths , cu_path , libs = None ):
544
+ def build_dll (args , dll_path , cpp_paths , cu_paths , libs = None ):
479
545
if sys .platform == "darwin" :
480
546
# create a universal binary by combining x86-64 and AArch64 builds
481
- build_dll_for_arch (args , dll_path + "-x86_64" , cpp_paths , cu_path , "x86_64" , libs )
482
- build_dll_for_arch (args , dll_path + "-aarch64" , cpp_paths , cu_path , "aarch64" , libs )
547
+ build_dll_for_arch (args , dll_path + "-x86_64" , cpp_paths , cu_paths , "x86_64" , libs )
548
+ build_dll_for_arch (args , dll_path + "-aarch64" , cpp_paths , cu_paths , "aarch64" , libs )
483
549
484
550
run_cmd (f"lipo -create -output { dll_path } { dll_path } -x86_64 { dll_path } -aarch64" )
485
551
os .remove (f"{ dll_path } -x86_64" )
486
552
os .remove (f"{ dll_path } -aarch64" )
487
553
488
554
else :
489
- build_dll_for_arch (args , dll_path , cpp_paths , cu_path , machine_architecture (), libs )
555
+ build_dll_for_arch (args , dll_path , cpp_paths , cu_paths , machine_architecture (), libs )
0 commit comments