diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 0ff552ad3..4a00e9814 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -74,6 +74,17 @@ jobs: pybind11_ver: v2.5.0 simd: sse4.2 setenvs: export CONAN_LLVM_VERSION=10.0.1 + - desc: gcc9/C++17 llvm11 py3.7 exr2.5 oiio2.3 sse2 batch-b4sse2 + nametag: linux-vfx2021 + runner: ubuntu-latest + container: aswftesting/ci-osl:2021-clang11 + vfxyear: 2021 + cxx_std: 17 + openimageio_ver: v2.4.13.0 + python_ver: 3.7 + pybind11_ver: v2.7.0 + simd: sse2 + batched: b4_SSE2 - desc: gcc9/C++17 llvm11 py3.7 exr2.5 oiio2.3 avx2 batch-b8avx2 nametag: linux-vfx2021 runner: ubuntu-latest diff --git a/INSTALL.md b/INSTALL.md index 25ec63d52..58ba76337 100644 --- a/INSTALL.md +++ b/INSTALL.md @@ -9,9 +9,9 @@ and aarch64), and Windows (x86_64). It may build and run on other platforms as well, but we don't officially support or test other than these platforms. Shader execution is supported on the native architectures of those x86_64 and -aarch64 platforms, a special batched 8- or 16-wide SIMD execution mode -requiring x86_64 with AVX2 or AVX-512 instructions, as well as on NVIDIA GPUs -using Cuda+OptiX. +aarch64 platforms, a special batched 4-, 8- or 16-wide SIMD execution mode +requiring x86_64 with SSE2, AVX/AVX2 or AVX-512 instructions, as well as on +NVIDIA GPUs using Cuda+OptiX. Dependencies ------------ diff --git a/src/cmake/compiler.cmake b/src/cmake/compiler.cmake index 81ac2092f..4e86cc055 100644 --- a/src/cmake/compiler.cmake +++ b/src/cmake/compiler.cmake @@ -329,7 +329,7 @@ endif () # # The USE_BATCHED option may be set to indicate that support for batched # SIMD shader execution be compiled along with targe specific libraries -set (USE_BATCHED "" CACHE STRING "Build batched SIMD shader execution for (0, b8_AVX, b8_AVX2, b8_AVX2_noFMA, b8_AVX512, b8_AVX512_noFMA, b16_AVX512, b16_AVX512_noFMA)") +set (USE_BATCHED "" CACHE STRING "Build batched SIMD shader execution for (0, b4_SSE2, b8_AVX, b8_AVX2, b8_AVX2_noFMA, b8_AVX512, b8_AVX512_noFMA, b16_AVX512, b16_AVX512_noFMA)") option (VEC_REPORT "Enable compiler's reporting system for vectorization" OFF) set (BATCHED_SUPPORT_DEFINES "") set (BATCHED_TARGET_LIBS "") diff --git a/src/include/OSL/batched_texture.h b/src/include/OSL/batched_texture.h index c720e9bed..787664472 100644 --- a/src/include/OSL/batched_texture.h +++ b/src/include/OSL/batched_texture.h @@ -49,6 +49,9 @@ static_assert(std::alignment_of>::value static_assert(std::alignment_of>::value == VecReg<8>::alignment, "Expect alignment of data member to set alignment of struct"); +static_assert(std::alignment_of>::value + == VecReg<4>::alignment, + "Expect alignment of data member to set alignment of struct"); template struct BatchedTextureOptions { VaryingTextureOptions varying; @@ -90,11 +93,15 @@ static_assert(std::alignment_of>::value static_assert(std::alignment_of>::value == VecReg<8>::alignment, "Expect alignment of data member to set alignment of struct"); +static_assert(std::alignment_of>::value + == VecReg<4>::alignment, + "Expect alignment of data member to set alignment of struct"); #ifdef OIIO_TEXTURE_SIMD_BATCH_WIDTH // Code here is to validate our OSL BatchedTextureOptions is binary compatible // and safe to reinterpret_cast -static_assert((OIIO::Tex::BatchWidth == 16) || (OIIO::Tex::BatchWidth == 8), +static_assert((OIIO::Tex::BatchWidth == 16) || (OIIO::Tex::BatchWidth == 8) + || (OIIO::Tex::BatchWidth == 4), "This validation requires OIIO_TEXTURE_SIMD_BATCH_WIDTH=16"); namespace validate_offsets { diff --git a/src/include/OSL/llvm_util.h b/src/include/OSL/llvm_util.h index 7f112ccf5..49df62891 100644 --- a/src/include/OSL/llvm_util.h +++ b/src/include/OSL/llvm_util.h @@ -693,6 +693,8 @@ class OSLEXECPUBLIC LLVM_Util { llvm::Constant* constant(uint32_t i); /// Return an llvm::Constant holding the given integer constant. + llvm::Constant* constant4(int8_t i); + llvm::Constant* constant4(uint8_t i); llvm::Constant* constant8(int8_t i); llvm::Constant* constant8(uint8_t i); llvm::Constant* constant16(int16_t i); @@ -1229,6 +1231,7 @@ class OSLEXECPUBLIC LLVM_Util { llvm::Value* op_linearize_16x_indices(llvm::Value* wide_index); llvm::Value* op_linearize_8x_indices(llvm::Value* wide_index); + llvm::Value* op_linearize_4x_indices(llvm::Value* wide_index); std::array op_split_16x(llvm::Value* vector_val); std::array op_split_8x(llvm::Value* vector_val); std::array op_quarter_16x(llvm::Value* vector_val); diff --git a/src/include/OSL/rendererservices.h b/src/include/OSL/rendererservices.h index 04e5269ae..62a6b6179 100644 --- a/src/include/OSL/rendererservices.h +++ b/src/include/OSL/rendererservices.h @@ -601,6 +601,7 @@ class OSLEXECPUBLIC RendererServices { /// Unless overridden, a nullptr is returned. virtual BatchedRendererServices<16>* batched(WidthOf<16>); virtual BatchedRendererServices<8>* batched(WidthOf<8>); + virtual BatchedRendererServices<4>* batched(WidthOf<4>); protected: TextureSystem* m_texturesys; // A place to hold a TextureSystem diff --git a/src/liboslexec/CMakeLists.txt b/src/liboslexec/CMakeLists.txt index 328565af6..2a2ea5c1b 100644 --- a/src/liboslexec/CMakeLists.txt +++ b/src/liboslexec/CMakeLists.txt @@ -380,6 +380,8 @@ foreach(batched_target ${BATCHED_TARGET_LIST}) list (APPEND TARGET_CXX_OPTS "-march=core-avx2") elseif (${TARGET_OPT_ISA} STREQUAL "AVX") list (APPEND TARGET_CXX_OPTS "-march=corei7-avx") + elseif (${TARGET_OPT_ISA} STREQUAL "SSE2") + list (APPEND TARGET_CXX_OPTS "-march=x86-64") else () message (FATAL_ERROR "Unknown ISA=${TARGET_OPT_ISA} extract from USE_BATCHED entry ${batched_target}") endif () @@ -455,6 +457,8 @@ foreach(batched_target ${BATCHED_TARGET_LIST}) list (APPEND TARGET_CXX_OPTS "-march=haswell") elseif (${TARGET_OPT_ISA} STREQUAL "AVX") list (APPEND TARGET_CXX_OPTS "-march=sandybridge") + elseif (${TARGET_OPT_ISA} STREQUAL "SSE2") + list (APPEND TARGET_CXX_OPTS "-march=x86-64") else () message (FATAL_ERROR "Unknown ISA=${TARGET_OPT_ISA} extract from USE_BATCHED entry ${batched_target}") endif () diff --git a/src/liboslexec/batched_analysis.cpp b/src/liboslexec/batched_analysis.cpp index 888f19874..9f76c1acf 100644 --- a/src/liboslexec/batched_analysis.cpp +++ b/src/liboslexec/batched_analysis.cpp @@ -1813,10 +1813,16 @@ struct Analyzer { // specific BatchedRendererServices. // Right here we don't know which width will be used, // so we will just require all widths provide the same answer + auto rs4 = m_ba.renderer()->batched(WidthOf<4>()); auto rs8 = m_ba.renderer()->batched(WidthOf<8>()); auto rs16 = m_ba.renderer()->batched(WidthOf<16>()); - if (rs8 || rs16) { + if (rs4 || rs8 || rs16) { get_attr_is_uniform = true; + if (rs4) { + get_attr_is_uniform + &= rs4->is_attribute_uniform(obj_name, + attr_name); + } if (rs8) { get_attr_is_uniform &= rs8->is_attribute_uniform(obj_name, diff --git a/src/liboslexec/batched_backendllvm.cpp b/src/liboslexec/batched_backendllvm.cpp index e94122ef4..79f87ca90 100644 --- a/src/liboslexec/batched_backendllvm.cpp +++ b/src/liboslexec/batched_backendllvm.cpp @@ -141,6 +141,7 @@ BatchedBackendLLVM::BatchedBackendLLVM(ShadingSystemImpl& shadingsys, switch (vector_width()) { case 16: m_true_mask_value = Mask<16>(true).value(); break; case 8: m_true_mask_value = Mask<8>(true).value(); break; + case 4: m_true_mask_value = Mask<4>(true).value(); break; default: OSL_ASSERT(0 && "unsupported vector width"); } ll.dumpasm(shadingsys.m_llvm_dumpasm); diff --git a/src/liboslexec/batched_llvm_instance.cpp b/src/liboslexec/batched_llvm_instance.cpp index 8e6ff0a76..218063786 100644 --- a/src/liboslexec/batched_llvm_instance.cpp +++ b/src/liboslexec/batched_llvm_instance.cpp @@ -537,6 +537,33 @@ const char* = "b8_AVX_"; #endif +#ifdef __OSL_SUPPORTS_b4_SSE2 +template<> +const NameAndSignature + ConcreteTargetLibraryHelper<4, TargetISA::x64>::library_functions[] + = { +# define DECL_INDIRECT(name, signature) \ + NameAndSignature { #name, signature }, +# define DECL(name, signature) DECL_INDIRECT(name, signature) +# define __OSL_WIDTH 4 +# define __OSL_TARGET_ISA SSE2 +// Don't allow order of xmacro includes be rearranged +// clang-format off +# include "wide/define_opname_macros.h" +# include "builtindecl_wide_xmacro.h" +# include "wide/undef_opname_macros.h" +// clang-format on +# undef __OSL_TARGET_ISA +# undef __OSL_WIDTH +# undef DECL +# undef DECL_INDIRECT + }; +template<> +const char* + ConcreteTargetLibraryHelper<4, TargetISA::x64>::library_selector_string + = "b4_SSE2_"; +#endif + std::unique_ptr @@ -592,6 +619,17 @@ BatchedBackendLLVM::TargetLibraryHelper::build(ShadingContext* context, default: break; } break; + case 4: + switch (target_isa) { +#ifdef __OSL_SUPPORTS_b4_SSE2 + case TargetISA::x64: + return RetType( + new ConcreteTargetLibraryHelper<4, TargetISA::x64>()); +#endif + default: break; + } + break; + default: OSL_ASSERT(0 && "unsupported vector width"); } std::cerr << "Build is not configured to support TargetISA of " @@ -735,6 +773,9 @@ BatchedBackendLLVM::llvm_type_batched_texture_options() { std::vector offset_by_index; switch (m_width) { + case 4: + build_offsets_of_BatchedTextureOptions<4>(offset_by_index); + break; case 8: build_offsets_of_BatchedTextureOptions<8>(offset_by_index); break; @@ -2698,6 +2739,9 @@ BatchedBackendLLVM::run() { std::vector offset_by_index; switch (m_width) { + case 4: + build_offsets_of_BatchedShaderGlobals<4>(offset_by_index); + break; case 8: build_offsets_of_BatchedShaderGlobals<8>(offset_by_index); break; diff --git a/src/liboslexec/batched_rendservices.cpp b/src/liboslexec/batched_rendservices.cpp index fbff377b2..1c5fcaa4a 100644 --- a/src/liboslexec/batched_rendservices.cpp +++ b/src/liboslexec/batched_rendservices.cpp @@ -328,5 +328,6 @@ BatchedRendererServices::getmessage(BatchedShaderGlobals* bsg, // Explicitly instantiate BatchedRendererServices template template class OSLEXECPUBLIC BatchedRendererServices<16>; template class OSLEXECPUBLIC BatchedRendererServices<8>; +template class OSLEXECPUBLIC BatchedRendererServices<4>; OSL_NAMESPACE_EXIT diff --git a/src/liboslexec/context.cpp b/src/liboslexec/context.cpp index a97b427e1..b001315a8 100644 --- a/src/liboslexec/context.cpp +++ b/src/liboslexec/context.cpp @@ -674,6 +674,7 @@ osl_incr_layers_executed(ShaderGlobals* sg) // Explicit template instantiation for supported batch sizes template class ShadingContext::Batched<16>; template class ShadingContext::Batched<8>; +template class ShadingContext::Batched<4>; #endif diff --git a/src/liboslexec/llvm_passes.h b/src/liboslexec/llvm_passes.h index 852ec82f9..43c7a7289 100644 --- a/src/liboslexec/llvm_passes.h +++ b/src/liboslexec/llvm_passes.h @@ -435,6 +435,8 @@ class LegacyPreventBitMasksFromBeingLiveinsToBasicBlocks final // including this file will need its own static members defined. LLVM will // assign IDs when they get registered, so this initialization value is not // important. +template<> char LegacyPreventBitMasksFromBeingLiveinsToBasicBlocks<4>::ID = 0; + template<> char LegacyPreventBitMasksFromBeingLiveinsToBasicBlocks<8>::ID = 0; template<> char LegacyPreventBitMasksFromBeingLiveinsToBasicBlocks<16>::ID = 0; diff --git a/src/liboslexec/llvm_util.cpp b/src/liboslexec/llvm_util.cpp index 3dd888cab..2a434a0b8 100644 --- a/src/liboslexec/llvm_util.cpp +++ b/src/liboslexec/llvm_util.cpp @@ -619,6 +619,12 @@ LLVM_Util::SetupLLVM() #ifndef OSL_LLVM_NEW_PASS_MANAGER // LegacyPreventBitMasksFromBeingLiveinsToBasicBlocks + static llvm::RegisterPass< + LegacyPreventBitMasksFromBeingLiveinsToBasicBlocks<4>> + sRegCustomPass2( + "PreventBitMasksFromBeingLiveinsToBasicBlocks<4>", + "Prevent Bit Masks <4xi1> From Being Liveins To Basic Blocks Pass", + false /* Only looks at CFG */, false /* Analysis Pass */); static llvm::RegisterPass< LegacyPreventBitMasksFromBeingLiveinsToBasicBlocks<8>> sRegCustomPass0( @@ -2305,7 +2311,11 @@ LLVM_Util::setup_new_optimization_passes(int optlevel, bool target_host) break; } case 4: - // We don't use masking or SIMD shading for 4-wide + // MUST BE THE FINAL PASS! + m_new_pass_manager->module_pass_manager.addPass( + createModuleToFunctionPassAdaptor( + NewPreventBitMasksFromBeingLiveinsToBasicBlocks<4>( + context()))); break; default: std::cout << "m_vector_width = " << m_vector_width << "\n"; @@ -2618,7 +2628,9 @@ LLVM_Util::setup_legacy_optimization_passes(int optlevel, bool target_host) new LegacyPreventBitMasksFromBeingLiveinsToBasicBlocks<8>()); break; case 4: - // We don't use masking or SIMD shading for 4-wide + // MUST BE THE FINAL PASS! + mpm.add( + new LegacyPreventBitMasksFromBeingLiveinsToBasicBlocks<4>()); break; default: std::cout << "m_vector_width = " << m_vector_width << "\n"; @@ -3393,6 +3405,19 @@ LLVM_Util::constant(uint32_t i) return llvm::ConstantInt::get(context(), llvm::APInt(32, i)); } +llvm::Constant* +LLVM_Util::constant4(int8_t i) +{ + return llvm::ConstantInt::get(context(), + llvm::APInt(4, i, true /*signed*/)); +} + +llvm::Constant* +LLVM_Util::constant4(uint8_t i) +{ + return llvm::ConstantInt::get(context(), llvm::APInt(4, i)); +} + llvm::Constant* LLVM_Util::constant8(int8_t i) { @@ -3592,6 +3617,11 @@ LLVM_Util::mask_as_int(llvm::Value* mask) // and all types are happy intMaskType = type_int8(); break; + case 4: + // We can just reinterpret cast a 4 bit mask to a 8 bit integer + // and all types are happy + intMaskType = type_int8(); + break; default: OSL_ASSERT(0 && "unsupported native bit mask width"); }; @@ -3659,6 +3689,25 @@ LLVM_Util::mask_as_int(llvm::Value* mask) int8_mask = builder().CreateCall(func, toArrayRef(args)); return int8_mask; } + case 4: { + // We need to do more than a simple cast to an int. Since we + // know vectorized comparison for SSE2 ends up setting 4 + // 32 bit integers to 0xFFFFFFFF or 0x00000000, We need to + // do more than a simple cast to an int. + + // Convert <4 x i1> -> <4 x i32> + llvm::Value* w4_int_mask = builder().CreateSExt(mask, + type_wide_int()); + // Now we will use the horizontal sign extraction intrinsic + // to build a 32 bit mask value. + llvm::Function* func = llvm::Intrinsic::getDeclaration( + module(), llvm::Intrinsic::x86_sse2_pmovmskb_128); + + llvm::Value* args[1] = { w4_int_mask }; + llvm::Value* int8_mask; + int8_mask = builder().CreateCall(func, toArrayRef(args)); + return int8_mask; + } default: { OSL_ASSERT(0 && "unsupported native bit mask width"); return mask; @@ -3678,28 +3727,18 @@ LLVM_Util::mask_as_int(llvm::Value* mask) auto w4_int_masks = op_quarter_16x(wide_int_mask); // Now we will use the horizontal sign extraction intrinsic - // to build a 32 bit mask value. However the only 128bit - // version works on floats, so we will cast from int32 to - // float beforehand - llvm::Type* w4_float_type = llvm_vector_type(m_llvm_type_float, 4); - std::array w4_float_masks = { - { builder().CreateBitCast(w4_int_masks[0], w4_float_type), - builder().CreateBitCast(w4_int_masks[1], w4_float_type), - builder().CreateBitCast(w4_int_masks[2], w4_float_type), - builder().CreateBitCast(w4_int_masks[3], w4_float_type) } - }; - + // to build a 32 bit mask value. llvm::Function* func = llvm::Intrinsic::getDeclaration( - module(), llvm::Intrinsic::x86_sse_movmsk_ps); + module(), llvm::Intrinsic::x86_sse2_pmovmskb_128); - llvm::Value* args[1] = { w4_float_masks[0] }; + llvm::Value* args[1] = { w4_int_masks[0] }; std::array int4_masks; int4_masks[0] = builder().CreateCall(func, toArrayRef(args)); - args[0] = w4_float_masks[1]; + args[0] = w4_int_masks[1]; int4_masks[1] = builder().CreateCall(func, toArrayRef(args)); - args[0] = w4_float_masks[2]; + args[0] = w4_int_masks[2]; int4_masks[2] = builder().CreateCall(func, toArrayRef(args)); - args[0] = w4_float_masks[3]; + args[0] = w4_int_masks[3]; int4_masks[3] = builder().CreateCall(func, toArrayRef(args)); llvm::Value* bits12_15 = op_shl(int4_masks[3], constant(12)); @@ -3720,22 +3759,14 @@ LLVM_Util::mask_as_int(llvm::Value* mask) auto w4_int_masks = op_split_8x(wide_int_mask); // Now we will use the horizontal sign extraction intrinsic - // to build a 32 bit mask value. However the only 128bit - // version works on floats, so we will cast from int32 to - // float beforehand - llvm::Type* w4_float_type = llvm_vector_type(m_llvm_type_float, 4); - std::array w4_float_masks = { - { builder().CreateBitCast(w4_int_masks[0], w4_float_type), - builder().CreateBitCast(w4_int_masks[1], w4_float_type) } - }; - + // to build a 32 bit mask value. llvm::Function* func = llvm::Intrinsic::getDeclaration( - module(), llvm::Intrinsic::x86_sse_movmsk_ps); + module(), llvm::Intrinsic::x86_sse2_pmovmskb_128); - llvm::Value* args[1] = { w4_float_masks[0] }; + llvm::Value* args[1] = { w4_int_masks[0] }; std::array int4_masks; int4_masks[0] = builder().CreateCall(func, toArrayRef(args)); - args[0] = w4_float_masks[1]; + args[0] = w4_int_masks[1]; int4_masks[1] = builder().CreateCall(func, toArrayRef(args)); llvm::Value* bits4_7 = op_shl(int4_masks[1], constant(4)); @@ -3748,21 +3779,15 @@ LLVM_Util::mask_as_int(llvm::Value* mask) // do more than a simple cast to an int. // Convert <4 x i1> -> <4 x i32> - llvm::Value* wide_int_mask = builder().CreateSExt(mask, - type_wide_int()); + llvm::Value* w4_int_mask = builder().CreateSExt(mask, + type_wide_int()); // Now we will use the horizontal sign extraction intrinsic - // to build a 32 bit mask value. However the only 128bit - // version works on floats, so we will cast from int32 to - // float beforehand - llvm::Type* w4_float_type = llvm_vector_type(m_llvm_type_float, 4); - llvm::Value* w4_float_mask = builder().CreateBitCast(wide_int_mask, - w4_float_type); - + // to build a 32 bit mask value. llvm::Function* func = llvm::Intrinsic::getDeclaration( - module(), llvm::Intrinsic::x86_sse_movmsk_ps); + module(), llvm::Intrinsic::x86_sse2_pmovmskb_128); - llvm::Value* args[1] = { w4_float_mask }; + llvm::Value* args[1] = { w4_int_mask }; llvm::Value* int4_mask = builder().CreateCall(func, toArrayRef(args)); @@ -3797,13 +3822,28 @@ LLVM_Util::mask_as_int8(llvm::Value* mask) llvm::Value* LLVM_Util::mask4_as_int8(llvm::Value* mask) { - OSL_ASSERT(m_supports_llvm_bit_masks_natively); - // combine <4xi1> mask with <4xi1> zero init to get <8xi1> and cast it - // to i8 - llvm::Value* zero_mask4 - = llvm::ConstantDataVector::getSplat(4, constant_bool(false)); - return builder().CreateBitCast(op_combine_4x_vectors(mask, zero_mask4), - type_int8()); + if (m_supports_llvm_bit_masks_natively) { + // combine <4xi1> mask with <4xi1> zero init to get <8xi1> and cast it + // to i8 + llvm::Value* zero_mask4 + = llvm::ConstantDataVector::getSplat(4, constant_bool(false)); + return builder().CreateBitCast(op_combine_4x_vectors(mask, zero_mask4), + type_int8()); + } else { + // Convert <4 x i1> -> <4 x i32> + llvm::Value* w4_int_mask = builder().CreateSExt(mask, type_wide_int()); + + // Now we will use the horizontal sign extraction intrinsic + // to build a 32 bit mask value. + llvm::Function* func = llvm::Intrinsic::getDeclaration( + module(), llvm::Intrinsic::x86_sse2_pmovmskb_128); + + llvm::Value* args[1] = { w4_int_mask }; + llvm::Value* int32 = builder().CreateCall(func, toArrayRef(args)); + llvm::Value* i8 = builder().CreateIntCast(int32, type_int8(), true); + + return i8; + } } @@ -3828,14 +3868,19 @@ LLVM_Util::int_as_mask(llvm::Value* value) // and all types are happy intMaskType = type_int8(); break; + case 4: + // We can just reinterpret cast a 8 bit integer to a 4 bit mask + // and all types are happy + intMaskType = type_int8(); + break; default: OSL_ASSERT(0 && "unsupported native bit mask width"); }; llvm::Value* intMask = builder().CreateTrunc(value, intMaskType); result = builder().CreateBitCast(intMask, type_wide_bool()); } else { - // Since we know vectorized comparisons for AVX&AVX2 end up setting - // 8 32 bit integers to 0xFFFFFFFF or 0x00000000, We need to do more + // Since we know vectorized comparisons for SSE2&AVX&AVX2 end up setting + // 32 bit integers to 0xFFFFFFFF or 0x00000000, We need to do more // than a simple cast to an int. // Broadcast out the int32 mask to all data lanes @@ -3950,23 +3995,25 @@ LLVM_Util::op_1st_active_lane_of(llvm::Value* mask) // and all types are happy intMaskType = type_int8(); break; -#if 0 // WIP - case 4: - { - // We can just reinterpret cast a 8 bit mask to a 8 bit integer - // and all types are happy - intMaskType = type_int8(); + case 4: { + intMaskType = type_int8(); -// extended_int_vector_type = (llvm::Type *) llvm::VectorType::get(llvm::Type::getInt32Ty (*m_llvm_context), m_vector_width); -// llvm::Value * wide_int_mask = builder().CreateSExt(mask, extended_int_vector_type); -// -// int_reinterpret_cast_vector_type = (llvm::Type *) llvm::Type::getInt128Ty (*m_llvm_context); -// zeroConstant = constant128(0); -// -// llvm::Value * mask_as_int = builder().CreateBitCast (wide_int_mask, int_reinterpret_cast_vector_type); - break; - } -#endif + llvm::Value* mask_as_int = mask4_as_int8(mask); + + // Count trailing zeros, least significant + llvm::Type* types[] = { intMaskType }; + llvm::Function* func_cttz + = llvm::Intrinsic::getDeclaration(module(), llvm::Intrinsic::cttz, + toArrayRef(types)); + + llvm::Value* args[2] = { mask_as_int, constant_bool(true) }; + + llvm::Value* firstNonZeroIndex = builder().CreateCall(func_cttz, + toArrayRef(args)); + return firstNonZeroIndex; + + break; + } default: OSL_ASSERT(0 && "unsupported native bit mask width"); }; @@ -4455,6 +4502,19 @@ LLVM_Util::op_linearize_8x_indices(llvm::Value* wide_index) } +llvm::Value* +LLVM_Util::op_linearize_4x_indices(llvm::Value* wide_index) +{ + llvm::Value* strided_indices = op_mul(wide_index, wide_constant(4, 4)); + llvm::Constant* offsets_to_lane[4] = { constant(0), constant(1), + constant(2), constant(3) }; + llvm::Value* const_vec_offsets = llvm::ConstantVector::get( + llvm::ArrayRef(&offsets_to_lane[0], 4)); + + return op_add(strided_indices, const_vec_offsets); +} + + std::array LLVM_Util::op_split_16x(llvm::Value* vector_val) { @@ -4613,6 +4673,7 @@ LLVM_Util::op_gather(llvm::Type* src_type, llvm::Value* src_ptr, module(), llvm::Intrinsic::x86_avx512_gather_dpi_512); break; case 8: + case 4: int_mask = mask_as_int8(current_mask()); func_avx512_gather_pi = llvm::Intrinsic::getDeclaration( module(), llvm::Intrinsic::x86_avx512_gather3siv8_si); @@ -4663,6 +4724,16 @@ LLVM_Util::op_gather(llvm::Type* src_type, llvm::Value* src_ptr, toArrayRef(args)); return gather_result; } + case 4: { + llvm::Value* args[] = { avx2_unmasked_value, void_ptr(src_ptr), + wide_index, wide_int_mask, + constant4((uint8_t)4) }; + llvm::Value* gather_result + = builder().CreateCall(func_avx2_gather_pi, + toArrayRef(args)); + return gather_result; + } + default: OSL_ASSERT(0 && "unsupported width"); }; } else { @@ -4680,6 +4751,7 @@ LLVM_Util::op_gather(llvm::Type* src_type, llvm::Value* src_ptr, module(), llvm::Intrinsic::x86_avx512_gather_dps_512); break; case 8: + case 4: int_mask = mask_as_int8(current_mask()); func_avx512_gather_ps = llvm::Intrinsic::getDeclaration( module(), llvm::Intrinsic::x86_avx512_gather3siv8_sf); @@ -4739,6 +4811,17 @@ LLVM_Util::op_gather(llvm::Type* src_type, llvm::Value* src_ptr, toArrayRef(args)); return gather; } + case 4: { + llvm::Value* args[] = { + avx2_unmasked_value, void_ptr(src_ptr), wide_index, + builder().CreateBitCast(wide_int_mask, + llvm_vector_type(type_float(), 4)), + constant4((uint8_t)4) + }; + llvm::Value* gather = builder().CreateCall(func_avx2_gather_ps, + toArrayRef(args)); + return gather; + } } } else { return clamped_gather_from_uniform(type_wide_float()); @@ -4805,6 +4888,29 @@ LLVM_Util::op_gather(llvm::Type* src_type, llvm::Value* src_ptr, gather2), type_wide_ustring()); } + case 4: { + // Gather 64bit integer, as that is binary compatible with 64bit pointers of ustring + llvm::Function* func_avx512_gather_dpq + = llvm::Intrinsic::getDeclaration( + module(), llvm::Intrinsic::x86_avx512_gather3siv4_di); + OSL_ASSERT(func_avx512_gather_dpq); + + auto w4_bit_masks = current_mask(); + auto w4_int_indices = wide_index; + + llvm::Value* unmasked_value + = builder().CreateVectorSplat(4, constant64((uint64_t)0)); + llvm::Value* args[] + = { unmasked_value, void_ptr(src_ptr), w4_int_indices, + mask4_as_int8(w4_bit_masks), constant(4) }; + llvm::Value* gather1 + = builder().CreateCall(func_avx512_gather_dpq, + toArrayRef(args)); + args[2] = w4_int_indices; + args[3] = mask4_as_int8(w4_bit_masks); + + return builder().CreateIntToPtr(gather1, type_wide_ustring()); + } default: OSL_ASSERT(0 && "unsupported native bit mask width"); } } else { @@ -4841,6 +4947,20 @@ LLVM_Util::op_gather(llvm::Type* src_type, llvm::Value* src_ptr, return builder().CreateCall(func_avx512_gather_ps, toArrayRef(args)); } + case 4: { + llvm::Function* func_avx512_gather_ps + = llvm::Intrinsic::getDeclaration( + module(), llvm::Intrinsic::x86_avx512_gather3siv8_sf); + OSL_ASSERT(func_avx512_gather_ps); + + llvm::Value* unmasked_value = wide_constant(0.0f); + llvm::Value* args[] = { unmasked_value, void_ptr(src_ptr), + op_linearize_4x_indices(wide_index), + mask_as_int8(current_mask()), + constant(4) }; + return builder().CreateCall(func_avx512_gather_ps, + toArrayRef(args)); + } default: OSL_ASSERT(0 && "unsupported native bit mask width"); }; @@ -4889,6 +5009,19 @@ LLVM_Util::op_gather(llvm::Type* src_type, llvm::Value* src_ptr, toArrayRef(args)); return gather_result; } + case 4: { + auto int_indices = op_linearize_4x_indices(wide_index); + llvm::Value* args[] = { + avx2_unmasked_value, void_ptr(src_ptr), int_indices, + builder().CreateBitCast(wide_int_mask, + llvm_vector_type(type_float(), 4)), + constant8((uint8_t)4) + }; + llvm::Value* gather_result + = builder().CreateCall(func_avx2_gather_ps, + toArrayRef(args)); + return gather_result; + } default: OSL_ASSERT(0 && "unsupported vector width for avx2 gather"); } @@ -4926,6 +5059,20 @@ LLVM_Util::op_gather(llvm::Type* src_type, llvm::Value* src_ptr, return builder().CreateCall(func_avx512_gather_pi, toArrayRef(args)); } + case 4: { + llvm::Function* func_avx512_gather_pi + = llvm::Intrinsic::getDeclaration( + module(), llvm::Intrinsic::x86_avx512_gather3siv8_si); + OSL_ASSERT(func_avx512_gather_pi); + + llvm::Value* unmasked_value = wide_constant(0); + llvm::Value* args[] = { unmasked_value, void_ptr(src_ptr), + op_linearize_4x_indices(wide_index), + mask_as_int8(current_mask()), + constant(4) }; + return builder().CreateCall(func_avx512_gather_pi, + toArrayRef(args)); + } default: OSL_ASSERT(0 && "unsupported native bit mask width"); } } else if (m_supports_avx2) { @@ -4975,6 +5122,26 @@ LLVM_Util::op_gather(llvm::Type* src_type, llvm::Value* src_ptr, toArrayRef(args)); return gather_result; } + case 4: { + llvm::Function* func_avx2_gather_pi + = llvm::Intrinsic::getDeclaration( + module(), llvm::Intrinsic::x86_avx2_gather_d_d_256); + OSL_ASSERT(func_avx2_gather_pi); + + llvm::Constant* avx2_unmasked_value = wide_constant(8, 0); + + // Convert <16 x i1> -> <16 x i32> -> to <2 x< 8 x i32>> + llvm::Value* wide_int_mask + = builder().CreateSExt(current_mask(), type_wide_int()); + auto int_indices = op_linearize_4x_indices(wide_index); + llvm::Value* args[] = { avx2_unmasked_value, void_ptr(src_ptr), + int_indices, wide_int_mask, + constant8((uint8_t)4) }; + llvm::Value* gather_result + = builder().CreateCall(func_avx2_gather_pi, + toArrayRef(args)); + return gather_result; + } default: OSL_ASSERT(0 && "unsupported vector width for avx2 gather"); } @@ -5017,7 +5184,8 @@ LLVM_Util::op_gather(llvm::Type* src_type, llvm::Value* src_ptr, gather2), type_wide_ustring()); } - case 8: { + case 8: + case 4: { // Gather 64bit integer, as that is binary compatible with 64bit pointers of ustring llvm::Function* func_avx512_gather_dpq = llvm::Intrinsic::getDeclaration( @@ -5093,6 +5261,7 @@ LLVM_Util::op_scatter(llvm::Value* wide_val, llvm::Type* src_type, linear_indices = op_linearize_16x_indices(wide_index); break; case 8: linear_indices = op_linearize_8x_indices(wide_index); break; + case 4: linear_indices = op_linearize_4x_indices(wide_index); break; default: OSL_ASSERT(0 && "unsupported vector width for scatter"); }; } else { @@ -5150,6 +5319,7 @@ LLVM_Util::op_scatter(llvm::Value* wide_val, llvm::Type* src_type, module(), llvm::Intrinsic::x86_avx512_scatter_dps_512); break; case 8: + case 4: int_mask = mask_as_int8(current_mask()); func_avx512_scatter_ps = llvm::Intrinsic::getDeclaration( module(), llvm::Intrinsic::x86_avx512_scattersiv8_sf); @@ -5182,6 +5352,7 @@ LLVM_Util::op_scatter(llvm::Value* wide_val, llvm::Type* src_type, module(), llvm::Intrinsic::x86_avx512_scatter_dpi_512); break; case 8: + case 4: int_mask = mask_as_int8(current_mask()); func_avx512_scatter_pi = llvm::Intrinsic::getDeclaration( module(), llvm::Intrinsic::x86_avx512_scattersiv8_si); @@ -5256,6 +5427,25 @@ LLVM_Util::op_scatter(llvm::Value* wide_val, llvm::Type* src_type, builder().CreateCall(func_avx512_scatter_dpq, toArrayRef(args)); return; } + case 4: { + llvm::Value* linear_indices = wide_index; + + llvm::Function* func_avx512_scatter_dpq + = llvm::Intrinsic::getDeclaration( + module(), llvm::Intrinsic::x86_avx512_scatter_dpq_512); + OSL_ASSERT(func_avx512_scatter_dpq); + + llvm::Type* wide_address_int_type + = llvm_vector_type(type_addrint(), 4); + llvm::Value* address_int_val + = builder().CreatePtrToInt(wide_val, wide_address_int_type); + + llvm::Value* args[] + = { void_ptr(src_ptr), mask_as_int8(current_mask()), + linear_indices, address_int_val, constant(4) }; + builder().CreateCall(func_avx512_scatter_dpq, toArrayRef(args)); + return; + } default: OSL_ASSERT(0 && "incomplete vector width for AVX512 scatter"); } @@ -5295,6 +5485,19 @@ LLVM_Util::op_scatter(llvm::Value* wide_val, llvm::Type* src_type, builder().CreateCall(func_avx512_scatter_ps, toArrayRef(args)); return; } + case 4: { + llvm::Function* func_avx512_scatter_ps + = llvm::Intrinsic::getDeclaration( + module(), llvm::Intrinsic::x86_avx512_scattersiv8_sf); + OSL_ASSERT(func_avx512_scatter_ps); + + llvm::Value* args[] = { void_ptr(src_ptr), + mask_as_int8(current_mask()), + op_linearize_4x_indices(wide_index), + wide_val, constant(4) }; + builder().CreateCall(func_avx512_scatter_ps, toArrayRef(args)); + return; + } default: OSL_ASSERT(0 && "incomplete vector width for AVX512 scatter"); } @@ -5338,6 +5541,19 @@ LLVM_Util::op_scatter(llvm::Value* wide_val, llvm::Type* src_type, builder().CreateCall(func_avx512_scatter_pi, toArrayRef(args)); return; } + case 4: { + llvm::Function* func_avx512_scatter_pi + = llvm::Intrinsic::getDeclaration( + module(), llvm::Intrinsic::x86_avx512_scattersiv8_si); + OSL_ASSERT(func_avx512_scatter_pi); + + llvm::Value* args[] = { void_ptr(src_ptr), + mask_as_int8(current_mask()), + op_linearize_4x_indices(wide_index), + wide_val, constant(4) }; + builder().CreateCall(func_avx512_scatter_pi, toArrayRef(args)); + return; + } default: OSL_ASSERT(0 && "incomplete vector width for AVX512 scatter"); } @@ -5407,6 +5623,26 @@ LLVM_Util::op_scatter(llvm::Value* wide_val, llvm::Type* src_type, builder().CreateCall(func_avx512_scatter_dpq, toArrayRef(args)); return; } + case 4: { + llvm::Value* linear_indices = op_linearize_4x_indices( + wide_index); + + llvm::Function* func_avx512_scatter_dpq + = llvm::Intrinsic::getDeclaration( + module(), llvm::Intrinsic::x86_avx512_scatter_dpq_512); + OSL_ASSERT(func_avx512_scatter_dpq); + + llvm::Type* wide_address_int_type + = llvm_vector_type(type_addrint(), 4); + llvm::Value* address_int_val + = builder().CreatePtrToInt(wide_val, wide_address_int_type); + + llvm::Value* args[] + = { void_ptr(src_ptr), mask_as_int8(current_mask()), + linear_indices, address_int_val, constant(4) }; + builder().CreateCall(func_avx512_scatter_dpq, toArrayRef(args)); + return; + } default: OSL_ASSERT(0 && "incomplete vector width for AVX512 scatter"); } diff --git a/src/liboslexec/rendservices.cpp b/src/liboslexec/rendservices.cpp index c0c84b03d..b3bd5c898 100644 --- a/src/liboslexec/rendservices.cpp +++ b/src/liboslexec/rendservices.cpp @@ -524,4 +524,11 @@ RendererServices::batched(WidthOf<8>) return nullptr; } +BatchedRendererServices<4>* +RendererServices::batched(WidthOf<4>) +{ + // No default implementation for batched services + return nullptr; +} + OSL_NAMESPACE_EXIT diff --git a/src/liboslexec/shadingsys.cpp b/src/liboslexec/shadingsys.cpp index 7efc5b13d..c0afc98f1 100644 --- a/src/liboslexec/shadingsys.cpp +++ b/src/liboslexec/shadingsys.cpp @@ -618,6 +618,29 @@ ShadingSystem::configure_batch_execution_at(int width) m_impl->attribute("llvm_jit_fma", 0); return true; } +# endif + if (target_requested) { + break; + } + // fallthrough + default: return false; + }; + return false; + case 4: + switch (requestedISA) { + case TargetISA::UNKNOWN: + // fallthrough + case TargetISA::x64: +# ifdef __OSL_SUPPORTS_b4_SSE2 + if (LLVM_Util::supports_isa(TargetISA::x64)) { + if (!target_requested) + m_impl->attribute("llvm_jit_target", + LLVM_Util::target_isa_name( + TargetISA::x64)); + // SSE2 doesn't support FMA + m_impl->attribute("llvm_jit_fma", 0); + return true; + } # endif if (target_requested) { break; @@ -885,6 +908,7 @@ ShadingSystem::BatchedExecutor::jit_all_groups(int nthreads) // Explicitly instantiate template class ShadingSystem::BatchedExecutor<16>; template class ShadingSystem::BatchedExecutor<8>; +template class ShadingSystem::BatchedExecutor<4>; #endif @@ -1079,7 +1103,8 @@ ShadingSystemImpl::ShadingSystemImpl(RendererServices* renderer, , m_opt_groupdata(true) #if OSL_USE_BATCHED , m_opt_batched_analysis((renderer->batched(WidthOf<16>()) != nullptr) - || (renderer->batched(WidthOf<8>()) != nullptr)) + || (renderer->batched(WidthOf<8>()) != nullptr) + || (renderer->batched(WidthOf<4>()) != nullptr)) #else , m_opt_batched_analysis(false) #endif @@ -3794,7 +3819,8 @@ ShadingSystemImpl::optimize_group(ShaderGroup& group, ShadingContext* ctx, // the batch jit has already happened, // as it requires the ops so we can't delete them yet! if (((renderer()->batched(WidthOf<16>()) == nullptr) - && (renderer()->batched(WidthOf<8>()) == nullptr)) + && (renderer()->batched(WidthOf<8>()) == nullptr) + && (renderer()->batched(WidthOf<4>()) == nullptr)) || group.batch_jitted()) { group_post_jit_cleanup(group); } @@ -4015,6 +4041,7 @@ ShadingSystemImpl::Batched::jit_all_groups(int nthreads, int mythread, // machine as well, start with just the batch size template class pvt::ShadingSystemImpl::Batched<16>; template class pvt::ShadingSystemImpl::Batched<8>; +template class pvt::ShadingSystemImpl::Batched<4>; #endif int diff --git a/src/testshade/batched_simplerend.cpp b/src/testshade/batched_simplerend.cpp index 937655af4..ea2acbdf9 100644 --- a/src/testshade/batched_simplerend.cpp +++ b/src/testshade/batched_simplerend.cpp @@ -1001,6 +1001,7 @@ BatchedSimpleRenderer::get_camera_screen_window(ustringhash /*object*/, // Explicitly instantiate BatchedSimpleRenderer template template class BatchedSimpleRenderer<16>; template class BatchedSimpleRenderer<8>; +template class BatchedSimpleRenderer<4>; OSL_NAMESPACE_EXIT diff --git a/src/testshade/simplerend.cpp b/src/testshade/simplerend.cpp index 65862c2db..3582c9cc4 100644 --- a/src/testshade/simplerend.cpp +++ b/src/testshade/simplerend.cpp @@ -218,7 +218,9 @@ register_closures(OSL::ShadingSystem* shadingsys) SimpleRenderer::SimpleRenderer() #if OSL_USE_BATCHED - : m_batch_16_simple_renderer(*this), m_batch_8_simple_renderer(*this) + : m_batch_16_simple_renderer(*this) + , m_batch_8_simple_renderer(*this) + , m_batch_4_simple_renderer(*this) #endif { Matrix44 M; diff --git a/src/testshade/simplerend.h b/src/testshade/simplerend.h index 87d0b96dd..8ebe1c1fc 100644 --- a/src/testshade/simplerend.h +++ b/src/testshade/simplerend.h @@ -177,12 +177,17 @@ class SimpleRenderer : public RendererServices { { return &m_batch_8_simple_renderer; } + BatchedRendererServices<4>* batched(WidthOf<4>) override + { + return &m_batch_4_simple_renderer; + } #endif protected: #if OSL_USE_BATCHED BatchedSimpleRenderer<16> m_batch_16_simple_renderer; BatchedSimpleRenderer<8> m_batch_8_simple_renderer; + BatchedSimpleRenderer<4> m_batch_4_simple_renderer; #endif // Camera parameters diff --git a/src/testshade/testshade.cpp b/src/testshade/testshade.cpp index db5bac164..d8b6f749c 100644 --- a/src/testshade/testshade.cpp +++ b/src/testshade/testshade.cpp @@ -306,6 +306,9 @@ set_shadingsys_options() } else if ((!batch_size_requested || batch_size == 8) && shadingsys->configure_batch_execution_at(8)) { batch_size = 8; + } else if ((!batch_size_requested || batch_size == 4) + && shadingsys->configure_batch_execution_at(4)) { + batch_size = 4; } else { OSL::print( "WARNING: Hardware or library requirements to utilize batched execution"); @@ -1194,9 +1197,11 @@ setup_output_images(SimpleRenderer* rend, ShadingSystem* shadingsys, // jit_group will optimize the group if necesssary if (batch_size == 16) { shadingsys->batched<16>().jit_group(shadergroup.get(), ctx); - } else { - ASSERT((batch_size == 8) && "Unsupported batch size"); + } else if (batch_size == 8) { shadingsys->batched<8>().jit_group(shadergroup.get(), ctx); + } else { + ASSERT((batch_size == 4) && "Unsupported batch size"); + shadingsys->batched<4>().jit_group(shadergroup.get(), ctx); } } else #endif @@ -2195,13 +2200,19 @@ test_shade(int argc, const char* argv[]) batched_shade_region<16>(rend, shadergroup.get(), sub_roi, save); }); - } else { - ASSERT((batch_size == 8) && "Unsupported batch size"); + } else if (batch_size == 8) { OIIO::ImageBufAlgo::parallel_image( roi, num_threads, [&](OIIO::ROI sub_roi) -> void { batched_shade_region<8>(rend, shadergroup.get(), sub_roi, save); }); + } else { + ASSERT((batch_size == 4) && "Unsupported batch size"); + OIIO::ImageBufAlgo::parallel_image( + roi, num_threads, [&](OIIO::ROI sub_roi) -> void { + batched_shade_region<4>(rend, shadergroup.get(), + sub_roi, save); + }); } } else # endif diff --git a/testsuite/example-batched-deformer/oslbatcheddeformer.cpp b/testsuite/example-batched-deformer/oslbatcheddeformer.cpp index 0b7af16e4..449f06f59 100644 --- a/testsuite/example-batched-deformer/oslbatcheddeformer.cpp +++ b/testsuite/example-batched-deformer/oslbatcheddeformer.cpp @@ -182,10 +182,15 @@ class MyRendererServices final : public OSL::RendererServices { { return &m_batch_8_rs; } + OSL::BatchedRendererServices<4>* batched(OSL::WidthOf<4>) override + { + return &m_batch_4_rs; + } private: MyBatchedRendererServices<16> m_batch_16_rs; MyBatchedRendererServices<8> m_batch_8_rs; + MyBatchedRendererServices<4> m_batch_4_rs; }; @@ -232,11 +237,13 @@ main(int argc, char* argv[]) batch_width = 16; } else if (shadsys->configure_batch_execution_at(8)) { batch_width = 8; + } else if (shadsys->configure_batch_execution_at(4)) { + batch_width = 4; } else { std::cout - << "Error: Hardware doesn't support 8 or 16 wide SIMD or the OSL has not been configured and built with a proper USE_BATCHED." + << "Error: Hardware doesn't support 4, 8 or 16 wide SIMD or the OSL has not been configured and built with a proper USE_BATCHED." << std::endl; - std::cout << "Error: e.g.: USE_BATCHED=b8_AVX2,b8_AVX512,b16_AVX512" + std::cout << "Error: e.g.: USE_BATCHED=b4_SSE2,b8_AVX2,b8_AVX512,b16_AVX512" << std::endl; return -1; } @@ -432,8 +439,11 @@ main(int argc, char* argv[]) if (batch_width == 16) { batched_shadepoints(std::integral_constant {}); - } else { + } + else if (batch_width == 8) { batched_shadepoints(std::integral_constant {}); + } else { + batched_shadepoints(std::integral_constant {}); } // Print some results to prove that we generated an expected Pout.