From 6ffdff84028af424c0c61e23c09c2558ae0c0a62 Mon Sep 17 00:00:00 2001 From: Tuomas Tonteri Date: Fri, 7 Jun 2024 17:22:54 +0300 Subject: [PATCH] Add support for b4_SSE2 batched mode. --- src/cmake/compiler.cmake | 2 +- src/include/OSL/rendererservices.h | 1 + src/liboslexec/CMakeLists.txt | 2 ++ src/liboslexec/batched_analysis.cpp | 8 ++++- src/liboslexec/batched_backendllvm.cpp | 1 + src/liboslexec/batched_llvm_instance.cpp | 44 ++++++++++++++++++++++++ src/liboslexec/batched_rendservices.cpp | 1 + src/liboslexec/context.cpp | 1 + src/liboslexec/llvm_passes.h | 2 ++ src/liboslexec/llvm_util.cpp | 17 +++++++-- src/liboslexec/rendservices.cpp | 7 ++++ src/liboslexec/shadingsys.cpp | 25 ++++++++++++-- 12 files changed, 104 insertions(+), 7 deletions(-) diff --git a/src/cmake/compiler.cmake b/src/cmake/compiler.cmake index c97316681c..172bf1a818 100644 --- a/src/cmake/compiler.cmake +++ b/src/cmake/compiler.cmake @@ -329,7 +329,7 @@ endif () # # The USE_BATCHED option may be set to indicate that support for batched # SIMD shader execution be compiled along with targe specific libraries -set (USE_BATCHED "" CACHE STRING "Build batched SIMD shader execution for (0, b8_AVX, b8_AVX2, b8_AVX2_noFMA, b8_AVX512, b8_AVX512_noFMA, b16_AVX512, b16_AVX512_noFMA)") +set (USE_BATCHED "" CACHE STRING "Build batched SIMD shader execution for (0, b4_SSE2, b8_AVX, b8_AVX2, b8_AVX2_noFMA, b8_AVX512, b8_AVX512_noFMA, b16_AVX512, b16_AVX512_noFMA)") option (VEC_REPORT "Enable compiler's reporting system for vectorization" OFF) set (BATCHED_SUPPORT_DEFINES "") set (BATCHED_TARGET_LIBS "") diff --git a/src/include/OSL/rendererservices.h b/src/include/OSL/rendererservices.h index 04e5269ae0..62a6b61793 100644 --- a/src/include/OSL/rendererservices.h +++ b/src/include/OSL/rendererservices.h @@ -601,6 +601,7 @@ class OSLEXECPUBLIC RendererServices { /// Unless overridden, a nullptr is returned. virtual BatchedRendererServices<16>* batched(WidthOf<16>); virtual BatchedRendererServices<8>* batched(WidthOf<8>); + virtual BatchedRendererServices<4>* batched(WidthOf<4>); protected: TextureSystem* m_texturesys; // A place to hold a TextureSystem diff --git a/src/liboslexec/CMakeLists.txt b/src/liboslexec/CMakeLists.txt index 328565af68..9c316b6849 100644 --- a/src/liboslexec/CMakeLists.txt +++ b/src/liboslexec/CMakeLists.txt @@ -380,6 +380,8 @@ foreach(batched_target ${BATCHED_TARGET_LIST}) list (APPEND TARGET_CXX_OPTS "-march=core-avx2") elseif (${TARGET_OPT_ISA} STREQUAL "AVX") list (APPEND TARGET_CXX_OPTS "-march=corei7-avx") + elseif (${TARGET_OPT_ISA} STREQUAL "SSE2") + list (APPEND TARGET_CXX_OPTS "-march=core2") else () message (FATAL_ERROR "Unknown ISA=${TARGET_OPT_ISA} extract from USE_BATCHED entry ${batched_target}") endif () diff --git a/src/liboslexec/batched_analysis.cpp b/src/liboslexec/batched_analysis.cpp index 888f198741..9f76c1acf6 100644 --- a/src/liboslexec/batched_analysis.cpp +++ b/src/liboslexec/batched_analysis.cpp @@ -1813,10 +1813,16 @@ struct Analyzer { // specific BatchedRendererServices. // Right here we don't know which width will be used, // so we will just require all widths provide the same answer + auto rs4 = m_ba.renderer()->batched(WidthOf<4>()); auto rs8 = m_ba.renderer()->batched(WidthOf<8>()); auto rs16 = m_ba.renderer()->batched(WidthOf<16>()); - if (rs8 || rs16) { + if (rs4 || rs8 || rs16) { get_attr_is_uniform = true; + if (rs4) { + get_attr_is_uniform + &= rs4->is_attribute_uniform(obj_name, + attr_name); + } if (rs8) { get_attr_is_uniform &= rs8->is_attribute_uniform(obj_name, diff --git a/src/liboslexec/batched_backendllvm.cpp b/src/liboslexec/batched_backendllvm.cpp index e94122ef43..79f87ca900 100644 --- a/src/liboslexec/batched_backendllvm.cpp +++ b/src/liboslexec/batched_backendllvm.cpp @@ -141,6 +141,7 @@ BatchedBackendLLVM::BatchedBackendLLVM(ShadingSystemImpl& shadingsys, switch (vector_width()) { case 16: m_true_mask_value = Mask<16>(true).value(); break; case 8: m_true_mask_value = Mask<8>(true).value(); break; + case 4: m_true_mask_value = Mask<4>(true).value(); break; default: OSL_ASSERT(0 && "unsupported vector width"); } ll.dumpasm(shadingsys.m_llvm_dumpasm); diff --git a/src/liboslexec/batched_llvm_instance.cpp b/src/liboslexec/batched_llvm_instance.cpp index 8e6ff0a76d..2180637861 100644 --- a/src/liboslexec/batched_llvm_instance.cpp +++ b/src/liboslexec/batched_llvm_instance.cpp @@ -537,6 +537,33 @@ const char* = "b8_AVX_"; #endif +#ifdef __OSL_SUPPORTS_b4_SSE2 +template<> +const NameAndSignature + ConcreteTargetLibraryHelper<4, TargetISA::x64>::library_functions[] + = { +# define DECL_INDIRECT(name, signature) \ + NameAndSignature { #name, signature }, +# define DECL(name, signature) DECL_INDIRECT(name, signature) +# define __OSL_WIDTH 4 +# define __OSL_TARGET_ISA SSE2 +// Don't allow order of xmacro includes be rearranged +// clang-format off +# include "wide/define_opname_macros.h" +# include "builtindecl_wide_xmacro.h" +# include "wide/undef_opname_macros.h" +// clang-format on +# undef __OSL_TARGET_ISA +# undef __OSL_WIDTH +# undef DECL +# undef DECL_INDIRECT + }; +template<> +const char* + ConcreteTargetLibraryHelper<4, TargetISA::x64>::library_selector_string + = "b4_SSE2_"; +#endif + std::unique_ptr @@ -592,6 +619,17 @@ BatchedBackendLLVM::TargetLibraryHelper::build(ShadingContext* context, default: break; } break; + case 4: + switch (target_isa) { +#ifdef __OSL_SUPPORTS_b4_SSE2 + case TargetISA::x64: + return RetType( + new ConcreteTargetLibraryHelper<4, TargetISA::x64>()); +#endif + default: break; + } + break; + default: OSL_ASSERT(0 && "unsupported vector width"); } std::cerr << "Build is not configured to support TargetISA of " @@ -735,6 +773,9 @@ BatchedBackendLLVM::llvm_type_batched_texture_options() { std::vector offset_by_index; switch (m_width) { + case 4: + build_offsets_of_BatchedTextureOptions<4>(offset_by_index); + break; case 8: build_offsets_of_BatchedTextureOptions<8>(offset_by_index); break; @@ -2698,6 +2739,9 @@ BatchedBackendLLVM::run() { std::vector offset_by_index; switch (m_width) { + case 4: + build_offsets_of_BatchedShaderGlobals<4>(offset_by_index); + break; case 8: build_offsets_of_BatchedShaderGlobals<8>(offset_by_index); break; diff --git a/src/liboslexec/batched_rendservices.cpp b/src/liboslexec/batched_rendservices.cpp index fbff377b25..1c5fcaa4a6 100644 --- a/src/liboslexec/batched_rendservices.cpp +++ b/src/liboslexec/batched_rendservices.cpp @@ -328,5 +328,6 @@ BatchedRendererServices::getmessage(BatchedShaderGlobals* bsg, // Explicitly instantiate BatchedRendererServices template template class OSLEXECPUBLIC BatchedRendererServices<16>; template class OSLEXECPUBLIC BatchedRendererServices<8>; +template class OSLEXECPUBLIC BatchedRendererServices<4>; OSL_NAMESPACE_EXIT diff --git a/src/liboslexec/context.cpp b/src/liboslexec/context.cpp index a97b427e1b..b001315a8e 100644 --- a/src/liboslexec/context.cpp +++ b/src/liboslexec/context.cpp @@ -674,6 +674,7 @@ osl_incr_layers_executed(ShaderGlobals* sg) // Explicit template instantiation for supported batch sizes template class ShadingContext::Batched<16>; template class ShadingContext::Batched<8>; +template class ShadingContext::Batched<4>; #endif diff --git a/src/liboslexec/llvm_passes.h b/src/liboslexec/llvm_passes.h index 852ec82f94..43c7a72894 100644 --- a/src/liboslexec/llvm_passes.h +++ b/src/liboslexec/llvm_passes.h @@ -435,6 +435,8 @@ class LegacyPreventBitMasksFromBeingLiveinsToBasicBlocks final // including this file will need its own static members defined. LLVM will // assign IDs when they get registered, so this initialization value is not // important. +template<> char LegacyPreventBitMasksFromBeingLiveinsToBasicBlocks<4>::ID = 0; + template<> char LegacyPreventBitMasksFromBeingLiveinsToBasicBlocks<8>::ID = 0; template<> char LegacyPreventBitMasksFromBeingLiveinsToBasicBlocks<16>::ID = 0; diff --git a/src/liboslexec/llvm_util.cpp b/src/liboslexec/llvm_util.cpp index 3dd888cab0..ac3a8d0284 100644 --- a/src/liboslexec/llvm_util.cpp +++ b/src/liboslexec/llvm_util.cpp @@ -619,6 +619,12 @@ LLVM_Util::SetupLLVM() #ifndef OSL_LLVM_NEW_PASS_MANAGER // LegacyPreventBitMasksFromBeingLiveinsToBasicBlocks + static llvm::RegisterPass< + LegacyPreventBitMasksFromBeingLiveinsToBasicBlocks<4>> + sRegCustomPass2( + "PreventBitMasksFromBeingLiveinsToBasicBlocks<4>", + "Prevent Bit Masks <4xi1> From Being Liveins To Basic Blocks Pass", + false /* Only looks at CFG */, false /* Analysis Pass */); static llvm::RegisterPass< LegacyPreventBitMasksFromBeingLiveinsToBasicBlocks<8>> sRegCustomPass0( @@ -3592,6 +3598,11 @@ LLVM_Util::mask_as_int(llvm::Value* mask) // and all types are happy intMaskType = type_int8(); break; + case 4: + // We can just reinterpret cast a 4 bit mask to a 8 bit integer + // and all types are happy + intMaskType = type_int8(); + break; default: OSL_ASSERT(0 && "unsupported native bit mask width"); }; @@ -3950,10 +3961,10 @@ LLVM_Util::op_1st_active_lane_of(llvm::Value* mask) // and all types are happy intMaskType = type_int8(); break; -#if 0 // WIP +//#if 0 // WIP case 4: { - // We can just reinterpret cast a 8 bit mask to a 8 bit integer + // We can just reinterpret cast a 4 bit mask to a 8 bit integer // and all types are happy intMaskType = type_int8(); @@ -3966,7 +3977,7 @@ LLVM_Util::op_1st_active_lane_of(llvm::Value* mask) // llvm::Value * mask_as_int = builder().CreateBitCast (wide_int_mask, int_reinterpret_cast_vector_type); break; } -#endif +//#endif default: OSL_ASSERT(0 && "unsupported native bit mask width"); }; diff --git a/src/liboslexec/rendservices.cpp b/src/liboslexec/rendservices.cpp index c0c84b03d6..b3bd5c8989 100644 --- a/src/liboslexec/rendservices.cpp +++ b/src/liboslexec/rendservices.cpp @@ -524,4 +524,11 @@ RendererServices::batched(WidthOf<8>) return nullptr; } +BatchedRendererServices<4>* +RendererServices::batched(WidthOf<4>) +{ + // No default implementation for batched services + return nullptr; +} + OSL_NAMESPACE_EXIT diff --git a/src/liboslexec/shadingsys.cpp b/src/liboslexec/shadingsys.cpp index 307d57355e..e1fbf49781 100644 --- a/src/liboslexec/shadingsys.cpp +++ b/src/liboslexec/shadingsys.cpp @@ -622,6 +622,23 @@ ShadingSystem::configure_batch_execution_at(int width) if (target_requested) { break; } + // fallthrough + case TargetISA::x64: +# ifdef __OSL_SUPPORTS_b4_SSE2 + if (LLVM_Util::supports_isa(TargetISA::x64)) { + if (!target_requested) + m_impl->attribute("llvm_jit_target", + LLVM_Util::target_isa_name( + TargetISA::x64)); + // SSE2 doesn't support FMA + m_impl->attribute("llvm_jit_fma", 0); + return true; + } +# endif + if (target_requested) { + break; + } + // fallthrough default: return false; }; @@ -885,6 +902,7 @@ ShadingSystem::BatchedExecutor::jit_all_groups(int nthreads) // Explicitly instantiate template class ShadingSystem::BatchedExecutor<16>; template class ShadingSystem::BatchedExecutor<8>; +template class ShadingSystem::BatchedExecutor<4>; #endif @@ -1079,7 +1097,8 @@ ShadingSystemImpl::ShadingSystemImpl(RendererServices* renderer, , m_opt_groupdata(true) #if OSL_USE_BATCHED , m_opt_batched_analysis((renderer->batched(WidthOf<16>()) != nullptr) - || (renderer->batched(WidthOf<8>()) != nullptr)) + || (renderer->batched(WidthOf<8>()) != nullptr) + || (renderer->batched(WidthOf<4>()) != nullptr)) #else , m_opt_batched_analysis(false) #endif @@ -3794,7 +3813,8 @@ ShadingSystemImpl::optimize_group(ShaderGroup& group, ShadingContext* ctx, // the batch jit has already happened, // as it requires the ops so we can't delete them yet! if (((renderer()->batched(WidthOf<16>()) == nullptr) - && (renderer()->batched(WidthOf<8>()) == nullptr)) + && (renderer()->batched(WidthOf<8>()) == nullptr) + && (renderer()->batched(WidthOf<4>()) == nullptr)) || group.batch_jitted()) { group_post_jit_cleanup(group); } @@ -4015,6 +4035,7 @@ ShadingSystemImpl::Batched::jit_all_groups(int nthreads, int mythread, // machine as well, start with just the batch size template class pvt::ShadingSystemImpl::Batched<16>; template class pvt::ShadingSystemImpl::Batched<8>; +template class pvt::ShadingSystemImpl::Batched<4>; #endif int