From 6ffdff84028af424c0c61e23c09c2558ae0c0a62 Mon Sep 17 00:00:00 2001
From: Tuomas Tonteri <e90@tuomastonteri.fi>
Date: Fri, 7 Jun 2024 17:22:54 +0300
Subject: [PATCH] Add support for b4_SSE2 batched mode.

---
 src/cmake/compiler.cmake                 |  2 +-
 src/include/OSL/rendererservices.h       |  1 +
 src/liboslexec/CMakeLists.txt            |  2 ++
 src/liboslexec/batched_analysis.cpp      |  8 ++++-
 src/liboslexec/batched_backendllvm.cpp   |  1 +
 src/liboslexec/batched_llvm_instance.cpp | 44 ++++++++++++++++++++++++
 src/liboslexec/batched_rendservices.cpp  |  1 +
 src/liboslexec/context.cpp               |  1 +
 src/liboslexec/llvm_passes.h             |  2 ++
 src/liboslexec/llvm_util.cpp             | 17 +++++++--
 src/liboslexec/rendservices.cpp          |  7 ++++
 src/liboslexec/shadingsys.cpp            | 25 ++++++++++++--
 12 files changed, 104 insertions(+), 7 deletions(-)

diff --git a/src/cmake/compiler.cmake b/src/cmake/compiler.cmake
index c97316681c..172bf1a818 100644
--- a/src/cmake/compiler.cmake
+++ b/src/cmake/compiler.cmake
@@ -329,7 +329,7 @@ endif ()
 #
 # The USE_BATCHED option may be set to indicate that support for batched
 # SIMD shader execution be compiled along with targe specific libraries
-set (USE_BATCHED "" CACHE STRING "Build batched SIMD shader execution for (0, b8_AVX, b8_AVX2, b8_AVX2_noFMA, b8_AVX512, b8_AVX512_noFMA, b16_AVX512, b16_AVX512_noFMA)")
+set (USE_BATCHED "" CACHE STRING "Build batched SIMD shader execution for (0, b4_SSE2, b8_AVX, b8_AVX2, b8_AVX2_noFMA, b8_AVX512, b8_AVX512_noFMA, b16_AVX512, b16_AVX512_noFMA)")
 option (VEC_REPORT "Enable compiler's reporting system for vectorization" OFF)
 set (BATCHED_SUPPORT_DEFINES "")
 set (BATCHED_TARGET_LIBS "")
diff --git a/src/include/OSL/rendererservices.h b/src/include/OSL/rendererservices.h
index 04e5269ae0..62a6b61793 100644
--- a/src/include/OSL/rendererservices.h
+++ b/src/include/OSL/rendererservices.h
@@ -601,6 +601,7 @@ class OSLEXECPUBLIC RendererServices {
     /// Unless overridden, a nullptr is returned.
     virtual BatchedRendererServices<16>* batched(WidthOf<16>);
     virtual BatchedRendererServices<8>* batched(WidthOf<8>);
+    virtual BatchedRendererServices<4>* batched(WidthOf<4>);
 
 protected:
     TextureSystem* m_texturesys;  // A place to hold a TextureSystem
diff --git a/src/liboslexec/CMakeLists.txt b/src/liboslexec/CMakeLists.txt
index 328565af68..9c316b6849 100644
--- a/src/liboslexec/CMakeLists.txt
+++ b/src/liboslexec/CMakeLists.txt
@@ -380,6 +380,8 @@ foreach(batched_target ${BATCHED_TARGET_LIST})
             list (APPEND TARGET_CXX_OPTS "-march=core-avx2")
         elseif (${TARGET_OPT_ISA} STREQUAL "AVX")
             list (APPEND TARGET_CXX_OPTS "-march=corei7-avx")
+        elseif (${TARGET_OPT_ISA} STREQUAL "SSE2")
+            list (APPEND TARGET_CXX_OPTS "-march=core2")
         else ()
             message (FATAL_ERROR "Unknown ISA=${TARGET_OPT_ISA} extract from USE_BATCHED entry ${batched_target}")
         endif ()
diff --git a/src/liboslexec/batched_analysis.cpp b/src/liboslexec/batched_analysis.cpp
index 888f198741..9f76c1acf6 100644
--- a/src/liboslexec/batched_analysis.cpp
+++ b/src/liboslexec/batched_analysis.cpp
@@ -1813,10 +1813,16 @@ struct Analyzer {
                     // specific BatchedRendererServices.
                     // Right here we don't know which width will be used,
                     // so we will just require all widths provide the same answer
+                    auto rs4  = m_ba.renderer()->batched(WidthOf<4>());
                     auto rs8  = m_ba.renderer()->batched(WidthOf<8>());
                     auto rs16 = m_ba.renderer()->batched(WidthOf<16>());
-                    if (rs8 || rs16) {
+                    if (rs4 || rs8 || rs16) {
                         get_attr_is_uniform = true;
+                        if (rs4) {
+                            get_attr_is_uniform
+                                &= rs4->is_attribute_uniform(obj_name,
+                                                             attr_name);
+                        }
                         if (rs8) {
                             get_attr_is_uniform
                                 &= rs8->is_attribute_uniform(obj_name,
diff --git a/src/liboslexec/batched_backendllvm.cpp b/src/liboslexec/batched_backendllvm.cpp
index e94122ef43..79f87ca900 100644
--- a/src/liboslexec/batched_backendllvm.cpp
+++ b/src/liboslexec/batched_backendllvm.cpp
@@ -141,6 +141,7 @@ BatchedBackendLLVM::BatchedBackendLLVM(ShadingSystemImpl& shadingsys,
     switch (vector_width()) {
     case 16: m_true_mask_value = Mask<16>(true).value(); break;
     case 8: m_true_mask_value = Mask<8>(true).value(); break;
+    case 4: m_true_mask_value = Mask<4>(true).value(); break;
     default: OSL_ASSERT(0 && "unsupported vector width");
     }
     ll.dumpasm(shadingsys.m_llvm_dumpasm);
diff --git a/src/liboslexec/batched_llvm_instance.cpp b/src/liboslexec/batched_llvm_instance.cpp
index 8e6ff0a76d..2180637861 100644
--- a/src/liboslexec/batched_llvm_instance.cpp
+++ b/src/liboslexec/batched_llvm_instance.cpp
@@ -537,6 +537,33 @@ const char*
     = "b8_AVX_";
 #endif
 
+#ifdef __OSL_SUPPORTS_b4_SSE2
+template<>
+const NameAndSignature
+    ConcreteTargetLibraryHelper<4, TargetISA::x64>::library_functions[]
+    = {
+#    define DECL_INDIRECT(name, signature) \
+        NameAndSignature { #name, signature },
+#    define DECL(name, signature) DECL_INDIRECT(name, signature)
+#    define __OSL_WIDTH           4
+#    define __OSL_TARGET_ISA      SSE2
+// Don't allow order of xmacro includes be rearranged
+// clang-format off
+#    include "wide/define_opname_macros.h"
+#    include "builtindecl_wide_xmacro.h"
+#    include "wide/undef_opname_macros.h"
+// clang-format on
+#    undef __OSL_TARGET_ISA
+#    undef __OSL_WIDTH
+#    undef DECL
+#    undef DECL_INDIRECT
+      };
+template<>
+const char*
+    ConcreteTargetLibraryHelper<4, TargetISA::x64>::library_selector_string
+    = "b4_SSE2_";
+#endif
+
 
 
 std::unique_ptr<BatchedBackendLLVM::TargetLibraryHelper>
@@ -592,6 +619,17 @@ BatchedBackendLLVM::TargetLibraryHelper::build(ShadingContext* context,
         default: break;
         }
         break;
+    case 4:
+        switch (target_isa) {
+#ifdef __OSL_SUPPORTS_b4_SSE2
+        case TargetISA::x64:
+            return RetType(
+                new ConcreteTargetLibraryHelper<4, TargetISA::x64>());
+#endif
+        default: break;
+        }
+        break;
+
     default: OSL_ASSERT(0 && "unsupported vector width");
     }
     std::cerr << "Build is not configured to support TargetISA of "
@@ -735,6 +773,9 @@ BatchedBackendLLVM::llvm_type_batched_texture_options()
     {
         std::vector<unsigned int> offset_by_index;
         switch (m_width) {
+        case 4:
+            build_offsets_of_BatchedTextureOptions<4>(offset_by_index);
+            break;
         case 8:
             build_offsets_of_BatchedTextureOptions<8>(offset_by_index);
             break;
@@ -2698,6 +2739,9 @@ BatchedBackendLLVM::run()
     {
         std::vector<unsigned int> offset_by_index;
         switch (m_width) {
+        case 4:
+            build_offsets_of_BatchedShaderGlobals<4>(offset_by_index);
+            break;
         case 8:
             build_offsets_of_BatchedShaderGlobals<8>(offset_by_index);
             break;
diff --git a/src/liboslexec/batched_rendservices.cpp b/src/liboslexec/batched_rendservices.cpp
index fbff377b25..1c5fcaa4a6 100644
--- a/src/liboslexec/batched_rendservices.cpp
+++ b/src/liboslexec/batched_rendservices.cpp
@@ -328,5 +328,6 @@ BatchedRendererServices<WidthT>::getmessage(BatchedShaderGlobals* bsg,
 // Explicitly instantiate BatchedRendererServices template
 template class OSLEXECPUBLIC BatchedRendererServices<16>;
 template class OSLEXECPUBLIC BatchedRendererServices<8>;
+template class OSLEXECPUBLIC BatchedRendererServices<4>;
 
 OSL_NAMESPACE_EXIT
diff --git a/src/liboslexec/context.cpp b/src/liboslexec/context.cpp
index a97b427e1b..b001315a8e 100644
--- a/src/liboslexec/context.cpp
+++ b/src/liboslexec/context.cpp
@@ -674,6 +674,7 @@ osl_incr_layers_executed(ShaderGlobals* sg)
 // Explicit template instantiation for supported batch sizes
 template class ShadingContext::Batched<16>;
 template class ShadingContext::Batched<8>;
+template class ShadingContext::Batched<4>;
 #endif
 
 
diff --git a/src/liboslexec/llvm_passes.h b/src/liboslexec/llvm_passes.h
index 852ec82f94..43c7a72894 100644
--- a/src/liboslexec/llvm_passes.h
+++ b/src/liboslexec/llvm_passes.h
@@ -435,6 +435,8 @@ class LegacyPreventBitMasksFromBeingLiveinsToBasicBlocks final
 // including this file will need its own static members defined. LLVM will
 // assign IDs when they get registered, so this initialization value is not
 // important.
+template<> char LegacyPreventBitMasksFromBeingLiveinsToBasicBlocks<4>::ID = 0;
+
 template<> char LegacyPreventBitMasksFromBeingLiveinsToBasicBlocks<8>::ID = 0;
 
 template<> char LegacyPreventBitMasksFromBeingLiveinsToBasicBlocks<16>::ID = 0;
diff --git a/src/liboslexec/llvm_util.cpp b/src/liboslexec/llvm_util.cpp
index 3dd888cab0..ac3a8d0284 100644
--- a/src/liboslexec/llvm_util.cpp
+++ b/src/liboslexec/llvm_util.cpp
@@ -619,6 +619,12 @@ LLVM_Util::SetupLLVM()
 
 #ifndef OSL_LLVM_NEW_PASS_MANAGER
     // LegacyPreventBitMasksFromBeingLiveinsToBasicBlocks
+    static llvm::RegisterPass<
+        LegacyPreventBitMasksFromBeingLiveinsToBasicBlocks<4>>
+        sRegCustomPass2(
+            "PreventBitMasksFromBeingLiveinsToBasicBlocks<4>",
+            "Prevent Bit Masks <4xi1> From Being Liveins To Basic Blocks Pass",
+            false /* Only looks at CFG */, false /* Analysis Pass */);
     static llvm::RegisterPass<
         LegacyPreventBitMasksFromBeingLiveinsToBasicBlocks<8>>
         sRegCustomPass0(
@@ -3592,6 +3598,11 @@ LLVM_Util::mask_as_int(llvm::Value* mask)
             // and all types are happy
             intMaskType = type_int8();
             break;
+        case 4:
+            // We can just reinterpret cast a 4 bit mask to a 8 bit integer
+            // and all types are happy
+            intMaskType = type_int8();
+            break;
         default: OSL_ASSERT(0 && "unsupported native bit mask width");
         };
 
@@ -3950,10 +3961,10 @@ LLVM_Util::op_1st_active_lane_of(llvm::Value* mask)
         // and all types are happy
         intMaskType = type_int8();
         break;
-#if 0  // WIP
+//#if 0  // WIP
         case 4:
         {
-            // We can just reinterpret cast a 8 bit mask to a 8 bit integer
+            // We can just reinterpret cast a 4 bit mask to a 8 bit integer
             // and all types are happy
             intMaskType = type_int8();
 
@@ -3966,7 +3977,7 @@ LLVM_Util::op_1st_active_lane_of(llvm::Value* mask)
 //            llvm::Value * mask_as_int =  builder().CreateBitCast (wide_int_mask, int_reinterpret_cast_vector_type);
             break;
         }
-#endif
+//#endif
     default: OSL_ASSERT(0 && "unsupported native bit mask width");
     };
 
diff --git a/src/liboslexec/rendservices.cpp b/src/liboslexec/rendservices.cpp
index c0c84b03d6..b3bd5c8989 100644
--- a/src/liboslexec/rendservices.cpp
+++ b/src/liboslexec/rendservices.cpp
@@ -524,4 +524,11 @@ RendererServices::batched(WidthOf<8>)
     return nullptr;
 }
 
+BatchedRendererServices<4>*
+RendererServices::batched(WidthOf<4>)
+{
+    // No default implementation for batched services
+    return nullptr;
+}
+
 OSL_NAMESPACE_EXIT
diff --git a/src/liboslexec/shadingsys.cpp b/src/liboslexec/shadingsys.cpp
index 307d57355e..e1fbf49781 100644
--- a/src/liboslexec/shadingsys.cpp
+++ b/src/liboslexec/shadingsys.cpp
@@ -622,6 +622,23 @@ ShadingSystem::configure_batch_execution_at(int width)
             if (target_requested) {
                 break;
             }
+            // fallthrough
+        case TargetISA::x64:
+#    ifdef __OSL_SUPPORTS_b4_SSE2
+            if (LLVM_Util::supports_isa(TargetISA::x64)) {
+                if (!target_requested)
+                    m_impl->attribute("llvm_jit_target",
+                                      LLVM_Util::target_isa_name(
+                                          TargetISA::x64));
+                // SSE2 doesn't support FMA
+                m_impl->attribute("llvm_jit_fma", 0);
+                return true;
+            }
+#    endif
+            if (target_requested) {
+                break;
+            }
+
             // fallthrough
         default: return false;
         };
@@ -885,6 +902,7 @@ ShadingSystem::BatchedExecutor<WidthT>::jit_all_groups(int nthreads)
 // Explicitly instantiate
 template class ShadingSystem::BatchedExecutor<16>;
 template class ShadingSystem::BatchedExecutor<8>;
+template class ShadingSystem::BatchedExecutor<4>;
 #endif
 
 
@@ -1079,7 +1097,8 @@ ShadingSystemImpl::ShadingSystemImpl(RendererServices* renderer,
     , m_opt_groupdata(true)
 #if OSL_USE_BATCHED
     , m_opt_batched_analysis((renderer->batched(WidthOf<16>()) != nullptr)
-                             || (renderer->batched(WidthOf<8>()) != nullptr))
+                             || (renderer->batched(WidthOf<8>()) != nullptr)
+                             || (renderer->batched(WidthOf<4>()) != nullptr))
 #else
     , m_opt_batched_analysis(false)
 #endif
@@ -3794,7 +3813,8 @@ ShadingSystemImpl::optimize_group(ShaderGroup& group, ShadingContext* ctx,
         // the batch jit has already happened,
         // as it requires the ops so we can't delete them yet!
         if (((renderer()->batched(WidthOf<16>()) == nullptr)
-             && (renderer()->batched(WidthOf<8>()) == nullptr))
+             && (renderer()->batched(WidthOf<8>()) == nullptr)
+             && (renderer()->batched(WidthOf<4>()) == nullptr))
             || group.batch_jitted()) {
             group_post_jit_cleanup(group);
         }
@@ -4015,6 +4035,7 @@ ShadingSystemImpl::Batched<WidthT>::jit_all_groups(int nthreads, int mythread,
 // machine as well, start with just the batch size
 template class pvt::ShadingSystemImpl::Batched<16>;
 template class pvt::ShadingSystemImpl::Batched<8>;
+template class pvt::ShadingSystemImpl::Batched<4>;
 #endif
 
 int