AcademySoftwareFoundation · lgritz · Sep 4, 2024 · Jul 4, 2024 · Aug 16, 2024 · Aug 16, 2024
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -49,6 +49,17 @@ jobs:
             pybind11_ver: v2.7.0
             simd: sse4.2
             batched: b8_AVX2_noFMA
+          - desc: gcc9/C++17 llvm11 py3.7 exr2.5 oiio2.3 sse2 batch-b4sse2
+            nametag: linux-vfx2021
+            runner: ubuntu-latest
+            container: aswftesting/ci-osl:2021-clang11
+            vfxyear: 2021
+            cxx_std: 17
+            openimageio_ver: v2.4.13.0
+            python_ver: 3.7
+            pybind11_ver: v2.7.0
+            simd: sse2
+            batched: b4_SSE2
           - desc: gcc9/C++17 llvm13 py3.9 exr3.1 oiio-rel avx2
             nametag: linux-vfx2022
             runner: ubuntu-latest

diff --git a/INSTALL.md b/INSTALL.md
@@ -9,9 +9,9 @@ and aarch64), and Windows (x86_64). It may build and run on other platforms as
 well, but we don't officially support or test other than these platforms.
 
 Shader execution is supported on the native architectures of those x86_64 and
-aarch64 platforms, a special batched 8- or 16-wide SIMD execution mode
-requiring x86_64 with AVX2 or AVX-512 instructions, as well as on NVIDIA GPUs
-using Cuda+OptiX.
+aarch64 platforms, a special batched 4-, 8- or 16-wide SIMD execution mode
+requiring x86_64 with SSE2, AVX/AVX2 or AVX-512 instructions, as well as on
+NVIDIA GPUs using Cuda+OptiX.
 
 Dependencies
 ------------

diff --git a/src/cmake/compiler.cmake b/src/cmake/compiler.cmake
@@ -338,7 +338,7 @@ endif ()
 #
 # The USE_BATCHED option may be set to indicate that support for batched
 # SIMD shader execution be compiled along with targe specific libraries
-set (USE_BATCHED "" CACHE STRING "Build batched SIMD shader execution for (0, b8_AVX, b8_AVX2, b8_AVX2_noFMA, b8_AVX512, b8_AVX512_noFMA, b16_AVX512, b16_AVX512_noFMA)")
+set (USE_BATCHED "" CACHE STRING "Build batched SIMD shader execution for (0, b4_SSE2, b8_AVX, b8_AVX2, b8_AVX2_noFMA, b8_AVX512, b8_AVX512_noFMA, b16_AVX512, b16_AVX512_noFMA)")
 option (VEC_REPORT "Enable compiler's reporting system for vectorization" OFF)
 set (BATCHED_SUPPORT_DEFINES "")
 set (BATCHED_TARGET_LIBS "")

diff --git a/src/include/OSL/batched_texture.h b/src/include/OSL/batched_texture.h
@@ -49,6 +49,9 @@ static_assert(std::alignment_of<VaryingTextureOptions<16>>::value
 static_assert(std::alignment_of<VaryingTextureOptions<8>>::value
                   == VecReg<8>::alignment,
               "Expect alignment of data member to set alignment of struct");
+static_assert(std::alignment_of<VaryingTextureOptions<4>>::value
+                  == VecReg<4>::alignment,
+              "Expect alignment of data member to set alignment of struct");
 
 template<int WidthT> struct BatchedTextureOptions {
     VaryingTextureOptions<WidthT> varying;
@@ -90,11 +93,15 @@ static_assert(std::alignment_of<BatchedTextureOptions<16>>::value
 static_assert(std::alignment_of<BatchedTextureOptions<8>>::value
                   == VecReg<8>::alignment,
               "Expect alignment of data member to set alignment of struct");
+static_assert(std::alignment_of<BatchedTextureOptions<4>>::value
+                  == VecReg<4>::alignment,
+              "Expect alignment of data member to set alignment of struct");
 
 #ifdef OIIO_TEXTURE_SIMD_BATCH_WIDTH
 // Code here is to validate our OSL BatchedTextureOptions<WidthT> is binary compatible
 // and safe to reinterpret_cast<TextureOptBatch*>
-static_assert((OIIO::Tex::BatchWidth == 16) || (OIIO::Tex::BatchWidth == 8),
+static_assert((OIIO::Tex::BatchWidth == 16) || (OIIO::Tex::BatchWidth == 8)
+                  || (OIIO::Tex::BatchWidth == 4),
               "This validation requires OIIO_TEXTURE_SIMD_BATCH_WIDTH=16");
 
 namespace validate_offsets {

diff --git a/src/include/OSL/llvm_util.h b/src/include/OSL/llvm_util.h
@@ -693,6 +693,8 @@ class OSLEXECPUBLIC LLVM_Util {
     llvm::Constant* constant(uint32_t i);
 
     /// Return an llvm::Constant holding the given integer constant.
+    llvm::Constant* constant4(int8_t i);
+    llvm::Constant* constant4(uint8_t i);
     llvm::Constant* constant8(int8_t i);
     llvm::Constant* constant8(uint8_t i);
     llvm::Constant* constant16(int16_t i);
@@ -1229,6 +1231,7 @@ class OSLEXECPUBLIC LLVM_Util {
 
     llvm::Value* op_linearize_16x_indices(llvm::Value* wide_index);
     llvm::Value* op_linearize_8x_indices(llvm::Value* wide_index);
+    llvm::Value* op_linearize_4x_indices(llvm::Value* wide_index);
     std::array<llvm::Value*, 2> op_split_16x(llvm::Value* vector_val);
     std::array<llvm::Value*, 2> op_split_8x(llvm::Value* vector_val);
     std::array<llvm::Value*, 4> op_quarter_16x(llvm::Value* vector_val);

diff --git a/src/include/OSL/rendererservices.h b/src/include/OSL/rendererservices.h
@@ -601,6 +601,7 @@ class OSLEXECPUBLIC RendererServices {
     /// Unless overridden, a nullptr is returned.
     virtual BatchedRendererServices<16>* batched(WidthOf<16>);
     virtual BatchedRendererServices<8>* batched(WidthOf<8>);
+    virtual BatchedRendererServices<4>* batched(WidthOf<4>);
 
 protected:
     TextureSystem* m_texturesys;  // A place to hold a TextureSystem

diff --git a/src/liboslexec/CMakeLists.txt b/src/liboslexec/CMakeLists.txt
@@ -380,6 +380,8 @@ foreach(batched_target ${BATCHED_TARGET_LIST})
             list (APPEND TARGET_CXX_OPTS "-march=core-avx2")
         elseif (${TARGET_OPT_ISA} STREQUAL "AVX")
             list (APPEND TARGET_CXX_OPTS "-march=corei7-avx")
+        elseif (${TARGET_OPT_ISA} STREQUAL "SSE2")
+            list (APPEND TARGET_CXX_OPTS "-march=x86-64")
         else ()
             message (FATAL_ERROR "Unknown ISA=${TARGET_OPT_ISA} extract from USE_BATCHED entry ${batched_target}")
         endif ()
@@ -455,6 +457,8 @@ foreach(batched_target ${BATCHED_TARGET_LIST})
             list (APPEND TARGET_CXX_OPTS "-march=haswell")
         elseif (${TARGET_OPT_ISA} STREQUAL "AVX")
             list (APPEND TARGET_CXX_OPTS "-march=sandybridge")
+        elseif (${TARGET_OPT_ISA} STREQUAL "SSE2")
+            list (APPEND TARGET_CXX_OPTS "-march=x86-64")
         else ()
             message (FATAL_ERROR "Unknown ISA=${TARGET_OPT_ISA} extract from USE_BATCHED entry ${batched_target}")
         endif ()

diff --git a/src/liboslexec/batched_analysis.cpp b/src/liboslexec/batched_analysis.cpp
@@ -1813,10 +1813,16 @@ struct Analyzer {
                     // specific BatchedRendererServices.
                     // Right here we don't know which width will be used,
                     // so we will just require all widths provide the same answer
+                    auto rs4  = m_ba.renderer()->batched(WidthOf<4>());
                     auto rs8  = m_ba.renderer()->batched(WidthOf<8>());
                     auto rs16 = m_ba.renderer()->batched(WidthOf<16>());
-                    if (rs8 || rs16) {
+                    if (rs4 || rs8 || rs16) {
                         get_attr_is_uniform = true;
+                        if (rs4) {
+                            get_attr_is_uniform
+                                &= rs4->is_attribute_uniform(obj_name,
+                                                             attr_name);
+                        }
                         if (rs8) {
                             get_attr_is_uniform
                                 &= rs8->is_attribute_uniform(obj_name,

diff --git a/src/liboslexec/batched_backendllvm.cpp b/src/liboslexec/batched_backendllvm.cpp
@@ -141,6 +141,7 @@ BatchedBackendLLVM::BatchedBackendLLVM(ShadingSystemImpl& shadingsys,
     switch (vector_width()) {
     case 16: m_true_mask_value = Mask<16>(true).value(); break;
     case 8: m_true_mask_value = Mask<8>(true).value(); break;
+    case 4: m_true_mask_value = Mask<4>(true).value(); break;
     default: OSL_ASSERT(0 && "unsupported vector width");
     }
     ll.dumpasm(shadingsys.m_llvm_dumpasm);

diff --git a/src/liboslexec/batched_llvm_instance.cpp b/src/liboslexec/batched_llvm_instance.cpp
@@ -537,6 +537,33 @@ const char*
     = "b8_AVX_";
 #endif
 
+#ifdef __OSL_SUPPORTS_b4_SSE2
+template<>
+const NameAndSignature
+    ConcreteTargetLibraryHelper<4, TargetISA::x64>::library_functions[]
+    = {
+#    define DECL_INDIRECT(name, signature) \
+        NameAndSignature { #name, signature },
+#    define DECL(name, signature) DECL_INDIRECT(name, signature)
+#    define __OSL_WIDTH           4
+#    define __OSL_TARGET_ISA      SSE2
+// Don't allow order of xmacro includes be rearranged
+// clang-format off
+#    include "wide/define_opname_macros.h"
+#    include "builtindecl_wide_xmacro.h"
+#    include "wide/undef_opname_macros.h"
+// clang-format on
+#    undef __OSL_TARGET_ISA
+#    undef __OSL_WIDTH
+#    undef DECL
+#    undef DECL_INDIRECT
+      };
+template<>
+const char*
+    ConcreteTargetLibraryHelper<4, TargetISA::x64>::library_selector_string
+    = "b4_SSE2_";
+#endif
+
 
 
 std::unique_ptr<BatchedBackendLLVM::TargetLibraryHelper>
@@ -592,6 +619,17 @@ BatchedBackendLLVM::TargetLibraryHelper::build(ShadingContext* context,
         default: break;
         }
         break;
+    case 4:
+        switch (target_isa) {
+#ifdef __OSL_SUPPORTS_b4_SSE2
+        case TargetISA::x64:
+            return RetType(
+                new ConcreteTargetLibraryHelper<4, TargetISA::x64>());
+#endif
+        default: break;
+        }
+        break;
+
     default: OSL_ASSERT(0 && "unsupported vector width");
     }
     std::cerr << "Build is not configured to support TargetISA of "
@@ -735,6 +773,9 @@ BatchedBackendLLVM::llvm_type_batched_texture_options()
     {
         std::vector<unsigned int> offset_by_index;
         switch (m_width) {
+        case 4:
+            build_offsets_of_BatchedTextureOptions<4>(offset_by_index);
+            break;
         case 8:
             build_offsets_of_BatchedTextureOptions<8>(offset_by_index);
             break;
@@ -2698,6 +2739,9 @@ BatchedBackendLLVM::run()
     {
         std::vector<unsigned int> offset_by_index;
         switch (m_width) {
+        case 4:
+            build_offsets_of_BatchedShaderGlobals<4>(offset_by_index);
+            break;
         case 8:
             build_offsets_of_BatchedShaderGlobals<8>(offset_by_index);
             break;

diff --git a/src/liboslexec/batched_rendservices.cpp b/src/liboslexec/batched_rendservices.cpp
@@ -328,5 +328,6 @@ BatchedRendererServices<WidthT>::getmessage(BatchedShaderGlobals* bsg,
 // Explicitly instantiate BatchedRendererServices template
 template class OSLEXECPUBLIC BatchedRendererServices<16>;
 template class OSLEXECPUBLIC BatchedRendererServices<8>;
+template class OSLEXECPUBLIC BatchedRendererServices<4>;
 
 OSL_NAMESPACE_EXIT
diff --git a/src/liboslexec/context.cpp b/src/liboslexec/context.cpp
@@ -674,6 +674,7 @@ osl_incr_layers_executed(ShaderGlobals* sg)
 // Explicit template instantiation for supported batch sizes
 template class ShadingContext::Batched<16>;
 template class ShadingContext::Batched<8>;
+template class ShadingContext::Batched<4>;
 #endif
 
 

diff --git a/src/liboslexec/llvm_passes.h b/src/liboslexec/llvm_passes.h
@@ -435,6 +435,8 @@ class LegacyPreventBitMasksFromBeingLiveinsToBasicBlocks final
 // including this file will need its own static members defined. LLVM will
 // assign IDs when they get registered, so this initialization value is not
 // important.
+template<> char LegacyPreventBitMasksFromBeingLiveinsToBasicBlocks<4>::ID = 0;
+
 template<> char LegacyPreventBitMasksFromBeingLiveinsToBasicBlocks<8>::ID = 0;
 
 template<> char LegacyPreventBitMasksFromBeingLiveinsToBasicBlocks<16>::ID = 0;