Skip to content

Commit

Permalink
Add support for b4_SSE2 batched mode.
Browse files Browse the repository at this point in the history
Signed-off-by: Tuomas Tonteri <[email protected]>
  • Loading branch information
johnfea committed Jul 12, 2024
1 parent 321c803 commit ae76938
Show file tree
Hide file tree
Showing 21 changed files with 463 additions and 83 deletions.
11 changes: 11 additions & 0 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,17 @@ jobs:
pybind11_ver: v2.5.0
simd: sse4.2
setenvs: export CONAN_LLVM_VERSION=10.0.1
- desc: gcc9/C++17 llvm11 py3.7 exr2.5 oiio2.3 sse2 batch-b4sse2
nametag: linux-vfx2021
runner: ubuntu-latest
container: aswftesting/ci-osl:2021-clang11
vfxyear: 2021
cxx_std: 17
openimageio_ver: v2.4.13.0
python_ver: 3.7
pybind11_ver: v2.7.0
simd: sse2
batched: b4_SSE2
- desc: gcc9/C++17 llvm11 py3.7 exr2.5 oiio2.3 avx2 batch-b8avx2
nametag: linux-vfx2021
runner: ubuntu-latest
Expand Down
6 changes: 3 additions & 3 deletions INSTALL.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,9 @@ and aarch64), and Windows (x86_64). It may build and run on other platforms as
well, but we don't officially support or test other than these platforms.

Shader execution is supported on the native architectures of those x86_64 and
aarch64 platforms, a special batched 8- or 16-wide SIMD execution mode
requiring x86_64 with AVX2 or AVX-512 instructions, as well as on NVIDIA GPUs
using Cuda+OptiX.
aarch64 platforms, a special batched 4-, 8- or 16-wide SIMD execution mode
requiring x86_64 with SSE2, AVX/AVX2 or AVX-512 instructions, as well as on
NVIDIA GPUs using Cuda+OptiX.

Dependencies
------------
Expand Down
2 changes: 1 addition & 1 deletion src/cmake/compiler.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -329,7 +329,7 @@ endif ()
#
# The USE_BATCHED option may be set to indicate that support for batched
# SIMD shader execution be compiled along with targe specific libraries
set (USE_BATCHED "" CACHE STRING "Build batched SIMD shader execution for (0, b8_AVX, b8_AVX2, b8_AVX2_noFMA, b8_AVX512, b8_AVX512_noFMA, b16_AVX512, b16_AVX512_noFMA)")
set (USE_BATCHED "" CACHE STRING "Build batched SIMD shader execution for (0, b4_SSE2, b8_AVX, b8_AVX2, b8_AVX2_noFMA, b8_AVX512, b8_AVX512_noFMA, b16_AVX512, b16_AVX512_noFMA)")
option (VEC_REPORT "Enable compiler's reporting system for vectorization" OFF)
set (BATCHED_SUPPORT_DEFINES "")
set (BATCHED_TARGET_LIBS "")
Expand Down
9 changes: 8 additions & 1 deletion src/include/OSL/batched_texture.h
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,9 @@ static_assert(std::alignment_of<VaryingTextureOptions<16>>::value
static_assert(std::alignment_of<VaryingTextureOptions<8>>::value
== VecReg<8>::alignment,
"Expect alignment of data member to set alignment of struct");
static_assert(std::alignment_of<VaryingTextureOptions<4>>::value
== VecReg<4>::alignment,
"Expect alignment of data member to set alignment of struct");

template<int WidthT> struct BatchedTextureOptions {
VaryingTextureOptions<WidthT> varying;
Expand Down Expand Up @@ -90,11 +93,15 @@ static_assert(std::alignment_of<BatchedTextureOptions<16>>::value
static_assert(std::alignment_of<BatchedTextureOptions<8>>::value
== VecReg<8>::alignment,
"Expect alignment of data member to set alignment of struct");
static_assert(std::alignment_of<BatchedTextureOptions<4>>::value
== VecReg<4>::alignment,
"Expect alignment of data member to set alignment of struct");

#ifdef OIIO_TEXTURE_SIMD_BATCH_WIDTH
// Code here is to validate our OSL BatchedTextureOptions<WidthT> is binary compatible
// and safe to reinterpret_cast<TextureOptBatch*>
static_assert((OIIO::Tex::BatchWidth == 16) || (OIIO::Tex::BatchWidth == 8),
static_assert((OIIO::Tex::BatchWidth == 16) || (OIIO::Tex::BatchWidth == 8)
|| (OIIO::Tex::BatchWidth == 4),
"This validation requires OIIO_TEXTURE_SIMD_BATCH_WIDTH=16");

namespace validate_offsets {
Expand Down
3 changes: 3 additions & 0 deletions src/include/OSL/llvm_util.h
Original file line number Diff line number Diff line change
Expand Up @@ -693,6 +693,8 @@ class OSLEXECPUBLIC LLVM_Util {
llvm::Constant* constant(uint32_t i);

/// Return an llvm::Constant holding the given integer constant.
llvm::Constant* constant4(int8_t i);
llvm::Constant* constant4(uint8_t i);
llvm::Constant* constant8(int8_t i);
llvm::Constant* constant8(uint8_t i);
llvm::Constant* constant16(int16_t i);
Expand Down Expand Up @@ -1229,6 +1231,7 @@ class OSLEXECPUBLIC LLVM_Util {

llvm::Value* op_linearize_16x_indices(llvm::Value* wide_index);
llvm::Value* op_linearize_8x_indices(llvm::Value* wide_index);
llvm::Value* op_linearize_4x_indices(llvm::Value* wide_index);
std::array<llvm::Value*, 2> op_split_16x(llvm::Value* vector_val);
std::array<llvm::Value*, 2> op_split_8x(llvm::Value* vector_val);
std::array<llvm::Value*, 4> op_quarter_16x(llvm::Value* vector_val);
Expand Down
1 change: 1 addition & 0 deletions src/include/OSL/rendererservices.h
Original file line number Diff line number Diff line change
Expand Up @@ -601,6 +601,7 @@ class OSLEXECPUBLIC RendererServices {
/// Unless overridden, a nullptr is returned.
virtual BatchedRendererServices<16>* batched(WidthOf<16>);
virtual BatchedRendererServices<8>* batched(WidthOf<8>);
virtual BatchedRendererServices<4>* batched(WidthOf<4>);

protected:
TextureSystem* m_texturesys; // A place to hold a TextureSystem
Expand Down
4 changes: 4 additions & 0 deletions src/liboslexec/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -380,6 +380,8 @@ foreach(batched_target ${BATCHED_TARGET_LIST})
list (APPEND TARGET_CXX_OPTS "-march=core-avx2")
elseif (${TARGET_OPT_ISA} STREQUAL "AVX")
list (APPEND TARGET_CXX_OPTS "-march=corei7-avx")
elseif (${TARGET_OPT_ISA} STREQUAL "SSE2")
list (APPEND TARGET_CXX_OPTS "-march=x86-64")
else ()
message (FATAL_ERROR "Unknown ISA=${TARGET_OPT_ISA} extract from USE_BATCHED entry ${batched_target}")
endif ()
Expand Down Expand Up @@ -455,6 +457,8 @@ foreach(batched_target ${BATCHED_TARGET_LIST})
list (APPEND TARGET_CXX_OPTS "-march=haswell")
elseif (${TARGET_OPT_ISA} STREQUAL "AVX")
list (APPEND TARGET_CXX_OPTS "-march=sandybridge")
elseif (${TARGET_OPT_ISA} STREQUAL "SSE2")
list (APPEND TARGET_CXX_OPTS "-march=x86-64")
else ()
message (FATAL_ERROR "Unknown ISA=${TARGET_OPT_ISA} extract from USE_BATCHED entry ${batched_target}")
endif ()
Expand Down
8 changes: 7 additions & 1 deletion src/liboslexec/batched_analysis.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1813,10 +1813,16 @@ struct Analyzer {
// specific BatchedRendererServices.
// Right here we don't know which width will be used,
// so we will just require all widths provide the same answer
auto rs4 = m_ba.renderer()->batched(WidthOf<4>());
auto rs8 = m_ba.renderer()->batched(WidthOf<8>());
auto rs16 = m_ba.renderer()->batched(WidthOf<16>());
if (rs8 || rs16) {
if (rs4 || rs8 || rs16) {
get_attr_is_uniform = true;
if (rs4) {
get_attr_is_uniform
&= rs4->is_attribute_uniform(obj_name,
attr_name);
}
if (rs8) {
get_attr_is_uniform
&= rs8->is_attribute_uniform(obj_name,
Expand Down
1 change: 1 addition & 0 deletions src/liboslexec/batched_backendllvm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -141,6 +141,7 @@ BatchedBackendLLVM::BatchedBackendLLVM(ShadingSystemImpl& shadingsys,
switch (vector_width()) {
case 16: m_true_mask_value = Mask<16>(true).value(); break;
case 8: m_true_mask_value = Mask<8>(true).value(); break;
case 4: m_true_mask_value = Mask<4>(true).value(); break;
default: OSL_ASSERT(0 && "unsupported vector width");
}
ll.dumpasm(shadingsys.m_llvm_dumpasm);
Expand Down
44 changes: 44 additions & 0 deletions src/liboslexec/batched_llvm_instance.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -537,6 +537,33 @@ const char*
= "b8_AVX_";
#endif

#ifdef __OSL_SUPPORTS_b4_SSE2
template<>
const NameAndSignature
ConcreteTargetLibraryHelper<4, TargetISA::x64>::library_functions[]
= {
# define DECL_INDIRECT(name, signature) \
NameAndSignature { #name, signature },
# define DECL(name, signature) DECL_INDIRECT(name, signature)
# define __OSL_WIDTH 4
# define __OSL_TARGET_ISA SSE2
// Don't allow order of xmacro includes be rearranged
// clang-format off
# include "wide/define_opname_macros.h"
# include "builtindecl_wide_xmacro.h"
# include "wide/undef_opname_macros.h"
// clang-format on
# undef __OSL_TARGET_ISA
# undef __OSL_WIDTH
# undef DECL
# undef DECL_INDIRECT
};
template<>
const char*
ConcreteTargetLibraryHelper<4, TargetISA::x64>::library_selector_string
= "b4_SSE2_";
#endif



std::unique_ptr<BatchedBackendLLVM::TargetLibraryHelper>
Expand Down Expand Up @@ -592,6 +619,17 @@ BatchedBackendLLVM::TargetLibraryHelper::build(ShadingContext* context,
default: break;
}
break;
case 4:
switch (target_isa) {
#ifdef __OSL_SUPPORTS_b4_SSE2
case TargetISA::x64:
return RetType(
new ConcreteTargetLibraryHelper<4, TargetISA::x64>());
#endif
default: break;
}
break;

default: OSL_ASSERT(0 && "unsupported vector width");
}
std::cerr << "Build is not configured to support TargetISA of "
Expand Down Expand Up @@ -735,6 +773,9 @@ BatchedBackendLLVM::llvm_type_batched_texture_options()
{
std::vector<unsigned int> offset_by_index;
switch (m_width) {
case 4:
build_offsets_of_BatchedTextureOptions<4>(offset_by_index);
break;
case 8:
build_offsets_of_BatchedTextureOptions<8>(offset_by_index);
break;
Expand Down Expand Up @@ -2698,6 +2739,9 @@ BatchedBackendLLVM::run()
{
std::vector<unsigned int> offset_by_index;
switch (m_width) {
case 4:
build_offsets_of_BatchedShaderGlobals<4>(offset_by_index);
break;
case 8:
build_offsets_of_BatchedShaderGlobals<8>(offset_by_index);
break;
Expand Down
1 change: 1 addition & 0 deletions src/liboslexec/batched_rendservices.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -328,5 +328,6 @@ BatchedRendererServices<WidthT>::getmessage(BatchedShaderGlobals* bsg,
// Explicitly instantiate BatchedRendererServices template
template class OSLEXECPUBLIC BatchedRendererServices<16>;
template class OSLEXECPUBLIC BatchedRendererServices<8>;
template class OSLEXECPUBLIC BatchedRendererServices<4>;

OSL_NAMESPACE_EXIT
1 change: 1 addition & 0 deletions src/liboslexec/context.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -674,6 +674,7 @@ osl_incr_layers_executed(ShaderGlobals* sg)
// Explicit template instantiation for supported batch sizes
template class ShadingContext::Batched<16>;
template class ShadingContext::Batched<8>;
template class ShadingContext::Batched<4>;
#endif


Expand Down
2 changes: 2 additions & 0 deletions src/liboslexec/llvm_passes.h
Original file line number Diff line number Diff line change
Expand Up @@ -435,6 +435,8 @@ class LegacyPreventBitMasksFromBeingLiveinsToBasicBlocks final
// including this file will need its own static members defined. LLVM will
// assign IDs when they get registered, so this initialization value is not
// important.
template<> char LegacyPreventBitMasksFromBeingLiveinsToBasicBlocks<4>::ID = 0;

template<> char LegacyPreventBitMasksFromBeingLiveinsToBasicBlocks<8>::ID = 0;

template<> char LegacyPreventBitMasksFromBeingLiveinsToBasicBlocks<16>::ID = 0;
Expand Down
Loading

0 comments on commit ae76938

Please sign in to comment.