Skip to content
This repository was archived by the owner on Aug 2, 2024. It is now read-only.

Commit 6613aee

Browse files
committed
Add support for SSE3 (new default) and option to target SSE4.2
1 parent f300b12 commit 6613aee

34 files changed

+351
-147
lines changed

.travis.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -123,5 +123,5 @@ before_script:
123123
- ${CMAKE} --build .
124124

125125
script:
126-
- if [ "${COVERITY_SCAN_BRANCH}" != 1 ]; then ./klein_test; fi
126+
- if [ "${COVERITY_SCAN_BRANCH}" != 1 ]; then ./klein_test && ./klein_test_sse42; fi
127127
- if [ "${ENABLE_GCOV}" = 1 ]; then bash <(curl -s https://codecov.io/bash) -x gcov-9 -a "-s `pwd`"; fi

CMakeLists.txt

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -27,11 +27,20 @@ endif()
2727
option(KLEIN_BUILD_SYM "Enable compilation of symbolic Klein utility" ON)
2828
option(KLEIN_BUILD_C_BINDINGS "Enable compilation of the Klein C bindings" ON)
2929

30+
# The default platform and instruction set is x86 SSE3
3031
add_library(klein INTERFACE)
3132
add_library(klein::klein ALIAS klein)
3233
target_include_directories(klein INTERFACE public)
3334
target_compile_features(klein INTERFACE cxx_std_17)
34-
# SSE4.2 has > 96% market penetration according to the Steam hardware survey
35+
if(NOT MSVC)
36+
target_compile_options(klein INTERFACE -msse3)
37+
endif()
38+
39+
add_library(klein_sse42 INTERFACE)
40+
add_library(klein::klein_sse42 ALIAS klein_sse42)
41+
target_include_directories(klein_sse42 INTERFACE public)
42+
target_compile_features(klein_sse42 INTERFACE cxx_std_17)
43+
# SSE4.1 has > 97% market penetration according to the Steam hardware survey
3544
# queried as of December 2019 while AVX2 is around 70%. Thus, we can assume
3645
# FMA support is at least 70%, but perhaps not much more beyond that.
3746
# TODO: Optionally support FMA
@@ -40,7 +49,8 @@ if(MSVC)
4049
# AVX extensions). This is on by default.
4150
else()
4251
# Unlike MSVC, FMA instructions are enabled with a separate feature flag
43-
target_compile_options(klein INTERFACE -msse4.2)
52+
target_compile_options(klein_sse42 INTERFACE -msse4.1)
53+
target_compile_definitions(klein_sse42 INTERFACE KLEIN_SSE_4_1)
4454
endif()
4555

4656
if(KLEIN_ENABLE_PERF)
@@ -54,7 +64,7 @@ if(KLEIN_ENABLE_TESTS)
5464
endif()
5565

5666
if(KLEIN_BUILD_SYM)
57-
add_subdirectory(src)
67+
add_subdirectory(sym)
5868
endif()
5969

6070
if(KLEIN_BUILD_C_BINDINGS)

README.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,8 +19,9 @@ SIMD Extensions) for maximum throughput.
1919

2020
## Requirements
2121

22-
- Machine with a processor that supports SSE4.2 or later (has ~97% market penetration)
22+
- Machine with a processor that supports SSE3 or later (Steam hardware survey reports 100% market penetration)
2323
- C++17 compliant compiler (tested with GCC 9.2.1, Clang 9.0.1, and Visual Studio 2019)
24+
- Optional SSE4.1 support
2425

2526
## Usage
2627

appveyor.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,3 +30,4 @@ build_script:
3030
- cmake --build .
3131
- dir
3232
- C:\projects\klein\%configuration%\klein_test.exe
33+
- C:\projects\klein\%configuration%\klein_test_sse42.exe

bench/rtm_bench.cpp

Lines changed: 0 additions & 22 deletions
This file was deleted.

docs/index.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ art kinematic and math libraries built with traditional vector and quaternion fo
2828

2929
- Geometric computing library suitable for use with realtime graphics and animation applications
3030
- Header-only core libary with an optional lightweight symbolic computer algebra system
31-
- SSE4.2-optimized implementation (SSE2 fallback on the way)
31+
- SSE3 or SSE4.1-optimized implementations
3232
- Tested on Linux, MacOS, and Windows
3333
- Requires no third-party dependencies
3434
- Permissively licensed

docs/quickstart.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,8 @@ FetchContent_Declare(
1717
FetchContent_MakeAvailable(klein)
1818
1919
# Now, you can use target_link_libraries(your_lib PUBLIC klein::klein)
20+
# If you can target SSE4.1 (~97% market penetration), you can link against
21+
# the target klein::klein_sse42 instead.
2022
```
2123

2224
The primary "catch-all" header provided can be included using `#include <klein/klein.hpp>`.

perf/CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ FetchContent_MakeAvailable(mc_ruler)
3131

3232
include(MCRuler)
3333

34-
if(${CMAKE_BUILD_TYPE} STREQUAL "Release")
34+
if(CMAKE_BUILD_TYPE EQUAL "RELEASE")
3535
add_library(klein_perf klein_perf.cpp)
3636
target_link_libraries(klein_perf PRIVATE mc_ruler::mc_ruler klein)
3737
mc_ruler(klein_perf SOURCES klein_perf.cpp)

public/klein/detail/x86/x86_exp_log.hpp

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99

1010
namespace kln
1111
{
12-
inline namespace detail
12+
namespace detail
1313
{
1414
// Partition memory layouts
1515
// LSB --> MSB
@@ -48,8 +48,8 @@ inline namespace detail
4848

4949
// Broadcast dot(a, a) ignoring the scalar component to all components
5050
// of a2
51-
__m128 a2 = _mm_dp_ps(a, a, 0b11101111);
52-
__m128 ab = _mm_dp_ps(a, b, 0b11101111);
51+
__m128 a2 = hi_dp_bc(a, a);
52+
__m128 ab = hi_dp_bc(a, b);
5353

5454
// Next, we need the sqrt of that quantity. Since e0123 squares to 0,
5555
// this has a closed form solution.
@@ -144,9 +144,9 @@ inline namespace detail
144144
__m128 b = _mm_mul_ps(bv_mask, p2);
145145

146146
// Next, we need to compute the norm as in the exponential.
147-
__m128 a2 = _mm_dp_ps(a, a, 0b11101111);
147+
__m128 a2 = hi_dp_bc(a, a);
148148
// TODO: handle case when a2 is 0
149-
__m128 ab = _mm_dp_ps(a, b, 0b11101111);
149+
__m128 ab = hi_dp_bc(a, b);
150150
__m128 s = _mm_sqrt_ps(a2);
151151
__m128 a2_sqrt_rcp = _mm_rcp_ps(s);
152152
__m128 minus_t = _mm_mul_ps(ab, a2_sqrt_rcp);

public/klein/detail/x86/x86_exterior_product.hpp

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44

55
namespace kln
66
{
7-
inline namespace detail
7+
namespace detail
88
{
99
// Partition memory layouts
1010
// LSB --> MSB
@@ -60,7 +60,7 @@ inline namespace detail
6060
p3_out = _mm_mul_ps(_mm_mul_ps(KLN_SWIZZLE(a, 0, 0, 0, 1), b),
6161
_mm_set_ps(-1.f, -1.f, -1.f, 0.f));
6262

63-
p3_out = _mm_add_ss(p3_out, _mm_dp_ps(a, b, 0b11100001));
63+
p3_out = _mm_add_ss(p3_out, hi_dp(a, b));
6464
}
6565

6666
// p0 ^ p2 = p2 ^ p0
@@ -86,7 +86,7 @@ inline namespace detail
8686
__m128& p2_out) noexcept
8787
{
8888
// (a0 b0 + a1 b1 + a2 b2 + a3 b3) e0123
89-
p2_out = _mm_dp_ps(a, b, 0b11110001);
89+
p2_out = dp(a, b);
9090
if constexpr (Flip)
9191
{
9292
p2_out = _mm_xor_ps(p2_out, _mm_set_ss(-0.f));
@@ -117,7 +117,7 @@ inline namespace detail
117117
// (a0 b2) e02 +
118118
// (a0 b3) e03
119119
p2_out = _mm_mul_ps(KLN_SWIZZLE(a, 0, 0, 0, 0), b);
120-
p2_out = _mm_add_ps(p2_out, _mm_dp_ps(a, b, 0b11100001));
120+
p2_out = _mm_add_ps(p2_out, hi_dp(a, b));
121121
}
122122

123123
// p1 ^ p3 = p3 ^ p1

0 commit comments

Comments
 (0)