Skip to content

Commit

Permalink
AK+LibCrypto: Replace ad-hoc target clones with a generic mechanism
Browse files Browse the repository at this point in the history
- Checked AVX, AVX2, loop unrolling -- no or negligible (<0.5%) effect
- ifdef soup -> well-defined AK_CAN_CODEGEN_FOR_<FEATURE>
- ifunc resolvers -> statically initialized function pointers

TODO: Write proper description
  • Loading branch information
DanShaders committed Jul 5, 2024
1 parent 2ac61cd commit ee3a6d9
Show file tree
Hide file tree
Showing 3 changed files with 116 additions and 61 deletions.
81 changes: 81 additions & 0 deletions AK/CPUFeature.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
/*
* Copyright (c) 2024, Dan Klishch <[email protected]>
*
* SPDX-License-Identifier: BSD-2-Clause
*/

#pragma once

#include <AK/EnumBits.h>
#include <AK/Format.h>
#include <AK/Types.h>
#include <cpuid.h>

namespace AK {

enum class CPUFeature : u64 {
None = 0ULL,
Invalid = 1ULL << 63,

#if !defined(KERNEL) && ARCH(X86_64)
# define AK_CAN_CODEGEN_FOR_X86_SSE42 1
X86_SSE42 = 1 << 0,
# define AK_CAN_CODEGEN_FOR_X86_AVX 1
X86_AVX = 1 << 1,
# define AK_CAN_CODEGEN_FOR_X86_SHA 1
X86_SHA = 1 << 2,
#else
# define AK_CAN_CODEGEN_FOR_X86_SSE42 0
X86_SSE42 = Invalid,
# define AK_CAN_CODEGEN_FOR_X86_AVX 0
X86_AVX = Invalid,
# define AK_CAN_CODEGEN_FOR_X86_SHA 0
X86_SHA = Invalid,
#endif
};

AK_ENUM_BITWISE_OPERATORS(CPUFeature);

inline CPUFeature detect_cpu_features()
{
static CPUFeature const s_cached_features = [] {
CPUFeature result = CPUFeature::None;

__builtin_cpu_init();

#if AK_CAN_CODEGEN_FOR_X86_SSE42
if (__builtin_cpu_supports("sse4.2"))
result |= CPUFeature::X86_SSE42;
#endif

#if AK_CAN_CODEGEN_FOR_X86_AVX
if (__builtin_cpu_supports("avx"))
result |= CPUFeature::X86_AVX;
#endif

#if AK_CAN_CODEGEN_FOR_X86_SHA
// FIXME: Use __builtin_cpu_supports("sha") when compilers support it
constexpr u32 cpuid_sha_ebx = 1 << 29;
u32 eax, ebx, ecx, edx;
__cpuid_count(7, 0, eax, ebx, ecx, edx);
if (ebx & cpuid_sha_ebx)
result |= CPUFeature::X86_SHA;
#endif

return result;
}();
return s_cached_features;
}

#define AK_UPDATE_SELECTION_IF_FEATURE_SUPPORTED(feature, template_function) \
if constexpr (((feature) & CPUFeature::Invalid) == CPUFeature::None) { \
if (has_flag(detect_cpu_features(), (feature))) \
result = template_function<(feature)>; \
}

}

#ifdef USING_AK_GLOBALLY
using AK::CPUFeature;
using AK::detect_cpu_features;
#endif
53 changes: 18 additions & 35 deletions Userland/Libraries/LibCrypto/Hash/SHA1.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,26 +5,27 @@
* SPDX-License-Identifier: BSD-2-Clause
*/

#include <AK/CPUFeature.h>
#include <AK/Endian.h>
#include <AK/Memory.h>
#include <AK/Platform.h>
#include <AK/SIMD.h>
#include <AK/SIMDExtras.h>
#include <AK/Types.h>
#include <LibCrypto/Hash/SHA1.h>

#if (ARCH(I386) || ARCH(X86_64))
# include <AK/SIMD.h>
# include <AK/SIMDExtras.h>
# include <cpuid.h>
#endif

namespace Crypto::Hash {

static constexpr auto ROTATE_LEFT(u32 value, size_t bits)
{
return (value << bits) | (value >> (32 - bits));
}

static void transform_impl_base(u32 (&state)[5], u8 const (&data)[64])
template<CPUFeature feature>
static void transform_impl(u32 (&state)[5], u8 const (&data)[64]);

template<>
void transform_impl<CPUFeature::None>(u32 (&state)[5], u8 const (&data)[64])
{
constexpr static auto Rounds = 80;

Expand Down Expand Up @@ -76,12 +77,13 @@ static void transform_impl_base(u32 (&state)[5], u8 const (&data)[64])
secure_zero(blocks, 16 * sizeof(u32));
}

#if (ARCH(I386) || ARCH(X86_64))
// Note: The SHA extension was introduced with
// Intel Goldmont (SSE4.2), Ice Lake (AVX512), Rocket Lake (AVX512), and AMD Zen (AVX2)
// So it's safe to assume that if we have SHA we have at least SSE4.2
// ~https://en.wikipedia.org/wiki/Intel_SHA_extensions
[[gnu::target("sha,sse4.2")]] static void transform_impl_sha1(u32 (&state)[5], u8 const (&data)[64])
#if AK_CAN_CODEGEN_FOR_X86_SHA && AK_CAN_CODEGEN_FOR_X86_SSE42
template<>
[[gnu::target("sha,sse4.2")]] void transform_impl<CPUFeature::X86_SSE42 | CPUFeature::X86_SHA>(u32 (&state)[5], u8 const (&data)[64])
{
# define SAME_TARGET gnu::target("sha"), gnu::always_inline

Expand All @@ -104,10 +106,6 @@ static void transform_impl_base(u32 (&state)[5], u8 const (&data)[64])
auto sha_rnds4 = []<int i> [[SAME_TARGET]] (u32x4 a, u32x4 b) { return bit_cast<u32x4>(__builtin_ia32_sha1rnds4(bit_cast<i32x4>(a), bit_cast<i32x4>(b), i)); };

auto group = [&]<int i_group> [[SAME_TARGET]] () {
//" // FIXME: Trailing quote to fix syntax highlighting, somethings off with function like attributes and templated lambdas in VsCode
// FIXME: Test if unrolling the loop is worth it
// GCC: #pragma GCC unroll(5)
// Clang: #pragma unroll
for (size_t i_pack = 0; i_pack != 5; ++i_pack) {
size_t i_msg = i_group * 5 + i_pack;
if (i_msg < 4) {
Expand Down Expand Up @@ -143,32 +141,17 @@ static void transform_impl_base(u32 (&state)[5], u8 const (&data)[64])

# undef SAME_TARGET
}

// FIXME: We need a custom resolver as Clang and GCC either refuse or silently ignore the `sha` target
// for function multiversioning
[[gnu::ifunc("resolve_transform_impl")]] static void transform_impl(u32 (&state)[5], u8 const (&data)[64]);
namespace {
extern "C" [[gnu::used]] decltype(&transform_impl) resolve_transform_impl()
{
// FIXME: Use __builtin_cpu_supports("sha") when compilers support it
constexpr u32 cpuid_sha_ebx = 1 << 29;
u32 eax, ebx, ecx, edx;
__cpuid_count(7, 0, eax, ebx, ecx, edx);
if (ebx & cpuid_sha_ebx)
return transform_impl_sha1;

// FIXME: Investigate if more target clones (avx) make sense

return transform_impl_base;
}
}
#else
# define transform_impl transform_impl_base
#endif

static decltype(transform_impl<CPUFeature::None>)* transform_dispatched = [] {
auto result = transform_impl<CPUFeature::None>;
AK_UPDATE_SELECTION_IF_FEATURE_SUPPORTED(CPUFeature::X86_SSE42 | CPUFeature::X86_SHA, transform_impl);
return result;
}();

inline void SHA1::transform(u8 const (&data)[BlockSize])
{
transform_impl(m_state, data);
transform_dispatched(m_state, data);
}

void SHA1::update(u8 const* message, size_t length)
Expand Down
43 changes: 17 additions & 26 deletions Userland/Libraries/LibCrypto/Hash/SHA2.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,14 @@
* SPDX-License-Identifier: BSD-2-Clause
*/

#include <AK/CPUFeature.h>
#include <AK/Platform.h>
#include <AK/Types.h>
#include <LibCrypto/Hash/SHA2.h>

#if (ARCH(I386) || ARCH(X86_64)) && !defined(KERNEL)
#if !defined(KERNEL)
# include <AK/SIMD.h>
# include <AK/SIMDExtras.h>
# include <cpuid.h>
#endif

namespace Crypto::Hash {
Expand All @@ -32,7 +32,11 @@ constexpr static auto EP1(u64 x) { return ROTRIGHT(x, 14) ^ ROTRIGHT(x, 18) ^ RO
constexpr static auto SIGN0(u64 x) { return ROTRIGHT(x, 1) ^ ROTRIGHT(x, 8) ^ (x >> 7); }
constexpr static auto SIGN1(u64 x) { return ROTRIGHT(x, 19) ^ ROTRIGHT(x, 61) ^ (x >> 6); }

static void SHA256_transform_impl_base(u32 (&state)[8], u8 const (&data)[64])
template<CPUFeature feature>
static void SHA256_transform_impl(u32 (&state)[8], u8 const (&data)[64]);

template<>
void SHA256_transform_impl<CPUFeature::None>(u32 (&state)[8], u8 const (&data)[64])
{
constexpr static auto BlockSize = 64;
constexpr static auto Rounds = 64;
Expand Down Expand Up @@ -76,12 +80,13 @@ static void SHA256_transform_impl_base(u32 (&state)[8], u8 const (&data)[64])
state[7] += h;
}

#if (ARCH(I386) || ARCH(X86_64)) && !defined(KERNEL)
// Note: The SHA extension was introduced with
// Intel Goldmont (SSE4.2), Ice Lake (AVX512), Rocket Lake (AVX512), and AMD Zen (AVX2)
// So it's safe to assume that if we have SHA we have at least SSE4.2
// ~https://en.wikipedia.org/wiki/Intel_SHA_extensions
[[gnu::target("sha,sse4.2")]] static void SHA256_transform_impl_sha(u32 (&state)[8], u8 const (&data)[64])
#if AK_CAN_CODEGEN_FOR_X86_SHA && AK_CAN_CODEGEN_FOR_X86_SSE42
template<>
[[gnu::target("sha,sse4.2")]] void SHA256_transform_impl<CPUFeature::X86_SSE42 | CPUFeature::X86_SHA>(u32 (&state)[8], u8 const (&data)[64])
{
using AK::SIMD::i32x4, AK::SIMD::u32x4;

Expand Down Expand Up @@ -124,31 +129,17 @@ static void SHA256_transform_impl_base(u32 (&state)[8], u8 const (&data)[64])
AK::SIMD::store_unaligned(&state[0], states[0]);
AK::SIMD::store_unaligned(&state[4], states[1]);
}
// FIXME: We need a custom resolver as Clang and GCC either refuse or silently ignore the `sha` target
// for function multiversioning
[[gnu::ifunc("resolve_SHA256_transform_impl")]] static void SHA256_transform_impl(u32 (&state)[8], u8 const (&data)[64]);
namespace {
extern "C" [[gnu::used]] decltype(&SHA256_transform_impl) resolve_SHA256_transform_impl()
{
// FIXME: Use __builtin_cpu_supports("sha") when compilers support it
constexpr u32 cpuid_sha_ebx = 1 << 29;
u32 eax, ebx, ecx, edx;
__cpuid_count(7, 0, eax, ebx, ecx, edx);
if (ebx & cpuid_sha_ebx)
return SHA256_transform_impl_sha;

// FIXME: Investigate if more target clones (avx) make sense

return SHA256_transform_impl_base;
}
}
#else
# define SHA256_transform_impl SHA256_transform_impl_base
#endif

static decltype(SHA256_transform_impl<CPUFeature::None>)* SHA256_transform_dispatched = [] {
auto result = SHA256_transform_impl<CPUFeature::None>;
AK_UPDATE_SELECTION_IF_FEATURE_SUPPORTED(CPUFeature::X86_SSE42 | CPUFeature::X86_SHA, SHA256_transform_impl);
return result;
}();

inline void SHA256::transform(u8 const (&data)[BlockSize])
{
SHA256_transform_impl(m_state, data);
SHA256_transform_dispatched(m_state, data);
}

template<size_t BlockSize, typename Callback>
Expand Down

0 comments on commit ee3a6d9

Please sign in to comment.