Skip to content

Commit

Permalink
Add fast_matrix_mul_4x4_lsx function for LoongArch64
Browse files Browse the repository at this point in the history
  • Loading branch information
KatyushaScarlet committed Feb 8, 2025
1 parent d5bd7d2 commit a9c7fe3
Show file tree
Hide file tree
Showing 5 changed files with 41 additions and 0 deletions.
2 changes: 2 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,8 @@ if(CMAKE_SYSTEM_PROCESSOR)
set(RISCV64_DEVICE ON)
elseif(${CMAKE_SYSTEM_PROCESSOR} MATCHES "^loongarch64")
set(LOONGARCH64_DEVICE ON)
add_compile_options(-mlsx)
add_compile_options(-mlasx)
else()
message("Unknown CMAKE_SYSTEM_PROCESSOR: ${CMAKE_SYSTEM_PROCESSOR}")
endif()
Expand Down
6 changes: 6 additions & 0 deletions Common/Math/SIMDHeaders.h
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,12 @@
#endif
#endif

#if PPSSPP_ARCH(LOONGARCH64)
#if PPSSPP_ARCH(LOONGARCH64_LSX)
#include <lsxintrin.h>
#endif
#endif

// Basic types

#if PPSSPP_ARCH(ARM64_NEON)
Expand Down
29 changes: 29 additions & 0 deletions Common/Math/fast/fast_matrix.c
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,35 @@ void fast_matrix_mul_4x4_sse(float *dest, const float *a, const float *b) {
}
}

#elif PPSSPP_ARCH(LOONGARCH64_LSX)

static __m128 __lsx_vreplfr2vr_s(const float val) {
v4f32 res = {val, val, val, val};
return (__m128)res;
}

void fast_matrix_mul_4x4_lsx(float *dest, const float *a, const float *b) {
__m128 a_col_1 = (__m128)__lsx_vld(a, 0);
__m128 a_col_2 = (__m128)__lsx_vld(a + 4, 0);
__m128 a_col_3 = (__m128)__lsx_vld(a + 8, 0);
__m128 a_col_4 = (__m128)__lsx_vld(a + 12, 0);

for (int i = 0; i < 16; i += 4) {

__m128 b1 = __lsx_vreplfr2vr_s(b[i]);
__m128 b2 = __lsx_vreplfr2vr_s(b[i + 1]);
__m128 b3 = __lsx_vreplfr2vr_s(b[i + 2]);
__m128 b4 = __lsx_vreplfr2vr_s(b[i + 3]);

__m128 result = __lsx_vfmul_s(a_col_1, b1);
result = __lsx_vfmadd_s(a_col_2, b2, result);
result = __lsx_vfmadd_s(a_col_3, b3, result);
result = __lsx_vfmadd_s(a_col_4, b4, result);

__lsx_vst(result, &dest[i], 0);
}
}

#elif PPSSPP_ARCH(ARM_NEON)

// From https://developer.arm.com/documentation/102467/0100/Matrix-multiplication-example
Expand Down
3 changes: 3 additions & 0 deletions Common/Math/fast/fast_matrix.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,12 +11,15 @@ extern "C" {
extern void fast_matrix_mul_4x4_c(float *dest, const float *a, const float *b);
extern void fast_matrix_mul_4x4_neon(float *dest, const float *a, const float *b);
extern void fast_matrix_mul_4x4_sse(float *dest, const float *a, const float *b);
extern void fast_matrix_mul_4x4_lsx(float *dest, const float *a, const float *b);

#if PPSSPP_ARCH(X86) || PPSSPP_ARCH(AMD64)
// Hard link to SSE implementations on x86/amd64
#define fast_matrix_mul_4x4 fast_matrix_mul_4x4_sse
#elif PPSSPP_ARCH(ARM_NEON)
#define fast_matrix_mul_4x4 fast_matrix_mul_4x4_neon
#elif PPSSPP_ARCH(LOONGARCH64_LSX)
#define fast_matrix_mul_4x4 fast_matrix_mul_4x4_lsx
#else
#define fast_matrix_mul_4x4 fast_matrix_mul_4x4_c
#endif
Expand Down
1 change: 1 addition & 0 deletions ppsspp_config.h
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,7 @@
//https://github.com/gcc-mirror/gcc/blob/master/gcc/config/loongarch/loongarch-c.cc
#define PPSSPP_ARCH_LOONGARCH64 1
#define PPSSPP_ARCH_64BIT 1
#define PPSSPP_ARCH_LOONGARCH64_LSX 1
#endif

// PLATFORM defines
Expand Down

0 comments on commit a9c7fe3

Please sign in to comment.