From 42153716be3517ee578e244e92eface91a07fdfa Mon Sep 17 00:00:00 2001 From: Aurumaker72 <48759429+Aurumaker72@users.noreply.github.com> Date: Thu, 19 Jun 2025 15:19:49 +0200 Subject: [PATCH 1/3] rewrite UnswapCopy to cpp uses SSE2 & 3 for better throughput, but this isnt a hot path anyway so whatever --- src/GBI.cpp | 2 +- src/Textures.cpp | 2 +- src/convert.h | 123 ++++++++++++++++++++++++----------------------- src/gDP.cpp | 6 +-- src/stdafx.h | 2 + 5 files changed, 70 insertions(+), 65 deletions(-) diff --git a/src/GBI.cpp b/src/GBI.cpp index 7c6224e..d189c1a 100644 --- a/src/GBI.cpp +++ b/src/GBI.cpp @@ -217,7 +217,7 @@ MicrocodeInfo* GBI_DetectMicrocode(u32 uc_start, u32 uc_dstart, u16 uc_dsize) // See if we can identify it by text char uc_data[2048]; - UnswapCopy(&RDRAM[uc_dstart & 0x1FFFFFFF], uc_data, 2048); + unswap_copy(&RDRAM[uc_dstart & 0x1FFFFFFF], (uint8_t*)uc_data, 2048); strcpy(uc_str, "Not Found"); for (u32 i = 0; i < 2048; i++) diff --git a/src/Textures.cpp b/src/Textures.cpp index d53e24c..0749afb 100644 --- a/src/Textures.cpp +++ b/src/Textures.cpp @@ -544,7 +544,7 @@ void TextureCache_LoadBackground(CachedTexture* texInfo) bpl = gSP.bgImage.width << gSP.bgImage.size >> 1; numBytes = bpl * gSP.bgImage.height; swapped = (u8*)malloc(numBytes); - UnswapCopy(&RDRAM[gSP.bgImage.address], swapped, numBytes); + unswap_copy(&RDRAM[gSP.bgImage.address], swapped, numBytes); dest = (u32*)malloc(texInfo->textureBytes); clampSClamp = texInfo->width - 1; diff --git a/src/convert.h b/src/convert.h index c77d7a5..7fb7e81 100644 --- a/src/convert.h +++ b/src/convert.h @@ -1,3 +1,5 @@ +#include + #ifndef CONVERT_H #define CONVERT_H @@ -170,69 +172,70 @@ const unsigned char One2Eight[2] = } }*/ -inline void UnswapCopy(void* src, void* dest, u32 numBytes) +inline void bswap_4_x32_sse2(__m128i& vec) { - __asm + __m128i tmp1 = _mm_srli_epi32(vec, 24); + __m128i tmp2 = _mm_slli_epi32(vec, 24); + + __m128i t1 = _mm_and_si128(_mm_srli_epi32(vec, 8), _mm_set1_epi32(0x0000FF00)); + __m128i t2 = _mm_and_si128(_mm_slli_epi32(vec, 8), _mm_set1_epi32(0x00FF0000)); + + vec = _mm_or_si128(tmp2, tmp1); + vec = _mm_or_si128(vec, t1); + vec = _mm_or_si128(vec, t2); +} + +/** + * \brief Copies data from a source buffer to a destination buffer while performing a byteswap within 4-byte groups. + * \param src The source buffer. + * \param dest The destination buffer. + * \param num_bytes The number of bytes to copy. + */ +inline void unswap_copy(uint8_t* p_src, uint8_t* p_dest, u32 num_bytes) +{ + const uintptr_t src_addr = reinterpret_cast(p_src); + u32 leading_bytes = src_addr & 3; + + if (leading_bytes != 0) { - mov ecx, 0 - mov esi, dword ptr [src] - mov edi, dword ptr [dest] - - mov ebx, esi - and ebx, 3 // ebx = number of leading bytes - - cmp ebx, 0 - jz StartDWordLoop - neg ebx - add ebx, 4 - - cmp ebx, [numBytes] - jle NotGreater - mov ebx, [numBytes] - NotGreater: - mov ecx, ebx - xor esi, 3 - LeadingLoop: // Copies leading bytes, in reverse order (un-swaps) - mov al, byte ptr [esi] - mov byte ptr [edi], al - sub esi, 1 - add edi, 1 - loop LeadingLoop - add esi, 5 - - StartDWordLoop: - mov ecx, dword ptr [numBytes] - sub ecx, ebx // Don't copy what's already been copied - - mov ebx, ecx - and ebx, 3 - // add ecx, 3 // Round up to nearest dword - shr ecx, 2 - - cmp ecx, 0 // If there's nothing to do, don't do it - jle StartTrailingLoop + leading_bytes = 4 - leading_bytes; + leading_bytes = min(leading_bytes, num_bytes); - // Copies from source to destination, bswap-ing first - DWordLoop: - mov eax, dword ptr [esi] - bswap eax - mov dword ptr [edi], eax - add esi, 4 - add edi, 4 - loop DWordLoop - StartTrailingLoop: - cmp ebx, 0 - jz Done - mov ecx, ebx - xor esi, 3 - - TrailingLoop: - mov al, byte ptr [esi] - mov byte ptr [edi], al - sub esi, 1 - add edi, 1 - loop TrailingLoop - Done: + for (u32 i = 0; i < leading_bytes; ++i) + p_dest[i] = p_src[3 - i]; + + p_src += leading_bytes; + p_dest += leading_bytes; + num_bytes -= leading_bytes; + } + + const u32 sse_block_size = (num_bytes / 16) * 16; + for (u32 i = 0; i < sse_block_size; i += 16) + { + __m128i data = _mm_loadu_si128(reinterpret_cast(p_src)); + bswap_4_x32_sse2(data); + _mm_storeu_si128(reinterpret_cast<__m128i*>(p_dest), data); + p_src += 16; + p_dest += 16; + } + + num_bytes -= sse_block_size; + + for (u32 i = 0; i < num_bytes / 4; ++i) + { + uint32_t val{}; + std::memcpy(&val, p_src, sizeof(uint32_t)); + val = (val >> 24 & 0x000000FF) | (val >> 8) & 0x0000FF00 | (val << 8) & 0x00FF0000 | (val << 24) & 0xFF000000; + std::memcpy(p_dest, &val, sizeof(uint32_t)); + p_src += 4; + p_dest += 4; + } + + const u32 trailing_bytes = num_bytes % 4; + if (trailing_bytes > 0) + { + for (u32 i = 0; i < trailing_bytes; ++i) + p_dest[i] = p_src[3 - i]; } } diff --git a/src/gDP.cpp b/src/gDP.cpp index e0a236c..0632867 100644 --- a/src/gDP.cpp +++ b/src/gDP.cpp @@ -592,7 +592,7 @@ void gDPLoadTile(u32 tile, u32 uls, u32 ult, u32 lrs, u32 lrt) for (y = 0; y < height; y++) { - UnswapCopy(src, dest, bpl); + unswap_copy(src, (uint8_t*)dest, bpl); if (y & 1) Interleave(dest, line); @@ -643,7 +643,7 @@ void gDPLoadBlock(u32 tile, u32 uls, u32 ult, u32 lrs, u32 dxt) for (u32 y = 0; y < height; y++) { - UnswapCopy(src, dest, bpl); + unswap_copy((uint8_t*)src, (uint8_t*)dest, bpl); if (y & 1) Interleave(dest, line); @@ -652,7 +652,7 @@ void gDPLoadBlock(u32 tile, u32 uls, u32 ult, u32 lrs, u32 dxt) } } else - UnswapCopy(src, dest, bytes); + unswap_copy((uint8_t*)src, (uint8_t*)dest, bytes); gDP.textureMode = TEXTUREMODE_NORMAL; gDP.loadType = LOADTYPE_BLOCK; diff --git a/src/stdafx.h b/src/stdafx.h index d1f36b1..9f1bebc 100644 --- a/src/stdafx.h +++ b/src/stdafx.h @@ -11,6 +11,8 @@ #include #include #include +#include +#include #include "Types.h" #define EXPORT __declspec(dllexport) From 9b0aac954747cdb079e8a4d199f3348c1ad1763c Mon Sep 17 00:00:00 2001 From: Aurumaker72 <48759429+Aurumaker72@users.noreply.github.com> Date: Thu, 19 Jun 2025 15:20:28 +0200 Subject: [PATCH 2/3] remove dead code from convert.h --- src/convert.h | 78 ++------------------------------------------------- 1 file changed, 2 insertions(+), 76 deletions(-) diff --git a/src/convert.h b/src/convert.h index 7fb7e81..7377521 100644 --- a/src/convert.h +++ b/src/convert.h @@ -1,7 +1,4 @@ -#include - -#ifndef CONVERT_H -#define CONVERT_H +#pragma once const unsigned char Five2Eight[32] = { @@ -102,76 +99,6 @@ const unsigned char One2Eight[2] = 255, // 1 = 11111111 }; -// Un-swaps on the dword, works with non-dword aligned addresses -/*inline void UnswapCopy( void *src, void *dest, u32 numBytes ) -{ - __asm - { - mov ecx, 0 - mov esi, dword ptr [src] - mov edi, dword ptr [dest] - - mov ebx, esi - and ebx, 3 // ebx = number of leading bytes - - cmp ebx, 0 - jz StartDWordLoop - - neg ebx - add ebx, 4 - cmp ebx, [numBytes] - jle NotGreater - mov ebx, [numBytes] -NotGreater: - mov ecx, ebx - - xor esi, 3 - -LeadingLoop: // Copies leading bytes, in reverse order (un-swaps) - mov al, byte ptr [esi] - mov byte ptr [edi], al - sub esi, 1 - add edi, 1 - loop LeadingLoop - add esi, 5 - -StartDWordLoop: - mov ecx, dword ptr [numBytes] - sub ecx, ebx // Don't copy what's already been copied - - mov ebx, ecx - and ebx, 3 // ebx = number of trailing bytes - - shr ecx, 2 // ecx = number of dwords - - cmp ecx, 0 // If there's nothing to do, don't do it - jz StartTrailingLoop - - // Copies from source to destination, bswap-ing first -DWordLoop: - mov eax, dword ptr [esi] - bswap eax - mov dword ptr [edi], eax - add esi, 4 - add edi, 4 - loop DWordLoop - -StartTrailingLoop: - cmp ebx, 0 - jz Done - mov ecx, ebx - add esi, 3 - -TrailingLoop: - mov al, byte ptr [esi] - mov byte ptr [esi], al - sub esi, 1 - add edi, 1 - loop TrailingLoop -Done: - } -}*/ - inline void bswap_4_x32_sse2(__m128i& vec) { __m128i tmp1 = _mm_srli_epi32(vec, 24); @@ -542,5 +469,4 @@ inline u32 I4_RGBA8888(u8 color) mov ah, cl mov al, cl } -} -#endif +} \ No newline at end of file From 454c2be7aee63707c618b01efc21b8a3e87506da Mon Sep 17 00:00:00 2001 From: Aurumaker72 <48759429+Aurumaker72@users.noreply.github.com> Date: Thu, 19 Jun 2025 15:20:59 +0200 Subject: [PATCH 3/3] clean up unswap_copy --- src/convert.h | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/src/convert.h b/src/convert.h index 7377521..0920a8d 100644 --- a/src/convert.h +++ b/src/convert.h @@ -115,12 +115,12 @@ inline void bswap_4_x32_sse2(__m128i& vec) /** * \brief Copies data from a source buffer to a destination buffer while performing a byteswap within 4-byte groups. * \param src The source buffer. - * \param dest The destination buffer. + * \param dst The destination buffer. * \param num_bytes The number of bytes to copy. */ -inline void unswap_copy(uint8_t* p_src, uint8_t* p_dest, u32 num_bytes) +inline void unswap_copy(uint8_t* src, uint8_t* dst, u32 num_bytes) { - const uintptr_t src_addr = reinterpret_cast(p_src); + const uintptr_t src_addr = reinterpret_cast(src); u32 leading_bytes = src_addr & 3; if (leading_bytes != 0) @@ -129,21 +129,21 @@ inline void unswap_copy(uint8_t* p_src, uint8_t* p_dest, u32 num_bytes) leading_bytes = min(leading_bytes, num_bytes); for (u32 i = 0; i < leading_bytes; ++i) - p_dest[i] = p_src[3 - i]; + dst[i] = src[3 - i]; - p_src += leading_bytes; - p_dest += leading_bytes; + src += leading_bytes; + dst += leading_bytes; num_bytes -= leading_bytes; } const u32 sse_block_size = (num_bytes / 16) * 16; for (u32 i = 0; i < sse_block_size; i += 16) { - __m128i data = _mm_loadu_si128(reinterpret_cast(p_src)); + __m128i data = _mm_loadu_si128(reinterpret_cast(src)); bswap_4_x32_sse2(data); - _mm_storeu_si128(reinterpret_cast<__m128i*>(p_dest), data); - p_src += 16; - p_dest += 16; + _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), data); + src += 16; + dst += 16; } num_bytes -= sse_block_size; @@ -151,18 +151,18 @@ inline void unswap_copy(uint8_t* p_src, uint8_t* p_dest, u32 num_bytes) for (u32 i = 0; i < num_bytes / 4; ++i) { uint32_t val{}; - std::memcpy(&val, p_src, sizeof(uint32_t)); + std::memcpy(&val, src, sizeof(uint32_t)); val = (val >> 24 & 0x000000FF) | (val >> 8) & 0x0000FF00 | (val << 8) & 0x00FF0000 | (val << 24) & 0xFF000000; - std::memcpy(p_dest, &val, sizeof(uint32_t)); - p_src += 4; - p_dest += 4; + std::memcpy(dst, &val, sizeof(uint32_t)); + src += 4; + dst += 4; } const u32 trailing_bytes = num_bytes % 4; if (trailing_bytes > 0) { for (u32 i = 0; i < trailing_bytes; ++i) - p_dest[i] = p_src[3 - i]; + dst[i] = src[3 - i]; } } @@ -469,4 +469,4 @@ inline u32 I4_RGBA8888(u8 color) mov ah, cl mov al, cl } -} \ No newline at end of file +}