Skip to content

Commit

Permalink
Merge pull request #19963 from hrydgard/crosssimd-fallback-fix
Browse files Browse the repository at this point in the history
CrossSIMD: Fix more no-simd fallbacks. The depth rasterizer now works in TEST_FALLBACK mode.
  • Loading branch information
hrydgard authored Feb 10, 2025
2 parents 2e5f901 + fd88f79 commit c0a4917
Show file tree
Hide file tree
Showing 5 changed files with 38 additions and 36 deletions.
54 changes: 26 additions & 28 deletions Common/Math/CrossSIMD.h
Original file line number Diff line number Diff line change
Expand Up @@ -765,18 +765,18 @@ struct Mat4F32 {
mat.m[1] = src[1];
mat.m[2] = src[2];
mat.m[3] = 0.0f;
mat.m[0] = src[3];
mat.m[1] = src[4];
mat.m[2] = src[5];
mat.m[3] = 0.0f;
mat.m[0] = src[6];
mat.m[1] = src[7];
mat.m[2] = src[8];
mat.m[3] = 0.0f;
mat.m[0] = src[9];
mat.m[1] = src[10];
mat.m[2] = src[11];
mat.m[3] = 1.0f;
mat.m[4] = src[3];
mat.m[5] = src[4];
mat.m[6] = src[5];
mat.m[7] = 0.0f;
mat.m[8] = src[6];
mat.m[9] = src[7];
mat.m[10] = src[8];
mat.m[11] = 0.0f;
mat.m[12] = src[9];
mat.m[13] = src[10];
mat.m[14] = src[11];
mat.m[15] = 1.0f;
return mat;
}

Expand Down Expand Up @@ -1083,23 +1083,21 @@ struct Vec4F32 {
return temp;
}

// In-place transpose. Fast on SIMD, not ideal on not.
// In-place transpose.
static void Transpose(Vec4F32 &col0, Vec4F32 &col1, Vec4F32 &col2, Vec4F32 &col3) {
std::swap(col0.v[1], col1.v[0]);
std::swap(col0.v[2], col2.v[0]);
std::swap(col0.v[3], col3.v[0]);

std::swap(col1.v[0], col0.v[1]);
std::swap(col1.v[2], col2.v[1]);
std::swap(col1.v[3], col3.v[1]);

std::swap(col2.v[0], col0.v[2]);
std::swap(col2.v[1], col1.v[2]);
std::swap(col2.v[3], col3.v[2]);

std::swap(col3.v[0], col0.v[3]);
std::swap(col3.v[1], col1.v[3]);
std::swap(col3.v[2], col2.v[3]);
float m[16];
for (int i = 0; i < 4; i++) {
m[0 + i] = col0.v[i];
m[4 + i] = col1.v[i];
m[8 + i] = col2.v[i];
m[12 + i] = col3.v[i];
}
for (int i = 0; i < 4; i++) {
col0.v[i] = m[i * 4 + 0];
col1.v[i] = m[i * 4 + 1];
col2.v[i] = m[i * 4 + 2];
col3.v[i] = m[i * 4 + 3];
}
}

inline Vec4F32 AsVec3ByMatrix44(const Mat4F32 &m) {
Expand Down
3 changes: 0 additions & 3 deletions Common/UI/Root.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -97,9 +97,6 @@ bool IsFocusMovementEnabled() {
}

void LayoutViewHierarchy(const UIContext &dc, ViewGroup *root, bool ignoreInsets) {
_assert_(root);
_assert_(&dc);

Bounds rootBounds = ignoreInsets ? dc.GetBounds() : dc.GetLayoutBounds();

MeasureSpec horiz(EXACTLY, rootBounds.w);
Expand Down
6 changes: 6 additions & 0 deletions Core/Config.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@
#include "Common/File/FileUtil.h"
#include "Common/File/VFS/VFS.h"
#include "Common/Log/LogManager.h"
#include "Common/Math/CrossSIMD.h"
#include "Common/OSVersion.h"
#include "Common/System/Display.h"
#include "Common/System/System.h"
Expand Down Expand Up @@ -140,6 +141,11 @@ std::string DefaultLangRegion() {
}

static int DefaultDepthRaster() {
#ifdef CROSSSIMD_SLOW
// No SIMD acceleration for the depth rasterizer.
// Default to off.
return (int)DepthRasterMode::OFF;
#endif

// For 64-bit ARM and x86 with SIMD, enable depth raster.
#if PPSSPP_ARCH(ARM64_NEON) || PPSSPP_ARCH(SSE2)
Expand Down
1 change: 1 addition & 0 deletions GPU/Common/DepthRaster.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -348,6 +348,7 @@ int DepthRasterClipIndexedRectangles(int *tx, int *ty, float *tz, const float *t
}

// These names are wrong .. until we transpose.
// TODO: Maybe combine two rects here at a time. But hardly relevant for performance.
Vec4F32 x = Vec4F32::Load(verts[0]);
Vec4F32 y = Vec4F32::Load(verts[1]);
Vec4F32 z = Vec4F32::Zero();
Expand Down
10 changes: 5 additions & 5 deletions GPU/Common/DrawEngineCommon.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -56,9 +56,6 @@ DrawEngineCommon::DrawEngineCommon() : decoderMap_(32) {
decIndex_ = (u16 *)AllocateMemoryPages(DECODED_INDEX_BUFFER_SIZE, MEM_PROT_READ | MEM_PROT_WRITE);
indexGen.Setup(decIndex_);

#ifdef CROSSSIMD_SLOW
useDepthRaster_ = false;
#else
switch ((DepthRasterMode)g_Config.iDepthRasterMode) {
case DepthRasterMode::DEFAULT:
case DepthRasterMode::LOW_QUALITY:
Expand All @@ -70,7 +67,7 @@ DrawEngineCommon::DrawEngineCommon() : decoderMap_(32) {
case DepthRasterMode::OFF:
useDepthRaster_ = false;
}
#endif

if (useDepthRaster_) {
depthDraws_.reserve(256);
}
Expand Down Expand Up @@ -933,12 +930,15 @@ Mat4F32 ComputeFinalProjMatrix() {
gstate.getViewportXCenter() - gstate.getOffsetX(),
gstate.getViewportYCenter() - gstate.getOffsetY(),
gstate.getViewportZCenter(),
0.0f,
};

Mat4F32 wv = Mul4x3By4x4(Mat4x3F32(gstate.worldMatrix), Mat4F32::Load4x3(gstate.viewMatrix));
Mat4F32 m = Mul4x4By4x4(wv, Mat4F32(gstate.projMatrix));
// NOTE: Applying the translation actually works pre-divide, since W is also affected.
TranslateAndScaleInplace(m, Vec4F32::LoadF24x3_One(&gstate.viewportxscale), Vec4F32::Load(viewportTranslate));
Vec4F32 scale = Vec4F32::LoadF24x3_One(&gstate.viewportxscale);
Vec4F32 translate = Vec4F32::Load(viewportTranslate);
TranslateAndScaleInplace(m, scale, translate);
return m;
}

Expand Down

0 comments on commit c0a4917

Please sign in to comment.