GS/HW: Allow blending on normal shuffles

refractionpcsx2 · refractionpcsx2 · commit 01842a3c6bb4 · 2024-01-09T13:17:52.000Z
diff --git a/bin/resources/shaders/dx11/tfx.fx b/bin/resources/shaders/dx11/tfx.fx
@@ -742,6 +742,25 @@ float4 ps_color(PS_INPUT input)
 	float4 T = sample_color(st, input.t.w);
 #endif
 
+	if (PS_SHUFFLE && !PS_SHUFFLE_SAME && !PS_READ16_SRC)
+	{
+		uint4 denorm_c_before = uint4(T);
+		if (PS_READ_BA)
+		{
+			T.r = float((denorm_c_before.b << 3) & 0xF8);
+			T.g = float(((denorm_c_before.b >> 2) & 0x38) | ((denorm_c_before.a << 6) & 0xC0));
+			T.b = float((denorm_c_before.a << 1) & 0xF8);
+			T.a = float(denorm_c_before.a & 0x80);
+		}
+		else
+		{
+			T.r = float((denorm_c_before.r << 3) & 0xF8);
+			T.g = float(((denorm_c_before.r >> 2) & 0x38) | ((denorm_c_before.g << 6) & 0xC0));
+			T.b = float((denorm_c_before.g << 1) & 0xF8);
+			T.a = float(denorm_c_before.g & 0x80);
+		}
+	}
+
 	float4 C = tfx(T, input.c);
 
 	atst(C);
@@ -925,48 +944,6 @@ PS_OUTPUT ps_main(PS_INPUT input)
 			discard;
 	}
 
-	if (PS_SHUFFLE)
-	{
-		uint4 denorm_c = uint4(C);
-		uint2 denorm_TA = uint2(float2(TA.xy) * 255.0f + 0.5f);
-
-		// Special case for 32bit input and 16bit output, shuffle used by The Godfather
-		if (PS_SHUFFLE_SAME)
-		{
-			if (PS_READ_BA)
-				C = (float4)(float((denorm_c.b & 0x7Fu) | (denorm_c.a & 0x80u)));
-			else
-				C.ga = C.rg;
-		}
-		// Copy of a 16bit source in to this target
-		else if (PS_READ16_SRC)
-		{
-			C.rb = (float2)float((denorm_c.r >> 3) | (((denorm_c.g >> 3) & 0x7u) << 5));
-			if (denorm_c.a & 0x80u)
-				C.ga = (float2)float((denorm_c.g >> 6) | ((denorm_c.b >> 3) << 2) | (denorm_TA.y & 0x80u));
-			else
-				C.ga = (float2)float((denorm_c.g >> 6) | ((denorm_c.b >> 3) << 2) | (denorm_TA.x & 0x80u));
-		}
-		// Write RB part. Mask will take care of the correct destination
-		else if (PS_READ_BA)
-		{
-			C.rb = C.bb;
-			if (denorm_c.a & 0x80u)
-				C.ga = (float2)(float((denorm_c.a & 0x7Fu) | (denorm_TA.y & 0x80u)));
-			else
-				C.ga = (float2)(float((denorm_c.a & 0x7Fu) | (denorm_TA.x & 0x80u)));
-		}
-		else
-		{
-			C.rb = C.rr;
-			if (denorm_c.g & 0x80u)
-				C.ga = (float2)(float((denorm_c.g & 0x7Fu) | (denorm_TA.y & 0x80u)));
-
-			else
-				C.ga = (float2)(float((denorm_c.g & 0x7Fu) | (denorm_TA.x & 0x80u)));
-		}
-	}
-
 	// Must be done before alpha correction
 
 	// AA (Fixed one) will output a coverage of 1.0 as alpha
@@ -1023,6 +1000,63 @@ PS_OUTPUT ps_main(PS_INPUT input)
 
 	ps_blend(C, alpha_blend, input.p.xy);
 
+	if (PS_SHUFFLE)
+	{
+		if (!PS_SHUFFLE_SAME && !PS_READ16_SRC)
+		{
+			uint4 denorm_c_after = uint4(C);
+			if (PS_READ_BA)
+			{
+				C.b = float(((denorm_c_after.r >> 3) & 0x1F) | ((denorm_c_after.g << 2) & 0xE0));
+				C.a = float(((denorm_c_after.g >> 6) & 0x3) | ((denorm_c_after.b >> 1) & 0x7C) | (denorm_c_after.a & 0x80));
+			}
+			else
+			{
+				C.r = float(((denorm_c_after.r >> 3) & 0x1F) | ((denorm_c_after.g << 2) & 0xE0));
+				C.g = float(((denorm_c_after.g >> 6) & 0x3) | ((denorm_c_after.b >> 1) & 0x7C) | (denorm_c_after.a & 0x80));
+			}
+		}
+
+		uint4 denorm_c = uint4(C);
+		uint2 denorm_TA = uint2(float2(TA.xy) * 255.0f + 0.5f);
+
+		// Special case for 32bit input and 16bit output, shuffle used by The Godfather
+		if (PS_SHUFFLE_SAME)
+		{
+			if (PS_READ_BA)
+				C = (float4)(float((denorm_c.b & 0x7Fu) | (denorm_c.a & 0x80u)));
+			else
+				C.ga = C.rg;
+		}
+		// Copy of a 16bit source in to this target
+		else if (PS_READ16_SRC)
+		{
+			C.rb = (float2)float((denorm_c.r >> 3) | (((denorm_c.g >> 3) & 0x7u) << 5));
+			if (denorm_c.a & 0x80u)
+				C.ga = (float2)float((denorm_c.g >> 6) | ((denorm_c.b >> 3) << 2) | (denorm_TA.y & 0x80u));
+			else
+				C.ga = (float2)float((denorm_c.g >> 6) | ((denorm_c.b >> 3) << 2) | (denorm_TA.x & 0x80u));
+		}
+		// Write RB part. Mask will take care of the correct destination
+		else if (PS_READ_BA)
+		{
+			C.rb = C.bb;
+			if (denorm_c.a & 0x80u)
+				C.ga = (float2)(float((denorm_c.a & 0x7Fu) | (denorm_TA.y & 0x80u)));
+			else
+				C.ga = (float2)(float((denorm_c.a & 0x7Fu) | (denorm_TA.x & 0x80u)));
+		}
+		else
+		{
+			C.rb = C.rr;
+			if (denorm_c.g & 0x80u)
+				C.ga = (float2)(float((denorm_c.g & 0x7Fu) | (denorm_TA.y & 0x80u)));
+
+			else
+				C.ga = (float2)(float((denorm_c.g & 0x7Fu) | (denorm_TA.x & 0x80u)));
+		}
+	}
+
 	ps_dither(C.rgb, input.p.xy);
 
 	// Color clamp/wrap needs to be done after sw blending and dithering
diff --git a/bin/resources/shaders/opengl/tfx_fs.glsl b/bin/resources/shaders/opengl/tfx_fs.glsl
@@ -687,6 +687,21 @@ vec4 ps_color()
 	vec4 T = sample_color(st);
 #endif
 
+	#if PS_SHUFFLE && !PS_SHUFFLE_SAME && !PS_READ16_SRC
+		uvec4 denorm_c_before = uvec4(T);
+		#if PS_READ_BA
+			T.r = float((denorm_c_before.b << 3) & 0xF8);
+			T.g = float(((denorm_c_before.b >> 2) & 0x38) | ((denorm_c_before.a << 6) & 0xC0));
+			T.b = float((denorm_c_before.a << 1) & 0xF8);
+			T.a = float(denorm_c_before.a & 0x80);
+		#else
+			T.r = float((denorm_c_before.r << 3) & 0xF8);
+			T.g = float(((denorm_c_before.r >> 2) & 0x38) | ((denorm_c_before.g << 6) & 0xC0));
+			T.b = float((denorm_c_before.g << 1) & 0xF8);
+			T.a = float(denorm_c_before.g & 0x80);
+		#endif
+	#endif
+	
 	vec4 C = tfx(T, PSin.c);
 
 	atst(C);
@@ -937,7 +952,56 @@ void ps_main()
 
 	vec4 C = ps_color();
 
+	// Must be done before alpha correction
+
+	// AA (Fixed one) will output a coverage of 1.0 as alpha
+#if PS_FIXED_ONE_A
+	C.a = 128.0f;
+#endif
+
+#if SW_AD_TO_HW
+	vec4 RT = trunc(fetch_rt() * 255.0f + 0.1f);
+	vec4 alpha_blend = vec4(RT.a / 128.0f);
+#else
+	vec4 alpha_blend = vec4(C.a / 128.0f);
+#endif
+
+	// Correct the ALPHA value based on the output format
+#if (PS_DST_FMT == FMT_16)
+	float A_one = 128.0f; // alpha output will be 0x80
+	C.a = (PS_FBA != 0) ? A_one : step(128.0f, C.a) * A_one;
+#elif (PS_DST_FMT == FMT_32) && (PS_FBA != 0)
+	if(C.a < 128.0f) C.a += 128.0f;
+#endif
+
+	// Get first primitive that will write a failling alpha value
+#if PS_DATE == 1
+	// DATM == 0
+	// Pixel with alpha equal to 1 will failed (128-255)
+	SV_Target0 = (C.a > 127.5f) ? vec4(gl_PrimitiveID) : vec4(0x7FFFFFFF);
+	return;
+#elif PS_DATE == 2
+	// DATM == 1
+	// Pixel with alpha equal to 0 will failed (0-127)
+	SV_Target0 = (C.a < 127.5f) ? vec4(gl_PrimitiveID) : vec4(0x7FFFFFFF);
+	return;
+#endif
+
+	ps_blend(C, alpha_blend);
+
+
 #if PS_SHUFFLE
+	#if !PS_SHUFFLE_SAME && !PS_READ16_SRC
+		uvec4 denorm_c_after = uvec4(C);
+		#if PS_READ_BA
+			C.b = float(((denorm_c_after.r >> 3) & 0x1F) | ((denorm_c_after.g << 2) & 0xE0));
+			C.a = float(((denorm_c_after.g >> 6) & 0x3) | ((denorm_c_after.b >> 1) & 0x7C) | (denorm_c_after.a & 0x80));
+		#else
+			C.r = float(((denorm_c_after.r >> 3) & 0x1F) | ((denorm_c_after.g << 2) & 0xE0));
+			C.g = float(((denorm_c_after.g >> 6) & 0x3) | ((denorm_c_after.b >> 1) & 0x7C) | (denorm_c_after.a & 0x80));
+		#endif
+	#endif
+	
 	uvec4 denorm_c = uvec4(C);
 	uvec2 denorm_TA = uvec2(vec2(TA.xy) * 255.0f + 0.5f);
 
@@ -991,43 +1055,6 @@ void ps_main()
 #endif // PS_SHUFFLE_SAME
 #endif // PS_SHUFFLE
 
-	// Must be done before alpha correction
-
-	// AA (Fixed one) will output a coverage of 1.0 as alpha
-#if PS_FIXED_ONE_A
-	C.a = 128.0f;
-#endif
-
-#if SW_AD_TO_HW
-	vec4 RT = trunc(fetch_rt() * 255.0f + 0.1f);
-	vec4 alpha_blend = vec4(RT.a / 128.0f);
-#else
-	vec4 alpha_blend = vec4(C.a / 128.0f);
-#endif
-
-	// Correct the ALPHA value based on the output format
-#if (PS_DST_FMT == FMT_16)
-	float A_one = 128.0f; // alpha output will be 0x80
-	C.a = (PS_FBA != 0) ? A_one : step(128.0f, C.a) * A_one;
-#elif (PS_DST_FMT == FMT_32) && (PS_FBA != 0)
-	if(C.a < 128.0f) C.a += 128.0f;
-#endif
-
-	// Get first primitive that will write a failling alpha value
-#if PS_DATE == 1
-	// DATM == 0
-	// Pixel with alpha equal to 1 will failed (128-255)
-	SV_Target0 = (C.a > 127.5f) ? vec4(gl_PrimitiveID) : vec4(0x7FFFFFFF);
-	return;
-#elif PS_DATE == 2
-	// DATM == 1
-	// Pixel with alpha equal to 0 will failed (0-127)
-	SV_Target0 = (C.a < 127.5f) ? vec4(gl_PrimitiveID) : vec4(0x7FFFFFFF);
-	return;
-#endif
-
-	ps_blend(C, alpha_blend);
-
 	ps_dither(C.rgb);
 
 	// Color clamp/wrap needs to be done after sw blending and dithering
diff --git a/bin/resources/shaders/vulkan/tfx.glsl b/bin/resources/shaders/vulkan/tfx.glsl
@@ -933,6 +933,21 @@ vec4 ps_color()
 	vec4 T = sample_color(st);
 #endif
 
+	#if PS_SHUFFLE && !PS_SHUFFLE_SAME && !PS_READ16_SRC
+		uvec4 denorm_c_before = uvec4(T);
+		#if PS_READ_BA
+			T.r = float((denorm_c_before.b << 3) & 0xF8);
+			T.g = float(((denorm_c_before.b >> 2) & 0x38) | ((denorm_c_before.a << 6) & 0xC0));
+			T.b = float((denorm_c_before.a << 1) & 0xF8);
+			T.a = float(denorm_c_before.a & 0x80);
+		#else
+			T.r = float((denorm_c_before.r << 3) & 0xF8);
+			T.g = float(((denorm_c_before.r >> 2) & 0x38) | ((denorm_c_before.g << 6) & 0xC0));
+			T.b = float((denorm_c_before.g << 1) & 0xF8);
+			T.a = float(denorm_c_before.g & 0x80);
+		#endif
+	#endif
+	
 	vec4 C = tfx(T, vsIn.c);
 
 	atst(C);
@@ -1184,40 +1199,6 @@ void main()
 
 	vec4 C = ps_color();
 
-	#if PS_SHUFFLE
-		uvec4 denorm_c = uvec4(C);
-		uvec2 denorm_TA = uvec2(vec2(TA.xy) * 255.0f + 0.5f);
-		
-		// Special case for 32bit input and 16bit output, shuffle used by The Godfather
-		#if PS_SHUFFLE_SAME
-			#if (PS_READ_BA)
-				C = vec4(float((denorm_c.b & 0x7Fu) | (denorm_c.a & 0x80u)));
-			#else
-				C.ga = C.rg;
-			#endif
-		// Copy of a 16bit source in to this target
-		#elif PS_READ16_SRC
-			C.rb = vec2(float((denorm_c.r >> 3) | (((denorm_c.g >> 3) & 0x7u) << 5)));
-			if ((denorm_c.a & 0x80u) != 0u)
-				C.ga = vec2(float((denorm_c.g >> 6) | ((denorm_c.b >> 3) << 2) | (denorm_TA.y & 0x80u)));
-			else
-				C.ga = vec2(float((denorm_c.g >> 6) | ((denorm_c.b >> 3) << 2) | (denorm_TA.x & 0x80u)));
-		// Write RB part. Mask will take care of the correct destination
-		#elif PS_READ_BA
-			C.rb = C.bb;
-			if ((denorm_c.a & 0x80u) != 0u)
-				C.ga = vec2(float((denorm_c.a & 0x7Fu) | (denorm_TA.y & 0x80u)));
-			else
-				C.ga = vec2(float((denorm_c.a & 0x7Fu) | (denorm_TA.x & 0x80u)));
-		#else
-			C.rb = C.rr;
-			if ((denorm_c.g & 0x80u) != 0u)
-				C.ga = vec2(float((denorm_c.g & 0x7Fu) | (denorm_TA.y & 0x80u)));
-			else
-				C.ga = vec2(float((denorm_c.g & 0x7Fu) | (denorm_TA.x & 0x80u)));
-		#endif // PS_SHUFFLE_SAME
-	#endif // PS_SHUFFLE
-
 	// Must be done before alpha correction
 
 	// AA (Fixed one) will output a coverage of 1.0 as alpha
@@ -1254,9 +1235,53 @@ void main()
 	o_col0 = (C.a < 127.5f) ? vec4(gl_PrimitiveID) : vec4(0x7FFFFFFF);
 
 #else
-
 	ps_blend(C, alpha_blend);
 
+#if PS_SHUFFLE
+		#if !PS_SHUFFLE_SAME && !PS_READ16_SRC
+			uvec4 denorm_c_after = uvec4(C);
+			#if PS_READ_BA
+				C.b = float(((denorm_c_after.r >> 3) & 0x1F) | ((denorm_c_after.g << 2) & 0xE0));
+				C.a = float(((denorm_c_after.g >> 6) & 0x3) | ((denorm_c_after.b >> 1) & 0x7C) | (denorm_c_after.a & 0x80));
+			#else
+				C.r = float(((denorm_c_after.r >> 3) & 0x1F) | ((denorm_c_after.g << 2) & 0xE0));
+				C.g = float(((denorm_c_after.g >> 6) & 0x3) | ((denorm_c_after.b >> 1) & 0x7C) | (denorm_c_after.a & 0x80));
+			#endif
+		#endif
+
+		uvec4 denorm_c = uvec4(C);
+		uvec2 denorm_TA = uvec2(vec2(TA.xy) * 255.0f + 0.5f);
+		
+		// Special case for 32bit input and 16bit output, shuffle used by The Godfather
+		#if PS_SHUFFLE_SAME
+			#if (PS_READ_BA)
+				C = vec4(float((denorm_c.b & 0x7Fu) | (denorm_c.a & 0x80u)));
+			#else
+				C.ga = C.rg;
+			#endif
+		// Copy of a 16bit source in to this target
+		#elif PS_READ16_SRC
+			C.rb = vec2(float((denorm_c.r >> 3) | (((denorm_c.g >> 3) & 0x7u) << 5)));
+			if ((denorm_c.a & 0x80u) != 0u)
+				C.ga = vec2(float((denorm_c.g >> 6) | ((denorm_c.b >> 3) << 2) | (denorm_TA.y & 0x80u)));
+			else
+				C.ga = vec2(float((denorm_c.g >> 6) | ((denorm_c.b >> 3) << 2) | (denorm_TA.x & 0x80u)));
+		// Write RB part. Mask will take care of the correct destination
+		#elif PS_READ_BA
+			C.rb = C.bb;
+			if ((denorm_c.a & 0x80u) != 0u)
+				C.ga = vec2(float((denorm_c.a & 0x7Fu) | (denorm_TA.y & 0x80u)));
+			else
+				C.ga = vec2(float((denorm_c.a & 0x7Fu) | (denorm_TA.x & 0x80u)));
+		#else
+			C.rb = C.rr;
+			if ((denorm_c.g & 0x80u) != 0u)
+				C.ga = vec2(float((denorm_c.g & 0x7Fu) | (denorm_TA.y & 0x80u)));
+			else
+				C.ga = vec2(float((denorm_c.g & 0x7Fu) | (denorm_TA.x & 0x80u)));
+		#endif // PS_SHUFFLE_SAME
+	#endif // PS_SHUFFLE
+
 	ps_dither(C.rgb);
 
 	// Color clamp/wrap needs to be done after sw blending and dithering
diff --git a/pcsx2/GS/Renderers/HW/GSRendererHW.cpp b/pcsx2/GS/Renderers/HW/GSRendererHW.cpp
@@ -5169,7 +5169,7 @@ __ri void GSRendererHW::DrawPrims(GSTextureCache::Target* rt, GSTextureCache::Ta
 	}
 
 	bool blending_alpha_pass = false;
-	if ((!IsOpaque() || m_context->ALPHA.IsBlack()) && rt && (m_conf.colormask.wrgba & 0x7))
+	if ((!IsOpaque() || m_context->ALPHA.IsBlack()) && rt && ((m_conf.colormask.wrgba & 0x7) || (m_texture_shuffle && !m_copy_16bit_to_target_shuffle && !m_same_group_texture_shuffle)))
 	{
 		EmulateBlending(blend_alpha_min, blend_alpha_max, DATE_PRIMID, DATE_BARRIER, blending_alpha_pass);
 	}
diff --git a/pcsx2/GS/Renderers/Metal/tfx.metal b/pcsx2/GS/Renderers/Metal/tfx.metal
diff --git a/pcsx2/ShaderCacheVersion.h b/pcsx2/ShaderCacheVersion.h

Original file line number	Diff line number	Diff line change
`@@ -5169,7 +5169,7 @@ __ri void GSRendererHW::DrawPrims(GSTextureCache::Target* rt, GSTextureCache::Ta`
`5169`	`5169`	`}`
`5170`	`5170`
`5171`	`5171`	`bool blending_alpha_pass = false;`
`5172`		`- if ((!IsOpaque() \|\| m_context->ALPHA.IsBlack()) && rt && (m_conf.colormask.wrgba & 0x7))`
	`5172`	`+ if ((!IsOpaque() \|\| m_context->ALPHA.IsBlack()) && rt && ((m_conf.colormask.wrgba & 0x7) \|\| (m_texture_shuffle && !m_copy_16bit_to_target_shuffle && !m_same_group_texture_shuffle)))`
`5173`	`5173`	`{`
`5174`	`5174`	`EmulateBlending(blend_alpha_min, blend_alpha_max, DATE_PRIMID, DATE_BARRIER, blending_alpha_pass);`
`5175`	`5175`	`}`