From 11a1966bf349ab74035df4673293bdeab41b1a96 Mon Sep 17 00:00:00 2001 From: kub Date: Thu, 18 Jul 2024 21:36:43 +0200 Subject: [PATCH] core vdp, arm rendering speed optimisation --- pico/draw.c | 112 ++++++++++++++++++++++++------------------------ pico/draw_arm.S | 69 +++++++++++++---------------- 2 files changed, 85 insertions(+), 96 deletions(-) diff --git a/pico/draw.c b/pico/draw.c index 0f9b137a9..387f3a424 100644 --- a/pico/draw.c +++ b/pico/draw.c @@ -304,34 +304,34 @@ TileFlipMakerAS(TileFlipSH_AS_and, pix_sh_as_and) // -------------------------------------------- #ifndef _ASM_DRAW_C -#define DrawTile(mask) { \ - if (code!=oldcode) { \ - oldcode = code; \ - \ - pack = 0; \ - if (code != blank) { \ - /* Get tile address/2: */\ - u32 addr = ((code&0x7ff)<<4) + ty; \ - if (code & 0x1000) addr ^= 0xe; /* Y-flip */ \ - \ - pal = ((code>>9)&0x30) | sh; /* shadow */ \ - \ - pack = CPU_LE2(*(u32 *)(PicoMem.vram + addr)); \ - if (!pack) \ - blank = code; \ - } \ - } \ - \ - if (code & 0x8000) { /* (un-forced) high priority tile */ \ - if (sh | (pack&mask)) { \ - code |= (dx<<16) | (ty<<25); \ - if (code & 0x1000) code ^= 0xe<<25; \ - *hc++ = code, *hc++ = pack&mask; /* cache it */ \ - } \ - } else if (pack&mask) { \ - if (code & 0x0800) TileFlip(pd + dx, pack&mask, pal); \ - else TileNorm(pd + dx, pack&mask, pal); \ - } \ +#define DrawTile(mask) { \ + if (code!=oldcode) { \ + oldcode = code; \ + \ + pack = 0; \ + if (code != blank) { \ + /* Get tile address/2: */ \ + u32 addr = ((code&0x7ff)<<4) + ty; \ + if (code & 0x1000) addr ^= 0xe; /* Y-flip */ \ + \ + pal = ((code>>9)&0x30) | sh; /* shadow */ \ + \ + pack = CPU_LE2(*(u32 *)(PicoMem.vram + addr)); \ + if (!pack) \ + blank = code; \ + } \ + } \ + \ + if (code & 0x8000) { /* (un-forced) high priority tile */ \ + if (sh | (pack&mask)) { \ + code |= (dx<<16) | (ty<<25); \ + if (code & 0x1000) code ^= 0xe<<25; \ + *hc++ = code, *hc++ = pack&mask; /* cache it */ \ + } \ + } else if (pack&mask) { \ + if (code & 0x0800) TileFlip(pd + dx, pack&mask, pal); \ + else TileNorm(pd + dx, pack&mask, pal); \ + } \ } static void DrawStrip(struct TileStrip *ts, int lflags, int cellskip) @@ -478,34 +478,34 @@ static void DrawStripVSRam(struct TileStrip *ts, int plane_sh, int cellskip) } #endif -#define DrawTileInterlace(mask) { \ - if (code!=oldcode) { \ - oldcode = code; \ - \ - pack = 0; \ - if (code != blank) { \ - /* Get tile address/2: */ \ - u32 addr = ((code&0x3ff)<<5) + ty; \ - if (code & 0x1000) addr ^= 0x1e; /* Y-flip */ \ - \ - pal = ((code>>9)&0x30) | sh; /* shadow */ \ - \ - pack = CPU_LE2(*(u32 *)(PicoMem.vram + addr)); \ - if (!pack) \ - blank = code; \ - } \ - } \ - \ - if (code & 0x8000) { /* high priority tile */ \ - if (sh | (pack&mask)) { \ - code = (code&0xfc00) | ((code&0x3ff)<<1) | (dx<<16) | (ty<<25); \ - if (code & 0x1000) code ^= 0x1e<<25; \ - *hc++ = code, *hc++ = pack&mask; /* cache it */ \ - } \ - } else if (pack&mask) { \ - if (code & 0x0800) TileFlip(pd + dx, pack&mask, pal); \ - else TileNorm(pd + dx, pack&mask, pal); \ - } \ +#define DrawTileInterlace(mask) { \ + if (code!=oldcode) { \ + oldcode = code; \ + \ + pack = 0; \ + if (code != blank) { \ + /* Get tile address/2: */ \ + u32 addr = ((code&0x3ff)<<5) + ty; \ + if (code & 0x1000) addr ^= 0x1e; /* Y-flip */ \ + \ + pal = ((code>>9)&0x30) | sh; /* shadow */ \ + \ + pack = CPU_LE2(*(u32 *)(PicoMem.vram + addr)); \ + if (!pack) \ + blank = code; \ + } \ + } \ + \ + if (code & 0x8000) { /* high priority tile */ \ + if (sh | (pack&mask)) { \ + code = (code&0xfc00) | ((code&0x3ff)<<1) | (dx<<16) | (ty<<25); \ + if (code & 0x1000) code ^= 0x1e<<25; \ + *hc++ = code, *hc++ = pack&mask; /* cache it */ \ + } \ + } else if (pack&mask) { \ + if (code & 0x0800) TileFlip(pd + dx, pack&mask, pal); \ + else TileNorm(pd + dx, pack&mask, pal); \ + } \ } #ifndef _ASM_DRAW_C diff --git a/pico/draw_arm.S b/pico/draw_arm.S index 7a2e6f1d6..40d5c4431 100644 --- a/pico/draw_arm.S +++ b/pico/draw_arm.S @@ -433,7 +433,7 @@ DrawLayer: movs r3, r9, lsl #1 @ (force[31]|sh[30]) << 1 mov r3, #0 orrmi r10,r10, #1<<23 @ r10=cells[31:24]|sh[23]|hi_not_empty[22] -@ orrcc r10,r10, #1<<20 @ |had_output[21]|!force[20]|hscroll[19:17]|ty[15:0] +@ orrcc r10,r10, #1<<20 @ |had_output[21]|!force[20]|hscroll[18:16]|ty[15:0] movmi r3, #0x80 @ default to shadowed pal on sh mode and r4, r7, #7 @@ -452,7 +452,7 @@ DrawLayer: mvn r9, #0 @ r9=prevcode=-1 add r1, r11, r7 @ r1=pdest - @ r10=cells[31:24]|sh[23]|hi_not_empty[22]|had_output[21]|!force[20]|hscroll[19:17]|ty[15:0] + @ r10=cells[31:24]|sh[23]|hi_not_empty[22]|had_output[21]|!force[20]|hscroll[18:16]|ty[15:0] @ r1=pd+dx r2=pack r3=pal r5=xmask r6=hc r8=tilex r9=prevcode r11=HighCol r12=nametab lr=vram @ r4 & r7 are scratch in this loop @@ -467,21 +467,19 @@ DrawLayer: add r8, r8, #1 - movs r2, r9, lsl #20 @ if (code&0x1000) - mov r2, r2, lsl #1 + tst r9, #0x1000 @ if (code&0x1000) + mov r2, r9, lsl #21 add r2, r2, r10, lsl #17 - mov r2, r2, lsr #17 - eorcs r2, r2, #0x0e @ if (code&0x1000) addr^=0xe; + eorne r2, r2, #0xe<<17 @ if (code&0x1000) addr^=0xe; - ldr r2, [lr, r2, lsl #1] @ pack=*(unsigned int *)(Pico.vram+addr); // Get 8 pixels + ldr r2, [lr, r2, lsr #16] @ pack=*(unsigned int *)(Pico.vram+addr); // Get 8 pixels mvn r7, #0 mov r4, r4, lsr #16-2 @ (dx&7)*4 tst r9, #0x0800 moveq r7, r7, lsl r4 @ mask = ~0 [shift] (dx&7)*4 movne r7, r7, lsr r4 - mvn r7, r7, ror #16 - and r2, r2, r7 @ pack&mask + bic r2, r2, r7, ror #16 @ pack&~mask orr r9, r9, #0x80000000 @ invalidate oldcode since pack is masked b .DrawStrip_samecode @@ -504,13 +502,12 @@ DrawLayer: mov r9, r7 @ remember code - movs r2, r9, lsl #20 @ if (code&0x1000) - mov r2, r2, lsl #1 + tst r9, #0x1000 @ if (code&0x1000) + mov r2, r9, lsl #21 add r2, r2, r10, lsl #17 - mov r2, r2, lsr #17 - eorcs r2, r2, #0x0e @ if (code&0x1000) addr^=0xe; + eorne r2, r2, #0x0e<<17 @ if (code&0x1000) addr^=0xe; - ldr r2, [lr, r2, lsl #1] @ pack=*(unsigned int *)(Pico.vram+addr); // Get 8 pixels + ldr r2, [lr, r2, lsr #16] @ pack=*(unsigned int *)(Pico.vram+addr); // Get 8 pixels .DrawStrip_samecode: tst r9, #0x8000 @ tstne r10, #1<<20 @ !force[20] @@ -577,21 +574,19 @@ DrawLayer: add r1, r1, #8 - movs r2, r9, lsl #20 @ if (code&0x1000) - mov r2, r2, lsl #1 + tst r9, #0x1000 @ if (code&0x1000) + mov r2, r9, lsl #21 add r2, r2, r10, lsl #17 - mov r2, r2, lsr #17 - eorcs r2, r2, #0x0e @ if (code&0x1000) addr^=0xe; + eorne r2, r2, #0x0e<<17 @ if (code&0x1000) addr^=0xe; - ldr r2, [lr, r2, lsl #1] @ pack=*(unsigned int *)(Pico.vram+addr); // Get 8 pixels + ldr r2, [lr, r2, lsr #16] @ pack=*(unsigned int *)(Pico.vram+addr); // Get 8 pixels mvn r7, #0 mov r4, r4, lsr #16-2 @ (dx&7)*4 tst r9, #0x0800 moveq r7, r7, lsl r4 @ mask = ~0 [shift] (dx&7)*4 movne r7, r7, lsr r4 - mov r7, r7, ror #16 - and r2, r2, r7 @ pack&mask + and r2, r2, r7, ror #16 @ pack&mask bic r10,r10, #7<<16 b .DrawStrip_samecode @ one last time, with last tile now masked @@ -743,13 +738,12 @@ DrawLayer: mov r9, r7 @ remember code - movs r2, r9, lsl #20 @ if (code&0x1000) - mov r2, r2, lsl #1 + tst r9, #0x1000 @ if (code&0x1000) + mov r2, r9, lsl #21 add r2, r2, r10, lsl #17 - mov r2, r2, lsr #17 - eorcs r2, r2, #0x0e @ if (code&0x1000) addr^=0xe; + eorne r2, r2, #0x0e<<17 @ if (code&0x1000) addr^=0xe; - ldr r2, [lr, r2, lsl #1] @ pack=*(unsigned int *)(PicoMem.vram+addr); // Get 8 pixels + ldr r2, [lr, r2, lsr #16] @ pack=*(unsigned int *)(Pico.vram+addr); // Get 8 pixels .DrawStrip_vs_samecode: tst r9, #0x8000 @@ -1007,8 +1001,7 @@ DrawTilesFromCache: tst r6, #0x0800 @ flipped? moveq r12,r12, lsl r4 movne r12,r12, lsr r4 - mov r12,r12, ror #16 - and r2, r2, r12 + and r2, r2, r12, ror #16 mov r12,#0xf tst r8, #1 bne .dtfc_shadow @@ -1152,9 +1145,7 @@ DrawSpriteSHi: cmp r0, #328 bge DrawSpriteSHi - mov r8, r8, lsl #17 - mov r8, r8, lsr #17 @ tile&=0x7fff; // Clip tile address - + bic r8, r8, #0xf8000 @ tile&=0x7fff; // Clip tile address ldr r2, [lr, r8, lsl #1] @ pack=*(unsigned int *)(PicoMem.vram+addr); // Get 8 pixels add r1, r11, r0 @ r1=pdest tst r2, r2 @@ -1346,9 +1337,7 @@ DrawSprite: cmp r0, #328 bge DrawSprite - mov r8, r8, lsl #17 - mov r8, r8, lsr #17 @ tile&=0x7fff; // Clip tile address - + bic r8, r8, #0xf8000 @ tile&=0x7fff; // Clip tile address ldr r2, [lr, r8, lsl #1] @ pack=*(unsigned int *)(PicoMem.vram+addr); // Get 8 pixels add r1, r11, r0 @ r1=pdest tst r2, r2 @@ -1492,16 +1481,16 @@ DrawWindow: mov r9, r7 @ remember code - movs r2, r9, lsl #20 @ if (code&0x1000) - mov r2, r2, lsl #1 - add r2, r10, r2, lsr #17 @ r2=addr=(code&0x7ff)<<4; addr+=ty - eorcs r2, r2, #0x0e @ if (code&0x1000) addr^=0xe; + tst r9, #0x1000 @ if (code&0x1000) + mov r2, r9, lsl #21 + add r2, r2, r10, lsl #17 + eorne r2, r2, #0xe<<17 @ if (code&0x1000) addr^=0xe; + + ldr r2, [lr, r2, lsr #16] @ pack=*(unsigned int *)(Pico.vram+addr); // Get 8 pixels and r3, r9, #0x6000 mov r3, r3, lsr #9 @ r3=pal=((code&0x6000)>>9); - ldr r2, [lr, r2, lsl #1] @ pack=*(unsigned int *)(Pico.vram+addr); // Get 8 pixels - .dw_samecode: tst r6, #0x100 bne .dw_shadow