@@ -1823,22 +1823,22 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out, ptr addrspace
1823
1823
; GFX1264: ; %bb.0: ; %entry
1824
1824
; GFX1264-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
1825
1825
; GFX1264-NEXT: s_mov_b64 s[6:7], exec
1826
- ; GFX1264-NEXT: s_mov_b64 s[4:5], exec
1826
+ ; GFX1264-NEXT: s_mov_b32 s9, 0
1827
1827
; GFX1264-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0
1828
+ ; GFX1264-NEXT: s_mov_b64 s[4:5], exec
1828
1829
; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1829
1830
; GFX1264-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0
1830
1831
; GFX1264-NEXT: ; implicit-def: $vgpr0_vgpr1
1831
1832
; GFX1264-NEXT: v_cmpx_eq_u32_e32 0, v2
1832
1833
; GFX1264-NEXT: s_cbranch_execz .LBB3_2
1833
1834
; GFX1264-NEXT: ; %bb.1:
1834
- ; GFX1264-NEXT: s_bcnt1_i32_b64 s6, s[6:7]
1835
- ; GFX1264-NEXT: v_mov_b32_e32 v1, 0
1836
- ; GFX1264-NEXT: s_wait_alu 0xfffe
1837
- ; GFX1264-NEXT: s_mul_i32 s6, s6, 5
1835
+ ; GFX1264-NEXT: s_bcnt1_i32_b64 s8, s[6:7]
1838
1836
; GFX1264-NEXT: s_mov_b32 s11, 0x31016000
1837
+ ; GFX1264-NEXT: s_mul_u64 s[6:7], s[8:9], 5
1838
+ ; GFX1264-NEXT: s_mov_b32 s10, -1
1839
1839
; GFX1264-NEXT: s_wait_alu 0xfffe
1840
1840
; GFX1264-NEXT: v_mov_b32_e32 v0, s6
1841
- ; GFX1264-NEXT: s_mov_b32 s10, -1
1841
+ ; GFX1264-NEXT: v_mov_b32_e32 v1, s7
1842
1842
; GFX1264-NEXT: s_wait_kmcnt 0x0
1843
1843
; GFX1264-NEXT: s_mov_b32 s8, s2
1844
1844
; GFX1264-NEXT: s_mov_b32 s9, s3
@@ -1860,27 +1860,29 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out, ptr addrspace
1860
1860
; GFX1232-LABEL: add_i64_constant:
1861
1861
; GFX1232: ; %bb.0: ; %entry
1862
1862
; GFX1232-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
1863
+ ; GFX1232-NEXT: s_mov_b32 s7, exec_lo
1864
+ ; GFX1232-NEXT: s_mov_b32 s5, 0
1865
+ ; GFX1232-NEXT: v_mbcnt_lo_u32_b32 v2, s7, 0
1863
1866
; GFX1232-NEXT: s_mov_b32 s6, exec_lo
1864
- ; GFX1232-NEXT: s_mov_b32 s4, exec_lo
1865
- ; GFX1232-NEXT: v_mbcnt_lo_u32_b32 v2, s6, 0
1866
1867
; GFX1232-NEXT: ; implicit-def: $vgpr0_vgpr1
1867
1868
; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1)
1868
1869
; GFX1232-NEXT: v_cmpx_eq_u32_e32 0, v2
1869
1870
; GFX1232-NEXT: s_cbranch_execz .LBB3_2
1870
1871
; GFX1232-NEXT: ; %bb.1:
1871
- ; GFX1232-NEXT: s_bcnt1_i32_b32 s5, s6
1872
+ ; GFX1232-NEXT: s_bcnt1_i32_b32 s4, s7
1872
1873
; GFX1232-NEXT: s_mov_b32 s11, 0x31016000
1873
- ; GFX1232-NEXT: s_mul_i32 s5, s5 , 5
1874
+ ; GFX1232-NEXT: s_mul_u64 s[4:5], s[4:5] , 5
1874
1875
; GFX1232-NEXT: s_mov_b32 s10, -1
1875
- ; GFX1232-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, 0
1876
+ ; GFX1232-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
1876
1877
; GFX1232-NEXT: s_wait_kmcnt 0x0
1877
1878
; GFX1232-NEXT: s_mov_b32 s8, s2
1878
1879
; GFX1232-NEXT: s_mov_b32 s9, s3
1879
1880
; GFX1232-NEXT: buffer_atomic_add_u64 v[0:1], off, s[8:11], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV
1880
1881
; GFX1232-NEXT: s_wait_loadcnt 0x0
1881
1882
; GFX1232-NEXT: global_inv scope:SCOPE_DEV
1882
1883
; GFX1232-NEXT: .LBB3_2:
1883
- ; GFX1232-NEXT: s_or_b32 exec_lo, exec_lo, s4
1884
+ ; GFX1232-NEXT: s_wait_alu 0xfffe
1885
+ ; GFX1232-NEXT: s_or_b32 exec_lo, exec_lo, s6
1884
1886
; GFX1232-NEXT: s_wait_kmcnt 0x0
1885
1887
; GFX1232-NEXT: v_readfirstlane_b32 s3, v1
1886
1888
; GFX1232-NEXT: v_readfirstlane_b32 s2, v0
@@ -5370,22 +5372,22 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out, ptr addrspace
5370
5372
; GFX1264: ; %bb.0: ; %entry
5371
5373
; GFX1264-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
5372
5374
; GFX1264-NEXT: s_mov_b64 s[6:7], exec
5373
- ; GFX1264-NEXT: s_mov_b64 s[4:5], exec
5375
+ ; GFX1264-NEXT: s_mov_b32 s9, 0
5374
5376
; GFX1264-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0
5377
+ ; GFX1264-NEXT: s_mov_b64 s[4:5], exec
5375
5378
; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
5376
5379
; GFX1264-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0
5377
5380
; GFX1264-NEXT: ; implicit-def: $vgpr0_vgpr1
5378
5381
; GFX1264-NEXT: v_cmpx_eq_u32_e32 0, v2
5379
5382
; GFX1264-NEXT: s_cbranch_execz .LBB9_2
5380
5383
; GFX1264-NEXT: ; %bb.1:
5381
- ; GFX1264-NEXT: s_bcnt1_i32_b64 s6, s[6:7]
5382
- ; GFX1264-NEXT: v_mov_b32_e32 v1, 0
5383
- ; GFX1264-NEXT: s_wait_alu 0xfffe
5384
- ; GFX1264-NEXT: s_mul_i32 s6, s6, 5
5384
+ ; GFX1264-NEXT: s_bcnt1_i32_b64 s8, s[6:7]
5385
5385
; GFX1264-NEXT: s_mov_b32 s11, 0x31016000
5386
+ ; GFX1264-NEXT: s_mul_u64 s[6:7], s[8:9], 5
5387
+ ; GFX1264-NEXT: s_mov_b32 s10, -1
5386
5388
; GFX1264-NEXT: s_wait_alu 0xfffe
5387
5389
; GFX1264-NEXT: v_mov_b32_e32 v0, s6
5388
- ; GFX1264-NEXT: s_mov_b32 s10, -1
5390
+ ; GFX1264-NEXT: v_mov_b32_e32 v1, s7
5389
5391
; GFX1264-NEXT: s_wait_kmcnt 0x0
5390
5392
; GFX1264-NEXT: s_mov_b32 s8, s2
5391
5393
; GFX1264-NEXT: s_mov_b32 s9, s3
@@ -5410,27 +5412,29 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out, ptr addrspace
5410
5412
; GFX1232-LABEL: sub_i64_constant:
5411
5413
; GFX1232: ; %bb.0: ; %entry
5412
5414
; GFX1232-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
5415
+ ; GFX1232-NEXT: s_mov_b32 s7, exec_lo
5416
+ ; GFX1232-NEXT: s_mov_b32 s5, 0
5417
+ ; GFX1232-NEXT: v_mbcnt_lo_u32_b32 v2, s7, 0
5413
5418
; GFX1232-NEXT: s_mov_b32 s6, exec_lo
5414
- ; GFX1232-NEXT: s_mov_b32 s4, exec_lo
5415
- ; GFX1232-NEXT: v_mbcnt_lo_u32_b32 v2, s6, 0
5416
5419
; GFX1232-NEXT: ; implicit-def: $vgpr0_vgpr1
5417
5420
; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1)
5418
5421
; GFX1232-NEXT: v_cmpx_eq_u32_e32 0, v2
5419
5422
; GFX1232-NEXT: s_cbranch_execz .LBB9_2
5420
5423
; GFX1232-NEXT: ; %bb.1:
5421
- ; GFX1232-NEXT: s_bcnt1_i32_b32 s5, s6
5424
+ ; GFX1232-NEXT: s_bcnt1_i32_b32 s4, s7
5422
5425
; GFX1232-NEXT: s_mov_b32 s11, 0x31016000
5423
- ; GFX1232-NEXT: s_mul_i32 s5, s5 , 5
5426
+ ; GFX1232-NEXT: s_mul_u64 s[4:5], s[4:5] , 5
5424
5427
; GFX1232-NEXT: s_mov_b32 s10, -1
5425
- ; GFX1232-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, 0
5428
+ ; GFX1232-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
5426
5429
; GFX1232-NEXT: s_wait_kmcnt 0x0
5427
5430
; GFX1232-NEXT: s_mov_b32 s8, s2
5428
5431
; GFX1232-NEXT: s_mov_b32 s9, s3
5429
5432
; GFX1232-NEXT: buffer_atomic_sub_u64 v[0:1], off, s[8:11], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV
5430
5433
; GFX1232-NEXT: s_wait_loadcnt 0x0
5431
5434
; GFX1232-NEXT: global_inv scope:SCOPE_DEV
5432
5435
; GFX1232-NEXT: .LBB9_2:
5433
- ; GFX1232-NEXT: s_or_b32 exec_lo, exec_lo, s4
5436
+ ; GFX1232-NEXT: s_wait_alu 0xfffe
5437
+ ; GFX1232-NEXT: s_or_b32 exec_lo, exec_lo, s6
5434
5438
; GFX1232-NEXT: s_wait_kmcnt 0x0
5435
5439
; GFX1232-NEXT: v_readfirstlane_b32 s2, v0
5436
5440
; GFX1232-NEXT: v_mul_u32_u24_e32 v0, 5, v2
0 commit comments