Skip to content

Commit 7f14b2a

Browse files
authored
Revert "[AMDGPU][CodeGenPrepare] Narrow 64 bit math to 32 bit if profitable" (#133880)
Reverts #130577
1 parent 66fca06 commit 7f14b2a

5 files changed

+34
-347
lines changed

llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp

-84
Original file line numberDiff line numberDiff line change
@@ -1561,87 +1561,6 @@ void AMDGPUCodeGenPrepareImpl::expandDivRem64(BinaryOperator &I) const {
15611561
llvm_unreachable("not a division");
15621562
}
15631563

1564-
Type *findSmallestLegalBits(Instruction *I, int OrigBit, int MaxBitsNeeded,
1565-
const TargetLowering *TLI, const DataLayout &DL) {
1566-
if (MaxBitsNeeded >= OrigBit)
1567-
return nullptr;
1568-
1569-
Type *NewType = I->getType()->getWithNewBitWidth(MaxBitsNeeded);
1570-
while (OrigBit > MaxBitsNeeded) {
1571-
if (TLI->isOperationLegalOrCustom(
1572-
TLI->InstructionOpcodeToISD(I->getOpcode()),
1573-
TLI->getValueType(DL, NewType, true)))
1574-
return NewType;
1575-
1576-
MaxBitsNeeded *= 2;
1577-
NewType = I->getType()->getWithNewBitWidth(MaxBitsNeeded);
1578-
}
1579-
return nullptr;
1580-
}
1581-
1582-
static bool tryNarrowMathIfNoOverflow(Instruction *I, const TargetLowering *TLI,
1583-
const TargetTransformInfo &TTI,
1584-
const DataLayout &DL) {
1585-
unsigned Opc = I->getOpcode();
1586-
Type *OldType = I->getType();
1587-
1588-
if (Opc != Instruction::Add && Opc != Instruction::Mul)
1589-
return false;
1590-
1591-
unsigned OrigBit = OldType->getScalarSizeInBits();
1592-
unsigned MaxBitsNeeded = OrigBit;
1593-
1594-
switch (Opc) {
1595-
case Instruction::Add:
1596-
MaxBitsNeeded = KnownBits::add(computeKnownBits(I->getOperand(0), DL),
1597-
computeKnownBits(I->getOperand(1), DL))
1598-
.countMaxActiveBits();
1599-
break;
1600-
case Instruction::Mul:
1601-
MaxBitsNeeded = KnownBits::mul(computeKnownBits(I->getOperand(0), DL),
1602-
computeKnownBits(I->getOperand(1), DL))
1603-
.countMaxActiveBits();
1604-
break;
1605-
default:
1606-
llvm_unreachable("Unexpected opcode, only valid for Instruction::Add and "
1607-
"Instruction::Mul.");
1608-
}
1609-
1610-
MaxBitsNeeded = std::max<unsigned>(bit_ceil(MaxBitsNeeded), 8);
1611-
Type *NewType = findSmallestLegalBits(I, OrigBit, MaxBitsNeeded, TLI, DL);
1612-
1613-
if (!NewType)
1614-
return false;
1615-
1616-
// Old cost
1617-
InstructionCost OldCost =
1618-
TTI.getArithmeticInstrCost(Opc, OldType, TTI::TCK_RecipThroughput);
1619-
// New cost of new op
1620-
InstructionCost NewCost =
1621-
TTI.getArithmeticInstrCost(Opc, NewType, TTI::TCK_RecipThroughput);
1622-
// New cost of narrowing 2 operands (use trunc)
1623-
NewCost += 2 * TTI.getCastInstrCost(Instruction::Trunc, NewType, OldType,
1624-
TTI.getCastContextHint(I),
1625-
TTI::TCK_RecipThroughput);
1626-
// New cost of zext narrowed result to original type
1627-
NewCost +=
1628-
TTI.getCastInstrCost(Instruction::ZExt, OldType, NewType,
1629-
TTI.getCastContextHint(I), TTI::TCK_RecipThroughput);
1630-
if (NewCost >= OldCost)
1631-
return false;
1632-
1633-
IRBuilder<> Builder(I);
1634-
Value *Trunc0 = Builder.CreateTrunc(I->getOperand(0), NewType);
1635-
Value *Trunc1 = Builder.CreateTrunc(I->getOperand(1), NewType);
1636-
Value *Arith =
1637-
Builder.CreateBinOp((Instruction::BinaryOps)Opc, Trunc0, Trunc1);
1638-
1639-
Value *Zext = Builder.CreateZExt(Arith, OldType);
1640-
I->replaceAllUsesWith(Zext);
1641-
I->eraseFromParent();
1642-
return true;
1643-
}
1644-
16451564
bool AMDGPUCodeGenPrepareImpl::visitBinaryOperator(BinaryOperator &I) {
16461565
if (foldBinOpIntoSelect(I))
16471566
return true;
@@ -1726,9 +1645,6 @@ bool AMDGPUCodeGenPrepareImpl::visitBinaryOperator(BinaryOperator &I) {
17261645
}
17271646
}
17281647

1729-
Changed = tryNarrowMathIfNoOverflow(&I, ST.getTargetLowering(),
1730-
TM.getTargetTransformInfo(F), DL);
1731-
17321648
return Changed;
17331649
}
17341650

llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-mul24.ll

+1-4
Original file line numberDiff line numberDiff line change
@@ -414,10 +414,7 @@ define i64 @umul24_i64_2(i64 %lhs, i64 %rhs) {
414414
; DISABLED-LABEL: @umul24_i64_2(
415415
; DISABLED-NEXT: [[LHS24:%.*]] = and i64 [[LHS:%.*]], 65535
416416
; DISABLED-NEXT: [[RHS24:%.*]] = and i64 [[RHS:%.*]], 65535
417-
; DISABLED-NEXT: [[TMP1:%.*]] = trunc i64 [[LHS24]] to i32
418-
; DISABLED-NEXT: [[TMP2:%.*]] = trunc i64 [[RHS24]] to i32
419-
; DISABLED-NEXT: [[TMP3:%.*]] = mul i32 [[TMP1]], [[TMP2]]
420-
; DISABLED-NEXT: [[MUL:%.*]] = zext i32 [[TMP3]] to i64
417+
; DISABLED-NEXT: [[MUL:%.*]] = mul i64 [[LHS24]], [[RHS24]]
421418
; DISABLED-NEXT: ret i64 [[MUL]]
422419
;
423420
%lhs24 = and i64 %lhs, 65535

llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll

+28-24
Original file line numberDiff line numberDiff line change
@@ -1823,22 +1823,22 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out, ptr addrspace
18231823
; GFX1264: ; %bb.0: ; %entry
18241824
; GFX1264-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
18251825
; GFX1264-NEXT: s_mov_b64 s[6:7], exec
1826-
; GFX1264-NEXT: s_mov_b64 s[4:5], exec
1826+
; GFX1264-NEXT: s_mov_b32 s9, 0
18271827
; GFX1264-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0
1828+
; GFX1264-NEXT: s_mov_b64 s[4:5], exec
18281829
; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
18291830
; GFX1264-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0
18301831
; GFX1264-NEXT: ; implicit-def: $vgpr0_vgpr1
18311832
; GFX1264-NEXT: v_cmpx_eq_u32_e32 0, v2
18321833
; GFX1264-NEXT: s_cbranch_execz .LBB3_2
18331834
; GFX1264-NEXT: ; %bb.1:
1834-
; GFX1264-NEXT: s_bcnt1_i32_b64 s6, s[6:7]
1835-
; GFX1264-NEXT: v_mov_b32_e32 v1, 0
1836-
; GFX1264-NEXT: s_wait_alu 0xfffe
1837-
; GFX1264-NEXT: s_mul_i32 s6, s6, 5
1835+
; GFX1264-NEXT: s_bcnt1_i32_b64 s8, s[6:7]
18381836
; GFX1264-NEXT: s_mov_b32 s11, 0x31016000
1837+
; GFX1264-NEXT: s_mul_u64 s[6:7], s[8:9], 5
1838+
; GFX1264-NEXT: s_mov_b32 s10, -1
18391839
; GFX1264-NEXT: s_wait_alu 0xfffe
18401840
; GFX1264-NEXT: v_mov_b32_e32 v0, s6
1841-
; GFX1264-NEXT: s_mov_b32 s10, -1
1841+
; GFX1264-NEXT: v_mov_b32_e32 v1, s7
18421842
; GFX1264-NEXT: s_wait_kmcnt 0x0
18431843
; GFX1264-NEXT: s_mov_b32 s8, s2
18441844
; GFX1264-NEXT: s_mov_b32 s9, s3
@@ -1860,27 +1860,29 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out, ptr addrspace
18601860
; GFX1232-LABEL: add_i64_constant:
18611861
; GFX1232: ; %bb.0: ; %entry
18621862
; GFX1232-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
1863+
; GFX1232-NEXT: s_mov_b32 s7, exec_lo
1864+
; GFX1232-NEXT: s_mov_b32 s5, 0
1865+
; GFX1232-NEXT: v_mbcnt_lo_u32_b32 v2, s7, 0
18631866
; GFX1232-NEXT: s_mov_b32 s6, exec_lo
1864-
; GFX1232-NEXT: s_mov_b32 s4, exec_lo
1865-
; GFX1232-NEXT: v_mbcnt_lo_u32_b32 v2, s6, 0
18661867
; GFX1232-NEXT: ; implicit-def: $vgpr0_vgpr1
18671868
; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1)
18681869
; GFX1232-NEXT: v_cmpx_eq_u32_e32 0, v2
18691870
; GFX1232-NEXT: s_cbranch_execz .LBB3_2
18701871
; GFX1232-NEXT: ; %bb.1:
1871-
; GFX1232-NEXT: s_bcnt1_i32_b32 s5, s6
1872+
; GFX1232-NEXT: s_bcnt1_i32_b32 s4, s7
18721873
; GFX1232-NEXT: s_mov_b32 s11, 0x31016000
1873-
; GFX1232-NEXT: s_mul_i32 s5, s5, 5
1874+
; GFX1232-NEXT: s_mul_u64 s[4:5], s[4:5], 5
18741875
; GFX1232-NEXT: s_mov_b32 s10, -1
1875-
; GFX1232-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, 0
1876+
; GFX1232-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
18761877
; GFX1232-NEXT: s_wait_kmcnt 0x0
18771878
; GFX1232-NEXT: s_mov_b32 s8, s2
18781879
; GFX1232-NEXT: s_mov_b32 s9, s3
18791880
; GFX1232-NEXT: buffer_atomic_add_u64 v[0:1], off, s[8:11], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV
18801881
; GFX1232-NEXT: s_wait_loadcnt 0x0
18811882
; GFX1232-NEXT: global_inv scope:SCOPE_DEV
18821883
; GFX1232-NEXT: .LBB3_2:
1883-
; GFX1232-NEXT: s_or_b32 exec_lo, exec_lo, s4
1884+
; GFX1232-NEXT: s_wait_alu 0xfffe
1885+
; GFX1232-NEXT: s_or_b32 exec_lo, exec_lo, s6
18841886
; GFX1232-NEXT: s_wait_kmcnt 0x0
18851887
; GFX1232-NEXT: v_readfirstlane_b32 s3, v1
18861888
; GFX1232-NEXT: v_readfirstlane_b32 s2, v0
@@ -5370,22 +5372,22 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out, ptr addrspace
53705372
; GFX1264: ; %bb.0: ; %entry
53715373
; GFX1264-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
53725374
; GFX1264-NEXT: s_mov_b64 s[6:7], exec
5373-
; GFX1264-NEXT: s_mov_b64 s[4:5], exec
5375+
; GFX1264-NEXT: s_mov_b32 s9, 0
53745376
; GFX1264-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0
5377+
; GFX1264-NEXT: s_mov_b64 s[4:5], exec
53755378
; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
53765379
; GFX1264-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0
53775380
; GFX1264-NEXT: ; implicit-def: $vgpr0_vgpr1
53785381
; GFX1264-NEXT: v_cmpx_eq_u32_e32 0, v2
53795382
; GFX1264-NEXT: s_cbranch_execz .LBB9_2
53805383
; GFX1264-NEXT: ; %bb.1:
5381-
; GFX1264-NEXT: s_bcnt1_i32_b64 s6, s[6:7]
5382-
; GFX1264-NEXT: v_mov_b32_e32 v1, 0
5383-
; GFX1264-NEXT: s_wait_alu 0xfffe
5384-
; GFX1264-NEXT: s_mul_i32 s6, s6, 5
5384+
; GFX1264-NEXT: s_bcnt1_i32_b64 s8, s[6:7]
53855385
; GFX1264-NEXT: s_mov_b32 s11, 0x31016000
5386+
; GFX1264-NEXT: s_mul_u64 s[6:7], s[8:9], 5
5387+
; GFX1264-NEXT: s_mov_b32 s10, -1
53865388
; GFX1264-NEXT: s_wait_alu 0xfffe
53875389
; GFX1264-NEXT: v_mov_b32_e32 v0, s6
5388-
; GFX1264-NEXT: s_mov_b32 s10, -1
5390+
; GFX1264-NEXT: v_mov_b32_e32 v1, s7
53895391
; GFX1264-NEXT: s_wait_kmcnt 0x0
53905392
; GFX1264-NEXT: s_mov_b32 s8, s2
53915393
; GFX1264-NEXT: s_mov_b32 s9, s3
@@ -5410,27 +5412,29 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out, ptr addrspace
54105412
; GFX1232-LABEL: sub_i64_constant:
54115413
; GFX1232: ; %bb.0: ; %entry
54125414
; GFX1232-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
5415+
; GFX1232-NEXT: s_mov_b32 s7, exec_lo
5416+
; GFX1232-NEXT: s_mov_b32 s5, 0
5417+
; GFX1232-NEXT: v_mbcnt_lo_u32_b32 v2, s7, 0
54135418
; GFX1232-NEXT: s_mov_b32 s6, exec_lo
5414-
; GFX1232-NEXT: s_mov_b32 s4, exec_lo
5415-
; GFX1232-NEXT: v_mbcnt_lo_u32_b32 v2, s6, 0
54165419
; GFX1232-NEXT: ; implicit-def: $vgpr0_vgpr1
54175420
; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1)
54185421
; GFX1232-NEXT: v_cmpx_eq_u32_e32 0, v2
54195422
; GFX1232-NEXT: s_cbranch_execz .LBB9_2
54205423
; GFX1232-NEXT: ; %bb.1:
5421-
; GFX1232-NEXT: s_bcnt1_i32_b32 s5, s6
5424+
; GFX1232-NEXT: s_bcnt1_i32_b32 s4, s7
54225425
; GFX1232-NEXT: s_mov_b32 s11, 0x31016000
5423-
; GFX1232-NEXT: s_mul_i32 s5, s5, 5
5426+
; GFX1232-NEXT: s_mul_u64 s[4:5], s[4:5], 5
54245427
; GFX1232-NEXT: s_mov_b32 s10, -1
5425-
; GFX1232-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, 0
5428+
; GFX1232-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
54265429
; GFX1232-NEXT: s_wait_kmcnt 0x0
54275430
; GFX1232-NEXT: s_mov_b32 s8, s2
54285431
; GFX1232-NEXT: s_mov_b32 s9, s3
54295432
; GFX1232-NEXT: buffer_atomic_sub_u64 v[0:1], off, s[8:11], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV
54305433
; GFX1232-NEXT: s_wait_loadcnt 0x0
54315434
; GFX1232-NEXT: global_inv scope:SCOPE_DEV
54325435
; GFX1232-NEXT: .LBB9_2:
5433-
; GFX1232-NEXT: s_or_b32 exec_lo, exec_lo, s4
5436+
; GFX1232-NEXT: s_wait_alu 0xfffe
5437+
; GFX1232-NEXT: s_or_b32 exec_lo, exec_lo, s6
54345438
; GFX1232-NEXT: s_wait_kmcnt 0x0
54355439
; GFX1232-NEXT: v_readfirstlane_b32 s2, v0
54365440
; GFX1232-NEXT: v_mul_u32_u24_e32 v0, 5, v2

0 commit comments

Comments
 (0)