Skip to content

Commit f3f741e

Browse files
authored
Merge branch 'release/rocm-rel-7.1' into amd/dev/yaxunl/SWDEV-550134-7.1
2 parents 1730cbe + 1b0eada commit f3f741e

23 files changed

+2540
-535
lines changed

clang/lib/Driver/ToolChains/CommonArgs.cpp

Lines changed: 32 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,23 @@ using namespace clang::driver::tools;
7878
using namespace clang;
7979
using namespace llvm::opt;
8080

81+
static bool addRPathCmdArg(const llvm::opt::ArgList &Args,
82+
ArgStringList &CmdArgs,
83+
const std::string pathCandidate,
84+
bool onlyIfPathExists = true) {
85+
SmallString<0> simplifiedPathCandidate(pathCandidate);
86+
llvm::sys::path::remove_dots(simplifiedPathCandidate, true);
87+
88+
bool pathExists = llvm::sys::fs::exists(simplifiedPathCandidate);
89+
90+
if (onlyIfPathExists && !pathExists)
91+
return false;
92+
93+
CmdArgs.push_back("-rpath");
94+
CmdArgs.push_back(Args.MakeArgString(simplifiedPathCandidate));
95+
return pathExists;
96+
}
97+
8198
static bool useFramePointerForTargetByDefault(const llvm::opt::ArgList &Args,
8299
const llvm::Triple &Triple) {
83100
if (Args.hasArg(clang::driver::options::OPT_pg) &&
@@ -1261,12 +1278,8 @@ void tools::addOpenMPRuntimeSpecificRPath(const ToolChain &TC,
12611278
// one of the LIBRARY_PATH directories.
12621279
ArgStringList EnvLibraryPaths;
12631280
addDirectoryList(Args, EnvLibraryPaths, "", "LIBRARY_PATH");
1264-
for (auto &EnvLibraryPath : EnvLibraryPaths) {
1265-
if (llvm::sys::fs::exists(EnvLibraryPath)) {
1266-
CmdArgs.push_back("-rpath");
1267-
CmdArgs.push_back(Args.MakeArgString(EnvLibraryPath));
1268-
}
1269-
}
1281+
for (auto &EnvLibraryPath : EnvLibraryPaths)
1282+
addRPathCmdArg(Args, CmdArgs, EnvLibraryPath);
12701283

12711284
if (Args.hasFlag(options::OPT_fopenmp_implicit_rpath,
12721285
options::OPT_fno_openmp_implicit_rpath, true)) {
@@ -1275,46 +1288,33 @@ void tools::addOpenMPRuntimeSpecificRPath(const ToolChain &TC,
12751288
SmallString<256> DefaultLibPath =
12761289
llvm::sys::path::parent_path(TC.getDriver().Dir);
12771290
llvm::sys::path::append(DefaultLibPath, CLANG_INSTALL_LIBDIR_BASENAME);
1278-
if (TC.getSanitizerArgs(Args).needsAsanRt()) {
1279-
CmdArgs.push_back("-rpath");
1280-
CmdArgs.push_back(Args.MakeArgString(TC.getCompilerRTPath()));
1281-
}
1291+
if (TC.getSanitizerArgs(Args).needsAsanRt())
1292+
addRPathCmdArg(Args, CmdArgs, TC.getCompilerRTPath(),
1293+
/*onlyIfPathExists=*/false);
12821294

12831295
// In case LibSuffix was not built, try lib
12841296
std::string CandidateRPath_suf = D.Dir + "/../" + LibSuffix;
1285-
CmdArgs.push_back("-rpath");
1286-
CmdArgs.push_back(Args.MakeArgString(CandidateRPath_suf.c_str()));
1287-
12881297
// Add lib directory in case LibSuffix does not exist
12891298
std::string CandidateRPath_lib = D.Dir + "/../lib";
1290-
if ((!llvm::sys::fs::exists(CandidateRPath_suf)) &&
1291-
(llvm::sys::fs::exists(CandidateRPath_lib))) {
1292-
CmdArgs.push_back("-rpath");
1293-
CmdArgs.push_back(Args.MakeArgString(CandidateRPath_lib.c_str()));
1294-
}
1299+
if (!addRPathCmdArg(Args, CmdArgs, CandidateRPath_suf,
1300+
/*onlyIfPathExists=*/false))
1301+
addRPathCmdArg(Args, CmdArgs, CandidateRPath_lib);
12951302

12961303
std::string rocmPath =
12971304
Args.getLastArgValue(clang::driver::options::OPT_rocm_path_EQ).str();
12981305
if (rocmPath.size() != 0) {
12991306
std::string rocmPath_lib = rocmPath + "/lib";
13001307
std::string rocmPath_suf = rocmPath + "/" + LibSuffix;
1301-
if (llvm::sys::fs::exists(rocmPath_suf)) {
1302-
CmdArgs.push_back("-rpath");
1303-
CmdArgs.push_back(Args.MakeArgString(rocmPath_suf.c_str()));
1304-
} else if (llvm::sys::fs::exists(rocmPath_lib)) {
1305-
CmdArgs.push_back("-rpath");
1306-
CmdArgs.push_back(Args.MakeArgString(rocmPath_lib.c_str()));
1307-
}
1308+
if (!addRPathCmdArg(Args, CmdArgs, rocmPath_suf))
1309+
addRPathCmdArg(Args, CmdArgs, rocmPath_lib);
13081310
}
13091311

13101312
// Add Default lib path to ensure llvm dynamic library is picked up for
13111313
// lib-debug/lib-perf
1312-
if (LibSuffix != "lib" && llvm::sys::fs::exists(DefaultLibPath)){
1313-
CmdArgs.push_back("-rpath");
1314-
CmdArgs.push_back(Args.MakeArgString(DefaultLibPath.c_str()));
1315-
}
1314+
if (LibSuffix != "lib")
1315+
addRPathCmdArg(Args, CmdArgs, DefaultLibPath.c_str());
13161316

1317-
if (llvm::find_if(CmdArgs, [](StringRef str) {
1317+
if (llvm::find_if(CmdArgs, [](StringRef str) {
13181318
return !str.compare("--enable-new-dtags");
13191319
}) == CmdArgs.end())
13201320
CmdArgs.push_back("--disable-new-dtags");
@@ -1351,10 +1351,8 @@ void tools::addArchSpecificRPath(const ToolChain &TC, const ArgList &Args,
13511351
CandidateRPaths.emplace_back(*CandidateRPath);
13521352

13531353
for (const auto &CandidateRPath : CandidateRPaths) {
1354-
if (TC.getVFS().exists(CandidateRPath)) {
1355-
CmdArgs.push_back("-rpath");
1356-
CmdArgs.push_back(Args.MakeArgString(CandidateRPath));
1357-
}
1354+
if (TC.getVFS().exists(CandidateRPath))
1355+
addRPathCmdArg(Args, CmdArgs, CandidateRPath, /*onlyIfPathExists=*/false);
13581356
}
13591357
}
13601358

Lines changed: 26 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1,38 +1,48 @@
11
// REQUIRES: amdgpu-registered-target
22

3-
// Asan-Debug: /lib-debug/asan
4-
// Asan-Devel: /lib/asan
5-
// Asan-Perf: /lib-perf/asan
6-
73
// RUN: %clang -### -fopenmp -nogpuinc -nogpulib --offload-arch=gfx90a -fopenmp-runtimelib=lib-debug %s -O3 2>&1 \
8-
// RUN: | FileCheck -check-prefixes=Debug %s
4+
// RUN: | FileCheck -check-prefixes=Debug,Debug-Rel %s
95

106
// RUN: %clang -### -fopenmp -nogpuinc -nogpulib --offload-arch=gfx90a -fopenmp-runtimelib=lib-perf %s -O3 2>&1 \
11-
// RUN: | FileCheck -check-prefixes=Perf %s
7+
// RUN: | FileCheck -check-prefixes=Perf,Perf-Rel %s
128

139
// RUN: %clang -### -fopenmp -nogpuinc -nogpulib --offload-arch=gfx90a -fopenmp-runtimelib=lib %s -O3 2>&1 \
14-
// RUN: | FileCheck -check-prefixes=Devel %s
10+
// RUN: | FileCheck -check-prefixes=Devel,Devel-Rel %s
1511

1612
// RUN: %clang -### -fopenmp -nogpuinc -nogpulib --offload-arch=gfx90a -fopenmp-target-fast %s -O3 2>&1 \
17-
// RUN: | FileCheck -check-prefixes=Default %s
13+
// RUN: | FileCheck -check-prefixes=Devel,Devel-Rel %s
1814

1915
// RUN: not %clang -### -fopenmp -nogpuinc -nogpulib --offload-arch=gfx90a -fopenmp-runtimelib=oopsy %s -O3 2>&1 \
2016
// RUN: | FileCheck -check-prefixes=Error %s
2117

2218
// RUN: %clang -### -fopenmp -nogpuinc -nogpulib --offload-arch=gfx90a:xnack+ -fopenmp-runtimelib=lib-debug -fsanitize=address -shared-libasan %s -O3 2>&1 \
23-
// RUN: | FileCheck -check-prefix=Asan-Debug %s
19+
// RUN: | FileCheck -check-prefixes=Asan-Debug,Asan-Debug-Rel %s
2420

2521
// RUN: %clang -### -fopenmp -nogpuinc -nogpulib --offload-arch=gfx90a:xnack+ -fopenmp-runtimelib=lib -fsanitize=address -shared-libasan %s -O3 2>&1 \
26-
// RUN: | FileCheck -check-prefix=Asan-Devel %s
22+
// RUN: | FileCheck -check-prefixes=Asan-Devel,Asan-Devel-Rel %s
2723

2824
// RUN: %clang -### -fopenmp -nogpuinc -nogpulib --offload-arch=gfx90a:xnack+ -fopenmp-runtimelib=lib-perf -fsanitize=address -shared-libasan %s -O3 2>&1 \
29-
// RUN: | FileCheck -check-prefix=Asan-Perf %s
25+
// RUN: | FileCheck -check-prefixes=Asan-Perf,Asan-Perf-Rel %s
3026

3127
// RUN: %clang -### -fopenmp -nogpuinc -nogpulib --offload-arch=gfx90a:xnack+ -fopenmp-target-fast -fsanitize=address -shared-libasan %s -O3 2>&1 \
32-
// RUN: | FileCheck -check-prefix=Asan-Devel %s
28+
// RUN: | FileCheck -check-prefixes=Asan-Devel,Asan-Devel-Rel %s
29+
30+
// Devel: "-rpath" "{{[^"]*}}[[LIB:(/|\\\\)lib]]"
31+
// Devel-Rel-NOT: "-rpath" "{{[^"]*(/|\\\\)\.\.}}[[LIB]]"
32+
33+
// Debug: "-rpath" "{{[^"]*}}[[LIB:(/|\\\\)lib-debug]]"
34+
// Debug-Rel-NOT: "-rpath" "{{[^"]*(/|\\\\)\.\.}}[[LIB]]"
35+
36+
// Perf: "-rpath" "{{[^"]*}}[[LIB:(/|\\\\)lib-perf]]"
37+
// Perf-Rel-NOT: "-rpath" "{{[^"]*(/|\\\\)\.\.}}[[LIB]]"
38+
39+
// Asan-Devel: "-rpath" "{{[^"]*}}[[LIB:(/|\\\\)lib(/|\\\\)asan]]"
40+
// Asan-Devel-Rel-NOT: "-rpath" "{{[^"]*(/|\\\\)\.\.}}[[LIB]]"
41+
42+
// Asan-Debug: "-rpath" "{{[^"]*}}[[LIB:(/|\\\\)lib-debug(/|\\\\)asan]]"
43+
// Asan-Debug-Rel-NOT: "-rpath" "{{[^"]*(/|\\\\)\.\.}}[[LIB]]"
44+
45+
// Asan-Perf: "-rpath" "{{[^"]*}}[[LIB:(/|\\\\)lib-perf(/|\\\\)asan]]"
46+
// Asan-Perf-Rel-NOT: "-rpath" "{{[^"]*(/|\\\\)\.\.}}[[LIB]]"
3347

34-
// Debug: /lib-debug
35-
// Perf: /lib-perf
36-
// Devel: /../lib
37-
// Default: /../lib
3848
// Error: clang: error: unsupported argument 'oopsy' to option '-fopenmp-runtimelib='

flang/test/Driver/arch-specific-libdir-rpath.f95

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -32,8 +32,7 @@
3232
!
3333
!
3434
! RESDIR: "-resource-dir" "[[RESDIR:[^"]*]]"
35-
!
3635
! LIBPATH-X86_64: -L[[RESDIR]]{{(/|\\\\)lib(/|\\\\)linux(/|\\\\)x86_64}}
37-
! RPATH-X86_64: "-rpath" "[[RESDIR]]{{(/|\\\\)lib(/|\\\\)linux(/|\\\\)x86_64}}"
3836
!
39-
! NO-RPATH-X86_64-NOT: "-rpath" "[[RESDIR]]{{(/|\\\\)lib(/|\\\\)linux(/|\\\\)x86_64}}"
37+
! RPATH-X86_64: "-rpath" "{{[^"]*(/|\\\\)resource_dir_with_arch_subdir(/|\\\\)lib(/|\\\\)linux(/|\\\\)x86_64}}"
38+
! NO-RPATH-X86_64-NOT: "-rpath" "{{[^"]*(/|\\\\)resource_dir_with_arch_subdir(/|\\\\)lib(/|\\\\)linux(/|\\\\)x86_64}}"

llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp

Lines changed: 12 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -129,10 +129,8 @@ static bool isDSAddress(const Constant *C) {
129129
return AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS;
130130
}
131131

132-
/// Returns true if the function requires the implicit argument be passed
133-
/// regardless of the function contents.
134-
static bool funcRequiresHostcallPtr(const Function &F) {
135-
// Sanitizers require the hostcall buffer passed in the implicit arguments.
132+
/// Returns true if sanitizer attributes are present on a function.
133+
static bool hasSanitizerAttributes(const Function &F) {
136134
return F.hasFnAttribute(Attribute::SanitizeAddress) ||
137135
F.hasFnAttribute(Attribute::SanitizeThread) ||
138136
F.hasFnAttribute(Attribute::SanitizeMemory) ||
@@ -465,15 +463,21 @@ struct AAAMDAttributesFunction : public AAAMDAttributes {
465463

466464
// If the function requires the implicit arg pointer due to sanitizers,
467465
// assume it's needed even if explicitly marked as not requiring it.
468-
const bool NeedsHostcall = funcRequiresHostcallPtr(*F);
469-
if (NeedsHostcall) {
466+
// Flat scratch initialization is needed because `asan_malloc_impl`
467+
// calls introduced later in pipeline will have flat scratch accesses.
468+
// FIXME: FLAT_SCRATCH_INIT will not be required here if device-libs
469+
// implementation for `asan_malloc_impl` is updated.
470+
const bool HasSanitizerAttrs = hasSanitizerAttributes(*F);
471+
if (HasSanitizerAttrs) {
470472
removeAssumedBits(IMPLICIT_ARG_PTR);
471473
removeAssumedBits(HOSTCALL_PTR);
474+
removeAssumedBits(FLAT_SCRATCH_INIT);
472475
}
473476

474477
for (auto Attr : ImplicitAttrs) {
475-
if (NeedsHostcall &&
476-
(Attr.first == IMPLICIT_ARG_PTR || Attr.first == HOSTCALL_PTR))
478+
if (HasSanitizerAttrs &&
479+
(Attr.first == IMPLICIT_ARG_PTR || Attr.first == HOSTCALL_PTR ||
480+
Attr.first == FLAT_SCRATCH_INIT))
477481
continue;
478482

479483
if (F->hasFnAttribute(Attr.second))

llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -648,7 +648,8 @@ class AMDGPULowerModuleLDS {
648648
ModuleScopeVariables.insert(GV);
649649
} else if (K.second.size() == 1) {
650650
KernelAccessVariables.insert(GV);
651-
} else if (set_is_subset(K.second, HybridModuleRootKernels)) {
651+
} else if (K.second == HybridModuleRootKernels &&
652+
set_is_subset(K.second, HybridModuleRootKernels)) {
652653
ModuleScopeVariables.insert(GV);
653654
} else {
654655
TableLookupVariables.insert(GV);

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1874,10 +1874,16 @@ bool SITargetLowering::allowsMisalignedMemoryAccessesImpl(
18741874
if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS ||
18751875
AddrSpace == AMDGPUAS::FLAT_ADDRESS) {
18761876
bool AlignedBy4 = Alignment >= Align(4);
1877+
if (Subtarget->hasUnalignedScratchAccessEnabled()) {
1878+
if (IsFast)
1879+
*IsFast = AlignedBy4 ? Size : 1;
1880+
return true;
1881+
}
1882+
18771883
if (IsFast)
18781884
*IsFast = AlignedBy4;
18791885

1880-
return AlignedBy4 || Subtarget->hasUnalignedScratchAccessEnabled();
1886+
return AlignedBy4;
18811887
}
18821888

18831889
// So long as they are correct, wide global memory operations perform better
@@ -3193,7 +3199,7 @@ bool SITargetLowering::CanLowerReturn(
31933199

31943200
// We must use the stack if return would require unavailable registers.
31953201
unsigned MaxNumVGPRs = Subtarget->getMaxNumVGPRs(MF);
3196-
unsigned TotalNumVGPRs = AMDGPU::VGPR_32RegClass.getNumRegs();
3202+
unsigned TotalNumVGPRs = Subtarget->getAddressableNumArchVGPRs();
31973203
for (unsigned i = MaxNumVGPRs; i < TotalNumVGPRs; ++i)
31983204
if (CCInfo.isAllocated(AMDGPU::VGPR_32RegClass.getRegister(i)))
31993205
return false;

llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp

Lines changed: 1 addition & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1733,13 +1733,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
17331733

17341734
// LOAD_CNT is only relevant to vgpr or LDS.
17351735
unsigned RegNo = SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS;
1736-
// Only objects with alias scope info were added to LDSDMAScopes array.
1737-
// In the absense of the scope info we will not be able to disambiguate
1738-
// aliasing here. There is no need to try searching for a corresponding
1739-
// store slot. This is conservatively correct because in that case we
1740-
// will produce a wait using the first (general) LDS DMA wait slot which
1741-
// will wait on all of them anyway.
1742-
if (Ptr && Memop->getAAInfo() && Memop->getAAInfo().Scope) {
1736+
if (Ptr && Memop->getAAInfo()) {
17431737
const auto &LDSDMAStores = ScoreBrackets.getLDSDMAStores();
17441738
for (unsigned I = 0, E = LDSDMAStores.size(); I != E; ++I) {
17451739
if (MI.mayAlias(AA, *LDSDMAStores[I], true))

llvm/lib/Target/AMDGPU/SIInstrInfo.cpp

Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6081,6 +6081,66 @@ bool SIInstrInfo::isOperandLegal(const MachineInstr &MI, unsigned OpIdx,
60816081
return isImmOperandLegal(MI, OpIdx, *MO);
60826082
}
60836083

6084+
bool SIInstrInfo::isNeverCoissue(MachineInstr &MI) const {
6085+
bool IsGFX950Only = ST.hasGFX950Insts();
6086+
bool IsGFX940Only = ST.hasGFX940Insts();
6087+
6088+
if (!IsGFX950Only && !IsGFX940Only)
6089+
return false;
6090+
6091+
if (!isVALU(MI))
6092+
return false;
6093+
6094+
// V_COS, V_EXP, V_RCP, etc.
6095+
if (isTRANS(MI))
6096+
return true;
6097+
6098+
// DOT2, DOT2C, DOT4, etc.
6099+
if (isDOT(MI))
6100+
return true;
6101+
6102+
// MFMA, SMFMA
6103+
if (isMFMA(MI))
6104+
return true;
6105+
6106+
unsigned Opcode = MI.getOpcode();
6107+
switch (Opcode) {
6108+
case AMDGPU::V_CVT_PK_BF8_F32_e64:
6109+
case AMDGPU::V_CVT_PK_FP8_F32_e64:
6110+
case AMDGPU::V_MQSAD_PK_U16_U8_e64:
6111+
case AMDGPU::V_MQSAD_U32_U8_e64:
6112+
case AMDGPU::V_PK_ADD_F16:
6113+
case AMDGPU::V_PK_ADD_F32:
6114+
case AMDGPU::V_PK_ADD_I16:
6115+
case AMDGPU::V_PK_ADD_U16:
6116+
case AMDGPU::V_PK_ASHRREV_I16:
6117+
case AMDGPU::V_PK_FMA_F16:
6118+
case AMDGPU::V_PK_FMA_F32:
6119+
case AMDGPU::V_PK_FMAC_F16_e32:
6120+
case AMDGPU::V_PK_FMAC_F16_e64:
6121+
case AMDGPU::V_PK_LSHLREV_B16:
6122+
case AMDGPU::V_PK_LSHRREV_B16:
6123+
case AMDGPU::V_PK_MAD_I16:
6124+
case AMDGPU::V_PK_MAD_U16:
6125+
case AMDGPU::V_PK_MAX_F16:
6126+
case AMDGPU::V_PK_MAX_I16:
6127+
case AMDGPU::V_PK_MAX_U16:
6128+
case AMDGPU::V_PK_MIN_F16:
6129+
case AMDGPU::V_PK_MIN_I16:
6130+
case AMDGPU::V_PK_MIN_U16:
6131+
case AMDGPU::V_PK_MOV_B32:
6132+
case AMDGPU::V_PK_MUL_F16:
6133+
case AMDGPU::V_PK_MUL_F32:
6134+
case AMDGPU::V_PK_MUL_LO_U16:
6135+
case AMDGPU::V_PK_SUB_I16:
6136+
case AMDGPU::V_PK_SUB_U16:
6137+
case AMDGPU::V_QSAD_PK_U16_U8_e64:
6138+
return true;
6139+
default:
6140+
return false;
6141+
}
6142+
}
6143+
60846144
void SIInstrInfo::legalizeOperandsVOP2(MachineRegisterInfo &MRI,
60856145
MachineInstr &MI) const {
60866146
unsigned Opc = MI.getOpcode();

llvm/lib/Target/AMDGPU/SIInstrInfo.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1139,6 +1139,8 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
11391139
bool isImmOperandLegal(const MachineInstr &MI, unsigned OpNo,
11401140
const MachineOperand &MO) const;
11411141

1142+
bool isNeverCoissue(MachineInstr &MI) const;
1143+
11421144
/// Return true if this 64-bit VALU instruction has a 32-bit encoding.
11431145
/// This function will return false if you pass it a 32-bit instruction.
11441146
bool hasVALU32BitEncoding(unsigned Opcode) const;

llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -102,7 +102,7 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const Function &F,
102102
MaxKernArgAlign);
103103

104104
if (ST.hasGFX90AInsts() &&
105-
ST.getMaxNumVGPRs(F) <= AMDGPU::VGPR_32RegClass.getNumRegs() &&
105+
ST.getMaxNumVGPRs(F) <= ST.getAddressableNumArchVGPRs() &&
106106
!mayUseAGPRs(F))
107107
MayNeedAGPRs = false; // We will select all MAI with VGPR operands.
108108
}

0 commit comments

Comments
 (0)