From 2962968b95c4e1be13b851ee7b0e5623d23e870a Mon Sep 17 00:00:00 2001 From: Shilei Tian Date: Wed, 6 Nov 2024 12:49:45 -0500 Subject: [PATCH] [AMDGPU] Introduce a new generic target `gfx9-4-generic` --- clang/include/clang/Basic/Cuda.h | 1 + clang/lib/Basic/Cuda.cpp | 1 + clang/lib/Basic/Targets/NVPTX.cpp | 1 + clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp | 1 + clang/test/CodeGenOpenCL/amdgpu-features.cl | 3 + clang/test/Driver/amdgpu-macros.cl | 1 + clang/test/Driver/amdgpu-mcpu.cl | 2 + .../Misc/target-invalid-cpu-note/amdgcn.c | 1 + .../test/Misc/target-invalid-cpu-note/nvptx.c | 1 + llvm/docs/AMDGPUUsage.rst | 6 + llvm/include/llvm/BinaryFormat/ELF.h | 3 +- llvm/include/llvm/TargetParser/TargetParser.h | 3 +- llvm/lib/Object/ELFObjectFile.cpp | 2 + llvm/lib/ObjectYAML/ELFYAML.cpp | 1 + llvm/lib/Target/AMDGPU/AMDGPU.td | 31 +- llvm/lib/Target/AMDGPU/GCNProcessors.td | 5 + .../MCTargetDesc/AMDGPUTargetStreamer.cpp | 5 + llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h | 1 + llvm/lib/TargetParser/TargetParser.cpp | 9 +- .../CodeGen/AMDGPU/directive-amdgcn-target.ll | 4 + .../CodeGen/AMDGPU/div-rem-by-constant-64.ll | 443 +++++++ llvm/test/CodeGen/AMDGPU/dst-sel-hazard.mir | 1 + .../CodeGen/AMDGPU/elf-header-flags-mach.ll | 2 + .../AMDGPU/generic-targets-require-v6.ll | 3 + .../AMDGPU/llvm.amdgcn.iglp.opt.exp.large.mir | 1144 +++++++++++++++++ .../AMDGPU/llvm.amdgcn.iglp.opt.exp.small.mir | 496 +++++++ .../AMDGPU/llvm.amdgcn.update.dpp.gfx90a.ll | 1 + .../AMDGPU/no-corresponding-integer-type.ll | 1 + .../MC/AMDGPU/gfx9_4_generic_unsupported.s | 104 ++ .../Object/AMDGPU/elf-header-flags-mach.yaml | 7 + .../llvm-objdump/ELF/AMDGPU/subtarget.ll | 6 + .../llvm-readobj/ELF/AMDGPU/elf-headers.test | 3 + llvm/tools/llvm-readobj/ELFDumper.cpp | 1 + 33 files changed, 2284 insertions(+), 10 deletions(-) create mode 100644 llvm/test/MC/AMDGPU/gfx9_4_generic_unsupported.s diff --git a/clang/include/clang/Basic/Cuda.h b/clang/include/clang/Basic/Cuda.h index 7b4f435dc39f29..721e8981af6ffc 100644 --- a/clang/include/clang/Basic/Cuda.h +++ b/clang/include/clang/Basic/Cuda.h @@ -103,6 +103,7 @@ enum class OffloadArch { GFX909, GFX90a, GFX90c, + GFX9_4_GENERIC, GFX940, GFX941, GFX942, diff --git a/clang/lib/Basic/Cuda.cpp b/clang/lib/Basic/Cuda.cpp index d765baef913e2f..59c932468cd891 100644 --- a/clang/lib/Basic/Cuda.cpp +++ b/clang/lib/Basic/Cuda.cpp @@ -121,6 +121,7 @@ static const OffloadArchToStringMap arch_names[] = { GFX(909), // gfx909 GFX(90a), // gfx90a GFX(90c), // gfx90c + {OffloadArch::GFX9_4_GENERIC, "gfx9-4-generic", "compute_amdgcn"}, GFX(940), // gfx940 GFX(941), // gfx941 GFX(942), // gfx942 diff --git a/clang/lib/Basic/Targets/NVPTX.cpp b/clang/lib/Basic/Targets/NVPTX.cpp index e0bd0b096324d8..0897032c4b8546 100644 --- a/clang/lib/Basic/Targets/NVPTX.cpp +++ b/clang/lib/Basic/Targets/NVPTX.cpp @@ -205,6 +205,7 @@ void NVPTXTargetInfo::getTargetDefines(const LangOptions &Opts, case OffloadArch::GFX909: case OffloadArch::GFX90a: case OffloadArch::GFX90c: + case OffloadArch::GFX9_4_GENERIC: case OffloadArch::GFX940: case OffloadArch::GFX941: case OffloadArch::GFX942: diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp index 598b946ad88dbb..43dc0e62284602 100644 --- a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp +++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp @@ -2301,6 +2301,7 @@ void CGOpenMPRuntimeGPU::processRequiresDirective(const OMPRequiresDecl *D) { case OffloadArch::GFX909: case OffloadArch::GFX90a: case OffloadArch::GFX90c: + case OffloadArch::GFX9_4_GENERIC: case OffloadArch::GFX940: case OffloadArch::GFX941: case OffloadArch::GFX942: diff --git a/clang/test/CodeGenOpenCL/amdgpu-features.cl b/clang/test/CodeGenOpenCL/amdgpu-features.cl index f3473346baae5a..692f5103724342 100644 --- a/clang/test/CodeGenOpenCL/amdgpu-features.cl +++ b/clang/test/CodeGenOpenCL/amdgpu-features.cl @@ -56,6 +56,8 @@ // RUN: %clang_cc1 -triple amdgcn -target-cpu gfx1103 -target-feature +wavefrontsize64 -emit-llvm -o - %s | FileCheck --check-prefix=GFX1103-W64 %s +// RUN: %clang_cc1 -triple amdgcn -target-cpu gfx9-4-generic -emit-llvm -o - %s | FileCheck --check-prefix=GFX9_4_Generic %s + // NOCPU-NOT: "target-features" // NOCPU-WAVE32: "target-features"="+wavefrontsize32" // NOCPU-WAVE64: "target-features"="+wavefrontsize64" @@ -85,6 +87,7 @@ // GFX940: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+fp8-conversion-insts,+fp8-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+mai-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64" // GFX941: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+fp8-conversion-insts,+fp8-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+mai-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64" // GFX942: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+fp8-conversion-insts,+fp8-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+mai-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64" +// GFX9_4_Generic: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+fp8-conversion-insts,+fp8-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+mai-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64" // GFX1010: "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dpp,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize32" // GFX1011: "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize32" // GFX1012: "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize32" diff --git a/clang/test/Driver/amdgpu-macros.cl b/clang/test/Driver/amdgpu-macros.cl index dd5a4483e4d607..9352c59ef28b7f 100644 --- a/clang/test/Driver/amdgpu-macros.cl +++ b/clang/test/Driver/amdgpu-macros.cl @@ -133,6 +133,7 @@ // RUN: %clang -E -dM -target amdgcn -mcpu=gfx1201 %s 2>&1 | FileCheck --check-prefixes=ARCH-GCN,FAST_FMAF %s -DWAVEFRONT_SIZE=32 -DCPU=gfx1201 -DFAMILY=GFX12 // RUN: %clang -E -dM -target amdgcn -mcpu=gfx9-generic %s 2>&1 | FileCheck --check-prefixes=ARCH-GCN,FAST_FMAF %s -DWAVEFRONT_SIZE=64 -DCPU=gfx9_generic -DFAMILY=GFX9 +// RUN: %clang -E -dM -target amdgcn -mcpu=gfx9-4-generic %s 2>&1 | FileCheck --check-prefixes=ARCH-GCN,FAST_FMAF %s -DWAVEFRONT_SIZE=64 -DCPU=gfx9_generic -DFAMILY=GFX9 // RUN: %clang -E -dM -target amdgcn -mcpu=gfx10-1-generic %s 2>&1 | FileCheck --check-prefixes=ARCH-GCN,FAST_FMAF %s -DWAVEFRONT_SIZE=32 -DCPU=gfx10_1_generic -DFAMILY=GFX10 // RUN: %clang -E -dM -target amdgcn -mcpu=gfx10-3-generic %s 2>&1 | FileCheck --check-prefixes=ARCH-GCN,FAST_FMAF %s -DWAVEFRONT_SIZE=32 -DCPU=gfx10_3_generic -DFAMILY=GFX10 // RUN: %clang -E -dM -target amdgcn -mcpu=gfx11-generic %s 2>&1 | FileCheck --check-prefixes=ARCH-GCN,FAST_FMAF %s -DWAVEFRONT_SIZE=32 -DCPU=gfx11_generic -DFAMILY=GFX11 diff --git a/clang/test/Driver/amdgpu-mcpu.cl b/clang/test/Driver/amdgpu-mcpu.cl index 42ce33db6eec07..ba578435072985 100644 --- a/clang/test/Driver/amdgpu-mcpu.cl +++ b/clang/test/Driver/amdgpu-mcpu.cl @@ -118,6 +118,7 @@ // RUN: %clang -### -target amdgcn -mcpu=gfx1201 %s 2>&1 | FileCheck --check-prefix=GFX1201 %s // RUN: %clang -### -target amdgcn -mcpu=gfx9-generic %s 2>&1 | FileCheck --check-prefix=GFX9_GENERIC %s +// RUN: %clang -### -target amdgcn -mcpu=gfx9-4-generic %s 2>&1 | FileCheck --check-prefix=GFX9_4_GENERIC %s // RUN: %clang -### -target amdgcn -mcpu=gfx10-1-generic %s 2>&1 | FileCheck --check-prefix=GFX10_1_GENERIC %s // RUN: %clang -### -target amdgcn -mcpu=gfx10-3-generic %s 2>&1 | FileCheck --check-prefix=GFX10_3_GENERIC %s // RUN: %clang -### -target amdgcn -mcpu=gfx11-generic %s 2>&1 | FileCheck --check-prefix=GFX11_GENERIC %s @@ -172,6 +173,7 @@ // GFX1201: "-target-cpu" "gfx1201" // GFX9_GENERIC: "-target-cpu" "gfx9-generic" +// GFX9_4_GENERIC: "-target-cpu" "gfx9-4-generic" // GFX10_1_GENERIC: "-target-cpu" "gfx10-1-generic" // GFX10_3_GENERIC: "-target-cpu" "gfx10-3-generic" // GFX11_GENERIC: "-target-cpu" "gfx11-generic" diff --git a/clang/test/Misc/target-invalid-cpu-note/amdgcn.c b/clang/test/Misc/target-invalid-cpu-note/amdgcn.c index b3ddbd53a0391b..cae31797c1febe 100644 --- a/clang/test/Misc/target-invalid-cpu-note/amdgcn.c +++ b/clang/test/Misc/target-invalid-cpu-note/amdgcn.c @@ -70,6 +70,7 @@ // CHECK-SAME: {{^}}, gfx1200 // CHECK-SAME: {{^}}, gfx1201 // CHECK-SAME: {{^}}, gfx9-generic +// CHECK-SAME: {{^}}, gfx9-4-generic // CHECK-SAME: {{^}}, gfx10-1-generic // CHECK-SAME: {{^}}, gfx10-3-generic // CHECK-SAME: {{^}}, gfx11-generic diff --git a/clang/test/Misc/target-invalid-cpu-note/nvptx.c b/clang/test/Misc/target-invalid-cpu-note/nvptx.c index a59e1c6fab1c49..44fe07065b2428 100644 --- a/clang/test/Misc/target-invalid-cpu-note/nvptx.c +++ b/clang/test/Misc/target-invalid-cpu-note/nvptx.c @@ -50,6 +50,7 @@ // CHECK-SAME: {{^}}, gfx909 // CHECK-SAME: {{^}}, gfx90a // CHECK-SAME: {{^}}, gfx90c +// CHECK-SAME: {{^}}, gfx9-4-generic // CHECK-SAME: {{^}}, gfx940 // CHECK-SAME: {{^}}, gfx941 // CHECK-SAME: {{^}}, gfx942 diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst index 5b83ea428c0bff..b9eb71e73dad97 100644 --- a/llvm/docs/AMDGPUUsage.rst +++ b/llvm/docs/AMDGPUUsage.rst @@ -576,6 +576,12 @@ Generic processor code objects are versioned. See :ref:`amdgpu-generic-processor - ``v_dot2_f32_f16`` + ``gfx9-4-generic`` ``amdgcn`` - ``gfx940`` - xnack - Absolute flat FP8 and BF8 instructions, + - ``gfx941`` scratch FP8 and BF8 conversion instructions, + - ``gfx942`` as well as instructions with XF32 format support + are not available. + + ``gfx10-1-generic`` ``amdgcn`` - ``gfx1010`` - xnack - Absolute flat - The following instructions are - ``gfx1011`` - wavefrontsize64 scratch not available on ``gfx1011`` - ``gfx1012`` - cumode and ``gfx1012`` diff --git a/llvm/include/llvm/BinaryFormat/ELF.h b/llvm/include/llvm/BinaryFormat/ELF.h index c591a96232f115..6c05ea7208e1f1 100644 --- a/llvm/include/llvm/BinaryFormat/ELF.h +++ b/llvm/include/llvm/BinaryFormat/ELF.h @@ -822,11 +822,12 @@ enum : unsigned { EF_AMDGPU_MACH_AMDGCN_RESERVED_0X57 = 0x057, EF_AMDGPU_MACH_AMDGCN_GFX1153 = 0x058, EF_AMDGPU_MACH_AMDGCN_GFX12_GENERIC = 0x059, + EF_AMDGPU_MACH_AMDGCN_GFX9_4_GENERIC = 0x05f, // clang-format on // First/last AMDGCN-based processors. EF_AMDGPU_MACH_AMDGCN_FIRST = EF_AMDGPU_MACH_AMDGCN_GFX600, - EF_AMDGPU_MACH_AMDGCN_LAST = EF_AMDGPU_MACH_AMDGCN_GFX12_GENERIC, + EF_AMDGPU_MACH_AMDGCN_LAST = EF_AMDGPU_MACH_AMDGCN_GFX9_4_GENERIC, // Indicates if the "xnack" target feature is enabled for all code contained // in the object. diff --git a/llvm/include/llvm/TargetParser/TargetParser.h b/llvm/include/llvm/TargetParser/TargetParser.h index ae86ff39083d89..c6db4dfd7f5159 100644 --- a/llvm/include/llvm/TargetParser/TargetParser.h +++ b/llvm/include/llvm/TargetParser/TargetParser.h @@ -119,9 +119,10 @@ enum GPUKind : uint32_t { GK_GFX10_3_GENERIC = 194, GK_GFX11_GENERIC = 195, GK_GFX12_GENERIC = 196, + GK_GFX9_4_GENERIC = 197, GK_AMDGCN_GENERIC_FIRST = GK_GFX9_GENERIC, - GK_AMDGCN_GENERIC_LAST = GK_GFX12_GENERIC, + GK_AMDGCN_GENERIC_LAST = GK_GFX9_4_GENERIC, }; /// Instruction set architecture version. diff --git a/llvm/lib/Object/ELFObjectFile.cpp b/llvm/lib/Object/ELFObjectFile.cpp index 5096877d2a4b00..9dc39936ffd8bb 100644 --- a/llvm/lib/Object/ELFObjectFile.cpp +++ b/llvm/lib/Object/ELFObjectFile.cpp @@ -602,6 +602,8 @@ StringRef ELFObjectFileBase::getAMDGPUCPUName() const { // Generic AMDGCN targets case ELF::EF_AMDGPU_MACH_AMDGCN_GFX9_GENERIC: return "gfx9-generic"; + case ELF::EF_AMDGPU_MACH_AMDGCN_GFX9_4_GENERIC: + return "gfx9-4-generic"; case ELF::EF_AMDGPU_MACH_AMDGCN_GFX10_1_GENERIC: return "gfx10-1-generic"; case ELF::EF_AMDGPU_MACH_AMDGCN_GFX10_3_GENERIC: diff --git a/llvm/lib/ObjectYAML/ELFYAML.cpp b/llvm/lib/ObjectYAML/ELFYAML.cpp index e97248cbcf5682..76c6c496c5e6bc 100644 --- a/llvm/lib/ObjectYAML/ELFYAML.cpp +++ b/llvm/lib/ObjectYAML/ELFYAML.cpp @@ -631,6 +631,7 @@ void ScalarBitSetTraits::bitset(IO &IO, BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX1200, EF_AMDGPU_MACH); BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX1201, EF_AMDGPU_MACH); BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX9_GENERIC, EF_AMDGPU_MACH); + BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX9_4_GENERIC, EF_AMDGPU_MACH); BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX10_1_GENERIC, EF_AMDGPU_MACH); BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX10_3_GENERIC, EF_AMDGPU_MACH); BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX11_GENERIC, EF_AMDGPU_MACH); diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td index d068402e95716e..b7ff8a86b3c939 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.td +++ b/llvm/lib/Target/AMDGPU/AMDGPU.td @@ -1451,11 +1451,7 @@ def FeatureISAVersion9_4_Common : FeatureSet< FeatureDPALU_DPP, FeaturePackedFP32Ops, FeatureMAIInsts, - FeatureFP8Insts, - FeatureFP8ConversionInsts, - FeatureCvtFP8VOP1Bug, FeaturePkFmacF16Inst, - FeatureXF32Insts, FeatureAtomicFaddRtnInsts, FeatureAtomicFaddNoRtnInsts, FeatureAtomicBufferGlobalPkAddF16Insts, @@ -1476,15 +1472,36 @@ def FeatureISAVersion9_4_Common : FeatureSet< def FeatureISAVersion9_4_0 : FeatureSet< !listconcat(FeatureISAVersion9_4_Common.Features, - [FeatureForceStoreSC0SC1])>; + [ + FeatureForceStoreSC0SC1, + FeatureFP8Insts, + FeatureFP8ConversionInsts, + FeatureCvtFP8VOP1Bug, + FeatureXF32Insts + ])>; def FeatureISAVersion9_4_1 : FeatureSet< !listconcat(FeatureISAVersion9_4_Common.Features, - [FeatureForceStoreSC0SC1])>; + [ + FeatureForceStoreSC0SC1, + FeatureFP8Insts, + FeatureFP8ConversionInsts, + FeatureCvtFP8VOP1Bug, + FeatureXF32Insts + ])>; def FeatureISAVersion9_4_2 : FeatureSet< !listconcat(FeatureISAVersion9_4_Common.Features, - [])>; + [ + FeatureFP8Insts, + FeatureFP8ConversionInsts, + FeatureCvtFP8VOP1Bug, + FeatureXF32Insts + ])>; + +def FeatureISAVersion9_4_Generic : FeatureSet< + !listconcat(FeatureISAVersion9_4_Common.Features, + [FeatureRequiresCOV6])>; def FeatureISAVersion10_Common : FeatureSet< [FeatureGFX10, diff --git a/llvm/lib/Target/AMDGPU/GCNProcessors.td b/llvm/lib/Target/AMDGPU/GCNProcessors.td index 547941633fda61..e12a6127b17063 100644 --- a/llvm/lib/Target/AMDGPU/GCNProcessors.td +++ b/llvm/lib/Target/AMDGPU/GCNProcessors.td @@ -209,6 +209,11 @@ def : ProcessorModel<"gfx9-generic", SIQuarterSpeedModel, FeatureISAVersion9_Generic.Features >; +// [gfx940, gfx941, gfx942] +def : ProcessorModel<"gfx9-4-generic", SIQuarterSpeedModel, + FeatureISAVersion9_4_Generic.Features +>; + //===----------------------------------------------------------------------===// // GCN GFX10. //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp index 5c625c3d83ff1b..507725b91a9bee 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp @@ -119,6 +119,7 @@ StringRef AMDGPUTargetStreamer::getArchNameFromElfMach(unsigned ElfMach) { case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1200: AK = GK_GFX1200; break; case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1201: AK = GK_GFX1201; break; case ELF::EF_AMDGPU_MACH_AMDGCN_GFX9_GENERIC: AK = GK_GFX9_GENERIC; break; + case ELF::EF_AMDGPU_MACH_AMDGCN_GFX9_4_GENERIC: AK = GK_GFX9_4_GENERIC; break; case ELF::EF_AMDGPU_MACH_AMDGCN_GFX10_1_GENERIC: AK = GK_GFX10_1_GENERIC; break; case ELF::EF_AMDGPU_MACH_AMDGCN_GFX10_3_GENERIC: AK = GK_GFX10_3_GENERIC; break; case ELF::EF_AMDGPU_MACH_AMDGCN_GFX11_GENERIC: AK = GK_GFX11_GENERIC; break; @@ -204,6 +205,7 @@ unsigned AMDGPUTargetStreamer::getElfMach(StringRef GPU) { case GK_GFX1200: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX1200; case GK_GFX1201: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX1201; case GK_GFX9_GENERIC: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX9_GENERIC; + case GK_GFX9_4_GENERIC: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX9_4_GENERIC; case GK_GFX10_1_GENERIC: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX10_1_GENERIC; case GK_GFX10_3_GENERIC: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX10_3_GENERIC; case GK_GFX11_GENERIC: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX11_GENERIC; @@ -821,6 +823,9 @@ unsigned AMDGPUTargetELFStreamer::getEFlagsV6() { case AMDGPU::GK_GFX9_GENERIC: Version = GenericVersion::GFX9; break; + case AMDGPU::GK_GFX9_4_GENERIC: + Version = GenericVersion::GFX9_4; + break; case AMDGPU::GK_GFX10_1_GENERIC: Version = GenericVersion::GFX10_1; break; diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h index beebe320b2cf3a..88a6d75b72c7d0 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h @@ -46,6 +46,7 @@ struct IsaVersion; /// within a generic family. namespace GenericVersion { static constexpr unsigned GFX9 = 1; +static constexpr unsigned GFX9_4 = 1; static constexpr unsigned GFX10_1 = 1; static constexpr unsigned GFX10_3 = 1; static constexpr unsigned GFX11 = 1; diff --git a/llvm/lib/TargetParser/TargetParser.cpp b/llvm/lib/TargetParser/TargetParser.cpp index 059d7b4f5ff2d0..f1a1d78e801e8e 100644 --- a/llvm/lib/TargetParser/TargetParser.cpp +++ b/llvm/lib/TargetParser/TargetParser.cpp @@ -130,6 +130,7 @@ constexpr GPUInfo AMDGCNGPUs[] = { {{"gfx1201"}, {"gfx1201"}, GK_GFX1201, FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_WAVE32|FEATURE_WGP}, {{"gfx9-generic"}, {"gfx9-generic"}, GK_GFX9_GENERIC, FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_XNACK}, + {{"gfx9-4-generic"}, {"gfx9-4-generic"}, GK_GFX9_4_GENERIC, FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_XNACK|FEATURE_SRAMECC}, {{"gfx10-1-generic"}, {"gfx10-1-generic"}, GK_GFX10_1_GENERIC, FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_WAVE32|FEATURE_XNACK|FEATURE_WGP}, {{"gfx10-3-generic"}, {"gfx10-3-generic"}, GK_GFX10_3_GENERIC, FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_WAVE32|FEATURE_WGP}, {{"gfx11-generic"}, {"gfx11-generic"}, GK_GFX11_GENERIC, FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_WAVE32|FEATURE_WGP}, @@ -156,6 +157,8 @@ StringRef llvm::AMDGPU::getArchFamilyNameAMDGCN(GPUKind AK) { switch (AK) { case AMDGPU::GK_GFX9_GENERIC: return "gfx9"; + case AMDGPU::GK_GFX9_4_GENERIC: + return "gfx9"; case AMDGPU::GK_GFX10_1_GENERIC: case AMDGPU::GK_GFX10_3_GENERIC: return "gfx10"; @@ -296,6 +299,7 @@ AMDGPU::IsaVersion AMDGPU::getIsaVersion(StringRef GPU) { // TODO: Split up this API depending on its caller so // generic target handling is more obvious and less risky. case GK_GFX9_GENERIC: return {9, 0, 0}; + case GK_GFX9_4_GENERIC: return {9, 4, 0}; case GK_GFX10_1_GENERIC: return {10, 1, 0}; case GK_GFX10_3_GENERIC: return {10, 3, 0}; case GK_GFX11_GENERIC: return {11, 0, 3}; @@ -466,9 +470,12 @@ void AMDGPU::fillAMDGPUFeatureMap(StringRef GPU, const Triple &T, case GK_GFX942: case GK_GFX941: case GK_GFX940: - Features["gfx940-insts"] = true; Features["fp8-insts"] = true; Features["fp8-conversion-insts"] = true; + Features["xf32-insts"] = true; + [[fallthrough]]; + case GK_GFX9_4_GENERIC: + Features["gfx940-insts"] = true; Features["atomic-ds-pk-add-16-insts"] = true; Features["atomic-flat-pk-add-16-insts"] = true; Features["atomic-global-pk-add-bf16-inst"] = true; diff --git a/llvm/test/CodeGen/AMDGPU/directive-amdgcn-target.ll b/llvm/test/CodeGen/AMDGPU/directive-amdgcn-target.ll index 5986d2d38ef1ad..4eac26e853c2a0 100644 --- a/llvm/test/CodeGen/AMDGPU/directive-amdgcn-target.ll +++ b/llvm/test/CodeGen/AMDGPU/directive-amdgcn-target.ll @@ -112,6 +112,8 @@ ; RUN: llc --amdhsa-code-object-version=6 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx9-generic -mattr=-xnack < %s | FileCheck --check-prefixes=GFX9_GENERIC_NOXNACK %s ; RUN: llc --amdhsa-code-object-version=6 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx9-generic -mattr=+xnack < %s | FileCheck --check-prefixes=GFX9_GENERIC_XNACK %s +; RUN: llc --amdhsa-code-object-version=6 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx9-4-generic -mattr=-xnack < %s | FileCheck --check-prefixes=GFX9_4_GENERIC_NOXNACK %s +; RUN: llc --amdhsa-code-object-version=6 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx9-4-generic -mattr=+xnack < %s | FileCheck --check-prefixes=GFX9_4_GENERIC_XNACK %s ; RUN: llc --amdhsa-code-object-version=6 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx10-1-generic -mattr=-xnack < %s | FileCheck --check-prefixes=GFX10_1_GENERIC_NOXNACK %s ; RUN: llc --amdhsa-code-object-version=6 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx10-1-generic -mattr=+xnack < %s | FileCheck --check-prefixes=GFX10_1_GENERIC_XNACK %s ; RUN: llc --amdhsa-code-object-version=6 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx10-3-generic < %s | FileCheck --check-prefixes=GFX10_3_GENERIC %s @@ -210,6 +212,8 @@ ; GFX9_GENERIC_NOXNACK: .amdgcn_target "amdgcn-amd-amdhsa--gfx9-generic:xnack-" ; GFX9_GENERIC_XNACK: .amdgcn_target "amdgcn-amd-amdhsa--gfx9-generic:xnack+" +; GFX9_4_GENERIC_NOXNACK: .amdgcn_target "amdgcn-amd-amdhsa--gfx9-4-generic:xnack-" +; GFX9_4_GENERIC_XNACK: .amdgcn_target "amdgcn-amd-amdhsa--gfx9-4-generic:xnack+" ; GFX10_1_GENERIC_NOXNACK: .amdgcn_target "amdgcn-amd-amdhsa--gfx10-1-generic:xnack-" ; GFX10_1_GENERIC_XNACK: .amdgcn_target "amdgcn-amd-amdhsa--gfx10-1-generic:xnack+" ; GFX10_3_GENERIC: .amdgcn_target "amdgcn-amd-amdhsa--gfx10-3-generic" diff --git a/llvm/test/CodeGen/AMDGPU/div-rem-by-constant-64.ll b/llvm/test/CodeGen/AMDGPU/div-rem-by-constant-64.ll index 662de47413654f..10ef4dc280d091 100644 --- a/llvm/test/CodeGen/AMDGPU/div-rem-by-constant-64.ll +++ b/llvm/test/CodeGen/AMDGPU/div-rem-by-constant-64.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -o - %s | FileCheck -check-prefixes=GFX9 %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -o - %s | FileCheck -check-prefixes=GFX942 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx9-4-generic --amdhsa-code-object-version=6 -o - %s | FileCheck -check-prefixes=GFX9_4 %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 -o - %s | FileCheck -check-prefixes=GFX1030 %s ; Sample test to check how we deal with division/modulos by 64 bit constants. @@ -66,6 +67,37 @@ define noundef i64 @srem64_3(i64 noundef %i) { ; GFX942-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v3, vcc ; GFX942-NEXT: s_setpc_b64 s[30:31] ; +; GFX9_4-LABEL: srem64_3: +; GFX9_4: ; %bb.0: ; %entry +; GFX9_4-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9_4-NEXT: s_mov_b32 s2, 0x55555556 +; GFX9_4-NEXT: v_mul_hi_u32 v2, v0, s2 +; GFX9_4-NEXT: v_mov_b32_e32 v3, 0 +; GFX9_4-NEXT: s_mov_b32 s3, 0x55555555 +; GFX9_4-NEXT: v_mov_b32_e32 v7, v3 +; GFX9_4-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v1, s2, v[2:3] +; GFX9_4-NEXT: v_mov_b32_e32 v2, v5 +; GFX9_4-NEXT: v_mov_b32_e32 v5, v3 +; GFX9_4-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v0, s3, v[4:5] +; GFX9_4-NEXT: v_mov_b32_e32 v6, v5 +; GFX9_4-NEXT: v_lshl_add_u64 v[4:5], v[2:3], 0, v[6:7] +; GFX9_4-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v1, s3, v[4:5] +; GFX9_4-NEXT: v_ashrrev_i32_e32 v2, 31, v1 +; GFX9_4-NEXT: v_mul_lo_u32 v6, v2, s3 +; GFX9_4-NEXT: v_mul_lo_u32 v7, v2, s2 +; GFX9_4-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v2, s2, v[4:5] +; GFX9_4-NEXT: v_add3_u32 v5, v7, v5, v6 +; GFX9_4-NEXT: v_lshrrev_b32_e32 v2, 31, v5 +; GFX9_4-NEXT: v_lshl_add_u64 v[2:3], v[4:5], 0, v[2:3] +; GFX9_4-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v2, 3, 0 +; GFX9_4-NEXT: v_mov_b32_e32 v2, v5 +; GFX9_4-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v3, 3, v[2:3] +; GFX9_4-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v4 +; GFX9_4-NEXT: v_mov_b32_e32 v3, v2 +; GFX9_4-NEXT: s_nop 0 +; GFX9_4-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v3, vcc +; GFX9_4-NEXT: s_setpc_b64 s[30:31] +; ; GFX1030-LABEL: srem64_3: ; GFX1030: ; %bb.0: ; %entry ; GFX1030-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -158,6 +190,37 @@ define noundef i64 @srem64_6(i64 noundef %i) { ; GFX942-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v3, vcc ; GFX942-NEXT: s_setpc_b64 s[30:31] ; +; GFX9_4-LABEL: srem64_6: +; GFX9_4: ; %bb.0: ; %entry +; GFX9_4-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9_4-NEXT: s_mov_b32 s2, 0x55555556 +; GFX9_4-NEXT: v_mul_hi_u32 v2, v0, s2 +; GFX9_4-NEXT: v_mov_b32_e32 v3, 0 +; GFX9_4-NEXT: s_mov_b32 s3, 0x55555555 +; GFX9_4-NEXT: v_mov_b32_e32 v7, v3 +; GFX9_4-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v1, s2, v[2:3] +; GFX9_4-NEXT: v_mov_b32_e32 v2, v5 +; GFX9_4-NEXT: v_mov_b32_e32 v5, v3 +; GFX9_4-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v0, s3, v[4:5] +; GFX9_4-NEXT: v_mov_b32_e32 v6, v5 +; GFX9_4-NEXT: v_lshl_add_u64 v[4:5], v[2:3], 0, v[6:7] +; GFX9_4-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v1, s3, v[4:5] +; GFX9_4-NEXT: v_ashrrev_i32_e32 v2, 31, v1 +; GFX9_4-NEXT: v_mul_lo_u32 v6, v2, s3 +; GFX9_4-NEXT: v_mul_lo_u32 v7, v2, s2 +; GFX9_4-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v2, s2, v[4:5] +; GFX9_4-NEXT: v_add3_u32 v5, v7, v5, v6 +; GFX9_4-NEXT: v_lshrrev_b32_e32 v2, 31, v5 +; GFX9_4-NEXT: v_lshl_add_u64 v[2:3], v[4:5], 0, v[2:3] +; GFX9_4-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v2, 3, 0 +; GFX9_4-NEXT: v_mov_b32_e32 v2, v5 +; GFX9_4-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v3, 3, v[2:3] +; GFX9_4-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v4 +; GFX9_4-NEXT: v_mov_b32_e32 v3, v2 +; GFX9_4-NEXT: s_nop 0 +; GFX9_4-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v3, vcc +; GFX9_4-NEXT: s_setpc_b64 s[30:31] +; ; GFX1030-LABEL: srem64_6: ; GFX1030: ; %bb.0: ; %entry ; GFX1030-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -240,6 +303,32 @@ define noundef i64 @urem64_3(i64 noundef %i) { ; GFX942-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v3, vcc ; GFX942-NEXT: s_setpc_b64 s[30:31] ; +; GFX9_4-LABEL: urem64_3: +; GFX9_4: ; %bb.0: ; %entry +; GFX9_4-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9_4-NEXT: s_mov_b32 s0, 0xaaaaaaab +; GFX9_4-NEXT: v_mul_hi_u32 v2, v0, s0 +; GFX9_4-NEXT: v_mov_b32_e32 v3, 0 +; GFX9_4-NEXT: s_mov_b32 s2, 0xaaaaaaaa +; GFX9_4-NEXT: v_mov_b32_e32 v7, v3 +; GFX9_4-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v1, s0, v[2:3] +; GFX9_4-NEXT: v_mov_b32_e32 v2, v5 +; GFX9_4-NEXT: v_mov_b32_e32 v5, v3 +; GFX9_4-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v0, s2, v[4:5] +; GFX9_4-NEXT: v_mov_b32_e32 v6, v5 +; GFX9_4-NEXT: v_lshl_add_u64 v[2:3], v[2:3], 0, v[6:7] +; GFX9_4-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v1, s2, v[2:3] +; GFX9_4-NEXT: v_alignbit_b32 v2, v3, v2, 1 +; GFX9_4-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v2, 3, 0 +; GFX9_4-NEXT: v_lshrrev_b32_e32 v3, 1, v3 +; GFX9_4-NEXT: v_mov_b32_e32 v2, v5 +; GFX9_4-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v3, 3, v[2:3] +; GFX9_4-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v4 +; GFX9_4-NEXT: v_mov_b32_e32 v3, v2 +; GFX9_4-NEXT: s_nop 0 +; GFX9_4-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v3, vcc +; GFX9_4-NEXT: s_setpc_b64 s[30:31] +; ; GFX1030-LABEL: urem64_3: ; GFX1030: ; %bb.0: ; %entry ; GFX1030-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -317,6 +406,32 @@ define noundef i64 @urem64_6(i64 noundef %i) { ; GFX942-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v3, vcc ; GFX942-NEXT: s_setpc_b64 s[30:31] ; +; GFX9_4-LABEL: urem64_6: +; GFX9_4: ; %bb.0: ; %entry +; GFX9_4-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9_4-NEXT: s_mov_b32 s0, 0xaaaaaaab +; GFX9_4-NEXT: v_mul_hi_u32 v2, v0, s0 +; GFX9_4-NEXT: v_mov_b32_e32 v3, 0 +; GFX9_4-NEXT: s_mov_b32 s2, 0xaaaaaaaa +; GFX9_4-NEXT: v_mov_b32_e32 v7, v3 +; GFX9_4-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v1, s0, v[2:3] +; GFX9_4-NEXT: v_mov_b32_e32 v2, v5 +; GFX9_4-NEXT: v_mov_b32_e32 v5, v3 +; GFX9_4-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v0, s2, v[4:5] +; GFX9_4-NEXT: v_mov_b32_e32 v6, v5 +; GFX9_4-NEXT: v_lshl_add_u64 v[2:3], v[2:3], 0, v[6:7] +; GFX9_4-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v1, s2, v[2:3] +; GFX9_4-NEXT: v_alignbit_b32 v2, v3, v2, 2 +; GFX9_4-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v2, 6, 0 +; GFX9_4-NEXT: v_lshrrev_b32_e32 v3, 2, v3 +; GFX9_4-NEXT: v_mov_b32_e32 v2, v5 +; GFX9_4-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v3, 6, v[2:3] +; GFX9_4-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v4 +; GFX9_4-NEXT: v_mov_b32_e32 v3, v2 +; GFX9_4-NEXT: s_nop 0 +; GFX9_4-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v3, vcc +; GFX9_4-NEXT: s_setpc_b64 s[30:31] +; ; GFX1030-LABEL: urem64_6: ; GFX1030: ; %bb.0: ; %entry ; GFX1030-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -393,6 +508,30 @@ define noundef i64 @sdiv64_3(i64 noundef %i) { ; GFX942-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3] ; GFX942-NEXT: s_setpc_b64 s[30:31] ; +; GFX9_4-LABEL: sdiv64_3: +; GFX9_4: ; %bb.0: ; %entry +; GFX9_4-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9_4-NEXT: s_mov_b32 s2, 0x55555556 +; GFX9_4-NEXT: v_mul_hi_u32 v2, v0, s2 +; GFX9_4-NEXT: v_mov_b32_e32 v3, 0 +; GFX9_4-NEXT: s_mov_b32 s3, 0x55555555 +; GFX9_4-NEXT: v_mov_b32_e32 v7, v3 +; GFX9_4-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v1, s2, v[2:3] +; GFX9_4-NEXT: v_mov_b32_e32 v2, v5 +; GFX9_4-NEXT: v_mov_b32_e32 v5, v3 +; GFX9_4-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v0, s3, v[4:5] +; GFX9_4-NEXT: v_ashrrev_i32_e32 v0, 31, v1 +; GFX9_4-NEXT: v_mov_b32_e32 v6, v5 +; GFX9_4-NEXT: v_lshl_add_u64 v[4:5], v[2:3], 0, v[6:7] +; GFX9_4-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v1, s3, v[4:5] +; GFX9_4-NEXT: v_mul_lo_u32 v2, v0, s3 +; GFX9_4-NEXT: v_mul_lo_u32 v6, v0, s2 +; GFX9_4-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, s2, v[4:5] +; GFX9_4-NEXT: v_add3_u32 v1, v6, v1, v2 +; GFX9_4-NEXT: v_lshrrev_b32_e32 v2, 31, v1 +; GFX9_4-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3] +; GFX9_4-NEXT: s_setpc_b64 s[30:31] +; ; GFX1030-LABEL: sdiv64_3: ; GFX1030: ; %bb.0: ; %entry ; GFX1030-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -470,6 +609,30 @@ define noundef i64 @sdiv64_6(i64 noundef %i) { ; GFX942-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3] ; GFX942-NEXT: s_setpc_b64 s[30:31] ; +; GFX9_4-LABEL: sdiv64_6: +; GFX9_4: ; %bb.0: ; %entry +; GFX9_4-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9_4-NEXT: s_mov_b32 s2, 0x55555556 +; GFX9_4-NEXT: v_mul_hi_u32 v2, v0, s2 +; GFX9_4-NEXT: v_mov_b32_e32 v3, 0 +; GFX9_4-NEXT: s_mov_b32 s3, 0x55555555 +; GFX9_4-NEXT: v_mov_b32_e32 v7, v3 +; GFX9_4-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v1, s2, v[2:3] +; GFX9_4-NEXT: v_mov_b32_e32 v2, v5 +; GFX9_4-NEXT: v_mov_b32_e32 v5, v3 +; GFX9_4-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v0, s3, v[4:5] +; GFX9_4-NEXT: v_ashrrev_i32_e32 v0, 31, v1 +; GFX9_4-NEXT: v_mov_b32_e32 v6, v5 +; GFX9_4-NEXT: v_lshl_add_u64 v[4:5], v[2:3], 0, v[6:7] +; GFX9_4-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v1, s3, v[4:5] +; GFX9_4-NEXT: v_mul_lo_u32 v2, v0, s3 +; GFX9_4-NEXT: v_mul_lo_u32 v6, v0, s2 +; GFX9_4-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, s2, v[4:5] +; GFX9_4-NEXT: v_add3_u32 v1, v6, v1, v2 +; GFX9_4-NEXT: v_lshrrev_b32_e32 v2, 31, v1 +; GFX9_4-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3] +; GFX9_4-NEXT: s_setpc_b64 s[30:31] +; ; GFX1030-LABEL: sdiv64_6: ; GFX1030: ; %bb.0: ; %entry ; GFX1030-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -536,6 +699,25 @@ define noundef i64 @udiv64_3(i64 noundef %i) { ; GFX942-NEXT: v_lshrrev_b32_e32 v1, 1, v1 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; +; GFX9_4-LABEL: udiv64_3: +; GFX9_4: ; %bb.0: ; %entry +; GFX9_4-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9_4-NEXT: s_mov_b32 s0, 0xaaaaaaab +; GFX9_4-NEXT: v_mul_hi_u32 v2, v0, s0 +; GFX9_4-NEXT: v_mov_b32_e32 v3, 0 +; GFX9_4-NEXT: s_mov_b32 s2, 0xaaaaaaaa +; GFX9_4-NEXT: v_mov_b32_e32 v7, v3 +; GFX9_4-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v1, s0, v[2:3] +; GFX9_4-NEXT: v_mov_b32_e32 v2, v5 +; GFX9_4-NEXT: v_mov_b32_e32 v5, v3 +; GFX9_4-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v0, s2, v[4:5] +; GFX9_4-NEXT: v_mov_b32_e32 v6, v5 +; GFX9_4-NEXT: v_lshl_add_u64 v[2:3], v[2:3], 0, v[6:7] +; GFX9_4-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v1, s2, v[2:3] +; GFX9_4-NEXT: v_alignbit_b32 v0, v1, v0, 1 +; GFX9_4-NEXT: v_lshrrev_b32_e32 v1, 1, v1 +; GFX9_4-NEXT: s_setpc_b64 s[30:31] +; ; GFX1030-LABEL: udiv64_3: ; GFX1030: ; %bb.0: ; %entry ; GFX1030-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -596,6 +778,25 @@ define noundef i64 @udiv64_6(i64 noundef %i) { ; GFX942-NEXT: v_lshrrev_b32_e32 v1, 2, v1 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; +; GFX9_4-LABEL: udiv64_6: +; GFX9_4: ; %bb.0: ; %entry +; GFX9_4-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9_4-NEXT: s_mov_b32 s0, 0xaaaaaaab +; GFX9_4-NEXT: v_mul_hi_u32 v2, v0, s0 +; GFX9_4-NEXT: v_mov_b32_e32 v3, 0 +; GFX9_4-NEXT: s_mov_b32 s2, 0xaaaaaaaa +; GFX9_4-NEXT: v_mov_b32_e32 v7, v3 +; GFX9_4-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v1, s0, v[2:3] +; GFX9_4-NEXT: v_mov_b32_e32 v2, v5 +; GFX9_4-NEXT: v_mov_b32_e32 v5, v3 +; GFX9_4-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v0, s2, v[4:5] +; GFX9_4-NEXT: v_mov_b32_e32 v6, v5 +; GFX9_4-NEXT: v_lshl_add_u64 v[2:3], v[2:3], 0, v[6:7] +; GFX9_4-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v1, s2, v[2:3] +; GFX9_4-NEXT: v_alignbit_b32 v0, v1, v0, 2 +; GFX9_4-NEXT: v_lshrrev_b32_e32 v1, 2, v1 +; GFX9_4-NEXT: s_setpc_b64 s[30:31] +; ; GFX1030-LABEL: udiv64_6: ; GFX1030: ; %bb.0: ; %entry ; GFX1030-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -641,6 +842,18 @@ define noundef i64 @srem64_2(i64 noundef %i) { ; GFX942-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v3, vcc ; GFX942-NEXT: s_setpc_b64 s[30:31] ; +; GFX9_4-LABEL: srem64_2: +; GFX9_4: ; %bb.0: ; %entry +; GFX9_4-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9_4-NEXT: v_lshrrev_b32_e32 v2, 31, v1 +; GFX9_4-NEXT: v_mov_b32_e32 v3, 0 +; GFX9_4-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, v[2:3] +; GFX9_4-NEXT: v_and_b32_e32 v2, -2, v2 +; GFX9_4-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v2 +; GFX9_4-NEXT: s_nop 1 +; GFX9_4-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v3, vcc +; GFX9_4-NEXT: s_setpc_b64 s[30:31] +; ; GFX1030-LABEL: srem64_2: ; GFX1030: ; %bb.0: ; %entry ; GFX1030-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -675,6 +888,15 @@ define noundef i64 @sdiv64_2(i64 noundef %i) { ; GFX942-NEXT: v_ashrrev_i64 v[0:1], 1, v[0:1] ; GFX942-NEXT: s_setpc_b64 s[30:31] ; +; GFX9_4-LABEL: sdiv64_2: +; GFX9_4: ; %bb.0: ; %entry +; GFX9_4-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9_4-NEXT: v_lshrrev_b32_e32 v2, 31, v1 +; GFX9_4-NEXT: v_mov_b32_e32 v3, 0 +; GFX9_4-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3] +; GFX9_4-NEXT: v_ashrrev_i64 v[0:1], 1, v[0:1] +; GFX9_4-NEXT: s_setpc_b64 s[30:31] +; ; GFX1030-LABEL: sdiv64_2: ; GFX1030: ; %bb.0: ; %entry ; GFX1030-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -703,6 +925,13 @@ define noundef i64 @urem64_2(i64 noundef %i) { ; GFX942-NEXT: v_mov_b32_e32 v1, 0 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; +; GFX9_4-LABEL: urem64_2: +; GFX9_4: ; %bb.0: ; %entry +; GFX9_4-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9_4-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX9_4-NEXT: v_mov_b32_e32 v1, 0 +; GFX9_4-NEXT: s_setpc_b64 s[30:31] +; ; GFX1030-LABEL: urem64_2: ; GFX1030: ; %bb.0: ; %entry ; GFX1030-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -729,6 +958,13 @@ define noundef i64 @udiv64_2(i64 noundef %i) { ; GFX942-NEXT: v_lshrrev_b32_e32 v1, 1, v1 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; +; GFX9_4-LABEL: udiv64_2: +; GFX9_4: ; %bb.0: ; %entry +; GFX9_4-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9_4-NEXT: v_alignbit_b32 v0, v1, v0, 1 +; GFX9_4-NEXT: v_lshrrev_b32_e32 v1, 1, v1 +; GFX9_4-NEXT: s_setpc_b64 s[30:31] +; ; GFX1030-LABEL: udiv64_2: ; GFX1030: ; %bb.0: ; %entry ; GFX1030-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -766,6 +1002,19 @@ define noundef i64 @srem64_64(i64 noundef %i) { ; GFX942-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v3, vcc ; GFX942-NEXT: s_setpc_b64 s[30:31] ; +; GFX9_4-LABEL: srem64_64: +; GFX9_4: ; %bb.0: ; %entry +; GFX9_4-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9_4-NEXT: v_ashrrev_i32_e32 v2, 31, v1 +; GFX9_4-NEXT: v_lshrrev_b32_e32 v2, 26, v2 +; GFX9_4-NEXT: v_mov_b32_e32 v3, 0 +; GFX9_4-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, v[2:3] +; GFX9_4-NEXT: v_and_b32_e32 v2, 0xffffffc0, v2 +; GFX9_4-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v2 +; GFX9_4-NEXT: s_nop 1 +; GFX9_4-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v3, vcc +; GFX9_4-NEXT: s_setpc_b64 s[30:31] +; ; GFX1030-LABEL: srem64_64: ; GFX1030: ; %bb.0: ; %entry ; GFX1030-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -803,6 +1052,16 @@ define noundef i64 @sdiv64_64(i64 noundef %i) { ; GFX942-NEXT: v_ashrrev_i64 v[0:1], 6, v[0:1] ; GFX942-NEXT: s_setpc_b64 s[30:31] ; +; GFX9_4-LABEL: sdiv64_64: +; GFX9_4: ; %bb.0: ; %entry +; GFX9_4-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9_4-NEXT: v_ashrrev_i32_e32 v2, 31, v1 +; GFX9_4-NEXT: v_lshrrev_b32_e32 v2, 26, v2 +; GFX9_4-NEXT: v_mov_b32_e32 v3, 0 +; GFX9_4-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3] +; GFX9_4-NEXT: v_ashrrev_i64 v[0:1], 6, v[0:1] +; GFX9_4-NEXT: s_setpc_b64 s[30:31] +; ; GFX1030-LABEL: sdiv64_64: ; GFX1030: ; %bb.0: ; %entry ; GFX1030-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -832,6 +1091,13 @@ define noundef i64 @urem64_64(i64 noundef %i) { ; GFX942-NEXT: v_mov_b32_e32 v1, 0 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; +; GFX9_4-LABEL: urem64_64: +; GFX9_4: ; %bb.0: ; %entry +; GFX9_4-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9_4-NEXT: v_and_b32_e32 v0, 63, v0 +; GFX9_4-NEXT: v_mov_b32_e32 v1, 0 +; GFX9_4-NEXT: s_setpc_b64 s[30:31] +; ; GFX1030-LABEL: urem64_64: ; GFX1030: ; %bb.0: ; %entry ; GFX1030-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -858,6 +1124,13 @@ define noundef i64 @udiv64_64(i64 noundef %i) { ; GFX942-NEXT: v_lshrrev_b32_e32 v1, 6, v1 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; +; GFX9_4-LABEL: udiv64_64: +; GFX9_4: ; %bb.0: ; %entry +; GFX9_4-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9_4-NEXT: v_alignbit_b32 v0, v1, v0, 6 +; GFX9_4-NEXT: v_lshrrev_b32_e32 v1, 6, v1 +; GFX9_4-NEXT: s_setpc_b64 s[30:31] +; ; GFX1030-LABEL: udiv64_64: ; GFX1030: ; %bb.0: ; %entry ; GFX1030-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -895,6 +1168,19 @@ define noundef i64 @srem64_i32min(i64 noundef %i) { ; GFX942-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v3, vcc ; GFX942-NEXT: s_setpc_b64 s[30:31] ; +; GFX9_4-LABEL: srem64_i32min: +; GFX9_4: ; %bb.0: ; %entry +; GFX9_4-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9_4-NEXT: v_ashrrev_i32_e32 v2, 31, v1 +; GFX9_4-NEXT: v_lshrrev_b32_e32 v2, 1, v2 +; GFX9_4-NEXT: v_mov_b32_e32 v3, 0 +; GFX9_4-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, v[2:3] +; GFX9_4-NEXT: v_and_b32_e32 v2, 0x80000000, v2 +; GFX9_4-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v2 +; GFX9_4-NEXT: s_nop 1 +; GFX9_4-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v3, vcc +; GFX9_4-NEXT: s_setpc_b64 s[30:31] +; ; GFX1030-LABEL: srem64_i32min: ; GFX1030: ; %bb.0: ; %entry ; GFX1030-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -937,6 +1223,19 @@ define noundef i64 @sdiv64_i32min(i64 noundef %i) { ; GFX942-NEXT: v_subb_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX942-NEXT: s_setpc_b64 s[30:31] ; +; GFX9_4-LABEL: sdiv64_i32min: +; GFX9_4: ; %bb.0: ; %entry +; GFX9_4-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9_4-NEXT: v_ashrrev_i32_e32 v2, 31, v1 +; GFX9_4-NEXT: v_lshrrev_b32_e32 v2, 1, v2 +; GFX9_4-NEXT: v_mov_b32_e32 v3, 0 +; GFX9_4-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3] +; GFX9_4-NEXT: v_ashrrev_i64 v[0:1], 31, v[0:1] +; GFX9_4-NEXT: v_sub_co_u32_e32 v0, vcc, 0, v0 +; GFX9_4-NEXT: s_nop 1 +; GFX9_4-NEXT: v_subb_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9_4-NEXT: s_setpc_b64 s[30:31] +; ; GFX1030-LABEL: sdiv64_i32min: ; GFX1030: ; %bb.0: ; %entry ; GFX1030-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -979,6 +1278,18 @@ define noundef i64 @urem64_i32min(i64 noundef %i) { ; GFX942-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3] ; GFX942-NEXT: s_setpc_b64 s[30:31] ; +; GFX9_4-LABEL: urem64_i32min: +; GFX9_4: ; %bb.0: ; %entry +; GFX9_4-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9_4-NEXT: v_alignbit_b32 v4, v1, v0, 31 +; GFX9_4-NEXT: v_lshrrev_b32_e32 v2, 31, v1 +; GFX9_4-NEXT: v_mov_b32_e32 v3, 0 +; GFX9_4-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v4, 1, v[2:3] +; GFX9_4-NEXT: v_add_lshl_u32 v2, v2, v5, 30 +; GFX9_4-NEXT: v_and_b32_e32 v2, 0x80000000, v2 +; GFX9_4-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3] +; GFX9_4-NEXT: s_setpc_b64 s[30:31] +; ; GFX1030-LABEL: urem64_i32min: ; GFX1030: ; %bb.0: ; %entry ; GFX1030-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -1025,6 +1336,20 @@ define noundef i64 @udiv64_i32min(i64 noundef %i) { ; GFX942-NEXT: v_mov_b32_e32 v1, 0 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; +; GFX9_4-LABEL: udiv64_i32min: +; GFX9_4: ; %bb.0: ; %entry +; GFX9_4-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9_4-NEXT: v_alignbit_b32 v2, v1, v0, 31 +; GFX9_4-NEXT: v_lshrrev_b32_e32 v0, 31, v1 +; GFX9_4-NEXT: v_mov_b32_e32 v1, 0 +; GFX9_4-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v2, 1, v[0:1] +; GFX9_4-NEXT: v_mov_b32_e32 v2, v3 +; GFX9_4-NEXT: v_mov_b32_e32 v3, v1 +; GFX9_4-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3] +; GFX9_4-NEXT: v_alignbit_b32 v0, v1, v0, 1 +; GFX9_4-NEXT: v_mov_b32_e32 v1, 0 +; GFX9_4-NEXT: s_setpc_b64 s[30:31] +; ; GFX1030-LABEL: udiv64_i32min: ; GFX1030: ; %bb.0: ; %entry ; GFX1030-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -1117,6 +1442,43 @@ define noundef i64 @srem64_i32max(i64 noundef %i) { ; GFX942-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v3, vcc ; GFX942-NEXT: s_setpc_b64 s[30:31] ; +; GFX9_4-LABEL: srem64_i32max: +; GFX9_4: ; %bb.0: ; %entry +; GFX9_4-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9_4-NEXT: v_mul_hi_u32 v2, v0, 3 +; GFX9_4-NEXT: v_mov_b32_e32 v3, 0 +; GFX9_4-NEXT: v_ashrrev_i32_e32 v6, 31, v1 +; GFX9_4-NEXT: v_lshl_add_u32 v8, v6, 31, v6 +; GFX9_4-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v1, 3, v[2:3] +; GFX9_4-NEXT: v_mad_u64_u32 v[6:7], s[0:1], v6, 3, 0 +; GFX9_4-NEXT: v_mov_b32_e32 v2, v5 +; GFX9_4-NEXT: v_mov_b32_e32 v5, v3 +; GFX9_4-NEXT: s_mov_b32 s2, 0x80000001 +; GFX9_4-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v0, s2, v[4:5] +; GFX9_4-NEXT: v_add3_u32 v7, v7, v8, v6 +; GFX9_4-NEXT: v_mad_u64_u32 v[6:7], s[0:1], v0, -1, v[6:7] +; GFX9_4-NEXT: v_mov_b32_e32 v4, v5 +; GFX9_4-NEXT: v_mov_b32_e32 v5, v3 +; GFX9_4-NEXT: v_lshl_add_u64 v[4:5], v[2:3], 0, v[4:5] +; GFX9_4-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v1, s2, v[4:5] +; GFX9_4-NEXT: v_sub_u32_e32 v2, v7, v1 +; GFX9_4-NEXT: v_sub_u32_e32 v7, v2, v0 +; GFX9_4-NEXT: v_lshl_add_u64 v[4:5], v[4:5], 0, v[6:7] +; GFX9_4-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v0, 1, v[4:5] +; GFX9_4-NEXT: s_brev_b32 s2, -2 +; GFX9_4-NEXT: v_add_u32_e32 v5, v1, v5 +; GFX9_4-NEXT: v_ashrrev_i64 v[6:7], 30, v[4:5] +; GFX9_4-NEXT: v_lshrrev_b32_e32 v2, 31, v5 +; GFX9_4-NEXT: v_lshl_add_u64 v[2:3], v[6:7], 0, v[2:3] +; GFX9_4-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v2, s2, 0 +; GFX9_4-NEXT: v_mov_b32_e32 v2, v5 +; GFX9_4-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v3, s2, v[2:3] +; GFX9_4-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v4 +; GFX9_4-NEXT: v_mov_b32_e32 v3, v2 +; GFX9_4-NEXT: s_nop 0 +; GFX9_4-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v3, vcc +; GFX9_4-NEXT: s_setpc_b64 s[30:31] +; ; GFX1030-LABEL: srem64_i32max: ; GFX1030: ; %bb.0: ; %entry ; GFX1030-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -1216,6 +1578,35 @@ define noundef i64 @sdiv64_i32max(i64 noundef %i) { ; GFX942-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, v[4:5] ; GFX942-NEXT: s_setpc_b64 s[30:31] ; +; GFX9_4-LABEL: sdiv64_i32max: +; GFX9_4: ; %bb.0: ; %entry +; GFX9_4-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9_4-NEXT: v_mul_hi_u32 v2, v0, 3 +; GFX9_4-NEXT: v_mov_b32_e32 v3, 0 +; GFX9_4-NEXT: v_ashrrev_i32_e32 v6, 31, v1 +; GFX9_4-NEXT: v_lshl_add_u32 v8, v6, 31, v6 +; GFX9_4-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v1, 3, v[2:3] +; GFX9_4-NEXT: v_mad_u64_u32 v[6:7], s[0:1], v6, 3, 0 +; GFX9_4-NEXT: v_mov_b32_e32 v2, v5 +; GFX9_4-NEXT: v_mov_b32_e32 v5, v3 +; GFX9_4-NEXT: s_mov_b32 s2, 0x80000001 +; GFX9_4-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v0, s2, v[4:5] +; GFX9_4-NEXT: v_add3_u32 v7, v7, v8, v6 +; GFX9_4-NEXT: v_mad_u64_u32 v[6:7], s[0:1], v0, -1, v[6:7] +; GFX9_4-NEXT: v_mov_b32_e32 v4, v5 +; GFX9_4-NEXT: v_mov_b32_e32 v5, v3 +; GFX9_4-NEXT: v_lshl_add_u64 v[4:5], v[2:3], 0, v[4:5] +; GFX9_4-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v1, s2, v[4:5] +; GFX9_4-NEXT: v_sub_u32_e32 v2, v7, v1 +; GFX9_4-NEXT: v_sub_u32_e32 v7, v2, v0 +; GFX9_4-NEXT: v_lshl_add_u64 v[4:5], v[4:5], 0, v[6:7] +; GFX9_4-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v0, 1, v[4:5] +; GFX9_4-NEXT: v_add_u32_e32 v5, v1, v5 +; GFX9_4-NEXT: v_ashrrev_i64 v[0:1], 30, v[4:5] +; GFX9_4-NEXT: v_lshrrev_b32_e32 v2, 31, v5 +; GFX9_4-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3] +; GFX9_4-NEXT: s_setpc_b64 s[30:31] +; ; GFX1030-LABEL: sdiv64_i32max: ; GFX1030: ; %bb.0: ; %entry ; GFX1030-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -1308,6 +1699,36 @@ define noundef i64 @urem64_i32max(i64 noundef %i) { ; GFX942-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v3, vcc ; GFX942-NEXT: s_setpc_b64 s[30:31] ; +; GFX9_4-LABEL: urem64_i32max: +; GFX9_4: ; %bb.0: ; %entry +; GFX9_4-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9_4-NEXT: v_mul_hi_u32 v2, v0, 5 +; GFX9_4-NEXT: v_mov_b32_e32 v3, 0 +; GFX9_4-NEXT: v_mov_b32_e32 v7, v3 +; GFX9_4-NEXT: s_brev_b32 s2, -2 +; GFX9_4-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v1, 5, v[2:3] +; GFX9_4-NEXT: v_mov_b32_e32 v2, v5 +; GFX9_4-NEXT: v_mov_b32_e32 v5, v3 +; GFX9_4-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v0, 2, v[4:5] +; GFX9_4-NEXT: v_mov_b32_e32 v6, v5 +; GFX9_4-NEXT: v_lshl_add_u64 v[2:3], v[2:3], 0, v[6:7] +; GFX9_4-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v1, 2, v[2:3] +; GFX9_4-NEXT: v_sub_co_u32_e32 v4, vcc, v0, v2 +; GFX9_4-NEXT: s_nop 1 +; GFX9_4-NEXT: v_subb_co_u32_e32 v5, vcc, v1, v3, vcc +; GFX9_4-NEXT: v_lshrrev_b64 v[4:5], 1, v[4:5] +; GFX9_4-NEXT: v_lshl_add_u64 v[2:3], v[4:5], 0, v[2:3] +; GFX9_4-NEXT: v_alignbit_b32 v2, v3, v2, 30 +; GFX9_4-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v2, s2, 0 +; GFX9_4-NEXT: v_lshrrev_b32_e32 v3, 30, v3 +; GFX9_4-NEXT: v_mov_b32_e32 v2, v5 +; GFX9_4-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v3, s2, v[2:3] +; GFX9_4-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v4 +; GFX9_4-NEXT: v_mov_b32_e32 v3, v2 +; GFX9_4-NEXT: s_nop 0 +; GFX9_4-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v3, vcc +; GFX9_4-NEXT: s_setpc_b64 s[30:31] +; ; GFX1030-LABEL: urem64_i32max: ; GFX1030: ; %bb.0: ; %entry ; GFX1030-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -1383,6 +1804,28 @@ define noundef i64 @udiv64_i32max(i64 noundef %i) { ; GFX942-NEXT: v_lshrrev_b32_e32 v1, 30, v1 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; +; GFX9_4-LABEL: udiv64_i32max: +; GFX9_4: ; %bb.0: ; %entry +; GFX9_4-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9_4-NEXT: v_mul_hi_u32 v2, v0, 5 +; GFX9_4-NEXT: v_mov_b32_e32 v3, 0 +; GFX9_4-NEXT: v_mov_b32_e32 v7, v3 +; GFX9_4-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v1, 5, v[2:3] +; GFX9_4-NEXT: v_mov_b32_e32 v2, v5 +; GFX9_4-NEXT: v_mov_b32_e32 v5, v3 +; GFX9_4-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v0, 2, v[4:5] +; GFX9_4-NEXT: v_mov_b32_e32 v6, v5 +; GFX9_4-NEXT: v_lshl_add_u64 v[2:3], v[2:3], 0, v[6:7] +; GFX9_4-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v1, 2, v[2:3] +; GFX9_4-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v2 +; GFX9_4-NEXT: s_nop 1 +; GFX9_4-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v3, vcc +; GFX9_4-NEXT: v_lshrrev_b64 v[0:1], 1, v[0:1] +; GFX9_4-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3] +; GFX9_4-NEXT: v_alignbit_b32 v0, v1, v0, 30 +; GFX9_4-NEXT: v_lshrrev_b32_e32 v1, 30, v1 +; GFX9_4-NEXT: s_setpc_b64 s[30:31] +; ; GFX1030-LABEL: udiv64_i32max: ; GFX1030: ; %bb.0: ; %entry ; GFX1030-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/dst-sel-hazard.mir b/llvm/test/CodeGen/AMDGPU/dst-sel-hazard.mir index e24817078d8bc9..524e074bb69de4 100644 --- a/llvm/test/CodeGen/AMDGPU/dst-sel-hazard.mir +++ b/llvm/test/CodeGen/AMDGPU/dst-sel-hazard.mir @@ -1,5 +1,6 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 # RUN: llc -mtriple=amdgcn -mcpu=gfx942 -run-pass post-RA-hazard-rec -o - %s | FileCheck -check-prefix=HAZARD %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx9-4-generic -run-pass post-RA-hazard-rec -o - %s | FileCheck -check-prefix=HAZARD %s # RUN: llc -mtriple=amdgcn -mcpu=gfx90a -run-pass post-RA-hazard-rec -o - %s | FileCheck -check-prefix=NOHAZARD %s --- diff --git a/llvm/test/CodeGen/AMDGPU/elf-header-flags-mach.ll b/llvm/test/CodeGen/AMDGPU/elf-header-flags-mach.ll index f293c52bf6bfb2..f1f4edb94a6178 100644 --- a/llvm/test/CodeGen/AMDGPU/elf-header-flags-mach.ll +++ b/llvm/test/CodeGen/AMDGPU/elf-header-flags-mach.ll @@ -80,6 +80,7 @@ ; RUN: llc -filetype=obj -mtriple=amdgcn -mcpu=gfx1201 < %s | llvm-readobj --file-header - | FileCheck --check-prefixes=ALL,ARCH-GCN,GFX1201 %s ; RUN: llc -filetype=obj --amdhsa-code-object-version=6 -mtriple=amdgcn -mcpu=gfx9-generic < %s | llvm-readobj --file-header - | FileCheck --check-prefixes=ALL,ARCH-GCN,GFX9_GENERIC %s +; RUN: llc -filetype=obj --amdhsa-code-object-version=6 -mtriple=amdgcn -mcpu=gfx9-4-generic < %s | llvm-readobj --file-header - | FileCheck --check-prefixes=ALL,ARCH-GCN,GFX9_4_GENERIC %s ; RUN: llc -filetype=obj --amdhsa-code-object-version=6 -mtriple=amdgcn -mcpu=gfx10-1-generic < %s | llvm-readobj --file-header - | FileCheck --check-prefixes=ALL,ARCH-GCN,GFX10_1_GENERIC %s ; RUN: llc -filetype=obj --amdhsa-code-object-version=6 -mtriple=amdgcn -mcpu=gfx10-3-generic < %s | llvm-readobj --file-header - | FileCheck --check-prefixes=ALL,ARCH-GCN,GFX10_3_GENERIC %s ; RUN: llc -filetype=obj --amdhsa-code-object-version=6 -mtriple=amdgcn -mcpu=gfx11-generic < %s | llvm-readobj --file-header - | FileCheck --check-prefixes=ALL,ARCH-GCN,GFX11_GENERIC %s @@ -161,6 +162,7 @@ ; GFX1201: EF_AMDGPU_MACH_AMDGCN_GFX1201 (0x4E) ; GFX9_GENERIC: EF_AMDGPU_MACH_AMDGCN_GFX9_GENERIC (0x51) +; GFX9_4_GENERIC: EF_AMDGPU_MACH_AMDGCN_GFX9_4_GENERIC (0x5F) ; GFX10_1_GENERIC: EF_AMDGPU_MACH_AMDGCN_GFX10_1_GENERIC (0x52) ; GFX10_3_GENERIC: EF_AMDGPU_MACH_AMDGCN_GFX10_3_GENERIC (0x53) ; GFX11_GENERIC: EF_AMDGPU_MACH_AMDGCN_GFX11_GENERIC (0x54) diff --git a/llvm/test/CodeGen/AMDGPU/generic-targets-require-v6.ll b/llvm/test/CodeGen/AMDGPU/generic-targets-require-v6.ll index d30cf1531a06b1..d58dc5db8c80f1 100644 --- a/llvm/test/CodeGen/AMDGPU/generic-targets-require-v6.ll +++ b/llvm/test/CodeGen/AMDGPU/generic-targets-require-v6.ll @@ -1,16 +1,19 @@ ; RUN: not llc -mtriple=amdgcn -mcpu=gfx9-generic --amdhsa-code-object-version=5 -o - %s 2>&1 | FileCheck --check-prefix=GFX9-V5 %s +; RUN: not llc -mtriple=amdgcn -mcpu=gfx9-4-generic --amdhsa-code-object-version=5 -o - %s 2>&1 | FileCheck --check-prefix=GFX9-4-V5 %s ; RUN: not llc -mtriple=amdgcn -mcpu=gfx10-1-generic --amdhsa-code-object-version=5 -o - %s 2>&1 | FileCheck --check-prefix=GFX101-V5 %s ; RUN: not llc -mtriple=amdgcn -mcpu=gfx10-3-generic --amdhsa-code-object-version=5 -o - %s 2>&1 | FileCheck --check-prefix=GFX103-V5 %s ; RUN: not llc -mtriple=amdgcn -mcpu=gfx11-generic --amdhsa-code-object-version=5 -o - %s 2>&1 | FileCheck --check-prefix=GFX11-V5 %s ; RUN: not llc -mtriple=amdgcn -mcpu=gfx12-generic --amdhsa-code-object-version=5 -o - %s 2>&1 | FileCheck --check-prefix=GFX12-V5 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx9-generic --amdhsa-code-object-version=6 -o - %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx9-4-generic --amdhsa-code-object-version=6 -o - %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx10-1-generic --amdhsa-code-object-version=6 -o - %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx10-3-generic --amdhsa-code-object-version=6 -o - %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx11-generic --amdhsa-code-object-version=6 -o - %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx12-generic --amdhsa-code-object-version=6 -o - %s ; GFX9-V5: gfx9-generic is only available on code object version 6 or better +; GFX9-4-V5: gfx9-4-generic is only available on code object version 6 or better ; GFX101-V5: gfx10-1-generic is only available on code object version 6 or better ; GFX103-V5: gfx10-3-generic is only available on code object version 6 or better ; GFX11-V5: gfx11-generic is only available on code object version 6 or better diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.large.mir b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.large.mir index da1d9972e42dcf..c3f1d47f35ed28 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.large.mir +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.large.mir @@ -1,5 +1,6 @@ # NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 # RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -start-before=machine-scheduler -verify-misched -o - %s | FileCheck -check-prefix=GCN %s +# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx9-4-generic --amdhsa-code-object-version=6 -start-before=machine-scheduler -verify-misched -o - %s | FileCheck -check-prefix=GCN9_4 %s --- | define amdgpu_kernel void @largeInterleave() #0 { ret void } @@ -1145,6 +1146,1149 @@ ; GCN-NEXT: s_waitcnt vmcnt(8) ; GCN-NEXT: ;;#ASMEND ; GCN-NEXT: s_endpgm + ; + ; GCN9_4-LABEL: largeInterleave: + ; GCN9_4: ; %bb.0: + ; GCN9_4-NEXT: ; implicit-def: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GCN9_4-NEXT: ; implicit-def: $vgpr0 + ; GCN9_4-NEXT: ; implicit-def: $vgpr1 + ; GCN9_4-NEXT: ; implicit-def: $vgpr8 + ; GCN9_4-NEXT: ; implicit-def: $vgpr94 + ; GCN9_4-NEXT: ; implicit-def: $vgpr76_vgpr77_vgpr78_vgpr79 + ; GCN9_4-NEXT: ; implicit-def: $vgpr132 + ; GCN9_4-NEXT: ; implicit-def: $vgpr133 + ; GCN9_4-NEXT: ; implicit-def: $vgpr139 + ; GCN9_4-NEXT: ; implicit-def: $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111 + ; GCN9_4-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) + ; GCN9_4-NEXT: ;;#ASMSTART + ; GCN9_4-NEXT: s_waitcnt vmcnt(8) + ; GCN9_4-NEXT: ;;#ASMEND + ; GCN9_4-NEXT: v_readfirstlane_b32 s7, v0 + ; GCN9_4-NEXT: ; implicit-def: $sgpr0 + ; GCN9_4-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 + ; GCN9_4-NEXT: ; kill: killed $sgpr8_sgpr9_sgpr10_sgpr11 + ; GCN9_4-NEXT: ; implicit-def: $sgpr5 + ; GCN9_4-NEXT: ; iglp_opt mask(0x00000002) + ; GCN9_4-NEXT: s_nop 1 + ; GCN9_4-NEXT: v_lshl_add_u32 v0, s7, 4, v1 + ; GCN9_4-NEXT: v_mul_lo_u32 v0, v0, s6 + ; GCN9_4-NEXT: ; implicit-def: $vgpr1 + ; GCN9_4-NEXT: ; implicit-def: $sgpr6 + ; GCN9_4-NEXT: v_add_lshl_u32 v92, v0, v1, 1 + ; GCN9_4-NEXT: v_add_u32_e32 v93, s0, v92 + ; GCN9_4-NEXT: buffer_load_dwordx4 v[0:3], v92, s[8:11], 0 offen sc0 sc1 + ; GCN9_4-NEXT: s_waitcnt vmcnt(0) + ; GCN9_4-NEXT: buffer_inv sc0 sc1 + ; GCN9_4-NEXT: buffer_load_dwordx4 v[4:7], v93, s[8:11], 0 offen sc0 sc1 + ; GCN9_4-NEXT: s_waitcnt vmcnt(0) + ; GCN9_4-NEXT: buffer_inv sc0 sc1 + ; GCN9_4-NEXT: s_lshl_b32 s0, s7, 7 + ; GCN9_4-NEXT: v_add_lshl_u32 v95, v8, s0, 1 + ; GCN9_4-NEXT: v_add_u32_e32 v8, 64, v93 + ; GCN9_4-NEXT: ; kill: killed $vgpr8 + ; GCN9_4-NEXT: ; implicit-def: $sgpr0_sgpr1_sgpr2_sgpr3 + ; GCN9_4-NEXT: ; kill: killed $vgpr92 + ; GCN9_4-NEXT: buffer_wbl2 sc0 sc1 + ; GCN9_4-NEXT: ds_write_b128 v95, v[0:3] + ; GCN9_4-NEXT: buffer_wbl2 sc0 sc1 + ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0) + ; GCN9_4-NEXT: ds_write_b128 v95, v[4:7] offset:1024 + ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0) + ; GCN9_4-NEXT: buffer_load_dwordx4 v[64:67], v92, s[8:11], 0 offen offset:64 sc0 sc1 + ; GCN9_4-NEXT: s_waitcnt vmcnt(0) + ; GCN9_4-NEXT: buffer_inv sc0 sc1 + ; GCN9_4-NEXT: buffer_load_dwordx4 v[68:71], v8, s[8:11], 0 offen sc0 sc1 + ; GCN9_4-NEXT: s_waitcnt vmcnt(0) + ; GCN9_4-NEXT: buffer_inv sc0 sc1 + ; GCN9_4-NEXT: ;;#ASMSTART + ; GCN9_4-NEXT: s_waitcnt vmcnt(8) + ; GCN9_4-NEXT: ;;#ASMEND + ; GCN9_4-NEXT: ds_read_b128 v[72:75], v94 + ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0) + ; GCN9_4-NEXT: buffer_inv sc0 sc1 + ; GCN9_4-NEXT: ds_read_b128 v[80:83], v94 offset:512 + ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0) + ; GCN9_4-NEXT: buffer_inv sc0 sc1 + ; GCN9_4-NEXT: ds_read_b128 v[84:87], v94 offset:1024 + ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0) + ; GCN9_4-NEXT: buffer_inv sc0 sc1 + ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[72:73], v[76:77], 0 + ; GCN9_4-NEXT: ds_read_b128 v[88:91], v94 offset:1536 + ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0) + ; GCN9_4-NEXT: buffer_inv sc0 sc1 + ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[80:81], v[76:77], 0 + ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[84:85], v[76:77], 0 + ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[88:89], v[76:77], 0 + ; GCN9_4-NEXT: ; implicit-def: $vgpr88 + ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[74:75], v[78:79], v[48:63] + ; GCN9_4-NEXT: ds_read_b128 v[72:75], v88 + ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0) + ; GCN9_4-NEXT: buffer_inv sc0 sc1 + ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[82:83], v[78:79], v[32:47] + ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[86:87], v[78:79], v[16:31] + ; GCN9_4-NEXT: ; implicit-def: $vgpr84_vgpr85_vgpr86_vgpr87 + ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[90:91], v[78:79], v[0:15] + ; GCN9_4-NEXT: ; implicit-def: $vgpr76_vgpr77_vgpr78_vgpr79 + ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[72:73], v[76:77], v[48:63] + ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[74:75], v[78:79], v[48:63] + ; GCN9_4-NEXT: ds_read_b128 v[72:75], v88 offset:512 + ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0) + ; GCN9_4-NEXT: buffer_inv sc0 sc1 + ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[72:73], v[76:77], v[32:47] + ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[74:75], v[78:79], v[32:47] + ; GCN9_4-NEXT: ds_read_b128 v[72:75], v88 offset:1024 + ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0) + ; GCN9_4-NEXT: buffer_inv sc0 sc1 + ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[72:73], v[76:77], v[16:31] + ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[74:75], v[78:79], v[16:31] + ; GCN9_4-NEXT: ds_read_b128 v[72:75], v88 offset:1536 + ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0) + ; GCN9_4-NEXT: buffer_inv sc0 sc1 + ; GCN9_4-NEXT: ;;#ASMSTART + ; GCN9_4-NEXT: s_waitcnt vmcnt(8) + ; GCN9_4-NEXT: ;;#ASMEND + ; GCN9_4-NEXT: buffer_wbl2 sc0 sc1 + ; GCN9_4-NEXT: ds_write_b128 v95, v[64:67] + ; GCN9_4-NEXT: buffer_wbl2 sc0 sc1 + ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0) + ; GCN9_4-NEXT: ds_write_b128 v95, v[68:71] offset:1024 + ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0) + ; GCN9_4-NEXT: buffer_load_dwordx4 v[64:67], v92, s[8:11], 0 offen offset:128 sc0 sc1 + ; GCN9_4-NEXT: s_waitcnt vmcnt(0) + ; GCN9_4-NEXT: buffer_inv sc0 sc1 + ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[72:73], v[76:77], v[0:15] + ; GCN9_4-NEXT: v_add_u32_e32 v72, 0x80, v93 + ; GCN9_4-NEXT: buffer_load_dwordx4 v[68:71], v72, s[8:11], 0 offen sc0 sc1 + ; GCN9_4-NEXT: s_waitcnt vmcnt(0) + ; GCN9_4-NEXT: buffer_inv sc0 sc1 + ; GCN9_4-NEXT: ;;#ASMSTART + ; GCN9_4-NEXT: s_waitcnt vmcnt(8) + ; GCN9_4-NEXT: ;;#ASMEND + ; GCN9_4-NEXT: ; kill: killed $vgpr72 + ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[74:75], v[78:79], v[0:15] + ; GCN9_4-NEXT: ds_read_b128 v[72:75], v94 + ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0) + ; GCN9_4-NEXT: buffer_inv sc0 sc1 + ; GCN9_4-NEXT: ; implicit-def: $vgpr76_vgpr77_vgpr78_vgpr79 + ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[72:73], v[76:77], v[48:63] + ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[74:75], v[78:79], v[48:63] + ; GCN9_4-NEXT: ds_read_b128 v[72:75], v94 offset:512 + ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0) + ; GCN9_4-NEXT: buffer_inv sc0 sc1 + ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[72:73], v[76:77], v[32:47] + ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[74:75], v[78:79], v[32:47] + ; GCN9_4-NEXT: ds_read_b128 v[72:75], v94 offset:1024 + ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0) + ; GCN9_4-NEXT: buffer_inv sc0 sc1 + ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[72:73], v[76:77], v[16:31] + ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[74:75], v[78:79], v[16:31] + ; GCN9_4-NEXT: ds_read_b128 v[72:75], v94 offset:1536 + ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0) + ; GCN9_4-NEXT: buffer_inv sc0 sc1 + ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[72:73], v[76:77], v[0:15] + ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[74:75], v[78:79], v[0:15] + ; GCN9_4-NEXT: ds_read_b128 v[72:75], v88 + ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0) + ; GCN9_4-NEXT: buffer_inv sc0 sc1 + ; GCN9_4-NEXT: ; implicit-def: $vgpr76_vgpr77_vgpr78_vgpr79 + ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[72:73], v[76:77], v[48:63] + ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[74:75], v[78:79], v[48:63] + ; GCN9_4-NEXT: ds_read_b128 v[72:75], v88 offset:512 + ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0) + ; GCN9_4-NEXT: buffer_inv sc0 sc1 + ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[72:73], v[76:77], v[32:47] + ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[74:75], v[78:79], v[32:47] + ; GCN9_4-NEXT: ds_read_b128 v[72:75], v88 offset:1024 + ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0) + ; GCN9_4-NEXT: buffer_inv sc0 sc1 + ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[72:73], v[76:77], v[16:31] + ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[74:75], v[78:79], v[16:31] + ; GCN9_4-NEXT: ds_read_b128 v[72:75], v88 offset:1536 + ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0) + ; GCN9_4-NEXT: buffer_inv sc0 sc1 + ; GCN9_4-NEXT: ;;#ASMSTART + ; GCN9_4-NEXT: s_waitcnt vmcnt(8) + ; GCN9_4-NEXT: ;;#ASMEND + ; GCN9_4-NEXT: buffer_wbl2 sc0 sc1 + ; GCN9_4-NEXT: ds_write_b128 v95, v[64:67] + ; GCN9_4-NEXT: buffer_wbl2 sc0 sc1 + ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0) + ; GCN9_4-NEXT: ds_write_b128 v95, v[68:71] offset:1024 + ; GCN9_4-NEXT: v_add_u32_e32 v66, 0xc0, v93 + ; GCN9_4-NEXT: ; implicit-def: $vgpr64 + ; GCN9_4-NEXT: ; implicit-def: $vgpr67 + ; GCN9_4-NEXT: ; implicit-def: $vgpr70 + ; GCN9_4-NEXT: v_add_u32_e32 v68, v132, v64 + ; GCN9_4-NEXT: ; kill: killed $vgpr66 + ; GCN9_4-NEXT: ; implicit-def: $vgpr71 + ; GCN9_4-NEXT: ; kill: killed $vgpr68 + ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[72:73], v[76:77], v[0:15] + ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[74:75], v[78:79], v[0:15] + ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0) + ; GCN9_4-NEXT: buffer_load_dwordx4 v[72:75], v92, s[8:11], 0 offen offset:192 sc0 sc1 + ; GCN9_4-NEXT: s_waitcnt vmcnt(0) + ; GCN9_4-NEXT: buffer_inv sc0 sc1 + ; GCN9_4-NEXT: buffer_load_dwordx4 v[76:79], v66, s[8:11], 0 offen sc0 sc1 + ; GCN9_4-NEXT: s_waitcnt vmcnt(0) + ; GCN9_4-NEXT: buffer_inv sc0 sc1 + ; GCN9_4-NEXT: v_add_u32_e32 v66, v132, v67 + ; GCN9_4-NEXT: buffer_load_dwordx2 v[64:65], v68, s[0:3], 0 offen sc0 sc1 + ; GCN9_4-NEXT: s_waitcnt vmcnt(0) + ; GCN9_4-NEXT: buffer_inv sc0 sc1 + ; GCN9_4-NEXT: ; implicit-def: $sgpr8 + ; GCN9_4-NEXT: buffer_load_dwordx2 v[68:69], v66, s[0:3], 0 offen sc0 sc1 + ; GCN9_4-NEXT: s_waitcnt vmcnt(0) + ; GCN9_4-NEXT: buffer_inv sc0 sc1 + ; GCN9_4-NEXT: v_add_u32_e32 v66, v132, v70 + ; GCN9_4-NEXT: v_add_u32_e32 v70, v132, v71 + ; GCN9_4-NEXT: buffer_load_dwordx2 v[66:67], v66, s[0:3], 0 offen sc0 sc1 + ; GCN9_4-NEXT: s_waitcnt vmcnt(0) + ; GCN9_4-NEXT: buffer_inv sc0 sc1 + ; GCN9_4-NEXT: buffer_load_dwordx2 v[70:71], v70, s[0:3], 0 offen sc0 sc1 + ; GCN9_4-NEXT: s_waitcnt vmcnt(0) + ; GCN9_4-NEXT: buffer_inv sc0 sc1 + ; GCN9_4-NEXT: ;;#ASMSTART + ; GCN9_4-NEXT: s_waitcnt vmcnt(8) + ; GCN9_4-NEXT: ;;#ASMEND + ; GCN9_4-NEXT: ds_read_b128 v[80:83], v94 + ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0) + ; GCN9_4-NEXT: buffer_inv sc0 sc1 + ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[80:81], v[84:85], v[48:63] + ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[82:83], v[86:87], v[48:63] + ; GCN9_4-NEXT: ds_read_b128 v[80:83], v94 offset:512 + ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0) + ; GCN9_4-NEXT: buffer_inv sc0 sc1 + ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[80:81], v[84:85], v[32:47] + ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[82:83], v[86:87], v[32:47] + ; GCN9_4-NEXT: ds_read_b128 v[80:83], v94 offset:1024 + ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0) + ; GCN9_4-NEXT: buffer_inv sc0 sc1 + ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[80:81], v[84:85], v[16:31] + ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[82:83], v[86:87], v[16:31] + ; GCN9_4-NEXT: ds_read_b128 v[80:83], v94 offset:1536 + ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0) + ; GCN9_4-NEXT: buffer_inv sc0 sc1 + ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[80:81], v[84:85], v[0:15] + ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[82:83], v[86:87], v[0:15] + ; GCN9_4-NEXT: ds_read_b128 v[80:83], v88 + ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0) + ; GCN9_4-NEXT: buffer_inv sc0 sc1 + ; GCN9_4-NEXT: ; implicit-def: $vgpr84_vgpr85_vgpr86_vgpr87 + ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[80:81], v[84:85], v[48:63] + ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[82:83], v[86:87], v[48:63] + ; GCN9_4-NEXT: ds_read_b128 v[80:83], v88 offset:512 + ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0) + ; GCN9_4-NEXT: buffer_inv sc0 sc1 + ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[80:81], v[84:85], v[32:47] + ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[82:83], v[86:87], v[32:47] + ; GCN9_4-NEXT: ds_read_b128 v[80:83], v88 offset:1024 + ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0) + ; GCN9_4-NEXT: buffer_inv sc0 sc1 + ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[80:81], v[84:85], v[16:31] + ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[82:83], v[86:87], v[16:31] + ; GCN9_4-NEXT: ds_read_b128 v[80:83], v88 offset:1536 + ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0) + ; GCN9_4-NEXT: buffer_inv sc0 sc1 + ; GCN9_4-NEXT: ;;#ASMSTART + ; GCN9_4-NEXT: s_waitcnt vmcnt(8) + ; GCN9_4-NEXT: ;;#ASMEND + ; GCN9_4-NEXT: buffer_wbl2 sc0 sc1 + ; GCN9_4-NEXT: ds_write_b128 v95, v[72:75] + ; GCN9_4-NEXT: buffer_wbl2 sc0 sc1 + ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0) + ; GCN9_4-NEXT: ds_write_b128 v95, v[76:79] offset:1024 + ; GCN9_4-NEXT: ;;#ASMSTART + ; GCN9_4-NEXT: s_waitcnt vmcnt(8) + ; GCN9_4-NEXT: ;;#ASMEND + ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0) + ; GCN9_4-NEXT: ds_read_b128 v[72:75], v94 + ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0) + ; GCN9_4-NEXT: buffer_inv sc0 sc1 + ; GCN9_4-NEXT: ; implicit-def: $vgpr76_vgpr77_vgpr78_vgpr79 + ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[72:73], v[76:77], v[48:63] + ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[74:75], v[78:79], v[48:63] + ; GCN9_4-NEXT: ds_read_b128 v[72:75], v94 offset:512 + ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0) + ; GCN9_4-NEXT: buffer_inv sc0 sc1 + ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[72:73], v[76:77], v[32:47] + ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[74:75], v[78:79], v[32:47] + ; GCN9_4-NEXT: ds_read_b128 v[72:75], v94 offset:1024 + ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0) + ; GCN9_4-NEXT: buffer_inv sc0 sc1 + ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[80:81], v[84:85], v[0:15] + ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[72:73], v[76:77], v[16:31] + ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[82:83], v[86:87], v[0:15] + ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[74:75], v[78:79], v[16:31] + ; GCN9_4-NEXT: ds_read_b128 v[72:75], v94 offset:1536 + ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0) + ; GCN9_4-NEXT: buffer_inv sc0 sc1 + ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[72:73], v[76:77], v[0:15] + ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[74:75], v[78:79], v[0:15] + ; GCN9_4-NEXT: ds_read_b128 v[72:75], v88 + ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0) + ; GCN9_4-NEXT: buffer_inv sc0 sc1 + ; GCN9_4-NEXT: ; implicit-def: $vgpr76_vgpr77_vgpr78_vgpr79 + ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[72:73], v[76:77], v[48:63] + ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[74:75], v[78:79], v[48:63] + ; GCN9_4-NEXT: ds_read_b128 v[72:75], v88 offset:512 + ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0) + ; GCN9_4-NEXT: buffer_inv sc0 sc1 + ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[72:73], v[76:77], v[32:47] + ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[74:75], v[78:79], v[32:47] + ; GCN9_4-NEXT: ds_read_b128 v[72:75], v88 offset:1024 + ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0) + ; GCN9_4-NEXT: buffer_inv sc0 sc1 + ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[72:73], v[76:77], v[16:31] + ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[74:75], v[78:79], v[16:31] + ; GCN9_4-NEXT: ds_read_b128 v[72:75], v88 offset:1536 + ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0) + ; GCN9_4-NEXT: buffer_inv sc0 sc1 + ; GCN9_4-NEXT: ;;#ASMSTART + ; GCN9_4-NEXT: s_waitcnt vmcnt(8) + ; GCN9_4-NEXT: ;;#ASMEND + ; GCN9_4-NEXT: ; implicit-def: $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95 + ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[72:73], v[76:77], v[0:15] + ; GCN9_4-NEXT: v_perm_b32 v72, v68, v64, s5 + ; GCN9_4-NEXT: v_perm_b32 v64, v68, v64, s8 + ; GCN9_4-NEXT: v_perm_b32 v68, v69, v65, s5 + ; GCN9_4-NEXT: v_perm_b32 v73, v70, v66, s5 + ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[74:75], v[78:79], v[0:15] + ; GCN9_4-NEXT: v_perm_b32 v74, v69, v65, s8 + ; GCN9_4-NEXT: v_perm_b32 v65, v70, v66, s8 + ; GCN9_4-NEXT: v_perm_b32 v69, v71, v67, s5 + ; GCN9_4-NEXT: v_perm_b32 v75, v71, v67, s8 + ; GCN9_4-NEXT: v_mul_f32_e32 v66, s4, v48 + ; GCN9_4-NEXT: v_mul_f32_e32 v67, s4, v49 + ; GCN9_4-NEXT: v_max3_f32 v66, v66, s6, v67 + ; GCN9_4-NEXT: v_mul_f32_e32 v67, s4, v50 + ; GCN9_4-NEXT: v_mul_f32_e32 v70, s4, v51 + ; GCN9_4-NEXT: v_max3_f32 v66, v66, v67, v70 + ; GCN9_4-NEXT: v_mul_f32_e32 v67, s4, v52 + ; GCN9_4-NEXT: v_mul_f32_e32 v70, s4, v53 + ; GCN9_4-NEXT: v_max3_f32 v66, v66, v67, v70 + ; GCN9_4-NEXT: v_mul_f32_e32 v67, s4, v54 + ; GCN9_4-NEXT: v_mul_f32_e32 v70, s4, v55 + ; GCN9_4-NEXT: v_max3_f32 v66, v66, v67, v70 + ; GCN9_4-NEXT: v_mul_f32_e32 v67, s4, v56 + ; GCN9_4-NEXT: v_mul_f32_e32 v70, s4, v57 + ; GCN9_4-NEXT: v_max3_f32 v66, v66, v67, v70 + ; GCN9_4-NEXT: v_mul_f32_e32 v67, s4, v58 + ; GCN9_4-NEXT: v_mul_f32_e32 v70, s4, v59 + ; GCN9_4-NEXT: v_max3_f32 v66, v66, v67, v70 + ; GCN9_4-NEXT: v_mul_f32_e32 v67, s4, v60 + ; GCN9_4-NEXT: v_mul_f32_e32 v70, s4, v61 + ; GCN9_4-NEXT: v_max3_f32 v66, v66, v67, v70 + ; GCN9_4-NEXT: v_mul_f32_e32 v67, s4, v62 + ; GCN9_4-NEXT: v_mul_f32_e32 v70, s4, v63 + ; GCN9_4-NEXT: v_max3_f32 v66, v66, v67, v70 + ; GCN9_4-NEXT: v_mul_f32_e32 v67, s4, v32 + ; GCN9_4-NEXT: v_mul_f32_e32 v70, s4, v33 + ; GCN9_4-NEXT: v_max3_f32 v66, v66, v67, v70 + ; GCN9_4-NEXT: v_mul_f32_e32 v67, s4, v34 + ; GCN9_4-NEXT: v_mul_f32_e32 v70, s4, v35 + ; GCN9_4-NEXT: v_max3_f32 v66, v66, v67, v70 + ; GCN9_4-NEXT: v_mul_f32_e32 v67, s4, v36 + ; GCN9_4-NEXT: v_mul_f32_e32 v70, s4, v37 + ; GCN9_4-NEXT: v_max3_f32 v66, v66, v67, v70 + ; GCN9_4-NEXT: v_mul_f32_e32 v67, s4, v38 + ; GCN9_4-NEXT: v_mul_f32_e32 v70, s4, v39 + ; GCN9_4-NEXT: v_max3_f32 v66, v66, v67, v70 + ; GCN9_4-NEXT: v_mul_f32_e32 v67, s4, v40 + ; GCN9_4-NEXT: v_mul_f32_e32 v70, s4, v41 + ; GCN9_4-NEXT: v_max3_f32 v66, v66, v67, v70 + ; GCN9_4-NEXT: v_mul_f32_e32 v67, s4, v42 + ; GCN9_4-NEXT: v_mul_f32_e32 v70, s4, v43 + ; GCN9_4-NEXT: v_max3_f32 v66, v66, v67, v70 + ; GCN9_4-NEXT: v_mul_f32_e32 v67, s4, v44 + ; GCN9_4-NEXT: v_mul_f32_e32 v70, s4, v45 + ; GCN9_4-NEXT: v_max3_f32 v66, v66, v67, v70 + ; GCN9_4-NEXT: v_mul_f32_e32 v67, s4, v46 + ; GCN9_4-NEXT: v_mul_f32_e32 v70, s4, v47 + ; GCN9_4-NEXT: v_max3_f32 v66, v66, v67, v70 + ; GCN9_4-NEXT: v_mul_f32_e32 v67, s4, v16 + ; GCN9_4-NEXT: v_mul_f32_e32 v70, s4, v17 + ; GCN9_4-NEXT: v_max3_f32 v66, v66, v67, v70 + ; GCN9_4-NEXT: v_mul_f32_e32 v67, s4, v18 + ; GCN9_4-NEXT: v_mul_f32_e32 v70, s4, v19 + ; GCN9_4-NEXT: v_max3_f32 v66, v66, v67, v70 + ; GCN9_4-NEXT: v_mul_f32_e32 v67, s4, v20 + ; GCN9_4-NEXT: v_mul_f32_e32 v70, s4, v21 + ; GCN9_4-NEXT: v_max3_f32 v66, v66, v67, v70 + ; GCN9_4-NEXT: v_mul_f32_e32 v67, s4, v22 + ; GCN9_4-NEXT: v_mul_f32_e32 v70, s4, v23 + ; GCN9_4-NEXT: v_max3_f32 v66, v66, v67, v70 + ; GCN9_4-NEXT: v_mul_f32_e32 v67, s4, v24 + ; GCN9_4-NEXT: v_mul_f32_e32 v70, s4, v25 + ; GCN9_4-NEXT: v_max3_f32 v66, v66, v67, v70 + ; GCN9_4-NEXT: v_mul_f32_e32 v67, s4, v26 + ; GCN9_4-NEXT: v_mul_f32_e32 v70, s4, v27 + ; GCN9_4-NEXT: v_max3_f32 v66, v66, v67, v70 + ; GCN9_4-NEXT: v_mul_f32_e32 v67, s4, v28 + ; GCN9_4-NEXT: v_mul_f32_e32 v70, s4, v29 + ; GCN9_4-NEXT: v_max3_f32 v66, v66, v67, v70 + ; GCN9_4-NEXT: v_mul_f32_e32 v67, s4, v30 + ; GCN9_4-NEXT: v_mul_f32_e32 v70, s4, v31 + ; GCN9_4-NEXT: v_max3_f32 v66, v66, v67, v70 + ; GCN9_4-NEXT: v_mul_f32_e32 v67, s4, v0 + ; GCN9_4-NEXT: v_mul_f32_e32 v70, s4, v1 + ; GCN9_4-NEXT: v_max3_f32 v66, v66, v67, v70 + ; GCN9_4-NEXT: v_mul_f32_e32 v67, s4, v2 + ; GCN9_4-NEXT: v_mul_f32_e32 v70, s4, v3 + ; GCN9_4-NEXT: v_max3_f32 v66, v66, v67, v70 + ; GCN9_4-NEXT: v_mul_f32_e32 v67, s4, v4 + ; GCN9_4-NEXT: v_mul_f32_e32 v70, s4, v5 + ; GCN9_4-NEXT: v_max3_f32 v66, v66, v67, v70 + ; GCN9_4-NEXT: v_mul_f32_e32 v67, s4, v6 + ; GCN9_4-NEXT: v_mul_f32_e32 v70, s4, v7 + ; GCN9_4-NEXT: v_max3_f32 v66, v66, v67, v70 + ; GCN9_4-NEXT: v_mul_f32_e32 v67, s4, v8 + ; GCN9_4-NEXT: v_mul_f32_e32 v70, s4, v9 + ; GCN9_4-NEXT: v_max3_f32 v66, v66, v67, v70 + ; GCN9_4-NEXT: v_mul_f32_e32 v67, s4, v10 + ; GCN9_4-NEXT: v_mul_f32_e32 v70, s4, v11 + ; GCN9_4-NEXT: v_max3_f32 v66, v66, v67, v70 + ; GCN9_4-NEXT: v_mul_f32_e32 v67, s4, v12 + ; GCN9_4-NEXT: v_mul_f32_e32 v70, s4, v13 + ; GCN9_4-NEXT: v_max3_f32 v66, v66, v67, v70 + ; GCN9_4-NEXT: ; implicit-def: $vgpr67 + ; GCN9_4-NEXT: ; implicit-def: $sgpr6 + ; GCN9_4-NEXT: v_mul_f32_e32 v70, s4, v14 + ; GCN9_4-NEXT: v_add_u32_e32 v67, s7, v67 + ; GCN9_4-NEXT: v_and_b32_e32 v67, 0x1fffffff, v67 + ; GCN9_4-NEXT: v_mul_lo_u32 v67, v67, s6 + ; GCN9_4-NEXT: v_mul_f32_e32 v71, s4, v15 + ; GCN9_4-NEXT: v_max3_f32 v66, v66, v70, v71 + ; GCN9_4-NEXT: ; implicit-def: $vgpr70 + ; GCN9_4-NEXT: ; implicit-def: $sgpr6_sgpr7 + ; GCN9_4-NEXT: v_add_lshl_u32 v135, v70, v67, 1 + ; GCN9_4-NEXT: ds_bpermute_b32 v67, v133, v66 + ; GCN9_4-NEXT: ; implicit-def: $vgpr70 + ; GCN9_4-NEXT: v_lshl_add_u32 v136, v70, 1, v135 + ; GCN9_4-NEXT: buffer_wbl2 sc0 sc1 + ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0) + ; GCN9_4-NEXT: ds_write_b64 v135, v[72:73] + ; GCN9_4-NEXT: buffer_wbl2 sc0 sc1 + ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0) + ; GCN9_4-NEXT: ds_write_b64 v136, v[64:65] + ; GCN9_4-NEXT: ; implicit-def: $vgpr64 + ; GCN9_4-NEXT: v_lshl_add_u32 v137, v64, 1, v136 + ; GCN9_4-NEXT: v_max_f32_e32 v65, v67, v67 + ; GCN9_4-NEXT: v_max_f32_e32 v65, v66, v65 + ; GCN9_4-NEXT: ds_bpermute_b32 v66, v133, v65 + ; GCN9_4-NEXT: ; implicit-def: $vgpr64 + ; GCN9_4-NEXT: v_lshl_add_u32 v138, v64, 1, v137 + ; GCN9_4-NEXT: buffer_wbl2 sc0 sc1 + ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0) + ; GCN9_4-NEXT: ds_write_b64 v137, v[68:69] + ; GCN9_4-NEXT: ; implicit-def: $vgpr68 + ; GCN9_4-NEXT: buffer_wbl2 sc0 sc1 + ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0) + ; GCN9_4-NEXT: ds_write_b64 v138, v[74:75] + ; GCN9_4-NEXT: v_add_u32_e32 v68, v132, v68 + ; GCN9_4-NEXT: v_cndmask_b32_e64 v64, v66, v65, s[6:7] + ; GCN9_4-NEXT: v_max_f32_e32 v64, v64, v64 + ; GCN9_4-NEXT: ; implicit-def: $vgpr65 + ; GCN9_4-NEXT: v_max_f32_e32 v66, v65, v65 + ; GCN9_4-NEXT: v_max_f32_e32 v134, v66, v64 + ; GCN9_4-NEXT: ; implicit-def: $vgpr64 + ; GCN9_4-NEXT: ; implicit-def: $vgpr66 + ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0) + ; GCN9_4-NEXT: buffer_load_dwordx2 v[156:157], v68, s[0:3], 0 offen sc0 sc1 + ; GCN9_4-NEXT: s_waitcnt vmcnt(0) + ; GCN9_4-NEXT: buffer_inv sc0 sc1 + ; GCN9_4-NEXT: v_add_u32_e32 v64, v132, v64 + ; GCN9_4-NEXT: buffer_load_dwordx2 v[158:159], v64, s[0:3], 0 offen sc0 sc1 + ; GCN9_4-NEXT: s_waitcnt vmcnt(0) + ; GCN9_4-NEXT: buffer_inv sc0 sc1 + ; GCN9_4-NEXT: v_add_u32_e32 v64, v132, v66 + ; GCN9_4-NEXT: ; implicit-def: $vgpr67 + ; GCN9_4-NEXT: buffer_load_dwordx2 v[128:129], v64, s[0:3], 0 offen sc0 sc1 + ; GCN9_4-NEXT: s_waitcnt vmcnt(0) + ; GCN9_4-NEXT: buffer_inv sc0 sc1 + ; GCN9_4-NEXT: v_add_u32_e32 v64, v132, v67 + ; GCN9_4-NEXT: buffer_load_dwordx2 v[130:131], v64, s[0:3], 0 offen sc0 sc1 + ; GCN9_4-NEXT: s_waitcnt vmcnt(0) + ; GCN9_4-NEXT: buffer_inv sc0 sc1 + ; GCN9_4-NEXT: v_fma_f32 v48, s4, v48, -v134 + ; GCN9_4-NEXT: v_fma_f32 v57, s4, v57, -v134 + ; GCN9_4-NEXT: v_mul_f32_e32 v48, 0x3fb8aa3b, v48 + ; GCN9_4-NEXT: v_fma_f32 v64, s4, v49, -v134 + ; GCN9_4-NEXT: v_mul_f32_e32 v57, 0x3fb8aa3b, v57 + ; GCN9_4-NEXT: v_fma_f32 v66, s4, v50, -v134 + ; GCN9_4-NEXT: v_exp_f32_e32 v162, v57 + ; GCN9_4-NEXT: v_exp_f32_e32 v49, v48 + ; GCN9_4-NEXT: v_mul_f32_e32 v48, 0x3fb8aa3b, v64 + ; GCN9_4-NEXT: v_fma_f32 v67, s4, v51, -v134 + ; GCN9_4-NEXT: v_exp_f32_e32 v50, v48 + ; GCN9_4-NEXT: v_mul_f32_e32 v48, 0x3fb8aa3b, v66 + ; GCN9_4-NEXT: v_fma_f32 v68, s4, v52, -v134 + ; GCN9_4-NEXT: v_exp_f32_e32 v51, v48 + ; GCN9_4-NEXT: v_mul_f32_e32 v48, 0x3fb8aa3b, v67 + ; GCN9_4-NEXT: v_fma_f32 v69, s4, v53, -v134 + ; GCN9_4-NEXT: v_exp_f32_e32 v52, v48 + ; GCN9_4-NEXT: v_mul_f32_e32 v48, 0x3fb8aa3b, v68 + ; GCN9_4-NEXT: ;;#ASMSTART + ; GCN9_4-NEXT: s_waitcnt vmcnt(8) + ; GCN9_4-NEXT: ;;#ASMEND + ; GCN9_4-NEXT: v_fma_f32 v70, s4, v54, -v134 + ; GCN9_4-NEXT: v_exp_f32_e32 v53, v48 + ; GCN9_4-NEXT: v_mul_f32_e32 v48, 0x3fb8aa3b, v69 + ; GCN9_4-NEXT: v_fma_f32 v71, s4, v55, -v134 + ; GCN9_4-NEXT: ds_read_b128 v[140:143], v139 + ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0) + ; GCN9_4-NEXT: buffer_inv sc0 sc1 + ; GCN9_4-NEXT: v_exp_f32_e32 v54, v48 + ; GCN9_4-NEXT: v_mul_f32_e32 v48, 0x3fb8aa3b, v70 + ; GCN9_4-NEXT: v_exp_f32_e32 v55, v48 + ; GCN9_4-NEXT: v_mul_f32_e32 v48, 0x3fb8aa3b, v71 + ; GCN9_4-NEXT: v_fma_f32 v66, s4, v56, -v134 + ; GCN9_4-NEXT: v_exp_f32_e32 v56, v48 + ; GCN9_4-NEXT: v_sub_f32_e32 v48, v65, v134 + ; GCN9_4-NEXT: v_mul_f32_e32 v48, 0x3fb8aa3b, v48 + ; GCN9_4-NEXT: v_cvt_f16_f32_e32 v64, v49 + ; GCN9_4-NEXT: v_cvt_f16_f32_e32 v67, v50 + ; GCN9_4-NEXT: v_cvt_f16_f32_e32 v68, v51 + ; GCN9_4-NEXT: v_fma_f32 v112, s4, v58, -v134 + ; GCN9_4-NEXT: v_cvt_f16_f32_e32 v58, v52 + ; GCN9_4-NEXT: v_exp_f32_e32 v48, v48 + ; GCN9_4-NEXT: ds_read_b128 v[144:147], v139 offset:576 + ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0) + ; GCN9_4-NEXT: buffer_inv sc0 sc1 + ; GCN9_4-NEXT: v_fma_f32 v160, s4, v59, -v134 + ; GCN9_4-NEXT: v_pack_b32_f16 v59, v68, v58 + ; GCN9_4-NEXT: v_pack_b32_f16 v58, v64, v67 + ; GCN9_4-NEXT: v_pk_mul_f32 v[96:97], v[96:97], v[48:49] op_sel_hi:[1,0] + ; GCN9_4-NEXT: v_pk_mul_f32 v[98:99], v[98:99], v[48:49] op_sel_hi:[1,0] + ; GCN9_4-NEXT: v_pk_mul_f32 v[100:101], v[100:101], v[48:49] op_sel_hi:[1,0] + ; GCN9_4-NEXT: v_pk_mul_f32 v[102:103], v[102:103], v[48:49] op_sel_hi:[1,0] + ; GCN9_4-NEXT: v_pk_mul_f32 v[104:105], v[104:105], v[48:49] op_sel_hi:[1,0] + ; GCN9_4-NEXT: v_pk_mul_f32 v[106:107], v[106:107], v[48:49] op_sel_hi:[1,0] + ; GCN9_4-NEXT: v_pk_mul_f32 v[108:109], v[108:109], v[48:49] op_sel_hi:[1,0] + ; GCN9_4-NEXT: v_pk_mul_f32 v[110:111], v[110:111], v[48:49] op_sel_hi:[1,0] + ; GCN9_4-NEXT: ds_read_b128 v[148:151], v139 offset:1152 + ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0) + ; GCN9_4-NEXT: buffer_inv sc0 sc1 + ; GCN9_4-NEXT: ds_read_b128 v[152:155], v139 offset:1728 + ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0) + ; GCN9_4-NEXT: buffer_inv sc0 sc1 + ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[140:141], v[58:59], v[96:111] + ; GCN9_4-NEXT: v_mul_f32_e32 v64, 0x3fb8aa3b, v66 + ; GCN9_4-NEXT: v_pk_mul_f32 v[80:81], v[80:81], v[48:49] op_sel_hi:[1,0] + ; GCN9_4-NEXT: v_pk_mul_f32 v[82:83], v[82:83], v[48:49] op_sel_hi:[1,0] + ; GCN9_4-NEXT: v_pk_mul_f32 v[84:85], v[84:85], v[48:49] op_sel_hi:[1,0] + ; GCN9_4-NEXT: v_pk_mul_f32 v[86:87], v[86:87], v[48:49] op_sel_hi:[1,0] + ; GCN9_4-NEXT: v_pk_mul_f32 v[88:89], v[88:89], v[48:49] op_sel_hi:[1,0] + ; GCN9_4-NEXT: v_pk_mul_f32 v[90:91], v[90:91], v[48:49] op_sel_hi:[1,0] + ; GCN9_4-NEXT: v_pk_mul_f32 v[92:93], v[92:93], v[48:49] op_sel_hi:[1,0] + ; GCN9_4-NEXT: v_pk_mul_f32 v[94:95], v[94:95], v[48:49] op_sel_hi:[1,0] + ; GCN9_4-NEXT: v_exp_f32_e32 v161, v64 + ; GCN9_4-NEXT: ; implicit-def: $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79 + ; GCN9_4-NEXT: v_mul_f32_e32 v57, 0x3fb8aa3b, v112 + ; GCN9_4-NEXT: v_pk_mul_f32 v[64:65], v[64:65], v[48:49] op_sel_hi:[1,0] + ; GCN9_4-NEXT: v_pk_mul_f32 v[66:67], v[66:67], v[48:49] op_sel_hi:[1,0] + ; GCN9_4-NEXT: v_pk_mul_f32 v[68:69], v[68:69], v[48:49] op_sel_hi:[1,0] + ; GCN9_4-NEXT: v_pk_mul_f32 v[70:71], v[70:71], v[48:49] op_sel_hi:[1,0] + ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[144:145], v[58:59], v[80:95] + ; GCN9_4-NEXT: v_fma_f32 v144, s4, v61, -v134 + ; GCN9_4-NEXT: v_pk_mul_f32 v[72:73], v[72:73], v[48:49] op_sel_hi:[1,0] + ; GCN9_4-NEXT: v_pk_mul_f32 v[74:75], v[74:75], v[48:49] op_sel_hi:[1,0] + ; GCN9_4-NEXT: v_pk_mul_f32 v[76:77], v[76:77], v[48:49] op_sel_hi:[1,0] + ; GCN9_4-NEXT: v_pk_mul_f32 v[78:79], v[78:79], v[48:49] op_sel_hi:[1,0] + ; GCN9_4-NEXT: v_mul_f32_e32 v144, 0x3fb8aa3b, v144 + ; GCN9_4-NEXT: v_exp_f32_e32 v164, v144 + ; GCN9_4-NEXT: v_cvt_f16_f32_e32 v140, v53 + ; GCN9_4-NEXT: v_cvt_f16_f32_e32 v141, v54 + ; GCN9_4-NEXT: v_cvt_f16_f32_e32 v61, v55 + ; GCN9_4-NEXT: ; implicit-def: $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127 + ; GCN9_4-NEXT: v_fma_f32 v145, s4, v62, -v134 + ; GCN9_4-NEXT: v_pk_mul_f32 v[112:113], v[112:113], v[48:49] op_sel_hi:[1,0] + ; GCN9_4-NEXT: v_pk_mul_f32 v[114:115], v[114:115], v[48:49] op_sel_hi:[1,0] + ; GCN9_4-NEXT: v_pk_mul_f32 v[116:117], v[116:117], v[48:49] op_sel_hi:[1,0] + ; GCN9_4-NEXT: v_pk_mul_f32 v[118:119], v[118:119], v[48:49] op_sel_hi:[1,0] + ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[148:149], v[58:59], v[64:79] + ; GCN9_4-NEXT: v_exp_f32_e32 v148, v57 + ; GCN9_4-NEXT: v_cvt_f16_f32_e32 v57, v56 + ; GCN9_4-NEXT: v_pk_mul_f32 v[120:121], v[120:121], v[48:49] op_sel_hi:[1,0] + ; GCN9_4-NEXT: v_pk_mul_f32 v[122:123], v[122:123], v[48:49] op_sel_hi:[1,0] + ; GCN9_4-NEXT: v_pk_mul_f32 v[124:125], v[124:125], v[48:49] op_sel_hi:[1,0] + ; GCN9_4-NEXT: v_pk_mul_f32 v[126:127], v[126:127], v[48:49] op_sel_hi:[1,0] + ; GCN9_4-NEXT: v_fma_f32 v149, s4, v63, -v134 + ; GCN9_4-NEXT: v_pack_b32_f16 v63, v61, v57 + ; GCN9_4-NEXT: v_pack_b32_f16 v62, v140, v141 + ; GCN9_4-NEXT: v_mul_f32_e32 v57, 0x3fb8aa3b, v160 + ; GCN9_4-NEXT: v_fma_f32 v60, s4, v60, -v134 + ; GCN9_4-NEXT: v_fma_f32 v163, s4, v33, -v134 + ; GCN9_4-NEXT: v_mul_f32_e32 v33, 0x3fb8aa3b, v145 + ; GCN9_4-NEXT: v_cvt_f16_f32_e32 v160, v161 + ; GCN9_4-NEXT: v_cvt_f16_f32_e32 v144, v148 + ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[152:153], v[58:59], v[112:127] + ; GCN9_4-NEXT: v_exp_f32_e32 v152, v57 + ; GCN9_4-NEXT: v_mul_f32_e32 v153, 0x3fb8aa3b, v60 + ; GCN9_4-NEXT: ; implicit-def: $vgpr57 + ; GCN9_4-NEXT: ds_read_b128 v[58:61], v57 + ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0) + ; GCN9_4-NEXT: buffer_inv sc0 sc1 + ; GCN9_4-NEXT: v_fma_f32 v32, s4, v32, -v134 + ; GCN9_4-NEXT: v_fma_f32 v40, s4, v40, -v134 + ; GCN9_4-NEXT: v_fma_f32 v44, s4, v44, -v134 + ; GCN9_4-NEXT: v_fma_f32 v16, s4, v16, -v134 + ; GCN9_4-NEXT: v_fma_f32 v24, s4, v24, -v134 + ; GCN9_4-NEXT: v_fma_f32 v28, s4, v28, -v134 + ; GCN9_4-NEXT: v_fma_f32 v0, s4, v0, -v134 + ; GCN9_4-NEXT: v_fma_f32 v8, s4, v8, -v134 + ; GCN9_4-NEXT: v_fma_f32 v12, s4, v12, -v134 + ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[142:143], v[62:63], v[96:111] + ; GCN9_4-NEXT: v_exp_f32_e32 v153, v153 + ; GCN9_4-NEXT: ds_read_b128 v[140:143], v57 offset:576 + ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0) + ; GCN9_4-NEXT: buffer_inv sc0 sc1 + ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[146:147], v[62:63], v[80:95] + ; GCN9_4-NEXT: v_cvt_f16_f32_e32 v146, v162 + ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[150:151], v[62:63], v[64:79] + ; GCN9_4-NEXT: v_exp_f32_e32 v151, v33 + ; GCN9_4-NEXT: v_cvt_f16_f32_e32 v33, v152 + ; GCN9_4-NEXT: v_fma_f32 v150, s4, v34, -v134 + ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[154:155], v[62:63], v[112:127] + ; GCN9_4-NEXT: v_pack_b32_f16 v63, v144, v33 + ; GCN9_4-NEXT: v_pack_b32_f16 v62, v160, v146 + ; GCN9_4-NEXT: v_mul_f32_e32 v33, 0x3fb8aa3b, v149 + ; GCN9_4-NEXT: v_fma_f32 v149, s4, v35, -v134 + ; GCN9_4-NEXT: v_exp_f32_e32 v154, v33 + ; GCN9_4-NEXT: v_fma_f32 v160, s4, v36, -v134 + ; GCN9_4-NEXT: v_mul_f32_e32 v36, 0x3fb8aa3b, v163 + ; GCN9_4-NEXT: v_cvt_f16_f32_e32 v155, v153 + ; GCN9_4-NEXT: v_fma_f32 v163, s4, v37, -v134 + ; GCN9_4-NEXT: v_perm_b32 v37, v130, v128, s8 + ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[58:59], v[62:63], v[96:111] + ; GCN9_4-NEXT: v_mul_f32_e32 v58, 0x3fb8aa3b, v32 + ; GCN9_4-NEXT: ds_read_b128 v[32:35], v57 offset:1152 + ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0) + ; GCN9_4-NEXT: buffer_inv sc0 sc1 + ; GCN9_4-NEXT: ds_read_b128 v[144:147], v57 offset:1728 + ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0) + ; GCN9_4-NEXT: buffer_inv sc0 sc1 + ; GCN9_4-NEXT: v_exp_f32_e32 v165, v58 + ; GCN9_4-NEXT: v_perm_b32 v59, v131, v129, s5 + ; GCN9_4-NEXT: v_perm_b32 v58, v159, v157, s5 + ; GCN9_4-NEXT: ;;#ASMSTART + ; GCN9_4-NEXT: s_waitcnt vmcnt(8) + ; GCN9_4-NEXT: ;;#ASMEND + ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[140:141], v[62:63], v[80:95] + ; GCN9_4-NEXT: v_exp_f32_e32 v166, v36 + ; GCN9_4-NEXT: v_cvt_f16_f32_e32 v141, v151 + ; GCN9_4-NEXT: v_cvt_f16_f32_e32 v140, v164 + ; GCN9_4-NEXT: v_perm_b32 v36, v158, v156, s8 + ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[32:33], v[62:63], v[64:79] + ; GCN9_4-NEXT: v_mul_f32_e32 v32, 0x3fb8aa3b, v150 + ; GCN9_4-NEXT: v_fma_f32 v150, s4, v38, -v134 + ; GCN9_4-NEXT: v_cvt_f16_f32_e32 v38, v154 + ; GCN9_4-NEXT: v_exp_f32_e32 v167, v32 + ; GCN9_4-NEXT: v_perm_b32 v32, v158, v156, s5 + ; GCN9_4-NEXT: v_perm_b32 v33, v130, v128, s5 + ; GCN9_4-NEXT: v_pack_b32_f16 v128, v155, v140 + ; GCN9_4-NEXT: buffer_wbl2 sc0 sc1 + ; GCN9_4-NEXT: ds_write_b64 v135, v[32:33] + ; GCN9_4-NEXT: ; implicit-def: $vgpr33 + ; GCN9_4-NEXT: buffer_wbl2 sc0 sc1 + ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0) + ; GCN9_4-NEXT: ds_write_b64 v136, v[36:37] + ; GCN9_4-NEXT: buffer_wbl2 sc0 sc1 + ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0) + ; GCN9_4-NEXT: ds_write_b64 v137, v[58:59] + ; GCN9_4-NEXT: v_add_u32_e32 v33, v132, v33 + ; GCN9_4-NEXT: ; implicit-def: $vgpr36 + ; GCN9_4-NEXT: ; implicit-def: $vgpr37 + ; GCN9_4-NEXT: v_mul_f32_e32 v32, 0x3fb8aa3b, v160 + ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[144:145], v[62:63], v[112:127] + ; GCN9_4-NEXT: v_perm_b32 v63, v131, v129, s8 + ; GCN9_4-NEXT: v_pack_b32_f16 v129, v141, v38 + ; GCN9_4-NEXT: v_mul_f32_e32 v38, 0x3fb8aa3b, v149 + ; GCN9_4-NEXT: v_perm_b32 v62, v159, v157, s8 + ; GCN9_4-NEXT: v_exp_f32_e32 v155, v38 + ; GCN9_4-NEXT: ; implicit-def: $vgpr38 + ; GCN9_4-NEXT: buffer_wbl2 sc0 sc1 + ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0) + ; GCN9_4-NEXT: ds_write_b64 v138, v[62:63] + ; GCN9_4-NEXT: v_add_u32_e32 v38, v132, v38 + ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0) + ; GCN9_4-NEXT: buffer_load_dwordx2 v[62:63], v38, s[0:3], 0 offen sc0 sc1 + ; GCN9_4-NEXT: s_waitcnt vmcnt(0) + ; GCN9_4-NEXT: buffer_inv sc0 sc1 + ; GCN9_4-NEXT: buffer_load_dwordx2 v[130:131], v33, s[0:3], 0 offen sc0 sc1 + ; GCN9_4-NEXT: s_waitcnt vmcnt(0) + ; GCN9_4-NEXT: buffer_inv sc0 sc1 + ; GCN9_4-NEXT: v_add_u32_e32 v33, v132, v36 + ; GCN9_4-NEXT: buffer_load_dwordx2 v[140:141], v33, s[0:3], 0 offen sc0 sc1 + ; GCN9_4-NEXT: s_waitcnt vmcnt(0) + ; GCN9_4-NEXT: buffer_inv sc0 sc1 + ; GCN9_4-NEXT: v_add_u32_e32 v33, v132, v37 + ; GCN9_4-NEXT: buffer_load_dwordx2 v[144:145], v33, s[0:3], 0 offen sc0 sc1 + ; GCN9_4-NEXT: s_waitcnt vmcnt(0) + ; GCN9_4-NEXT: buffer_inv sc0 sc1 + ; GCN9_4-NEXT: ;;#ASMSTART + ; GCN9_4-NEXT: s_waitcnt vmcnt(8) + ; GCN9_4-NEXT: ;;#ASMEND + ; GCN9_4-NEXT: v_fma_f32 v149, s4, v39, -v134 + ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[60:61], v[128:129], v[96:111] + ; GCN9_4-NEXT: ds_read_b128 v[36:39], v139 + ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0) + ; GCN9_4-NEXT: buffer_inv sc0 sc1 + ; GCN9_4-NEXT: v_exp_f32_e32 v156, v32 + ; GCN9_4-NEXT: v_mul_f32_e32 v32, 0x3fb8aa3b, v163 + ; GCN9_4-NEXT: v_cvt_f16_f32_e32 v33, v165 + ; GCN9_4-NEXT: ds_read_b128 v[58:61], v139 offset:576 + ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0) + ; GCN9_4-NEXT: buffer_inv sc0 sc1 + ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[142:143], v[128:129], v[80:95] + ; GCN9_4-NEXT: v_exp_f32_e32 v157, v32 + ; GCN9_4-NEXT: v_mul_f32_e32 v32, 0x3fb8aa3b, v150 + ; GCN9_4-NEXT: v_cvt_f16_f32_e32 v142, v166 + ; GCN9_4-NEXT: v_fma_f32 v143, s4, v41, -v134 + ; GCN9_4-NEXT: v_fma_f32 v150, s4, v42, -v134 + ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[34:35], v[128:129], v[64:79] + ; GCN9_4-NEXT: v_cvt_f16_f32_e32 v34, v167 + ; GCN9_4-NEXT: v_exp_f32_e32 v158, v32 + ; GCN9_4-NEXT: v_cvt_f16_f32_e32 v32, v155 + ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[146:147], v[128:129], v[112:127] + ; GCN9_4-NEXT: v_pack_b32_f16 v129, v34, v32 + ; GCN9_4-NEXT: v_pack_b32_f16 v128, v33, v142 + ; GCN9_4-NEXT: v_mul_f32_e32 v32, 0x3fb8aa3b, v149 + ; GCN9_4-NEXT: v_exp_f32_e32 v146, v32 + ; GCN9_4-NEXT: ds_read_b128 v[32:35], v139 offset:1152 + ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0) + ; GCN9_4-NEXT: buffer_inv sc0 sc1 + ; GCN9_4-NEXT: v_fma_f32 v142, s4, v43, -v134 + ; GCN9_4-NEXT: v_fma_f32 v149, s4, v46, -v134 + ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[36:37], v[128:129], v[96:111] + ; GCN9_4-NEXT: v_mul_f32_e32 v36, 0x3fb8aa3b, v40 + ; GCN9_4-NEXT: ds_read_b128 v[40:43], v139 offset:1728 + ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0) + ; GCN9_4-NEXT: buffer_inv sc0 sc1 + ; GCN9_4-NEXT: v_exp_f32_e32 v147, v36 + ; GCN9_4-NEXT: v_mul_f32_e32 v36, 0x3fb8aa3b, v143 + ; GCN9_4-NEXT: v_cvt_f16_f32_e32 v37, v156 + ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[58:59], v[128:129], v[80:95] + ; GCN9_4-NEXT: v_exp_f32_e32 v143, v36 + ; GCN9_4-NEXT: v_cvt_f16_f32_e32 v58, v157 + ; GCN9_4-NEXT: v_mul_f32_e32 v36, 0x3fb8aa3b, v142 + ; GCN9_4-NEXT: v_fma_f32 v59, s4, v45, -v134 + ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[32:33], v[128:129], v[64:79] + ; GCN9_4-NEXT: v_mul_f32_e32 v32, 0x3fb8aa3b, v150 + ; GCN9_4-NEXT: v_cvt_f16_f32_e32 v33, v158 + ; GCN9_4-NEXT: v_exp_f32_e32 v150, v32 + ; GCN9_4-NEXT: v_cvt_f16_f32_e32 v32, v146 + ; GCN9_4-NEXT: v_pack_b32_f16 v33, v33, v32 + ; GCN9_4-NEXT: v_pack_b32_f16 v32, v37, v58 + ; GCN9_4-NEXT: v_cvt_f16_f32_e32 v58, v147 + ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[40:41], v[128:129], v[112:127] + ; GCN9_4-NEXT: v_exp_f32_e32 v129, v36 + ; GCN9_4-NEXT: v_mul_f32_e32 v40, 0x3fb8aa3b, v44 + ; GCN9_4-NEXT: v_fma_f32 v128, s4, v47, -v134 + ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[38:39], v[32:33], v[96:111] + ; GCN9_4-NEXT: ds_read_b128 v[36:39], v57 + ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0) + ; GCN9_4-NEXT: buffer_inv sc0 sc1 + ; GCN9_4-NEXT: v_exp_f32_e32 v142, v40 + ; GCN9_4-NEXT: v_mul_f32_e32 v40, 0x3fb8aa3b, v59 + ; GCN9_4-NEXT: v_cvt_f16_f32_e32 v59, v143 + ; GCN9_4-NEXT: ds_read_b128 v[44:47], v57 offset:576 + ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0) + ; GCN9_4-NEXT: buffer_inv sc0 sc1 + ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[60:61], v[32:33], v[80:95] + ; GCN9_4-NEXT: v_exp_f32_e32 v61, v40 + ; GCN9_4-NEXT: v_fma_f32 v60, s4, v17, -v134 + ; GCN9_4-NEXT: v_mul_f32_e32 v17, 0x3fb8aa3b, v149 + ; GCN9_4-NEXT: v_pack_b32_f16 v40, v58, v59 + ; GCN9_4-NEXT: v_fma_f32 v149, s4, v18, -v134 + ; GCN9_4-NEXT: v_fma_f32 v58, s4, v20, -v134 + ; GCN9_4-NEXT: v_mul_f32_e32 v20, 0x3fb8aa3b, v60 + ; GCN9_4-NEXT: v_fma_f32 v60, s4, v21, -v134 + ; GCN9_4-NEXT: v_perm_b32 v21, v144, v140, s8 + ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[34:35], v[32:33], v[64:79] + ; GCN9_4-NEXT: v_cvt_f16_f32_e32 v34, v150 + ; GCN9_4-NEXT: v_exp_f32_e32 v159, v17 + ; GCN9_4-NEXT: v_cvt_f16_f32_e32 v17, v129 + ; GCN9_4-NEXT: v_pack_b32_f16 v41, v34, v17 + ; GCN9_4-NEXT: v_mul_f32_e32 v17, 0x3fb8aa3b, v128 + ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[42:43], v[32:33], v[112:127] + ; GCN9_4-NEXT: v_fma_f32 v42, s4, v19, -v134 + ; GCN9_4-NEXT: v_exp_f32_e32 v128, v17 + ; GCN9_4-NEXT: v_cvt_f16_f32_e32 v43, v142 + ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[36:37], v[40:41], v[96:111] + ; GCN9_4-NEXT: v_mul_f32_e32 v36, 0x3fb8aa3b, v16 + ; GCN9_4-NEXT: ds_read_b128 v[16:19], v57 offset:1152 + ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0) + ; GCN9_4-NEXT: buffer_inv sc0 sc1 + ; GCN9_4-NEXT: ds_read_b128 v[32:35], v57 offset:1728 + ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0) + ; GCN9_4-NEXT: buffer_inv sc0 sc1 + ; GCN9_4-NEXT: v_exp_f32_e32 v160, v36 + ; GCN9_4-NEXT: v_perm_b32 v36, v131, v63, s8 + ; GCN9_4-NEXT: v_perm_b32 v37, v145, v141, s8 + ; GCN9_4-NEXT: ;;#ASMSTART + ; GCN9_4-NEXT: s_waitcnt vmcnt(8) + ; GCN9_4-NEXT: ;;#ASMEND + ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[44:45], v[40:41], v[80:95] + ; GCN9_4-NEXT: v_exp_f32_e32 v163, v20 + ; GCN9_4-NEXT: v_cvt_f16_f32_e32 v45, v159 + ; GCN9_4-NEXT: v_cvt_f16_f32_e32 v44, v61 + ; GCN9_4-NEXT: v_perm_b32 v20, v130, v62, s8 + ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[16:17], v[40:41], v[64:79] + ; GCN9_4-NEXT: v_mul_f32_e32 v16, 0x3fb8aa3b, v149 + ; GCN9_4-NEXT: v_fma_f32 v149, s4, v22, -v134 + ; GCN9_4-NEXT: v_cvt_f16_f32_e32 v22, v128 + ; GCN9_4-NEXT: v_exp_f32_e32 v168, v16 + ; GCN9_4-NEXT: v_perm_b32 v16, v130, v62, s5 + ; GCN9_4-NEXT: v_perm_b32 v17, v144, v140, s5 + ; GCN9_4-NEXT: buffer_wbl2 sc0 sc1 + ; GCN9_4-NEXT: ds_write_b64 v135, v[16:17] + ; GCN9_4-NEXT: buffer_wbl2 sc0 sc1 + ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0) + ; GCN9_4-NEXT: ds_write_b64 v136, v[20:21] + ; GCN9_4-NEXT: ; implicit-def: $vgpr17 + ; GCN9_4-NEXT: ; implicit-def: $vgpr20 + ; GCN9_4-NEXT: ; implicit-def: $vgpr21 + ; GCN9_4-NEXT: v_mul_f32_e32 v16, 0x3fb8aa3b, v58 + ; GCN9_4-NEXT: v_add_u32_e32 v17, v132, v17 + ; GCN9_4-NEXT: v_add_u32_e32 v20, v132, v20 + ; GCN9_4-NEXT: v_add_u32_e32 v21, v132, v21 + ; GCN9_4-NEXT: v_fma_f32 v62, s4, v23, -v134 + ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[32:33], v[40:41], v[112:127] + ; GCN9_4-NEXT: v_pack_b32_f16 v41, v45, v22 + ; GCN9_4-NEXT: v_mul_f32_e32 v22, 0x3fb8aa3b, v42 + ; GCN9_4-NEXT: v_perm_b32 v32, v131, v63, s5 + ; GCN9_4-NEXT: v_perm_b32 v33, v145, v141, s5 + ; GCN9_4-NEXT: v_exp_f32_e32 v63, v22 + ; GCN9_4-NEXT: ; implicit-def: $vgpr22 + ; GCN9_4-NEXT: buffer_wbl2 sc0 sc1 + ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0) + ; GCN9_4-NEXT: ds_write_b64 v137, v[32:33] + ; GCN9_4-NEXT: buffer_wbl2 sc0 sc1 + ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0) + ; GCN9_4-NEXT: ds_write_b64 v138, v[36:37] + ; GCN9_4-NEXT: v_add_u32_e32 v22, v132, v22 + ; GCN9_4-NEXT: v_pack_b32_f16 v40, v43, v44 + ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0) + ; GCN9_4-NEXT: buffer_load_dwordx2 v[32:33], v22, s[0:3], 0 offen sc0 sc1 + ; GCN9_4-NEXT: s_waitcnt vmcnt(0) + ; GCN9_4-NEXT: buffer_inv sc0 sc1 + ; GCN9_4-NEXT: buffer_load_dwordx2 v[42:43], v17, s[0:3], 0 offen sc0 sc1 + ; GCN9_4-NEXT: s_waitcnt vmcnt(0) + ; GCN9_4-NEXT: buffer_inv sc0 sc1 + ; GCN9_4-NEXT: buffer_load_dwordx2 v[44:45], v20, s[0:3], 0 offen sc0 sc1 + ; GCN9_4-NEXT: s_waitcnt vmcnt(0) + ; GCN9_4-NEXT: buffer_inv sc0 sc1 + ; GCN9_4-NEXT: buffer_load_dwordx2 v[58:59], v21, s[0:3], 0 offen sc0 sc1 + ; GCN9_4-NEXT: s_waitcnt vmcnt(0) + ; GCN9_4-NEXT: buffer_inv sc0 sc1 + ; GCN9_4-NEXT: ;;#ASMSTART + ; GCN9_4-NEXT: s_waitcnt vmcnt(8) + ; GCN9_4-NEXT: ;;#ASMEND + ; GCN9_4-NEXT: ds_read_b128 v[20:23], v139 + ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0) + ; GCN9_4-NEXT: buffer_inv sc0 sc1 + ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[38:39], v[40:41], v[96:111] + ; GCN9_4-NEXT: v_exp_f32_e32 v130, v16 + ; GCN9_4-NEXT: v_mul_f32_e32 v16, 0x3fb8aa3b, v60 + ; GCN9_4-NEXT: v_cvt_f16_f32_e32 v17, v160 + ; GCN9_4-NEXT: ds_read_b128 v[36:39], v139 offset:576 + ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0) + ; GCN9_4-NEXT: buffer_inv sc0 sc1 + ; GCN9_4-NEXT: v_fma_f32 v131, s4, v26, -v134 + ; GCN9_4-NEXT: ; implicit-def: $sgpr0 + ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[46:47], v[40:41], v[80:95] + ; GCN9_4-NEXT: v_exp_f32_e32 v60, v16 + ; GCN9_4-NEXT: v_mul_f32_e32 v16, 0x3fb8aa3b, v149 + ; GCN9_4-NEXT: v_cvt_f16_f32_e32 v46, v163 + ; GCN9_4-NEXT: v_fma_f32 v47, s4, v25, -v134 + ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[18:19], v[40:41], v[64:79] + ; GCN9_4-NEXT: v_exp_f32_e32 v132, v16 + ; GCN9_4-NEXT: v_cvt_f16_f32_e32 v18, v168 + ; GCN9_4-NEXT: v_cvt_f16_f32_e32 v16, v63 + ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[34:35], v[40:41], v[112:127] + ; GCN9_4-NEXT: v_pack_b32_f16 v35, v18, v16 + ; GCN9_4-NEXT: v_pack_b32_f16 v34, v17, v46 + ; GCN9_4-NEXT: v_mul_f32_e32 v16, 0x3fb8aa3b, v62 + ; GCN9_4-NEXT: v_exp_f32_e32 v41, v16 + ; GCN9_4-NEXT: ds_read_b128 v[16:19], v139 offset:1152 + ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0) + ; GCN9_4-NEXT: buffer_inv sc0 sc1 + ; GCN9_4-NEXT: v_fma_f32 v40, s4, v27, -v134 + ; GCN9_4-NEXT: v_fma_f32 v62, s4, v30, -v134 + ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[20:21], v[34:35], v[96:111] + ; GCN9_4-NEXT: v_mul_f32_e32 v20, 0x3fb8aa3b, v24 + ; GCN9_4-NEXT: ds_read_b128 v[24:27], v139 offset:1728 + ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0) + ; GCN9_4-NEXT: buffer_inv sc0 sc1 + ; GCN9_4-NEXT: v_exp_f32_e32 v46, v20 + ; GCN9_4-NEXT: v_mul_f32_e32 v20, 0x3fb8aa3b, v47 + ; GCN9_4-NEXT: v_cvt_f16_f32_e32 v21, v130 + ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[36:37], v[34:35], v[80:95] + ; GCN9_4-NEXT: v_exp_f32_e32 v47, v20 + ; GCN9_4-NEXT: v_cvt_f16_f32_e32 v36, v60 + ; GCN9_4-NEXT: v_mul_f32_e32 v20, 0x3fb8aa3b, v40 + ; GCN9_4-NEXT: v_fma_f32 v37, s4, v29, -v134 + ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[16:17], v[34:35], v[64:79] + ; GCN9_4-NEXT: v_mul_f32_e32 v16, 0x3fb8aa3b, v131 + ; GCN9_4-NEXT: v_exp_f32_e32 v131, v16 + ; GCN9_4-NEXT: v_cvt_f16_f32_e32 v17, v132 + ; GCN9_4-NEXT: v_cvt_f16_f32_e32 v16, v41 + ; GCN9_4-NEXT: v_pack_b32_f16 v17, v17, v16 + ; GCN9_4-NEXT: v_pack_b32_f16 v16, v21, v36 + ; GCN9_4-NEXT: v_cvt_f16_f32_e32 v36, v46 + ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[24:25], v[34:35], v[112:127] + ; GCN9_4-NEXT: v_exp_f32_e32 v35, v20 + ; GCN9_4-NEXT: v_mul_f32_e32 v24, 0x3fb8aa3b, v28 + ; GCN9_4-NEXT: v_fma_f32 v34, s4, v31, -v134 + ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[22:23], v[16:17], v[96:111] + ; GCN9_4-NEXT: ds_read_b128 v[20:23], v57 + ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0) + ; GCN9_4-NEXT: buffer_inv sc0 sc1 + ; GCN9_4-NEXT: v_exp_f32_e32 v40, v24 + ; GCN9_4-NEXT: v_mul_f32_e32 v24, 0x3fb8aa3b, v37 + ; GCN9_4-NEXT: v_cvt_f16_f32_e32 v37, v47 + ; GCN9_4-NEXT: ds_read_b128 v[28:31], v57 offset:576 + ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0) + ; GCN9_4-NEXT: buffer_inv sc0 sc1 + ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[38:39], v[16:17], v[80:95] + ; GCN9_4-NEXT: v_fma_f32 v38, s4, v1, -v134 + ; GCN9_4-NEXT: v_exp_f32_e32 v39, v24 + ; GCN9_4-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v62 + ; GCN9_4-NEXT: v_pack_b32_f16 v24, v36, v37 + ; GCN9_4-NEXT: v_fma_f32 v62, s4, v2, -v134 + ; GCN9_4-NEXT: v_fma_f32 v37, s4, v6, -v134 + ; GCN9_4-NEXT: v_perm_b32 v6, v42, v32, s8 + ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[18:19], v[16:17], v[64:79] + ; GCN9_4-NEXT: v_exp_f32_e32 v140, v1 + ; GCN9_4-NEXT: v_cvt_f16_f32_e32 v18, v131 + ; GCN9_4-NEXT: v_cvt_f16_f32_e32 v1, v35 + ; GCN9_4-NEXT: v_pack_b32_f16 v25, v18, v1 + ; GCN9_4-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v34 + ; GCN9_4-NEXT: v_fma_f32 v34, s4, v4, -v134 + ; GCN9_4-NEXT: v_mul_f32_e32 v4, 0x3fb8aa3b, v38 + ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[26:27], v[16:17], v[112:127] + ; GCN9_4-NEXT: v_fma_f32 v26, s4, v3, -v134 + ; GCN9_4-NEXT: v_exp_f32_e32 v27, v1 + ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[20:21], v[24:25], v[96:111] + ; GCN9_4-NEXT: v_mul_f32_e32 v20, 0x3fb8aa3b, v0 + ; GCN9_4-NEXT: ds_read_b128 v[0:3], v57 offset:1152 + ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0) + ; GCN9_4-NEXT: buffer_inv sc0 sc1 + ; GCN9_4-NEXT: ds_read_b128 v[16:19], v57 offset:1728 + ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0) + ; GCN9_4-NEXT: buffer_inv sc0 sc1 + ; GCN9_4-NEXT: v_exp_f32_e32 v36, v20 + ; GCN9_4-NEXT: v_cvt_f16_f32_e32 v21, v40 + ; GCN9_4-NEXT: v_cvt_f16_f32_e32 v20, v39 + ; GCN9_4-NEXT: ;;#ASMSTART + ; GCN9_4-NEXT: s_waitcnt vmcnt(8) + ; GCN9_4-NEXT: ;;#ASMEND + ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[28:29], v[24:25], v[80:95] + ; GCN9_4-NEXT: v_exp_f32_e32 v29, v4 + ; GCN9_4-NEXT: v_mul_f32_e32 v4, 0x3fb8aa3b, v26 + ; GCN9_4-NEXT: v_fma_f32 v28, s4, v5, -v134 + ; GCN9_4-NEXT: v_perm_b32 v5, v58, v44, s5 + ; GCN9_4-NEXT: v_cvt_f16_f32_e32 v26, v36 + ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[0:1], v[24:25], v[64:79] + ; GCN9_4-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v62 + ; GCN9_4-NEXT: v_exp_f32_e32 v38, v0 + ; GCN9_4-NEXT: v_cvt_f16_f32_e32 v1, v140 + ; GCN9_4-NEXT: v_cvt_f16_f32_e32 v0, v27 + ; GCN9_4-NEXT: v_pack_b32_f16 v1, v1, v0 + ; GCN9_4-NEXT: v_pack_b32_f16 v0, v21, v20 + ; GCN9_4-NEXT: v_perm_b32 v20, v43, v33, s8 + ; GCN9_4-NEXT: v_perm_b32 v21, v59, v45, s8 + ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[16:17], v[24:25], v[112:127] + ; GCN9_4-NEXT: v_exp_f32_e32 v25, v4 + ; GCN9_4-NEXT: v_fma_f32 v24, s4, v7, -v134 + ; GCN9_4-NEXT: v_perm_b32 v4, v42, v32, s5 + ; GCN9_4-NEXT: v_perm_b32 v16, v43, v33, s5 + ; GCN9_4-NEXT: v_perm_b32 v7, v58, v44, s8 + ; GCN9_4-NEXT: v_perm_b32 v17, v59, v45, s5 + ; GCN9_4-NEXT: buffer_wbl2 sc0 sc1 + ; GCN9_4-NEXT: ds_write_b64 v135, v[4:5] + ; GCN9_4-NEXT: buffer_wbl2 sc0 sc1 + ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0) + ; GCN9_4-NEXT: ds_write_b64 v136, v[6:7] + ; GCN9_4-NEXT: buffer_wbl2 sc0 sc1 + ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0) + ; GCN9_4-NEXT: ds_write_b64 v137, v[16:17] + ; GCN9_4-NEXT: buffer_wbl2 sc0 sc1 + ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0) + ; GCN9_4-NEXT: ds_write_b64 v138, v[20:21] + ; GCN9_4-NEXT: v_mul_f32_e32 v16, 0x3fb8aa3b, v34 + ; GCN9_4-NEXT: ;;#ASMSTART + ; GCN9_4-NEXT: s_waitcnt vmcnt(8) + ; GCN9_4-NEXT: ;;#ASMEND + ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0) + ; GCN9_4-NEXT: ds_read_b128 v[4:7], v139 + ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0) + ; GCN9_4-NEXT: buffer_inv sc0 sc1 + ; GCN9_4-NEXT: v_fma_f32 v33, s4, v10, -v134 + ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[22:23], v[0:1], v[96:111] + ; GCN9_4-NEXT: v_exp_f32_e32 v32, v16 + ; GCN9_4-NEXT: v_mul_f32_e32 v16, 0x3fb8aa3b, v28 + ; GCN9_4-NEXT: v_cvt_f16_f32_e32 v28, v29 + ; GCN9_4-NEXT: ds_read_b128 v[20:23], v139 offset:576 + ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0) + ; GCN9_4-NEXT: buffer_inv sc0 sc1 + ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[30:31], v[0:1], v[80:95] + ; GCN9_4-NEXT: v_exp_f32_e32 v31, v16 + ; GCN9_4-NEXT: v_pack_b32_f16 v16, v26, v28 + ; GCN9_4-NEXT: v_fma_f32 v30, s4, v9, -v134 + ; GCN9_4-NEXT: v_fma_f32 v28, s4, v14, -v134 + ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[2:3], v[0:1], v[64:79] + ; GCN9_4-NEXT: v_mul_f32_e32 v2, 0x3fb8aa3b, v37 + ; GCN9_4-NEXT: v_exp_f32_e32 v34, v2 + ; GCN9_4-NEXT: v_cvt_f16_f32_e32 v3, v38 + ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[18:19], v[0:1], v[112:127] + ; GCN9_4-NEXT: v_cvt_f16_f32_e32 v0, v25 + ; GCN9_4-NEXT: v_fma_f32 v18, s4, v11, -v134 + ; GCN9_4-NEXT: v_pack_b32_f16 v17, v3, v0 + ; GCN9_4-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v24 + ; GCN9_4-NEXT: v_exp_f32_e32 v19, v0 + ; GCN9_4-NEXT: ds_read_b128 v[0:3], v139 offset:1152 + ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0) + ; GCN9_4-NEXT: buffer_inv sc0 sc1 + ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[4:5], v[16:17], v[96:111] + ; GCN9_4-NEXT: v_mul_f32_e32 v4, 0x3fb8aa3b, v8 + ; GCN9_4-NEXT: ds_read_b128 v[8:11], v139 offset:1728 + ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0) + ; GCN9_4-NEXT: buffer_inv sc0 sc1 + ; GCN9_4-NEXT: v_exp_f32_e32 v24, v4 + ; GCN9_4-NEXT: v_cvt_f16_f32_e32 v5, v32 + ; GCN9_4-NEXT: v_mul_f32_e32 v4, 0x3fb8aa3b, v30 + ; GCN9_4-NEXT: v_exp_f32_e32 v26, v4 + ; GCN9_4-NEXT: v_mul_f32_e32 v4, 0x3fb8aa3b, v18 + ; GCN9_4-NEXT: v_cvt_f16_f32_e32 v18, v24 + ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[20:21], v[16:17], v[80:95] + ; GCN9_4-NEXT: v_cvt_f16_f32_e32 v20, v31 + ; GCN9_4-NEXT: v_fma_f32 v21, s4, v13, -v134 + ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[0:1], v[16:17], v[64:79] + ; GCN9_4-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v33 + ; GCN9_4-NEXT: v_cvt_f16_f32_e32 v1, v34 + ; GCN9_4-NEXT: v_exp_f32_e32 v30, v0 + ; GCN9_4-NEXT: v_cvt_f16_f32_e32 v0, v19 + ; GCN9_4-NEXT: v_pack_b32_f16 v1, v1, v0 + ; GCN9_4-NEXT: v_pack_b32_f16 v0, v5, v20 + ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[8:9], v[16:17], v[112:127] + ; GCN9_4-NEXT: v_exp_f32_e32 v17, v4 + ; GCN9_4-NEXT: v_fma_f32 v16, s4, v15, -v134 + ; GCN9_4-NEXT: v_mul_f32_e32 v8, 0x3fb8aa3b, v12 + ; GCN9_4-NEXT: v_exp_f32_e32 v20, v8 + ; GCN9_4-NEXT: v_mul_f32_e32 v8, 0x3fb8aa3b, v21 + ; GCN9_4-NEXT: v_cvt_f16_f32_e32 v21, v26 + ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[6:7], v[0:1], v[96:111] + ; GCN9_4-NEXT: ds_read_b128 v[4:7], v57 + ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0) + ; GCN9_4-NEXT: buffer_inv sc0 sc1 + ; GCN9_4-NEXT: ds_read_b128 v[12:15], v57 offset:576 + ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0) + ; GCN9_4-NEXT: buffer_inv sc0 sc1 + ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[22:23], v[0:1], v[80:95] + ; GCN9_4-NEXT: v_exp_f32_e32 v22, v8 + ; GCN9_4-NEXT: v_pack_b32_f16 v8, v18, v21 + ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[2:3], v[0:1], v[64:79] + ; GCN9_4-NEXT: v_cvt_f16_f32_e32 v3, v30 + ; GCN9_4-NEXT: v_mul_f32_e32 v2, 0x3fb8aa3b, v28 + ; GCN9_4-NEXT: v_exp_f32_e32 v23, v2 + ; GCN9_4-NEXT: v_cvt_f16_f32_e32 v2, v22 + ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[10:11], v[0:1], v[112:127] + ; GCN9_4-NEXT: v_cvt_f16_f32_e32 v0, v17 + ; GCN9_4-NEXT: v_cvt_f16_f32_e32 v1, v23 + ; GCN9_4-NEXT: v_pack_b32_f16 v9, v3, v0 + ; GCN9_4-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v16 + ; GCN9_4-NEXT: v_exp_f32_e32 v10, v0 + ; GCN9_4-NEXT: v_cvt_f16_f32_e32 v3, v20 + ; GCN9_4-NEXT: v_cvt_f16_f32_e32 v0, v10 + ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[4:5], v[8:9], v[96:111] + ; GCN9_4-NEXT: v_pack_b32_f16 v5, v1, v0 + ; GCN9_4-NEXT: v_add_f32_e32 v0, 0, v49 + ; GCN9_4-NEXT: v_add_f32_e32 v0, v50, v0 + ; GCN9_4-NEXT: v_add_f32_e32 v0, v51, v0 + ; GCN9_4-NEXT: v_add_f32_e32 v0, v52, v0 + ; GCN9_4-NEXT: v_add_f32_e32 v0, v53, v0 + ; GCN9_4-NEXT: v_add_f32_e32 v0, v54, v0 + ; GCN9_4-NEXT: v_add_f32_e32 v0, v55, v0 + ; GCN9_4-NEXT: v_add_f32_e32 v0, v56, v0 + ; GCN9_4-NEXT: v_add_f32_e32 v0, v161, v0 + ; GCN9_4-NEXT: v_add_f32_e32 v0, v162, v0 + ; GCN9_4-NEXT: v_add_f32_e32 v0, v148, v0 + ; GCN9_4-NEXT: v_add_f32_e32 v0, v152, v0 + ; GCN9_4-NEXT: v_add_f32_e32 v0, v153, v0 + ; GCN9_4-NEXT: v_add_f32_e32 v0, v164, v0 + ; GCN9_4-NEXT: v_add_f32_e32 v0, v151, v0 + ; GCN9_4-NEXT: v_add_f32_e32 v0, v154, v0 + ; GCN9_4-NEXT: v_add_f32_e32 v0, v165, v0 + ; GCN9_4-NEXT: v_add_f32_e32 v0, v166, v0 + ; GCN9_4-NEXT: v_add_f32_e32 v0, v167, v0 + ; GCN9_4-NEXT: v_add_f32_e32 v0, v155, v0 + ; GCN9_4-NEXT: v_add_f32_e32 v0, v156, v0 + ; GCN9_4-NEXT: v_add_f32_e32 v0, v157, v0 + ; GCN9_4-NEXT: v_add_f32_e32 v0, v158, v0 + ; GCN9_4-NEXT: v_add_f32_e32 v0, v146, v0 + ; GCN9_4-NEXT: v_add_f32_e32 v0, v147, v0 + ; GCN9_4-NEXT: v_add_f32_e32 v0, v143, v0 + ; GCN9_4-NEXT: v_add_f32_e32 v0, v150, v0 + ; GCN9_4-NEXT: v_add_f32_e32 v0, v129, v0 + ; GCN9_4-NEXT: v_add_f32_e32 v0, v142, v0 + ; GCN9_4-NEXT: v_add_f32_e32 v0, v61, v0 + ; GCN9_4-NEXT: v_add_f32_e32 v0, v159, v0 + ; GCN9_4-NEXT: v_add_f32_e32 v0, v128, v0 + ; GCN9_4-NEXT: v_add_f32_e32 v0, v160, v0 + ; GCN9_4-NEXT: v_add_f32_e32 v0, v163, v0 + ; GCN9_4-NEXT: v_add_f32_e32 v0, v168, v0 + ; GCN9_4-NEXT: v_add_f32_e32 v0, v63, v0 + ; GCN9_4-NEXT: v_add_f32_e32 v0, v130, v0 + ; GCN9_4-NEXT: v_add_f32_e32 v0, v60, v0 + ; GCN9_4-NEXT: v_add_f32_e32 v0, v132, v0 + ; GCN9_4-NEXT: v_pack_b32_f16 v4, v3, v2 + ; GCN9_4-NEXT: v_add_f32_e32 v0, v41, v0 + ; GCN9_4-NEXT: s_nop 0 + ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[6:7], v[4:5], v[96:111] + ; GCN9_4-NEXT: v_add_f32_e32 v6, v46, v0 + ; GCN9_4-NEXT: ds_read_b128 v[0:3], v57 offset:1152 + ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0) + ; GCN9_4-NEXT: buffer_inv sc0 sc1 + ; GCN9_4-NEXT: v_add_f32_e32 v6, v47, v6 + ; GCN9_4-NEXT: v_add_f32_e32 v6, v131, v6 + ; GCN9_4-NEXT: v_add_f32_e32 v6, v35, v6 + ; GCN9_4-NEXT: v_add_f32_e32 v6, v40, v6 + ; GCN9_4-NEXT: v_add_f32_e32 v6, v39, v6 + ; GCN9_4-NEXT: v_add_f32_e32 v6, v140, v6 + ; GCN9_4-NEXT: v_add_f32_e32 v6, v27, v6 + ; GCN9_4-NEXT: v_add_f32_e32 v6, v36, v6 + ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[0:1], v[8:9], v[64:79] + ; GCN9_4-NEXT: v_add_f32_e32 v0, v29, v6 + ; GCN9_4-NEXT: v_add_f32_e32 v0, v38, v0 + ; GCN9_4-NEXT: v_add_f32_e32 v0, v25, v0 + ; GCN9_4-NEXT: v_add_f32_e32 v0, v32, v0 + ; GCN9_4-NEXT: v_add_f32_e32 v0, v31, v0 + ; GCN9_4-NEXT: v_add_f32_e32 v0, v34, v0 + ; GCN9_4-NEXT: v_add_f32_e32 v0, v19, v0 + ; GCN9_4-NEXT: v_add_f32_e32 v0, v24, v0 + ; GCN9_4-NEXT: v_add_f32_e32 v0, v26, v0 + ; GCN9_4-NEXT: v_add_f32_e32 v0, v30, v0 + ; GCN9_4-NEXT: v_add_f32_e32 v0, v17, v0 + ; GCN9_4-NEXT: v_add_f32_e32 v0, v20, v0 + ; GCN9_4-NEXT: v_add_f32_e32 v0, v22, v0 + ; GCN9_4-NEXT: v_add_f32_e32 v0, v23, v0 + ; GCN9_4-NEXT: v_add_f32_e32 v0, v10, v0 + ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[2:3], v[4:5], v[64:79] + ; GCN9_4-NEXT: ds_bpermute_b32 v1, v133, v0 + ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[12:13], v[8:9], v[80:95] + ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[14:15], v[4:5], v[80:95] + ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0) + ; GCN9_4-NEXT: v_add_f32_e32 v4, v0, v1 + ; GCN9_4-NEXT: ds_bpermute_b32 v5, v133, v4 + ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0) + ; GCN9_4-NEXT: ds_read_b128 v[0:3], v57 offset:1728 + ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0) + ; GCN9_4-NEXT: buffer_inv sc0 sc1 + ; GCN9_4-NEXT: ; implicit-def: $vgpr0 + ; GCN9_4-NEXT: v_cndmask_b32_e64 v1, v5, v4, s[6:7] + ; GCN9_4-NEXT: v_fmac_f32_e32 v1, v0, v48 + ; GCN9_4-NEXT: s_endpgm attributes #0 = {"amdgpu-flat-work-group-size"="256,256"} !0 = !{i64 2862105} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.small.mir b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.small.mir index 0473e017f193cb..5497d859c97dc6 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.small.mir +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.small.mir @@ -1,5 +1,6 @@ # NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 # RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -start-before=machine-scheduler -verify-misched -o - %s | FileCheck -check-prefix=GCN %s +# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx9-4-generic --amdhsa-code-object-version=6 -start-before=machine-scheduler -verify-misched -o - %s | FileCheck -check-prefix=GCN9_4 %s --- | define amdgpu_kernel void @smallInterleave() #0 { ret void } @@ -488,6 +489,501 @@ ; GCN-NEXT: v_cndmask_b32_e64 v2, v3, v2, s[0:1] ; GCN-NEXT: v_fmac_f32_e32 v2, v4, v16 ; GCN-NEXT: s_endpgm + ; + ; GCN9_4-LABEL: smallInterleave: + ; GCN9_4: ; %bb.0: + ; GCN9_4-NEXT: ; implicit-def: $vgpr2 + ; GCN9_4-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) + ; GCN9_4-NEXT: v_readfirstlane_b32 s20, v2 + ; GCN9_4-NEXT: ; implicit-def: $sgpr0 + ; GCN9_4-NEXT: ; implicit-def: $vgpr3 + ; GCN9_4-NEXT: ; implicit-def: $vgpr0_vgpr1 + ; GCN9_4-NEXT: s_lshl_b32 s4, s20, 7 + ; GCN9_4-NEXT: ; implicit-def: $vgpr6 + ; GCN9_4-NEXT: ; implicit-def: $vgpr7 + ; GCN9_4-NEXT: ; implicit-def: $vgpr49 + ; GCN9_4-NEXT: ; implicit-def: $sgpr16_sgpr17_sgpr18_sgpr19 + ; GCN9_4-NEXT: ; implicit-def: $vgpr48 + ; GCN9_4-NEXT: ; implicit-def: $vgpr40_vgpr41_vgpr42_vgpr43 + ; GCN9_4-NEXT: ; implicit-def: $vgpr51 + ; GCN9_4-NEXT: ; implicit-def: $vgpr68 + ; GCN9_4-NEXT: ; implicit-def: $vgpr69 + ; GCN9_4-NEXT: v_max_f32_e32 v79, v69, v69 + ; GCN9_4-NEXT: v_lshl_add_u32 v2, s20, 4, v3 + ; GCN9_4-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s0, v2, v[0:1] + ; GCN9_4-NEXT: ; implicit-def: $sgpr0_sgpr1_sgpr2_sgpr3 + ; GCN9_4-NEXT: ; implicit-def: $vgpr5 + ; GCN9_4-NEXT: v_add_lshl_u32 v50, v5, s4, 1 + ; GCN9_4-NEXT: v_add_u32_e32 v6, v6, v49 + ; GCN9_4-NEXT: v_add_u32_e32 v7, v7, v49 + ; GCN9_4-NEXT: ; kill: killed $vgpr7 + ; GCN9_4-NEXT: ; kill: killed $vgpr6 + ; GCN9_4-NEXT: ; kill: killed $vgpr4 + ; GCN9_4-NEXT: ; kill: killed $sgpr0_sgpr1_sgpr2_sgpr3 + ; GCN9_4-NEXT: ;;#ASMSTART + ; GCN9_4-NEXT: s_waitcnt vmcnt(8) + ; GCN9_4-NEXT: ;;#ASMEND + ; GCN9_4-NEXT: s_nop 1 + ; GCN9_4-NEXT: buffer_load_dwordx4 v[0:3], v4, s[0:3], 0 offen sc0 sc1 + ; GCN9_4-NEXT: s_waitcnt vmcnt(0) + ; GCN9_4-NEXT: buffer_inv sc0 sc1 + ; GCN9_4-NEXT: ; kill: killed $sgpr16_sgpr17_sgpr18_sgpr19 + ; GCN9_4-NEXT: ; iglp_opt mask(0x00000002) + ; GCN9_4-NEXT: buffer_wbl2 sc0 sc1 + ; GCN9_4-NEXT: ds_write_b128 v50, v[0:3] + ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0) + ; GCN9_4-NEXT: buffer_load_dwordx4 v[32:35], v4, s[0:3], 0 offen offset:64 sc0 sc1 + ; GCN9_4-NEXT: s_waitcnt vmcnt(0) + ; GCN9_4-NEXT: buffer_inv sc0 sc1 + ; GCN9_4-NEXT: buffer_load_dwordx2 v[64:65], v6, s[16:19], 0 offen sc0 sc1 + ; GCN9_4-NEXT: s_waitcnt vmcnt(0) + ; GCN9_4-NEXT: buffer_inv sc0 sc1 + ; GCN9_4-NEXT: buffer_load_dwordx2 v[66:67], v7, s[16:19], 0 offen sc0 sc1 + ; GCN9_4-NEXT: s_waitcnt vmcnt(0) + ; GCN9_4-NEXT: buffer_inv sc0 sc1 + ; GCN9_4-NEXT: ;;#ASMSTART + ; GCN9_4-NEXT: s_waitcnt vmcnt(8) + ; GCN9_4-NEXT: ;;#ASMEND + ; GCN9_4-NEXT: ds_read_b128 v[36:39], v48 + ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0) + ; GCN9_4-NEXT: buffer_inv sc0 sc1 + ; GCN9_4-NEXT: ds_read_b128 v[44:47], v48 offset:512 + ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0) + ; GCN9_4-NEXT: buffer_inv sc0 sc1 + ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[36:37], v[40:41], 0 + ; GCN9_4-NEXT: ; implicit-def: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GCN9_4-NEXT: ; implicit-def: $sgpr6 + ; GCN9_4-NEXT: ; implicit-def: $sgpr5 + ; GCN9_4-NEXT: ; implicit-def: $sgpr2 + ; GCN9_4-NEXT: ; implicit-def: $sgpr3 + ; GCN9_4-NEXT: ; implicit-def: $sgpr0_sgpr1 + ; GCN9_4-NEXT: v_perm_b32 v80, v66, v64, s2 + ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[44:45], v[40:41], 0 + ; GCN9_4-NEXT: v_perm_b32 v64, v66, v64, s3 + ; GCN9_4-NEXT: v_perm_b32 v66, v67, v65, s2 + ; GCN9_4-NEXT: v_perm_b32 v65, v67, v65, s3 + ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[38:39], v[42:43], v[16:31] + ; GCN9_4-NEXT: ds_read_b128 v[36:39], v51 + ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0) + ; GCN9_4-NEXT: buffer_inv sc0 sc1 + ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[46:47], v[42:43], v[0:15] + ; GCN9_4-NEXT: ds_read_b128 v[44:47], v51 offset:512 + ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0) + ; GCN9_4-NEXT: buffer_inv sc0 sc1 + ; GCN9_4-NEXT: ; implicit-def: $vgpr40_vgpr41_vgpr42_vgpr43 + ; GCN9_4-NEXT: ;;#ASMSTART + ; GCN9_4-NEXT: s_waitcnt vmcnt(8) + ; GCN9_4-NEXT: ;;#ASMEND + ; GCN9_4-NEXT: buffer_wbl2 sc0 sc1 + ; GCN9_4-NEXT: ds_write_b128 v50, v[32:35] + ; GCN9_4-NEXT: ;;#ASMSTART + ; GCN9_4-NEXT: s_waitcnt vmcnt(8) + ; GCN9_4-NEXT: ;;#ASMEND + ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0) + ; GCN9_4-NEXT: ds_read_b128 v[32:35], v48 + ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0) + ; GCN9_4-NEXT: buffer_inv sc0 sc1 + ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[36:37], v[40:41], v[16:31] + ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[44:45], v[40:41], v[0:15] + ; GCN9_4-NEXT: ; implicit-def: $vgpr44 + ; GCN9_4-NEXT: v_add_u32_e32 v86, v44, v49 + ; GCN9_4-NEXT: ; kill: killed $vgpr86 + ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[38:39], v[42:43], v[16:31] + ; GCN9_4-NEXT: ; implicit-def: $vgpr36_vgpr37_vgpr38_vgpr39 + ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[46:47], v[42:43], v[0:15] + ; GCN9_4-NEXT: ds_read_b128 v[40:43], v48 offset:512 + ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0) + ; GCN9_4-NEXT: buffer_inv sc0 sc1 + ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[32:33], v[36:37], v[16:31] + ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[40:41], v[36:37], v[0:15] + ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[34:35], v[38:39], v[16:31] + ; GCN9_4-NEXT: ds_read_b128 v[32:35], v51 + ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0) + ; GCN9_4-NEXT: buffer_inv sc0 sc1 + ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[42:43], v[38:39], v[0:15] + ; GCN9_4-NEXT: ds_read_b128 v[40:43], v51 offset:512 + ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0) + ; GCN9_4-NEXT: buffer_inv sc0 sc1 + ; GCN9_4-NEXT: ; implicit-def: $vgpr36_vgpr37_vgpr38_vgpr39 + ; GCN9_4-NEXT: ;;#ASMSTART + ; GCN9_4-NEXT: s_waitcnt vmcnt(8) + ; GCN9_4-NEXT: ;;#ASMEND + ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[32:33], v[36:37], v[16:31] + ; GCN9_4-NEXT: ; implicit-def: $vgpr32 + ; GCN9_4-NEXT: ; implicit-def: $vgpr33 + ; GCN9_4-NEXT: v_add_u32_e32 v32, s20, v32 + ; GCN9_4-NEXT: v_and_b32_e32 v32, 0x1fffffff, v32 + ; GCN9_4-NEXT: v_mul_lo_u32 v32, v32, s5 + ; GCN9_4-NEXT: v_add_lshl_u32 v81, v33, v32, 1 + ; GCN9_4-NEXT: buffer_wbl2 sc0 sc1 + ; GCN9_4-NEXT: ds_write_b32 v81, v80 + ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[40:41], v[36:37], v[0:15] + ; GCN9_4-NEXT: ; implicit-def: $vgpr36 + ; GCN9_4-NEXT: v_lshl_add_u32 v82, v36, 1, v81 + ; GCN9_4-NEXT: buffer_wbl2 sc0 sc1 + ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0) + ; GCN9_4-NEXT: ds_write_b32 v82, v64 + ; GCN9_4-NEXT: ; implicit-def: $vgpr37 + ; GCN9_4-NEXT: v_lshl_add_u32 v83, v37, 1, v82 + ; GCN9_4-NEXT: ; implicit-def: $vgpr40 + ; GCN9_4-NEXT: v_lshl_add_u32 v84, v40, 1, v83 + ; GCN9_4-NEXT: ; implicit-def: $vgpr41 + ; GCN9_4-NEXT: v_add_u32_e32 v85, v41, v49 + ; GCN9_4-NEXT: buffer_wbl2 sc0 sc1 + ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0) + ; GCN9_4-NEXT: ds_write_b32 v83, v66 + ; GCN9_4-NEXT: buffer_wbl2 sc0 sc1 + ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0) + ; GCN9_4-NEXT: ds_write_b32 v84, v65 + ; GCN9_4-NEXT: ; kill: killed $vgpr85 + ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[34:35], v[38:39], v[16:31] + ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[42:43], v[38:39], v[0:15] + ; GCN9_4-NEXT: s_nop 7 + ; GCN9_4-NEXT: s_nop 7 + ; GCN9_4-NEXT: s_nop 1 + ; GCN9_4-NEXT: v_mul_f32_e32 v34, s4, v16 + ; GCN9_4-NEXT: v_mul_f32_e32 v35, s4, v17 + ; GCN9_4-NEXT: v_mul_f32_e32 v45, s4, v18 + ; GCN9_4-NEXT: v_mul_f32_e32 v46, s4, v19 + ; GCN9_4-NEXT: v_max3_f32 v34, v34, s6, v35 + ; GCN9_4-NEXT: v_mul_f32_e32 v47, s4, v20 + ; GCN9_4-NEXT: v_mul_f32_e32 v48, s4, v21 + ; GCN9_4-NEXT: v_max3_f32 v34, v34, v45, v46 + ; GCN9_4-NEXT: v_mul_f32_e32 v50, s4, v22 + ; GCN9_4-NEXT: v_mul_f32_e32 v51, s4, v23 + ; GCN9_4-NEXT: v_max3_f32 v34, v34, v47, v48 + ; GCN9_4-NEXT: v_mul_f32_e32 v52, s4, v24 + ; GCN9_4-NEXT: v_mul_f32_e32 v53, s4, v25 + ; GCN9_4-NEXT: v_max3_f32 v34, v34, v50, v51 + ; GCN9_4-NEXT: v_mul_f32_e32 v54, s4, v26 + ; GCN9_4-NEXT: v_mul_f32_e32 v55, s4, v27 + ; GCN9_4-NEXT: v_max3_f32 v34, v34, v52, v53 + ; GCN9_4-NEXT: v_mul_f32_e32 v56, s4, v28 + ; GCN9_4-NEXT: v_mul_f32_e32 v57, s4, v29 + ; GCN9_4-NEXT: v_max3_f32 v34, v34, v54, v55 + ; GCN9_4-NEXT: v_mul_f32_e32 v58, s4, v30 + ; GCN9_4-NEXT: v_mul_f32_e32 v59, s4, v31 + ; GCN9_4-NEXT: v_max3_f32 v34, v34, v56, v57 + ; GCN9_4-NEXT: v_mul_f32_e32 v35, s4, v0 + ; GCN9_4-NEXT: v_mul_f32_e32 v38, s4, v1 + ; GCN9_4-NEXT: v_max3_f32 v34, v34, v58, v59 + ; GCN9_4-NEXT: v_mul_f32_e32 v39, s4, v2 + ; GCN9_4-NEXT: v_mul_f32_e32 v42, s4, v3 + ; GCN9_4-NEXT: v_max3_f32 v34, v34, v35, v38 + ; GCN9_4-NEXT: v_mul_f32_e32 v43, s4, v4 + ; GCN9_4-NEXT: v_mul_f32_e32 v45, s4, v5 + ; GCN9_4-NEXT: v_max3_f32 v34, v34, v39, v42 + ; GCN9_4-NEXT: v_mul_f32_e32 v46, s4, v6 + ; GCN9_4-NEXT: v_mul_f32_e32 v47, s4, v7 + ; GCN9_4-NEXT: v_max3_f32 v34, v34, v43, v45 + ; GCN9_4-NEXT: v_mul_f32_e32 v70, s4, v8 + ; GCN9_4-NEXT: v_mul_f32_e32 v71, s4, v9 + ; GCN9_4-NEXT: v_max3_f32 v78, v34, v46, v47 + ; GCN9_4-NEXT: v_mul_f32_e32 v72, s4, v10 + ; GCN9_4-NEXT: v_mul_f32_e32 v73, s4, v11 + ; GCN9_4-NEXT: v_max3_f32 v70, v78, v70, v71 + ; GCN9_4-NEXT: v_mul_f32_e32 v74, s4, v12 + ; GCN9_4-NEXT: v_mul_f32_e32 v75, s4, v13 + ; GCN9_4-NEXT: v_max3_f32 v70, v70, v72, v73 + ; GCN9_4-NEXT: v_mul_f32_e32 v76, s4, v14 + ; GCN9_4-NEXT: v_mul_f32_e32 v77, s4, v15 + ; GCN9_4-NEXT: v_max3_f32 v70, v70, v74, v75 + ; GCN9_4-NEXT: v_max3_f32 v70, v70, v76, v77 + ; GCN9_4-NEXT: ds_bpermute_b32 v71, v68, v70 + ; GCN9_4-NEXT: ; implicit-def: $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 + ; GCN9_4-NEXT: ; implicit-def: $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47 + ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0) + ; GCN9_4-NEXT: v_max_f32_e32 v64, v71, v71 + ; GCN9_4-NEXT: v_max_f32_e32 v70, v70, v64 + ; GCN9_4-NEXT: ds_bpermute_b32 v71, v68, v70 + ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0) + ; GCN9_4-NEXT: buffer_load_dwordx2 v[64:65], v85, s[16:19], 0 offen sc0 sc1 + ; GCN9_4-NEXT: s_waitcnt vmcnt(0) + ; GCN9_4-NEXT: buffer_inv sc0 sc1 + ; GCN9_4-NEXT: buffer_load_dwordx2 v[66:67], v86, s[16:19], 0 offen sc0 sc1 + ; GCN9_4-NEXT: s_waitcnt vmcnt(0) + ; GCN9_4-NEXT: buffer_inv sc0 sc1 + ; GCN9_4-NEXT: ;;#ASMSTART + ; GCN9_4-NEXT: s_waitcnt vmcnt(8) + ; GCN9_4-NEXT: ;;#ASMEND + ; GCN9_4-NEXT: v_cndmask_b32_e64 v70, v71, v70, s[0:1] + ; GCN9_4-NEXT: v_max_f32_e32 v70, v70, v70 + ; GCN9_4-NEXT: v_max_f32_e32 v72, v79, v70 + ; GCN9_4-NEXT: v_fma_f32 v16, s4, v16, -v72 + ; GCN9_4-NEXT: v_fma_f32 v18, s4, v18, -v72 + ; GCN9_4-NEXT: v_fma_f32 v19, s4, v19, -v72 + ; GCN9_4-NEXT: v_mul_f32_e32 v16, 0x3fb8aa3b, v16 + ; GCN9_4-NEXT: v_mul_f32_e32 v18, 0x3fb8aa3b, v18 + ; GCN9_4-NEXT: v_fma_f32 v17, s4, v17, -v72 + ; GCN9_4-NEXT: v_fma_f32 v20, s4, v20, -v72 + ; GCN9_4-NEXT: v_fma_f32 v21, s4, v21, -v72 + ; GCN9_4-NEXT: v_fma_f32 v22, s4, v22, -v72 + ; GCN9_4-NEXT: v_fma_f32 v23, s4, v23, -v72 + ; GCN9_4-NEXT: v_mul_f32_e32 v19, 0x3fb8aa3b, v19 + ; GCN9_4-NEXT: v_exp_f32_e32 v73, v16 + ; GCN9_4-NEXT: v_exp_f32_e32 v74, v18 + ; GCN9_4-NEXT: v_exp_f32_e32 v75, v19 + ; GCN9_4-NEXT: v_mul_f32_e32 v20, 0x3fb8aa3b, v20 + ; GCN9_4-NEXT: v_mul_f32_e32 v21, 0x3fb8aa3b, v21 + ; GCN9_4-NEXT: v_mul_f32_e32 v22, 0x3fb8aa3b, v22 + ; GCN9_4-NEXT: v_exp_f32_e32 v76, v20 + ; GCN9_4-NEXT: v_cvt_f16_f32_e32 v16, v73 + ; GCN9_4-NEXT: v_fma_f32 v18, s4, v24, -v72 + ; GCN9_4-NEXT: v_exp_f32_e32 v77, v21 + ; GCN9_4-NEXT: v_cvt_f16_f32_e32 v21, v74 + ; GCN9_4-NEXT: v_fma_f32 v20, s4, v25, -v72 + ; GCN9_4-NEXT: v_mul_f32_e32 v23, 0x3fb8aa3b, v23 + ; GCN9_4-NEXT: v_mul_f32_e32 v17, 0x3fb8aa3b, v17 + ; GCN9_4-NEXT: v_exp_f32_e32 v78, v22 + ; GCN9_4-NEXT: v_cvt_f16_f32_e32 v22, v75 + ; GCN9_4-NEXT: v_fma_f32 v26, s4, v26, -v72 + ; GCN9_4-NEXT: v_sub_f32_e32 v24, v69, v72 + ; GCN9_4-NEXT: v_exp_f32_e32 v69, v23 + ; GCN9_4-NEXT: v_mul_f32_e32 v23, 0x3fb8aa3b, v20 + ; GCN9_4-NEXT: v_fma_f32 v27, s4, v27, -v72 + ; GCN9_4-NEXT: v_exp_f32_e32 v85, v23 + ; GCN9_4-NEXT: v_exp_f32_e32 v17, v17 + ; GCN9_4-NEXT: v_pack_b32_f16 v71, v21, v22 + ; GCN9_4-NEXT: v_mul_f32_e32 v22, 0x3fb8aa3b, v18 + ; GCN9_4-NEXT: ; implicit-def: $vgpr79 + ; GCN9_4-NEXT: v_mul_f32_e32 v24, 0x3fb8aa3b, v24 + ; GCN9_4-NEXT: v_cvt_f16_f32_e32 v19, v17 + ; GCN9_4-NEXT: v_fma_f32 v28, s4, v28, -v72 + ; GCN9_4-NEXT: v_fma_f32 v29, s4, v29, -v72 + ; GCN9_4-NEXT: v_pack_b32_f16 v70, v16, v19 + ; GCN9_4-NEXT: ds_read_b128 v[18:21], v79 + ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0) + ; GCN9_4-NEXT: buffer_inv sc0 sc1 + ; GCN9_4-NEXT: v_exp_f32_e32 v16, v24 + ; GCN9_4-NEXT: s_nop 0 + ; GCN9_4-NEXT: v_pk_mul_f32 v[48:49], v[48:49], v[16:17] op_sel_hi:[1,0] + ; GCN9_4-NEXT: v_pk_mul_f32 v[50:51], v[50:51], v[16:17] op_sel_hi:[1,0] + ; GCN9_4-NEXT: v_pk_mul_f32 v[52:53], v[52:53], v[16:17] op_sel_hi:[1,0] + ; GCN9_4-NEXT: v_pk_mul_f32 v[54:55], v[54:55], v[16:17] op_sel_hi:[1,0] + ; GCN9_4-NEXT: v_pk_mul_f32 v[56:57], v[56:57], v[16:17] op_sel_hi:[1,0] + ; GCN9_4-NEXT: v_pk_mul_f32 v[58:59], v[58:59], v[16:17] op_sel_hi:[1,0] + ; GCN9_4-NEXT: v_pk_mul_f32 v[60:61], v[60:61], v[16:17] op_sel_hi:[1,0] + ; GCN9_4-NEXT: v_pk_mul_f32 v[62:63], v[62:63], v[16:17] op_sel_hi:[1,0] + ; GCN9_4-NEXT: v_pk_mul_f32 v[32:33], v[32:33], v[16:17] op_sel_hi:[1,0] + ; GCN9_4-NEXT: v_pk_mul_f32 v[34:35], v[34:35], v[16:17] op_sel_hi:[1,0] + ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[18:19], v[70:71], v[48:63] + ; GCN9_4-NEXT: v_add_f32_e32 v18, 0, v73 + ; GCN9_4-NEXT: v_fma_f32 v31, s4, v31, -v72 + ; GCN9_4-NEXT: v_exp_f32_e32 v73, v22 + ; GCN9_4-NEXT: ds_read_b128 v[22:25], v79 offset:576 + ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0) + ; GCN9_4-NEXT: buffer_inv sc0 sc1 + ; GCN9_4-NEXT: v_pk_mul_f32 v[36:37], v[36:37], v[16:17] op_sel_hi:[1,0] + ; GCN9_4-NEXT: v_pk_mul_f32 v[38:39], v[38:39], v[16:17] op_sel_hi:[1,0] + ; GCN9_4-NEXT: v_pk_mul_f32 v[40:41], v[40:41], v[16:17] op_sel_hi:[1,0] + ; GCN9_4-NEXT: v_pk_mul_f32 v[42:43], v[42:43], v[16:17] op_sel_hi:[1,0] + ; GCN9_4-NEXT: v_pk_mul_f32 v[44:45], v[44:45], v[16:17] op_sel_hi:[1,0] + ; GCN9_4-NEXT: v_pk_mul_f32 v[46:47], v[46:47], v[16:17] op_sel_hi:[1,0] + ; GCN9_4-NEXT: v_cvt_f16_f32_e32 v19, v76 + ; GCN9_4-NEXT: v_fma_f32 v0, s4, v0, -v72 + ; GCN9_4-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0 + ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[22:23], v[70:71], v[32:47] + ; GCN9_4-NEXT: v_add_f32_e32 v17, v17, v18 + ; GCN9_4-NEXT: v_mul_f32_e32 v18, 0x3fb8aa3b, v26 + ; GCN9_4-NEXT: v_cvt_f16_f32_e32 v80, v77 + ; GCN9_4-NEXT: v_fma_f32 v23, s4, v30, -v72 + ; GCN9_4-NEXT: v_exp_f32_e32 v26, v18 + ; GCN9_4-NEXT: v_mul_f32_e32 v18, 0x3fb8aa3b, v27 + ; GCN9_4-NEXT: v_cvt_f16_f32_e32 v22, v78 + ; GCN9_4-NEXT: v_fma_f32 v1, s4, v1, -v72 + ; GCN9_4-NEXT: v_exp_f32_e32 v27, v18 + ; GCN9_4-NEXT: v_add_f32_e32 v17, v74, v17 + ; GCN9_4-NEXT: v_fma_f32 v4, s4, v4, -v72 + ; GCN9_4-NEXT: v_cvt_f16_f32_e32 v30, v69 + ; GCN9_4-NEXT: v_pack_b32_f16 v18, v19, v80 + ; GCN9_4-NEXT: v_fma_f32 v5, s4, v5, -v72 + ; GCN9_4-NEXT: ; implicit-def: $vgpr70 + ; GCN9_4-NEXT: v_mul_f32_e32 v4, 0x3fb8aa3b, v4 + ; GCN9_4-NEXT: v_pack_b32_f16 v19, v22, v30 + ; GCN9_4-NEXT: s_nop 1 + ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[20:21], v[18:19], v[48:63] + ; GCN9_4-NEXT: v_mul_f32_e32 v20, 0x3fb8aa3b, v28 + ; GCN9_4-NEXT: v_add_f32_e32 v17, v75, v17 + ; GCN9_4-NEXT: v_fma_f32 v6, s4, v6, -v72 + ; GCN9_4-NEXT: v_exp_f32_e32 v28, v20 + ; GCN9_4-NEXT: v_mul_f32_e32 v20, 0x3fb8aa3b, v29 + ; GCN9_4-NEXT: v_fma_f32 v7, s4, v7, -v72 + ; GCN9_4-NEXT: v_exp_f32_e32 v30, v20 + ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[24:25], v[18:19], v[32:47] + ; GCN9_4-NEXT: v_mul_f32_e32 v18, 0x3fb8aa3b, v23 + ; GCN9_4-NEXT: v_add_f32_e32 v17, v76, v17 + ; GCN9_4-NEXT: v_cvt_f16_f32_e32 v22, v73 + ; GCN9_4-NEXT: v_fma_f32 v24, s4, v2, -v72 + ; GCN9_4-NEXT: v_exp_f32_e32 v25, v18 + ; GCN9_4-NEXT: v_cvt_f16_f32_e32 v29, v85 + ; GCN9_4-NEXT: ds_read_b128 v[18:21], v70 + ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0) + ; GCN9_4-NEXT: buffer_inv sc0 sc1 + ; GCN9_4-NEXT: v_pack_b32_f16 v22, v22, v29 + ; GCN9_4-NEXT: v_fma_f32 v29, s4, v3, -v72 + ; GCN9_4-NEXT: v_add_f32_e32 v17, v77, v17 + ; GCN9_4-NEXT: v_fma_f32 v10, s4, v10, -v72 + ; GCN9_4-NEXT: v_cvt_f16_f32_e32 v71, v30 + ; GCN9_4-NEXT: v_add_f32_e32 v17, v78, v17 + ; GCN9_4-NEXT: v_cvt_f16_f32_e32 v23, v26 + ; GCN9_4-NEXT: v_cvt_f16_f32_e32 v2, v27 + ; GCN9_4-NEXT: v_pack_b32_f16 v23, v23, v2 + ; GCN9_4-NEXT: v_mul_f32_e32 v2, 0x3fb8aa3b, v31 + ; GCN9_4-NEXT: v_exp_f32_e32 v31, v2 + ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[18:19], v[22:23], v[48:63] + ; GCN9_4-NEXT: v_cvt_f16_f32_e32 v18, v28 + ; GCN9_4-NEXT: v_exp_f32_e32 v19, v0 + ; GCN9_4-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v1 + ; GCN9_4-NEXT: v_exp_f32_e32 v74, v0 + ; GCN9_4-NEXT: ds_read_b128 v[0:3], v70 offset:576 + ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0) + ; GCN9_4-NEXT: buffer_inv sc0 sc1 + ; GCN9_4-NEXT: ;;#ASMSTART + ; GCN9_4-NEXT: s_waitcnt vmcnt(8) + ; GCN9_4-NEXT: ;;#ASMEND + ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[0:1], v[22:23], v[32:47] + ; GCN9_4-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v24 + ; GCN9_4-NEXT: v_cvt_f16_f32_e32 v1, v25 + ; GCN9_4-NEXT: v_fma_f32 v24, s4, v9, -v72 + ; GCN9_4-NEXT: v_exp_f32_e32 v22, v0 + ; GCN9_4-NEXT: v_cvt_f16_f32_e32 v0, v31 + ; GCN9_4-NEXT: v_pack_b32_f16 v1, v1, v0 + ; GCN9_4-NEXT: v_pack_b32_f16 v0, v18, v71 + ; GCN9_4-NEXT: v_mul_f32_e32 v18, 0x3fb8aa3b, v29 + ; GCN9_4-NEXT: s_nop 0 + ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[20:21], v[0:1], v[48:63] + ; GCN9_4-NEXT: v_fma_f32 v21, s4, v8, -v72 + ; GCN9_4-NEXT: v_exp_f32_e32 v18, v18 + ; GCN9_4-NEXT: v_cvt_f16_f32_e32 v20, v19 + ; GCN9_4-NEXT: v_exp_f32_e32 v23, v4 + ; GCN9_4-NEXT: v_mul_f32_e32 v4, 0x3fb8aa3b, v5 + ; GCN9_4-NEXT: v_cvt_f16_f32_e32 v5, v74 + ; GCN9_4-NEXT: v_exp_f32_e32 v29, v4 + ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[2:3], v[0:1], v[32:47] + ; GCN9_4-NEXT: v_perm_b32 v0, v66, v64, s2 + ; GCN9_4-NEXT: v_perm_b32 v1, v66, v64, s3 + ; GCN9_4-NEXT: v_perm_b32 v2, v67, v65, s2 + ; GCN9_4-NEXT: v_perm_b32 v3, v67, v65, s3 + ; GCN9_4-NEXT: buffer_wbl2 sc0 sc1 + ; GCN9_4-NEXT: ds_write_b32 v81, v0 + ; GCN9_4-NEXT: buffer_wbl2 sc0 sc1 + ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0) + ; GCN9_4-NEXT: ds_write_b32 v82, v1 + ; GCN9_4-NEXT: buffer_wbl2 sc0 sc1 + ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0) + ; GCN9_4-NEXT: ds_write_b32 v83, v2 + ; GCN9_4-NEXT: buffer_wbl2 sc0 sc1 + ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0) + ; GCN9_4-NEXT: ds_write_b32 v84, v3 + ; GCN9_4-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v6 + ; GCN9_4-NEXT: ;;#ASMSTART + ; GCN9_4-NEXT: s_waitcnt vmcnt(8) + ; GCN9_4-NEXT: ;;#ASMEND + ; GCN9_4-NEXT: v_add_f32_e32 v4, v69, v17 + ; GCN9_4-NEXT: v_cvt_f16_f32_e32 v6, v22 + ; GCN9_4-NEXT: v_exp_f32_e32 v17, v0 + ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0) + ; GCN9_4-NEXT: ds_read_b128 v[0:3], v79 + ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0) + ; GCN9_4-NEXT: buffer_inv sc0 sc1 + ; GCN9_4-NEXT: v_cvt_f16_f32_e32 v8, v18 + ; GCN9_4-NEXT: ; implicit-def: $sgpr2 + ; GCN9_4-NEXT: v_pack_b32_f16 v9, v6, v8 + ; GCN9_4-NEXT: v_pack_b32_f16 v8, v20, v5 + ; GCN9_4-NEXT: v_mul_f32_e32 v5, 0x3fb8aa3b, v7 + ; GCN9_4-NEXT: s_nop 0 + ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[0:1], v[8:9], v[48:63] + ; GCN9_4-NEXT: v_add_f32_e32 v0, v73, v4 + ; GCN9_4-NEXT: v_exp_f32_e32 v20, v5 + ; GCN9_4-NEXT: ds_read_b128 v[4:7], v79 offset:576 + ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0) + ; GCN9_4-NEXT: buffer_inv sc0 sc1 + ; GCN9_4-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v21 + ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[4:5], v[8:9], v[32:47] + ; GCN9_4-NEXT: v_add_f32_e32 v4, v85, v0 + ; GCN9_4-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v10 + ; GCN9_4-NEXT: v_exp_f32_e32 v8, v0 + ; GCN9_4-NEXT: v_exp_f32_e32 v64, v1 + ; GCN9_4-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v24 + ; GCN9_4-NEXT: v_cvt_f16_f32_e32 v24, v29 + ; GCN9_4-NEXT: v_exp_f32_e32 v65, v1 + ; GCN9_4-NEXT: v_cvt_f16_f32_e32 v21, v23 + ; GCN9_4-NEXT: v_cvt_f16_f32_e32 v1, v17 + ; GCN9_4-NEXT: v_cvt_f16_f32_e32 v0, v20 + ; GCN9_4-NEXT: v_fma_f32 v9, s4, v15, -v72 + ; GCN9_4-NEXT: v_mul_f32_e32 v9, 0x3fb8aa3b, v9 + ; GCN9_4-NEXT: v_exp_f32_e32 v9, v9 + ; GCN9_4-NEXT: v_pack_b32_f16 v1, v1, v0 + ; GCN9_4-NEXT: v_pack_b32_f16 v0, v21, v24 + ; GCN9_4-NEXT: s_nop 1 + ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[2:3], v[0:1], v[48:63] + ; GCN9_4-NEXT: v_add_f32_e32 v2, v26, v4 + ; GCN9_4-NEXT: v_cvt_f16_f32_e32 v4, v64 + ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[6:7], v[0:1], v[32:47] + ; GCN9_4-NEXT: v_add_f32_e32 v6, v27, v2 + ; GCN9_4-NEXT: v_fma_f32 v11, s4, v11, -v72 + ; GCN9_4-NEXT: v_fma_f32 v12, s4, v12, -v72 + ; GCN9_4-NEXT: v_fma_f32 v5, s4, v14, -v72 + ; GCN9_4-NEXT: v_fma_f32 v13, s4, v13, -v72 + ; GCN9_4-NEXT: v_mul_f32_e32 v3, 0x3fb8aa3b, v12 + ; GCN9_4-NEXT: v_mul_f32_e32 v10, 0x3fb8aa3b, v11 + ; GCN9_4-NEXT: v_exp_f32_e32 v11, v3 + ; GCN9_4-NEXT: v_mul_f32_e32 v3, 0x3fb8aa3b, v13 + ; GCN9_4-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v5 + ; GCN9_4-NEXT: v_exp_f32_e32 v10, v10 + ; GCN9_4-NEXT: v_exp_f32_e32 v13, v3 + ; GCN9_4-NEXT: v_exp_f32_e32 v7, v0 + ; GCN9_4-NEXT: ds_read_b128 v[0:3], v70 + ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0) + ; GCN9_4-NEXT: buffer_inv sc0 sc1 + ; GCN9_4-NEXT: v_cvt_f16_f32_e32 v12, v65 + ; GCN9_4-NEXT: v_cvt_f16_f32_e32 v5, v8 + ; GCN9_4-NEXT: v_cvt_f16_f32_e32 v14, v10 + ; GCN9_4-NEXT: v_pack_b32_f16 v4, v4, v12 + ; GCN9_4-NEXT: v_cvt_f16_f32_e32 v12, v13 + ; GCN9_4-NEXT: v_pack_b32_f16 v5, v5, v14 + ; GCN9_4-NEXT: v_cvt_f16_f32_e32 v14, v11 + ; GCN9_4-NEXT: s_nop 0 + ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[0:1], v[4:5], v[48:63] + ; GCN9_4-NEXT: v_cvt_f16_f32_e32 v0, v9 + ; GCN9_4-NEXT: v_cvt_f16_f32_e32 v1, v7 + ; GCN9_4-NEXT: v_pack_b32_f16 v1, v1, v0 + ; GCN9_4-NEXT: v_pack_b32_f16 v0, v14, v12 + ; GCN9_4-NEXT: s_nop 1 + ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[2:3], v[0:1], v[48:63] + ; GCN9_4-NEXT: v_add_f32_e32 v0, v28, v6 + ; GCN9_4-NEXT: v_add_f32_e32 v0, v30, v0 + ; GCN9_4-NEXT: v_add_f32_e32 v0, v25, v0 + ; GCN9_4-NEXT: v_add_f32_e32 v0, v31, v0 + ; GCN9_4-NEXT: v_add_f32_e32 v0, v19, v0 + ; GCN9_4-NEXT: v_add_f32_e32 v0, v74, v0 + ; GCN9_4-NEXT: v_add_f32_e32 v0, v22, v0 + ; GCN9_4-NEXT: v_add_f32_e32 v0, v18, v0 + ; GCN9_4-NEXT: v_add_f32_e32 v0, v23, v0 + ; GCN9_4-NEXT: v_add_f32_e32 v0, v29, v0 + ; GCN9_4-NEXT: v_add_f32_e32 v0, v17, v0 + ; GCN9_4-NEXT: v_add_f32_e32 v6, v20, v0 + ; GCN9_4-NEXT: ds_read_b128 v[0:3], v70 offset:576 + ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0) + ; GCN9_4-NEXT: buffer_inv sc0 sc1 + ; GCN9_4-NEXT: v_add_f32_e32 v2, v64, v6 + ; GCN9_4-NEXT: v_add_f32_e32 v2, v65, v2 + ; GCN9_4-NEXT: v_add_f32_e32 v2, v8, v2 + ; GCN9_4-NEXT: v_add_f32_e32 v2, v10, v2 + ; GCN9_4-NEXT: v_add_f32_e32 v2, v11, v2 + ; GCN9_4-NEXT: v_add_f32_e32 v2, v13, v2 + ; GCN9_4-NEXT: v_add_f32_e32 v2, v7, v2 + ; GCN9_4-NEXT: v_add_f32_e32 v2, v9, v2 + ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[0:1], v[4:5], v[32:47] + ; GCN9_4-NEXT: ds_bpermute_b32 v0, v68, v2 + ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0) + ; GCN9_4-NEXT: v_add_f32_e32 v0, v2, v0 + ; GCN9_4-NEXT: ds_bpermute_b32 v1, v68, v0 + ; GCN9_4-NEXT: v_mov_b32_e32 v2, 0 + ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0) + ; GCN9_4-NEXT: v_cndmask_b32_e64 v0, v1, v0, s[0:1] + ; GCN9_4-NEXT: v_fmac_f32_e32 v0, v2, v16 + ; GCN9_4-NEXT: s_endpgm attributes #0 = {"amdgpu-flat-work-group-size"="256,256"} !0 = !{i64 2862105} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.gfx90a.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.gfx90a.ll index a3d789c1ccc36f..daec7e9b91e71e 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.gfx90a.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.gfx90a.ll @@ -2,6 +2,7 @@ ; RUN: llc -global-isel=1 -march=amdgcn -mcpu=gfx90a < %s | FileCheck --check-prefixes=GCN,GFX90A %s ; RUN: llc -global-isel=0 -march=amdgcn -mcpu=gfx942 < %s | FileCheck --check-prefixes=GCN,GFX942 %s ; RUN: llc -global-isel=1 -march=amdgcn -mcpu=gfx942 < %s | FileCheck --check-prefixes=GCN,GFX942 %s +; RUN: llc -global-isel=1 -march=amdgcn -mcpu=gfx9-4-generic --amdhsa-code-object-version=6 < %s | FileCheck --check-prefixes=GCN,GFX942 %s ; DPP control value 337 is valid for 64-bit DPP on gfx942 diff --git a/llvm/test/CodeGen/AMDGPU/no-corresponding-integer-type.ll b/llvm/test/CodeGen/AMDGPU/no-corresponding-integer-type.ll index 5201f188afd5f8..df717d0ae497d3 100644 --- a/llvm/test/CodeGen/AMDGPU/no-corresponding-integer-type.ll +++ b/llvm/test/CodeGen/AMDGPU/no-corresponding-integer-type.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 %s -o - | FileCheck %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx9-4-generic --amdhsa-code-object-version=6 %s -o - | FileCheck %s define void @no_corresponding_integer_type(i8 %arg, ptr addrspace(1) %ptr) { ; CHECK-LABEL: no_corresponding_integer_type: diff --git a/llvm/test/MC/AMDGPU/gfx9_4_generic_unsupported.s b/llvm/test/MC/AMDGPU/gfx9_4_generic_unsupported.s new file mode 100644 index 00000000000000..6ade556f21a1d1 --- /dev/null +++ b/llvm/test/MC/AMDGPU/gfx9_4_generic_unsupported.s @@ -0,0 +1,104 @@ +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx9-4-generic -mattr=+wavefrontsize32 %s 2>&1 | FileCheck --implicit-check-not=error: %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx9-4-generic -mattr=+wavefrontsize64 %s 2>&1 | FileCheck --implicit-check-not=error: %s + +v_mfma_f32_16x16x8_xf32 a[0:3], v[2:3], v[4:5], a[2:5] +// CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU + +v_mfma_f32_16x16x8xf32 a[0:3], v[2:3], v[4:5], a[2:5] +// CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU + +v_mfma_f32_32x32x4_xf32 a[0:15], v[2:3], v[4:5], a[18:33] +// CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU + +v_mfma_f32_32x32x4xf32 a[0:15], v[2:3], v[4:5], a[18:33] +// CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU + +v_cvt_f32_bf8 v1, 3 +// CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU + +v_cvt_f32_bf8_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf +// CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU + +v_cvt_f32_bf8_e64 v5, v1 +// CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU + +v_cvt_f32_bf8_sdwa v5, v1 src0_sel:BYTE_0 +// CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU + +v_cvt_f32_fp8 v1, 3 +// CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU + +v_cvt_f32_fp8_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf +// CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU + +v_cvt_f32_fp8_e64 v5, v1 +// CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU + +v_cvt_f32_fp8_sdwa v5, v1 src0_sel:BYTE_0 +// CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU + +v_cvt_pk_f32_fp8 v[0:1], v3 quad_perm:[0,2,1,1] row_mask:0xf bank_mask:0xf +// CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU + +v_cvt_pk_f32_fp8_dpp v[10:11], v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf +// CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU + +v_cvt_pk_f32_fp8_sdwa v[10:11], v1 src0_sel:WORD_0 +// CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU + +v_cvt_pk_f32_bf8 v[0:1], v3 quad_perm:[0,2,1,1] row_mask:0xf bank_mask:0xf +// CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU + +v_cvt_pk_f32_bf8_dpp v[10:11], v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf +// CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU + +v_cvt_pk_f32_bf8_sdwa v[10:11], v1 src0_sel:WORD_0 +// CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU + +v_mfma_f32_16x16x32_bf8_bf8 a[0:3], v[2:3], v[4:5], a[0:3] +// CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU + +v_mfma_f32_16x16x32_fp8_bf8 a[0:3], v[2:3], v[4:5], a[0:3] +// CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU + +v_mfma_f32_16x16x32_bf8_fp8 a[0:3], v[2:3], v[4:5], a[0:3] +// CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU + +v_mfma_f32_16x16x32_fp8_fp8 a[0:3], v[2:3], v[4:5], a[0:3] +// CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU + +v_mfma_f32_32x32x16_bf8_bf8 a[0:15], v[2:3], v[4:5], a[0:15] +// CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU + +v_mfma_f32_32x32x16_bf8_fp8 a[0:15], v[2:3], v[4:5], a[0:15] +// CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU + +v_mfma_f32_32x32x16_fp8_bf8 a[0:15], v[2:3], v[4:5], a[0:15] +// CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU + +v_mfma_f32_32x32x16_fp8_fp8 a[0:15], v[2:3], v[4:5], a[0:15] +// CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU + +v_smfmac_f32_16x16x64_bf8_bf8 a[0:3], v[2:3], a[4:7], v1 +// CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU + +v_smfmac_f32_16x16x64_bf8_fp8 a[0:3], v[2:3], a[4:7], v1 +// CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU + +v_smfmac_f32_16x16x64_fp8_bf8 a[0:3], v[2:3], a[4:7], v1 +// CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU + +v_smfmac_f32_16x16x64_fp8_fp8 a[0:3], v[2:3], a[4:7], v1 +// CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU + +v_smfmac_f32_32x32x32_bf8_bf8 a[0:15], v[2:3], a[4:7], v1 +// CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU + +v_smfmac_f32_32x32x32_bf8_fp8 a[0:15], v[2:3], a[4:7], v1 +// CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU + +v_smfmac_f32_32x32x32_fp8_bf8 a[0:15], v[2:3], a[4:7], v1 +// CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU + +v_smfmac_f32_32x32x32_fp8_fp8 a[0:15], v[2:3], a[4:7], v1 +// CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU diff --git a/llvm/test/Object/AMDGPU/elf-header-flags-mach.yaml b/llvm/test/Object/AMDGPU/elf-header-flags-mach.yaml index 37234dba7d9b4c..9c79ea588f6247 100644 --- a/llvm/test/Object/AMDGPU/elf-header-flags-mach.yaml +++ b/llvm/test/Object/AMDGPU/elf-header-flags-mach.yaml @@ -250,6 +250,10 @@ # RUN: llvm-readobj -S --file-headers %t.o.AMDGCN_GFX9_GENERIC | FileCheck --check-prefixes=ELF-AMDGCN-ALL,ELF-AMDGCN-GFX9_GENERIC %s # RUN: obj2yaml %t.o.AMDGCN_GFX9_GENERIC | FileCheck --check-prefixes=YAML-AMDGCN-ALL,YAML-AMDGCN-GFX9_GENERIC %s +# RUN: sed -e 's//64/' -e 's//AMDGCN_GFX9_4_GENERIC/' %s | yaml2obj -o %t.o.AMDGCN_GFX9_4_GENERIC +# RUN: llvm-readobj -S --file-headers %t.o.AMDGCN_GFX9_4_GENERIC | FileCheck --check-prefixes=ELF-AMDGCN-ALL,ELF-AMDGCN-GFX9_4_GENERIC %s +# RUN: obj2yaml %t.o.AMDGCN_GFX9_4_GENERIC | FileCheck --check-prefixes=YAML-AMDGCN-ALL,YAML-AMDGCN-GFX9_4_GENERIC %s + # RUN: sed -e 's//64/' -e 's//AMDGCN_GFX10_1_GENERIC/' %s | yaml2obj -o %t.o.AMDGCN_GFX10_1_GENERIC # RUN: llvm-readobj -S --file-headers %t.o.AMDGCN_GFX10_1_GENERIC | FileCheck --check-prefixes=ELF-AMDGCN-ALL,ELF-AMDGCN-GFX10_1_GENERIC %s # RUN: obj2yaml %t.o.AMDGCN_GFX10_1_GENERIC | FileCheck --check-prefixes=YAML-AMDGCN-ALL,YAML-AMDGCN-GFX10_1_GENERIC %s @@ -473,6 +477,9 @@ # ELF-AMDGCN-GFX9_GENERIC: EF_AMDGPU_MACH_AMDGCN_GFX9_GENERIC (0x51) # YAML-AMDGCN-GFX9_GENERIC: Flags: [ EF_AMDGPU_MACH_AMDGCN_GFX9_GENERIC ] +# ELF-AMDGCN-GFX9_4_GENERIC: EF_AMDGPU_MACH_AMDGCN_GFX9_4_GENERIC (0x5F) +# YAML-AMDGCN-GFX9_4_GENERIC: Flags: [ EF_AMDGPU_MACH_AMDGCN_GFX9_4_GENERIC ] + # ELF-AMDGCN-GFX10_1_GENERIC: EF_AMDGPU_MACH_AMDGCN_GFX10_1_GENERIC (0x52) # YAML-AMDGCN-GFX10_1_GENERIC: Flags: [ EF_AMDGPU_MACH_AMDGCN_GFX10_1_GENERIC ] diff --git a/llvm/test/tools/llvm-objdump/ELF/AMDGPU/subtarget.ll b/llvm/test/tools/llvm-objdump/ELF/AMDGPU/subtarget.ll index c38f6b4e7833cd..45071ecb751321 100644 --- a/llvm/test/tools/llvm-objdump/ELF/AMDGPU/subtarget.ll +++ b/llvm/test/tools/llvm-objdump/ELF/AMDGPU/subtarget.ll @@ -137,6 +137,12 @@ define amdgpu_kernel void @test_kernel() { ; ----------------------------------GFX9--------------------------------------- ; + +; RUN: llc -mtriple=amdgcn-amd-amdhsa --amdhsa-code-object-version=6 -mcpu=gfx9-4-generic -filetype=obj -O0 -o %t.o %s +; RUN: llvm-objdump -D --arch-name=amdgcn -mllvm --amdhsa-code-object-version=6 --mcpu=gfx9-4-generic %t.o > %t-specify.txt +; RUN: llvm-objdump -D -mllvm --amdhsa-code-object-version=6 %t.o > %t-detect.txt +; RUN: diff %t-specify.txt %t-detect.txt + ; RUN: llc -mtriple=amdgcn-amd-amdhsa --amdhsa-code-object-version=6 -mcpu=gfx9-generic -filetype=obj -O0 -o %t.o %s ; RUN: llvm-objdump -D --arch-name=amdgcn -mllvm --amdhsa-code-object-version=6 --mcpu=gfx9-generic %t.o > %t-specify.txt ; RUN: llvm-objdump -D -mllvm --amdhsa-code-object-version=6 %t.o > %t-detect.txt diff --git a/llvm/test/tools/llvm-readobj/ELF/AMDGPU/elf-headers.test b/llvm/test/tools/llvm-readobj/ELF/AMDGPU/elf-headers.test index 78acbd657b7635..34c22dca3aa183 100644 --- a/llvm/test/tools/llvm-readobj/ELF/AMDGPU/elf-headers.test +++ b/llvm/test/tools/llvm-readobj/ELF/AMDGPU/elf-headers.test @@ -364,6 +364,9 @@ # RUN: yaml2obj %s -o %t -DABI_VERSION=4 -DFLAG_NAME=EF_AMDGPU_MACH_AMDGCN_GFX9_GENERIC # RUN: llvm-readobj -h %t | FileCheck %s --check-prefixes=ALL,KNOWN-ABI-VERSION,SINGLE-FLAG --match-full-lines -DABI_VERSION=4 -DFILE=%t -DFLAG_NAME=EF_AMDGPU_MACH_AMDGCN_GFX9_GENERIC -DFLAG_VALUE=0x51 +# RUN: yaml2obj %s -o %t -DABI_VERSION=4 -DFLAG_NAME=EF_AMDGPU_MACH_AMDGCN_GFX9_4_GENERIC +# RUN: llvm-readobj -h %t | FileCheck %s --check-prefixes=ALL,KNOWN-ABI-VERSION,SINGLE-FLAG --match-full-lines -DABI_VERSION=4 -DFILE=%t -DFLAG_NAME=EF_AMDGPU_MACH_AMDGCN_GFX9_4_GENERIC -DFLAG_VALUE=0x5F + # RUN: yaml2obj %s -o %t -DABI_VERSION=0 -DFLAG_NAME=EF_AMDGPU_MACH_AMDGCN_GFX1100 # RUN: llvm-readobj -h %t | FileCheck %s --check-prefixes=ALL,KNOWN-ABI-VERSION,SINGLE-FLAG --match-full-lines -DABI_VERSION=0 -DFILE=%t -DFLAG_NAME=EF_AMDGPU_MACH_AMDGCN_GFX1100 -DFLAG_VALUE=0x41 diff --git a/llvm/tools/llvm-readobj/ELFDumper.cpp b/llvm/tools/llvm-readobj/ELFDumper.cpp index 7d92a492d8b181..1012cd020d525e 100644 --- a/llvm/tools/llvm-readobj/ELFDumper.cpp +++ b/llvm/tools/llvm-readobj/ELFDumper.cpp @@ -1641,6 +1641,7 @@ const EnumEntry ElfHeaderMipsFlags[] = { ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX1200, "gfx1200"), \ ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX1201, "gfx1201"), \ ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX9_GENERIC, "gfx9-generic"), \ + ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX9_4_GENERIC, "gfx9-4-generic"), \ ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX10_1_GENERIC, "gfx10-1-generic"), \ ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX10_3_GENERIC, "gfx10-3-generic"), \ ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX11_GENERIC, "gfx11-generic"), \