Skip to content

Commit 0c565fe

Browse files
committed
Merge
2 parents 99e0cc6 + 4363a82 commit 0c565fe

File tree

234 files changed

+10281
-2569
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

234 files changed

+10281
-2569
lines changed

CHANGELOG.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ Documentation for Composable Kernel available at [https://rocm.docs.amd.com/proj
55
## Composable Kernel 1.2.0 for ROCm 7.0.0
66

77
### Added
8+
* Added support for B Tensor type pk_int4_t in the CK TILE weight preshuffle GEMM.
89
* Added support for B Tensor Preshuffle in CK TILE Grouped GEMM.
910
* Added a basic copy kernel example and supporting documentation for new CK Tile developers.
1011
* Added support for bf16, f32, and f16 for 2D and 3D NGCHW grouped convolution backward data
@@ -15,6 +16,7 @@ Documentation for Composable Kernel available at [https://rocm.docs.amd.com/proj
1516
* Added support for GKCYX layout for grouped convolution backward data (NGCHW/GKCYX/NGKHW).
1617
* Added support for Stream-K version of mixed fp8/bf16 GEMM
1718
* Added support for Multiple D GEMM
19+
* Added support for Multiple ABD GEMM
1820
* Added GEMM pipeline for microscaling (MX) FP8/FP6/FP4 data types
1921
* Added support for FP16 2:4 structured sparsity to universal GEMM.
2022
* Added support for Split K for grouped convolution backward data.
@@ -29,6 +31,7 @@ Documentation for Composable Kernel available at [https://rocm.docs.amd.com/proj
2931
* Added benchmarking support for tile engine GEMM Multi D.
3032
* Added block scaling support in CK_TILE GEMM, allowing flexible use of quantization matrices from either A or B operands.
3133
* Added the row-wise column-wise quantization for CK_TILE GEMM & CK_TILE Grouped GEMM.
34+
* Added tensor-wise quantization for CK_TILE GEMM
3235

3336
### Optimized
3437

Jenkinsfile

Lines changed: 46 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -157,9 +157,9 @@ def getDockerImage(Map conf=[:]){
157157
image = getDockerImageName()
158158
echo "Using default docker: ${image}"
159159
}
160-
//Check if image exists
160+
//Check if image exists
161161
def retimage
162-
try
162+
try
163163
{
164164
echo "Pulling image: ${image}"
165165
retimage = docker.image("${image}")
@@ -232,7 +232,7 @@ def cmake_build(Map conf=[:]){
232232
def setup_args = conf.get("setup_args","")
233233
// make sure all unit tests always run on develop branch
234234
def runAllUnitTests = (env.BRANCH_NAME == "develop") ? true : params.RUN_ALL_UNIT_TESTS
235-
235+
236236
if (prefixpath != "/usr/local"){
237237
setup_args = setup_args + " -DCMAKE_PREFIX_PATH=${prefixpath} "
238238
}
@@ -357,7 +357,7 @@ def cmake_build(Map conf=[:]){
357357
"build_cmd",
358358
"${build_envs} ninja -j${nt} ${config_targets}"
359359
)
360-
360+
361361
cmd = conf.get("cmd", """
362362
${setup_cmd}
363363
${build_cmd}
@@ -449,7 +449,7 @@ def buildHipClangJob(Map conf=[:]){
449449
checkout scm
450450
def prefixpath = conf.get("prefixpath", "/opt/rocm")
451451

452-
// Jenkins is complaining about the render group
452+
// Jenkins is complaining about the render group
453453
def dockerOpts
454454
if ( params.BUILD_INSTANCES_ONLY ){
455455
dockerOpts = "--group-add video --group-add render --cap-add=SYS_PTRACE --security-opt seccomp=unconfined"
@@ -515,7 +515,7 @@ def Build_CK(Map conf=[:]){
515515
checkout scm
516516
def prefixpath = conf.get("prefixpath", "/opt/rocm")
517517

518-
// Jenkins is complaining about the render group
518+
// Jenkins is complaining about the render group
519519
def dockerOpts="--device=/dev/kfd --device=/dev/dri --group-add video --group-add render --cap-add=SYS_PTRACE --security-opt seccomp=unconfined"
520520
if (conf.get("enforce_xnack_on", false)) {
521521
dockerOpts = dockerOpts + " --env HSA_XNACK=1 "
@@ -719,7 +719,7 @@ def process_results(Map conf=[:]){
719719
def image = "${env.CK_DOCKERHUB}:ck_ub22.04_rocm6.3"
720720
def prefixpath = "/opt/rocm"
721721

722-
// Jenkins is complaining about the render group
722+
// Jenkins is complaining about the render group
723723
def dockerOpts="--cap-add=SYS_PTRACE --security-opt seccomp=unconfined"
724724
if (conf.get("enforce_xnack_on", false)) {
725725
dockerOpts = dockerOpts + " --env HSA_XNACK=1 "
@@ -956,20 +956,20 @@ pipeline {
956956
defaultValue: '',
957957
description: 'If you want to use a custom docker image, please specify it here (default: leave blank).')
958958
string(
959-
name: 'ROCMVERSION',
959+
name: 'ROCMVERSION',
960960
defaultValue: '6.4.1',
961961
description: 'Specify which ROCM version to use: 6.4.1 (default).')
962962
string(
963-
name: 'COMPILER_VERSION',
964-
defaultValue: '',
963+
name: 'COMPILER_VERSION',
964+
defaultValue: '',
965965
description: 'Specify which version of compiler to use: release, amd-staging, amd-mainline, or leave blank (default).')
966966
string(
967-
name: 'COMPILER_COMMIT',
968-
defaultValue: '',
967+
name: 'COMPILER_COMMIT',
968+
defaultValue: '',
969969
description: 'Specify which commit of compiler branch to use: leave blank to use the latest commit (default), or use some specific commit of llvm-project branch.')
970970
string(
971-
name: 'BUILD_COMPILER',
972-
defaultValue: '/opt/rocm/llvm/bin/clang++',
971+
name: 'BUILD_COMPILER',
972+
defaultValue: '/opt/rocm/llvm/bin/clang++',
973973
description: 'Build CK with /opt/rocm/bin/hipcc, /llvm-project/build/bin/clang++, or with /opt/rocm/llvm/bin/clang++ (default).')
974974
booleanParam(
975975
name: "RUN_FULL_QA",
@@ -1448,6 +1448,36 @@ pipeline {
14481448
cleanWs()
14491449
}
14501450
}
1451+
stage("Run TILE_ENGINE_GEMM Tests on gfx1201")
1452+
{
1453+
when {
1454+
beforeAgent true
1455+
expression { params.RUN_TILE_ENGINE_GEMM_TESTS.toBoolean() }
1456+
}
1457+
agent{ label rocmnode("gfx1201") }
1458+
environment{
1459+
setup_args = "NO_CK_BUILD"
1460+
execute_args = """ cmake -G Ninja -D CMAKE_PREFIX_PATH=/opt/rocm \
1461+
-D CMAKE_CXX_COMPILER="${build_compiler()}" \
1462+
-D CMAKE_BUILD_TYPE=Release \
1463+
-D GPU_TARGETS="gfx1201" \
1464+
-D GEMM_DATATYPE="fp16" \
1465+
-D GEMM_LAYOUT="rcr;rrr;crr;ccr" \
1466+
-DGEMM_CONFIG_FILE=gfx120x_config.json \
1467+
-DCMAKE_CXX_FLAGS=" -O3 " .. && \
1468+
ninja -j64 benchmark_gemm_all && \
1469+
python3 ../tile_engine/ops/gemm/gemm_benchmark.py . --problem-sizes "1024,1024,1024" \
1470+
--warmup 5 --repeat 5 --verbose --json results.json && \
1471+
ninja -j64 benchmark_gemm_fp16_rcr && \
1472+
ninja -j64 benchmark_gemm_fp16_rrr && \
1473+
ninja -j64 benchmark_gemm_fp16_crr && \
1474+
ninja -j64 benchmark_gemm_fp16_ccr """
1475+
}
1476+
steps{
1477+
buildHipClangJobAndReboot(setup_args:setup_args, no_reboot:true, build_type: 'Release', execute_cmd: execute_args)
1478+
cleanWs()
1479+
}
1480+
}
14511481
}
14521482
}
14531483

@@ -1591,7 +1621,7 @@ pipeline {
15911621
agent{ label rocmnode("gfx942") }
15921622
steps{
15931623
script {
1594-
def execute_args = params.NINJA_FTIME_TRACE ?
1624+
def execute_args = params.NINJA_FTIME_TRACE ?
15951625
""" cmake -G Ninja -D CMAKE_PREFIX_PATH=/opt/rocm \
15961626
-D CMAKE_CXX_COMPILER="${build_compiler()}" \
15971627
-D CMAKE_BUILD_TYPE=Release \
@@ -1600,7 +1630,7 @@ pipeline {
16001630
-D CMAKE_CXX_COMPILER="${build_compiler()}" \
16011631
-D CMAKE_BUILD_TYPE=Release \
16021632
-D CMAKE_CXX_FLAGS=" -O3 " .. && ninja -j64 """
1603-
1633+
16041634
buildHipClangJobAndReboot(setup_cmd: "", build_cmd: "", no_reboot:true, build_type: 'Release', execute_cmd: execute_args, docker_name: "${env.CK_DOCKERHUB_PRIVATE}:ck_ub24.04_rocm7.0")
16051635
}
16061636
cleanWs()

example/01_gemm/CMakeLists.txt

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -105,6 +105,16 @@ foreach(gpu IN LISTS GPU_TARGETS)
105105
endif()
106106
endforeach()
107107

108+
list(APPEND gpu_list_tf32 gfx942)
109+
set(target 0)
110+
foreach(gpu IN LISTS GPU_TARGETS)
111+
if(gpu IN_LIST gpu_list_tf32 AND target EQUAL 0)
112+
add_example_executable(example_gemm_xdl_lds_direct_load_fp32_tf32 gemm_xdl_lds_direct_load_fp32_tf32.cpp)
113+
add_example_dependencies(example_gemm_xdl example_gemm_xdl_lds_direct_load_fp32_tf32)
114+
set(target 1)
115+
endif()
116+
endforeach()
117+
108118
add_example_executable(example_gemm_xdl_fp8 gemm_xdl_fp8.cpp)
109119
add_example_dependencies(example_gemm_xdl example_gemm_xdl_fp8)
110120

example/01_gemm/common.hpp

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -310,10 +310,14 @@ bool parse_cmd_args<ProblemSizeSplitK>(int argc,
310310
return true;
311311
}
312312

313-
template <typename DataType>
313+
template <typename DataType, typename ComputeDataType = DataType>
314314
inline __host__ __device__ constexpr double get_rtol()
315315
{
316-
if constexpr(std::is_same_v<DataType, float>)
316+
if constexpr(std::is_same_v<DataType, float> && std::is_same_v<ComputeDataType, ck::tf32_t>)
317+
{
318+
return 1e-3;
319+
}
320+
else if constexpr(std::is_same_v<DataType, float>)
317321
{
318322
return 1e-3;
319323
}
@@ -351,10 +355,14 @@ inline __host__ __device__ constexpr double get_rtol()
351355
}
352356
}
353357

354-
template <typename DataType>
358+
template <typename DataType, typename ComputeDataType = DataType>
355359
inline __host__ __device__ constexpr double get_atol()
356360
{
357-
if constexpr(std::is_same_v<DataType, float>)
361+
if constexpr(std::is_same_v<DataType, float> && std::is_same_v<ComputeDataType, ck::tf32_t>)
362+
{
363+
return 1e-3;
364+
}
365+
else if constexpr(std::is_same_v<DataType, float>)
358366
{
359367
return 1e-3;
360368
}
Lines changed: 85 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,85 @@
1+
// SPDX-License-Identifier: MIT
2+
// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
3+
4+
#include <iostream>
5+
6+
#include "common.hpp"
7+
8+
#define USING_DIRECT_LOADS 1
9+
#if USING_DIRECT_LOADS
10+
#include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_lds_direct_load.hpp"
11+
#else
12+
#include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle.hpp"
13+
#endif
14+
15+
#define EXAMPLE_WITH_COMPUTE_DATATYPE
16+
17+
using F32 = float;
18+
19+
using ADataType = F32;
20+
using BDataType = F32;
21+
using AccDataType = F32;
22+
using CShuffleDataType = F32;
23+
using CDataType = F32;
24+
using ComputeDataType = ck::tf32_t;
25+
26+
using ALayout = Row;
27+
using BLayout = Col;
28+
using CLayout = Row;
29+
30+
using AElementOp = PassThrough;
31+
using BElementOp = PassThrough;
32+
using CElementOp = PassThrough;
33+
34+
static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
35+
36+
#if USING_DIRECT_LOADS
37+
// clang-format off
38+
using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemm_Xdl_CShuffle_LdsDirectLoad
39+
// ######| ALayout| BLayout| CLayout| AData| BData| CData| AccData| CShuffle| A| B| C| GEMM| NumGemmK| Block| MPer| NPer| KPer|
40+
// ######| AK1| BK1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockLds|
41+
// ######| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| LoopScheduler | pipeline ver | gemm type |
42+
// ######| | | | Type| Type| Type| Type| DataType| Elementwise| Elementwise| Elementwise| Spacialization| Prefetch| Size| Block| Block| Block|
43+
// ######| XDL| XDL| Per| Per| ThreadCluster| SrcAccessOrder| SrcVectorDim| Scalar| AddExtraM| ThreadCluster| SrcAccessOrder| SrcVectorDim| Scalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector|
44+
// ######| | | | | | | | | Operation| Operation| Operation| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| | | PerVector| | Lengths_K0_N_K1| | | PerVector| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl|
45+
// ######| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
46+
< ALayout, BLayout, CLayout, ADataType, BDataType, CDataType, AccDataType, CShuffleDataType, AElementOp, BElementOp, CElementOp, GemmDefault, 1, 256, 128, 128, 32,
47+
8, 8, 32, 32, 2, 2, S<4, 8, 8>, S<1, 0, 2>, 2, 1, 1, S<4, 8, 8>, S<1, 0, 2>, 2, 1, 1,
48+
1, 1, S<1, 8, 1, 8>, 4, ck::LoopScheduler::Default, ck::PipelineVersion::v4, ComputeDataType>;
49+
// clang-format on
50+
#else
51+
// clang-format off
52+
using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemm_Xdl_CShuffle
53+
// ######| ALayout| BLayout| CLayout| AData| BData| CData| AccData| CShuffle| A| B| C| GEMM| NumGemmK| Block| MPer| NPer| KPer| AK1| BK1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer|
54+
// ######| | | | Type| Type| Type| Type| DataType| Elementwise| Elementwise| Elementwise| Spacialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector|
55+
// ######| | | | | | | | | Operation| Operation| Operation| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl|
56+
// ######| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
57+
< ALayout, BLayout, CLayout, ADataType, BDataType, CDataType, AccDataType, CShuffleDataType, AElementOp, BElementOp, CElementOp, GemmDefault, 1, 256, 128, 128, 32, 8, 8, 32, 32, 2, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 8, 1, 8>, 4>;
58+
// clang-format on
59+
#endif
60+
using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType,
61+
BDataType,
62+
CDataType,
63+
AccDataType,
64+
AElementOp,
65+
BElementOp,
66+
CElementOp,
67+
ComputeDataType,
68+
ComputeDataType>;
69+
70+
using ReferenceGemmInstanceGPU = ck::tensor_operation::device::ReferenceGemm<ALayout,
71+
BLayout,
72+
CLayout,
73+
ADataType,
74+
BDataType,
75+
CDataType,
76+
AccDataType,
77+
AElementOp,
78+
BElementOp,
79+
CElementOp>;
80+
81+
#include "run_gemm_example.inc"
82+
83+
int main(int argc, char* argv[]) { return !run_gemm_example(argc, argv); }
84+
85+
#undef EXAMPLE_WITH_COMPUTE_DATATYPE

example/01_gemm/run_gemm_example.inc

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,11 @@
44
#pragma once
55
#include "ck/library/utility/validation_common.hpp"
66

7+
// use macro to minimize code change
8+
#ifndef EXAMPLE_WITH_COMPUTE_DATATYPE
9+
using ComputeDataType = AccDataType;
10+
#endif
11+
712
template <typename ProblemType>
813
bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
914
{
@@ -218,8 +223,8 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
218223
pass &= ck::utils::check_err(c_m_n_device_result,
219224
c_m_n_host_result,
220225
"Error: Incorrect results!",
221-
get_rtol<CDataType>(),
222-
get_atol<CDataType>());
226+
get_rtol<CDataType, ComputeDataType>(),
227+
get_atol<CDataType, ComputeDataType>());
223228
#endif
224229
}
225230

@@ -249,8 +254,8 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
249254
pass &= ck::utils::check_err(c_m_n_device_result,
250255
c_m_n_device_ref_result,
251256
"Error: Incorrect results!",
252-
get_rtol<CDataType>(),
253-
get_atol<CDataType>());
257+
get_rtol<CDataType, ComputeDataType>(),
258+
get_atol<CDataType, ComputeDataType>());
254259
}
255260

256261
return pass == true;

example/09_convnd_fwd/CMakeLists.txt

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,4 +19,13 @@ foreach(gpu IN LISTS GPU_TARGETS)
1919
add_example_executable(example_convnd_fwd_xdl_fp64 convnd_fwd_xdl_fp64.cpp)
2020
set(target 1)
2121
endif()
22-
endforeach()
22+
endforeach()
23+
24+
list(APPEND gpu_list_tf32 gfx942)
25+
set(target 0)
26+
foreach(gpu IN LISTS GPU_TARGETS)
27+
if(gpu IN_LIST gpu_list_tf32 AND target EQUAL 0)
28+
add_example_executable(example_convnd_fwd_xdl_fp32_tf32 convnd_fwd_xdl_fp32_tf32.cpp)
29+
set(target 1)
30+
endif()
31+
endforeach()

0 commit comments

Comments
 (0)