Skip to content

Commit 56f38ca

Browse files
authored
Merge branch 'develop' into mkulikow/ck_examples_more_support_for_rdna3_and_4
2 parents 6b6a30c + ab22f91 commit 56f38ca

File tree

118 files changed

+4403
-573
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

118 files changed

+4403
-573
lines changed

Dockerfile

Lines changed: 10 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1,27 +1,23 @@
1+
12
FROM ubuntu:24.04
23
ARG DEBIAN_FRONTEND=noninteractive
3-
ARG ROCMVERSION=6.4.1
4+
ARG ROCMVERSION=7.0.1
45
ARG compiler_version=""
56
ARG compiler_commit=""
67
ARG CK_SCCACHE=""
78
ARG DEB_ROCM_REPO=http://repo.radeon.com/rocm/apt/.apt_$ROCMVERSION/
89
ENV APT_KEY_DONT_WARN_ON_DANGEROUS_USAGE=DontWarn
10+
ENV DEBIAN_FRONTEND=noninteractive
911

1012
# Add rocm repository
1113
RUN set -xe && \
12-
apt-get update && apt-get install -y --allow-unauthenticated apt-utils wget gnupg2 curl && \
13-
curl -fsSL https://repo.radeon.com/rocm/rocm.gpg.key | gpg --dearmor -o /etc/apt/trusted.gpg.d/rocm-keyring.gpg
14-
15-
RUN if [ "$ROCMVERSION" != "6.5" ]; then \
16-
sh -c "wget https://repo.radeon.com/amdgpu-install/$ROCMVERSION/ubuntu/jammy/amdgpu-install_6.4.60401-1_all.deb --no-check-certificate" && \
17-
apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated ./amdgpu-install_6.4.60401-1_all.deb && \
18-
wget -qO - http://repo.radeon.com/rocm/rocm.gpg.key | apt-key add - && \
19-
sh -c "echo deb [arch=amd64 signed-by=/etc/apt/trusted.gpg.d/rocm-keyring.gpg] $DEB_ROCM_REPO jammy main > /etc/apt/sources.list.d/rocm.list" && \
20-
sh -c 'echo deb [arch=amd64 signed-by=/etc/apt/trusted.gpg.d/rocm-keyring.gpg] https://repo.radeon.com/amdgpu/$ROCMVERSION/ubuntu jammy main > /etc/apt/sources.list.d/amdgpu.list'; \
21-
fi
14+
apt-get update && apt-get install -y --allow-unauthenticated apt-utils wget gnupg2 curl
2215

23-
RUN sh -c "echo deb http://mirrors.kernel.org/ubuntu jammy main universe | tee -a /etc/apt/sources.list" && \
24-
amdgpu-install -y --usecase=rocm --no-dkms
16+
RUN wget https://repo.radeon.com/amdgpu-install/7.0.1/ubuntu/noble/amdgpu-install_7.0.1.70001-1_all.deb && \
17+
apt install ./amdgpu-install_7.0.1.70001-1_all.deb -y && \
18+
apt update && \
19+
apt install python3-setuptools python3-wheel -y && \
20+
apt install rocm-dev -y
2521

2622
## Sccache binary built from source for ROCm, only install if CK_SCCACHE is defined
2723
ARG SCCACHE_REPO_URL=http://compute-artifactory.amd.com/artifactory/rocm-generic-experimental/rocm-sccache
@@ -45,7 +41,6 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-
4541
libelf-dev \
4642
libnuma-dev \
4743
libpthread-stubs0-dev \
48-
llvm-amdgpu \
4944
mpich \
5045
net-tools \
5146
pkg-config \
@@ -61,17 +56,13 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-
6156
zip \
6257
libzstd-dev \
6358
openssh-server \
64-
clang-format-12 \
6559
clang-format-18 \
6660
kmod && \
6761
apt-get clean && \
6862
rm -rf /var/lib/apt/lists/* && \
6963
rm -rf amdgpu-install* && \
70-
# Remove unnecessary rocm components that take a lot of space
71-
apt-get remove -y rocblas rocfft rocsparse composablekernel-dev hipblaslt
72-
7364
#Install latest ccache
74-
RUN git clone https://github.com/ccache/ccache.git && \
65+
git clone https://github.com/ccache/ccache.git && \
7566
cd ccache && mkdir build && cd build && cmake .. && make install && \
7667
#Install ninja build tracing tools
7768
cd / && \

Dockerfile.compiler

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
ARG BASE_DOCKER="rocm/composable_kernel:ck_ub24.04_rocm6.4.1"
1+
ARG BASE_DOCKER="rocm/composable_kernel:ck_ub24.04_rocm7.0.1"
22
FROM $BASE_DOCKER
33
ARG compiler_version=""
44
ARG compiler_commit=""

Jenkinsfile

Lines changed: 11 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,7 @@ def getBaseDockerImageName(){
5353
}
5454
else{
5555
def ROCM_numeric = parseVersion("${params.ROCMVERSION}")
56-
if ( ROCM_numeric.major <= 6 && ROCM_numeric.minor < 5 ){
56+
if ( ROCM_numeric.major <= 7 && ROCM_numeric.minor < 1 ){
5757
img = "${env.CK_DOCKERHUB}:ck_ub24.04_rocm${params.ROCMVERSION}"
5858
}
5959
else{
@@ -930,7 +930,8 @@ def run_pytorch_tests(Map conf=[:]){
930930
}
931931

932932
//launch develop branch daily jobs
933-
CRON_SETTINGS = BRANCH_NAME == "develop" ? '''0 23 * * * % RUN_FULL_QA=true;DISABLE_DL_KERNELS=true;RUN_CK_TILE_FMHA_TESTS=true;RUN_TILE_ENGINE_GEMM_TESTS=true;RUN_PERFORMANCE_TESTS=true;RUN_ALL_UNIT_TESTS=true
933+
CRON_SETTINGS = BRANCH_NAME == "develop" ? '''0 23 * * * % RUN_FULL_QA=true;RUN_CK_TILE_FMHA_TESTS=true
934+
0 22 * * * % RUN_FULL_QA=true;DISABLE_DL_KERNELS=true;RUN_TILE_ENGINE_GEMM_TESTS=true;RUN_PERFORMANCE_TESTS=true;RUN_ALL_UNIT_TESTS=true
934935
0 21 * * * % RUN_GROUPED_CONV_LARGE_CASES_TESTS=true;hipTensor_test=true;BUILD_GFX908=true;BUILD_GFX942=true;BUILD_GFX950=true;RUN_PERFORMANCE_TESTS=true;RUN_ALL_UNIT_TESTS=true
935936
0 19 * * * % BUILD_DOCKER=true;COMPILER_VERSION=amd-staging;BUILD_COMPILER=/llvm-project/build/bin/clang++;USE_SCCACHE=false;NINJA_BUILD_TRACE=true;RUN_ALL_UNIT_TESTS=true
936937
0 17 * * * % BUILD_DOCKER=true;COMPILER_VERSION=amd-mainline;BUILD_COMPILER=/llvm-project/build/bin/clang++;USE_SCCACHE=false;NINJA_BUILD_TRACE=true;RUN_ALL_UNIT_TESTS=true
@@ -957,8 +958,8 @@ pipeline {
957958
description: 'If you want to use a custom docker image, please specify it here (default: leave blank).')
958959
string(
959960
name: 'ROCMVERSION',
960-
defaultValue: '6.4.1',
961-
description: 'Specify which ROCM version to use: 6.4.1 (default).')
961+
defaultValue: '7.0.1',
962+
description: 'Specify which ROCM version to use: 7.0.1 (default).')
962963
string(
963964
name: 'COMPILER_VERSION',
964965
defaultValue: '',
@@ -1037,8 +1038,8 @@ pipeline {
10371038
description: "Build CK and run tests on gfx942 (default: ON)")
10381039
booleanParam(
10391040
name: "BUILD_GFX950",
1040-
defaultValue: false,
1041-
description: "Build CK and run tests on gfx950 (default: OFF)")
1041+
defaultValue: true,
1042+
description: "Build CK and run tests on gfx950 (default: ON)")
10421043
booleanParam(
10431044
name: "BUILD_GFX10",
10441045
defaultValue: true,
@@ -1290,7 +1291,7 @@ pipeline {
12901291
agent{ label rocmnode("gfx90a")}
12911292
environment{
12921293
setup_args = "NO_CK_BUILD"
1293-
execute_args = """ CXX=/opt/rocm/llvm/bin/clang++ cmake ../codegen && \
1294+
execute_args = """ CXX=/opt/rocm/llvm/bin/clang++ cmake -DCMAKE_PREFIX_PATH=/opt/rocm ../codegen && \
12941295
make -j64 check"""
12951296
}
12961297
steps{
@@ -1350,7 +1351,7 @@ pipeline {
13501351
}
13511352
agent{ label rocmnode("gfx950") }
13521353
environment{
1353-
def docker_name = "${env.CK_DOCKERHUB_PRIVATE}:ck_ub24.04_rocm7.0"
1354+
def docker_name = "${env.CK_DOCKERHUB}:ck_ub24.04_rocm7.0.1"
13541355
setup_args = "NO_CK_BUILD"
13551356
execute_args = """ ../script/cmake-ck-dev.sh ../ gfx950 && \
13561357
make -j128 tile_example_fmha_fwd tile_example_fmha_bwd && \
@@ -1566,7 +1567,7 @@ pipeline {
15661567
-DCMAKE_CXX_FLAGS=" -O3 " .. && make -j """
15671568
}
15681569
steps{
1569-
Build_CK_and_Reboot(setup_args: setup_args, docker_name: "${env.CK_DOCKERHUB_PRIVATE}:ck_ub24.04_rocm7.0", config_targets: "install", no_reboot:true, build_type: 'Release', execute_cmd: execute_args, prefixpath: '/usr/local')
1570+
Build_CK_and_Reboot(setup_args: setup_args, docker_name: "${env.CK_DOCKERHUB}:ck_ub24.04_rocm7.0.1", config_targets: "install", no_reboot:true, build_type: 'Release', execute_cmd: execute_args, prefixpath: '/usr/local')
15701571
cleanWs()
15711572
}
15721573
}
@@ -1631,7 +1632,7 @@ pipeline {
16311632
-D CMAKE_BUILD_TYPE=Release \
16321633
-D CMAKE_CXX_FLAGS=" -O3 " .. && ninja -j64 """
16331634

1634-
buildHipClangJobAndReboot(setup_cmd: "", build_cmd: "", no_reboot:true, build_type: 'Release', execute_cmd: execute_args, docker_name: "${env.CK_DOCKERHUB_PRIVATE}:ck_ub24.04_rocm7.0")
1635+
buildHipClangJobAndReboot(setup_cmd: "", build_cmd: "", no_reboot:true, build_type: 'Release', execute_cmd: execute_args, docker_name: "${env.CK_DOCKERHUB}:ck_ub24.04_rocm7.0.1")
16351636
}
16361637
cleanWs()
16371638
}

codegen/CMakeLists.txt

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ configure_file(${CK_ROOT}/include/ck/config.h.in ${CK_ROOT}/include/ck/config.h)
1212
find_package(ROCM)
1313
include(ROCMInstallTargets)
1414
include(ROCMTest)
15+
find_package(hiprtc REQUIRED)
1516

1617
rocm_setup_version(VERSION 1.0)
1718

@@ -27,7 +28,7 @@ add_compile_options(-std=c++20)
2728
file(GLOB SOURCES CONFIGURE_DEPENDS src/*.cpp)
2829
# TODO: Use object library
2930
add_library(ck_host STATIC ${SOURCES})
30-
target_link_libraries(ck_host PRIVATE ck_headers)
31+
target_link_libraries(ck_host PRIVATE ck_headers hiprtc::hiprtc)
3132

3233
set_target_properties(ck_host PROPERTIES
3334
LINKER_LANGUAGE CXX

example/01_gemm/gemm_xdl_skip_b_lds_fp16.cpp

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ using BDataType = ck::half_t;
3636
using CDataType = ck::half_t;
3737
using AccDataType = float;
3838
#else
39-
< F32, F32, F32, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 256, 16, 64, 4, 4, 16, 16, 1, 1, S<16, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, true, 4, 4, 7, 1>;
39+
< F32, F32, F32, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 256, 16, 128, 4, 4, 16, 16, 1, 2, S<16, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, true, 4, 4, 7, 1>;
4040
using ADataType = float;
4141
using BDataType = float;
4242
using CDataType = float;
@@ -185,7 +185,6 @@ int main(int argc, char* argv[])
185185
auto a_element_op = AElementOp{};
186186
auto b_element_op = BElementOp{};
187187
auto c_element_op = CElementOp{};
188-
189188
// do GEMM
190189
auto gemm = DeviceGemmInstance{};
191190
auto invoker = gemm.MakeInvoker();
@@ -209,8 +208,7 @@ int main(int argc, char* argv[])
209208
return 0;
210209
}
211210

212-
float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
213-
211+
float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
214212
std::size_t flop = std::size_t(2) * M * N * K;
215213
std::size_t num_btype =
216214
sizeof(ADataType) * M * K + sizeof(BDataType) * K * N + sizeof(CDataType) * M * N;

example/10_convnd_fwd_multiple_d_multiple_reduce/common.hpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -125,7 +125,7 @@ inline bool parse_cmd_args(int argc,
125125

126126
const ck::index_t num_dim_spatial = std::stoi(argv[4]);
127127
problem_size = ck::utils::conv::parse_conv_param(
128-
num_dim_spatial, threshold_to_catch_partial_args, argv);
128+
num_dim_spatial, threshold_to_catch_partial_args + 1, argv);
129129
}
130130
else
131131
{

example/10_convnd_fwd_multiple_d_multiple_reduce/run_convnd_fwd_max_example.inc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ using RsGlobalReduceOp =
2323
static constexpr auto ConvSpec =
2424
ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
2525

26-
static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
26+
static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
2727

2828
// clang-format off
2929
template <ck::index_t NDimSpatial>

example/21_gemm_layernorm/gemm_bias_relu_add_layernorm_xdl_welford_fp16.cpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,7 @@ using DeviceOpInstance = ck::tensor_operation::device::DeviceGemmMultipleDLayern
6565
//######| | | | | Type| Type| Type| DataType| Type| Type| Type| Type| Type| Elementwise| Elementwise| Elementwise| Elementwise| Spacialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| ThreadClusterLengths| ScalarPerVector| ThreadClusterLengths| ThreadSliceSize|
6666
//######| | | | | | | | | | | | | | Operation| Operation| Operation| Operation| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _M_N| _M_N| _M_N| _M|
6767
//######| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
68-
< ALayout, BLayout, DsLayout, HLayout, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EMeanVarDataType, GammaDataType, BetaDataType, HDataType, AElementOp, BElementOp, CDEElementOp, HElementOp, GemmDefault, 1, 256, 256, 128, 32, 8, 8, 32, 32, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<32, 8>, 8, S<8, 32>, 8>;
68+
< ALayout, BLayout, DsLayout, HLayout, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EMeanVarDataType, GammaDataType, BetaDataType, HDataType, AElementOp, BElementOp, CDEElementOp, HElementOp, GemmDefault, 1, 256, 256, 128, 32, 8, 8, 16, 16, 8, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<32, 8>, 4, S<8, 32>, 4>;
6969
// clang-format on
7070

7171
auto f_host_tensor_descriptor1d = [](std::size_t len, std::size_t stride) {
@@ -154,8 +154,8 @@ void host_gemm_layernorm(Tensor<HDataType>& h_m_n,
154154

155155
int main()
156156
{
157-
// temp disable on gfx11 & gfx12
158-
if(ck::is_gfx11_supported() || ck::is_gfx12_supported())
157+
// temp disable on gfx11
158+
if(ck::is_gfx11_supported())
159159
{
160160
return 0;
161161
}

include/ck/ck.hpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
#pragma once
55

66
#include "ck/config.h"
7+
#include <stdint.h>
78

89
#if !defined(__HIPCC_RTC__) || !defined(CK_CODE_GEN_RTC)
910
#ifndef CK_DONT_USE_HIP_RUNTIME_HEADERS

include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_base.hpp

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,9 @@ struct BlockwiseGemmXdlops_pipeline_base
5454
static constexpr auto xdlops_gemm =
5555
XdlopsGemm<ComputeDataType, MPerXDL, NPerXDL, KPack, ComputeDataType, TransposeC>{};
5656

57+
using ComputeDataTypeBuf =
58+
conditional_t<std::is_same<ComputeDataType, ck::tf32_t>::value, float, ComputeDataType>;
59+
5760
static constexpr index_t AMmaKStride = KPack;
5861
static constexpr index_t BMmaKStride = KPack;
5962

@@ -376,7 +379,7 @@ struct BlockwiseGemmXdlops_pipeline_base
376379
make_tuple(Number<MRepeat>{}, Number<NRepeat>{}, xdlops_gemm.GetRegSizePerXdlops()));
377380

378381
using AThreadCopy = ThreadwiseTensorSliceTransfer_v4<ADataType,
379-
ComputeDataType,
382+
ComputeDataTypeBuf,
380383
decltype(a_block_desc_m0_m1_m2_k),
381384
decltype(a_thread_desc_),
382385
Sequence<1, 1, 1, KPack>,
@@ -386,7 +389,7 @@ struct BlockwiseGemmXdlops_pipeline_base
386389
A_K1>;
387390

388391
using BThreadCopy = ThreadwiseTensorSliceTransfer_v4<BDataType,
389-
ComputeDataType,
392+
ComputeDataTypeBuf,
390393
decltype(b_block_desc_n0_n1_n2_k),
391394
decltype(b_thread_desc_),
392395
Sequence<1, 1, 1, KPack>,

0 commit comments

Comments
 (0)