diff --git a/binfo/core/038_aotriton.binfo b/binfo/core/038_aotriton.binfo index 8d9ab28..9a76985 100755 --- a/binfo/core/038_aotriton.binfo +++ b/binfo/core/038_aotriton.binfo @@ -47,9 +47,19 @@ BINFO_APP_PRE_CONFIG_CMD_ARRAY=( "./preconfig_rocm.sh" ) +# aotriton has been patched to check MAX_JOBS environment variable +# in aotriton v2src/CMakeLists.txt and use that for limiting the +# amount of python processes allowed to build and compress hsaco files. +# This fixes the out of memory problem on cases where computer has lot of +# CPUs compared to amount of memory. +# Note that this fix only works when using Ninja. +# (cmake's limitation for add_custom_jobs command) +export MAX_JOBS=${BUILD_CPU_COUNT_DEFAULT} + BINFO_APP_CMAKE_CFG="-DCMAKE_INSTALL_PREFIX=${INSTALL_DIR_PREFIX_SDK_ROOT}" BINFO_APP_CMAKE_CFG="${BINFO_APP_CMAKE_CFG} -DCMAKE_PREFIX_PATH=${INSTALL_DIR_PREFIX_SDK_ROOT}/lib64/cmake;${INSTALL_DIR_PREFIX_SDK_ROOT}/lib/cmake" BINFO_APP_CMAKE_CFG="${BINFO_APP_CMAKE_CFG} ${CFG_TEMP1}" +BINFO_APP_CMAKE_CFG="${BINFO_APP_CMAKE_CFG} -GNinja" BINFO_APP_CMAKE_CFG="${BINFO_APP_CMAKE_CFG} -DCMAKE_C_COMPILER=${SDK_C_COMPILER_HIPCC}" BINFO_APP_CMAKE_CFG="${BINFO_APP_CMAKE_CFG} -DCMAKE_CXX_COMPILER=${SDK_CXX_COMPILER_HIPCC}" BINFO_APP_CMAKE_CFG="${BINFO_APP_CMAKE_CFG} -DROCM_PATH=${INSTALL_DIR_PREFIX_SDK_ROOT}" @@ -65,3 +75,8 @@ BINFO_APP_CMAKE_CFG="${BINFO_APP_CMAKE_CFG} -DAMDGCN_ENABLE_DUMP=1" # separate build needed to do a backend mode as if this is enabled, other part of build is skipped??? #BINFO_APP_CMAKE_CFG="${BINFO_APP_CMAKE_CFG} -DHIP_BACKEND_MODE=1" BINFO_APP_CMAKE_CFG="${BINFO_APP_CMAKE_CFG} ${BINFO_APP_SRC_DIR}" + +BINFO_APP_BUILD_CMD_ARRAY=( + "cd ${BINFO_APP_BUILD_DIR}" + "ninja" +) diff --git a/patches/rocm-6.1.2/aotriton/0001-pass-extra-build-options.patch b/patches/rocm-6.1.2/aotriton/0001-pass-extra-build-options.patch index 8bd4330..41bf30a 100644 --- a/patches/rocm-6.1.2/aotriton/0001-pass-extra-build-options.patch +++ b/patches/rocm-6.1.2/aotriton/0001-pass-extra-build-options.patch @@ -1,7 +1,7 @@ -From d9d6a93e0c5f131130bd8cae6bfd455b331bdf09 Mon Sep 17 00:00:00 2001 +From 0d30e41b904e1027e559ebd54640467bd0226163 Mon Sep 17 00:00:00 2001 From: Mika Laitio Date: Mon, 29 Jul 2024 00:01:32 -0700 -Subject: [PATCH 1/7] pass extra build options +Subject: [PATCH 1/8] pass extra build options Signed-off-by: Mika Laitio --- @@ -227,5 +227,5 @@ index ca7a4b5..5c7d2a2 100644 WORKING_DIRECTORY "${CMAKE_CURRENT_SOURCE_PARENT_DIR}" ) -- -2.41.1 +2.43.0 diff --git a/patches/rocm-6.1.2/aotriton/0002-add-gpus-with-gfx-name-to-build-list.patch b/patches/rocm-6.1.2/aotriton/0002-add-gpus-with-gfx-name-to-build-list.patch index a910e0d..5e7a093 100644 --- a/patches/rocm-6.1.2/aotriton/0002-add-gpus-with-gfx-name-to-build-list.patch +++ b/patches/rocm-6.1.2/aotriton/0002-add-gpus-with-gfx-name-to-build-list.patch @@ -1,7 +1,7 @@ -From 3199757072a6291dd3276e315084a6fc32f07529 Mon Sep 17 00:00:00 2001 +From 13cb55b7856490058906ea08610edad6e4e8bbda Mon Sep 17 00:00:00 2001 From: Mika Laitio Date: Mon, 29 Jul 2024 12:48:47 -0700 -Subject: [PATCH 2/7] add gpus with gfx-name to build list +Subject: [PATCH 2/8] add gpus with gfx-name to build list add all rocm sdk gpu's to build list and use the gfx* name for them instead @@ -155,5 +155,5 @@ index 2c47e1d..2a6128b 100644 GpuArch -- -2.41.1 +2.43.0 diff --git a/patches/rocm-6.1.2/aotriton/0003-changed-the-line-which-allocates-twice-the-number-of.patch b/patches/rocm-6.1.2/aotriton/0003-changed-the-line-which-allocates-twice-the-number-of.patch index fbef5fb..9efed26 100644 --- a/patches/rocm-6.1.2/aotriton/0003-changed-the-line-which-allocates-twice-the-number-of.patch +++ b/patches/rocm-6.1.2/aotriton/0003-changed-the-line-which-allocates-twice-the-number-of.patch @@ -1,7 +1,7 @@ -From d01cd7fdad1e40b47505ba683d0703336e6be985 Mon Sep 17 00:00:00 2001 +From 02b1281ee898665efe5d713d8550016e5bcd7488 Mon Sep 17 00:00:00 2001 From: mritunjaymusale Date: Sun, 2 Jun 2024 18:00:59 +0530 -Subject: [PATCH 3/7] changed the line which allocates twice the number of max +Subject: [PATCH 3/8] changed the line which allocates twice the number of max cpu threads to triton build Signed-off-by: mritunjaymusale @@ -23,5 +23,5 @@ index 390ee8b..4030cad 100644 if check_env_flag("TRITON_BUILD_WITH_CLANG_LLD"): -- -2.41.1 +2.43.0 diff --git a/patches/rocm-6.1.2/aotriton/0004-printout-aotriton-tuning-db-gpu-info.patch b/patches/rocm-6.1.2/aotriton/0004-printout-aotriton-tuning-db-gpu-info.patch index d48c97d..a5059a6 100644 --- a/patches/rocm-6.1.2/aotriton/0004-printout-aotriton-tuning-db-gpu-info.patch +++ b/patches/rocm-6.1.2/aotriton/0004-printout-aotriton-tuning-db-gpu-info.patch @@ -1,7 +1,7 @@ -From 020aceb11f576ffb00d1131320acbc163f9519b1 Mon Sep 17 00:00:00 2001 +From 3bbccabde1409bbd2334d08d80e4fcdb56e942fc Mon Sep 17 00:00:00 2001 From: Mika Laitio Date: Tue, 6 Aug 2024 17:15:39 -0700 -Subject: [PATCH 4/7] printout aotriton tuning db gpu info +Subject: [PATCH 4/8] printout aotriton tuning db gpu info printout information if tuning data was available in database for kernel when build for certain gpu @@ -34,5 +34,5 @@ index 14ef241..fd1dc59 100644 @property -- -2.41.1 +2.43.0 diff --git a/patches/rocm-6.1.2/aotriton/0005-add-gfx906-908-40-41-and-gfx1150-51.patch b/patches/rocm-6.1.2/aotriton/0005-add-gfx906-908-40-41-and-gfx1150-51.patch index 6bace27..0215cf0 100644 --- a/patches/rocm-6.1.2/aotriton/0005-add-gfx906-908-40-41-and-gfx1150-51.patch +++ b/patches/rocm-6.1.2/aotriton/0005-add-gfx906-908-40-41-and-gfx1150-51.patch @@ -1,7 +1,7 @@ -From cd140c9489c60d3e1aba275c363e33f14aa5e30d Mon Sep 17 00:00:00 2001 +From 74b1c111ca07cbc02791ff63b6cf4ef20db9b52e Mon Sep 17 00:00:00 2001 From: Mika Laitio Date: Tue, 22 Oct 2024 21:55:34 -0700 -Subject: [PATCH 5/7] add gfx906/908/40/41 and gfx1150/51 +Subject: [PATCH 5/8] add gfx906/908/40/41 and gfx1150/51 Signed-off-by: Mika Laitio --- @@ -156,5 +156,5 @@ index 2a6128b..e54614a 100644 GpuArch -- -2.41.1 +2.43.0 diff --git a/patches/rocm-6.1.2/aotriton/0006-separate-each-gpu-files-to-own-dir.patch b/patches/rocm-6.1.2/aotriton/0006-separate-each-gpu-files-to-own-dir.patch index 36caec3..a3fe060 100644 --- a/patches/rocm-6.1.2/aotriton/0006-separate-each-gpu-files-to-own-dir.patch +++ b/patches/rocm-6.1.2/aotriton/0006-separate-each-gpu-files-to-own-dir.patch @@ -1,7 +1,7 @@ -From a68395374e6c9d1b127e6cbaaa6321fee971a594 Mon Sep 17 00:00:00 2001 +From f30bb39e4a3ffe1aae842eaf35118b21e8ef05fe Mon Sep 17 00:00:00 2001 From: Mika Laitio Date: Tue, 5 Nov 2024 00:13:12 -0800 -Subject: [PATCH 6/7] separate each gpu files to own dir +Subject: [PATCH 6/8] separate each gpu files to own dir - should help to avoid errors on situations where there are too many files in single @@ -114,5 +114,5 @@ index ec1e39f..a085ea4 100644 #include #include -- -2.41.1 +2.43.0 diff --git a/patches/rocm-6.1.2/aotriton/0007-add-preconfig_rocm.sh-script.patch b/patches/rocm-6.1.2/aotriton/0007-add-preconfig_rocm.sh-script.patch index 4cddc9e..2a909db 100644 --- a/patches/rocm-6.1.2/aotriton/0007-add-preconfig_rocm.sh-script.patch +++ b/patches/rocm-6.1.2/aotriton/0007-add-preconfig_rocm.sh-script.patch @@ -1,7 +1,7 @@ -From d95a6c6f73c06cb5e9abbe9240f43b8e564bfc31 Mon Sep 17 00:00:00 2001 +From e544d6b10c482745934203cfc085474cb1847ea1 Mon Sep 17 00:00:00 2001 From: Mika Laitio Date: Thu, 12 Dec 2024 16:14:24 -0800 -Subject: [PATCH 7/7] add preconfig_rocm.sh script +Subject: [PATCH 7/8] add preconfig_rocm.sh script Signed-off-by: Mika Laitio --- @@ -29,5 +29,5 @@ index 0000000..aa630ef + fi +fi -- -2.41.1 +2.43.0 diff --git a/patches/rocm-6.1.2/aotriton/0008-max-python-process-compile-count-for-hsaco-files.patch b/patches/rocm-6.1.2/aotriton/0008-max-python-process-compile-count-for-hsaco-files.patch new file mode 100644 index 0000000..6f56a83 --- /dev/null +++ b/patches/rocm-6.1.2/aotriton/0008-max-python-process-compile-count-for-hsaco-files.patch @@ -0,0 +1,88 @@ +From 375ce42eb83856885f933bb5214be85efd0a2a88 Mon Sep 17 00:00:00 2001 +From: Mika Laitio +Date: Wed, 18 Dec 2024 21:38:44 -0800 +Subject: [PATCH 8/8] max python process compile count for hsaco files + +use MAX_JOBS environment variable to +limit the amount python processes to +build and compress hsaco files. + +Note that this will require that aotriton uses +ninja as a builder because cmakes add_custom_command +supports only Ninja for setting the process count. + +This solves out of memory build problem in cases where +computer has low amount of memory compared to amount +of CPUs available. + +Fixes: https://github.com/lamikr/rocm_sdk_builder/issues/178 + +Signed-off-by: Mika Laitio +--- + v2src/CMakeLists.txt | 23 ++++++++++++++--------- + 1 file changed, 14 insertions(+), 9 deletions(-) + +diff --git a/v2src/CMakeLists.txt b/v2src/CMakeLists.txt +index 5c7d2a2..9509a27 100644 +--- a/v2src/CMakeLists.txt ++++ b/v2src/CMakeLists.txt +@@ -26,6 +26,17 @@ message("AOTRITON_COMPILER ${AOTRITON_COMPILER}") + # ) + # add_dependencies(aotriton_v2_gen_compile aotriton_venv_triton) + ++if(DEFINED ENV{MAX_JOBS}) ++ set(MAX_JOBS "$ENV{MAX_JOBS}") ++else() ++ cmake_host_system_information(RESULT MAX_JOBS QUERY NUMBER_OF_PHYSICAL_CORES) ++ if(MAX_JOBS LESS 2) # In case of failures. ++ set(MAX_JOBS 2) ++ endif() ++endif() ++ ++set_property(GLOBAL PROPERTY JOB_POOLS MAX_JOB_CNT__HSACO=${MAX_JOBS}) ++ + if(AOTRITON_BUILD_FOR_TUNING) + set(GENERATE_OPTION "--build_for_tuning") + else(AOTRITON_BUILD_FOR_TUNING) +@@ -36,6 +47,7 @@ execute_process( + COMMAND_ECHO STDOUT + WORKING_DIRECTORY "${CMAKE_CURRENT_SOURCE_PARENT_DIR}" + ) ++ + message("Bare.compile: ${AOTRITON_V2_BUILD_DIR}/Bare.compile") + file(STRINGS "${AOTRITON_V2_BUILD_DIR}/Bare.compile" HSACO_RULES) + set(ALL_HSACOS "") +@@ -65,6 +77,7 @@ foreach(RULE IN LISTS HSACO_RULES) + "--timeout" "${AOTRITON_GPU_BUILD_TIMEOUT}" + COMMAND ${ZSTD_EXEC} "-q" "-f" ${HSACO} + DEPENDS aotriton_venv_triton ++ JOB_POOL MAX_JOB_CNT__HSACO + ) + list(APPEND ALL_HSACOS "${HSACO}.zst") + else(AOTRITON_COMPRESS_KERNEL) +@@ -82,21 +95,13 @@ foreach(RULE IN LISTS HSACO_RULES) + "--signature" "${SIG}" + "--timeout" "${AOTRITON_GPU_BUILD_TIMEOUT}" + DEPENDS aotriton_venv_triton ++ JOB_POOL MAX_JOB_CNT__HSACO + ) + list(APPEND ALL_HSACOS "${HSACO}") + endif(AOTRITON_COMPRESS_KERNEL) + # message("HSACO ${HSACO}") + endforeach(RULE) + +-if(DEFINED ENV{MAX_JOBS}) +- set(MAX_JOBS "$ENV{MAX_JOBS}") +-else() +- cmake_host_system_information(RESULT MAX_JOBS QUERY NUMBER_OF_PHYSICAL_CORES) +- if(MAX_JOBS LESS 2) # In case of failures. +- set(MAX_JOBS 2) +- endif() +-endif() +- + add_custom_target(aotriton_v2_compile ALL DEPENDS ${ALL_HSACOS}) + + # add_custom_target(aotriton_v2_compile +-- +2.43.0 +