Skip to content

Commit

Permalink
Hipify during build instead of before cmake config (#13333)
Browse files Browse the repository at this point in the history
### Description

Currently, hipify happens before cmake is configured and then cmake glob
the directories. This get rids of thoes customized python threading
logic and opt for build system itself to generate the files.

This also supersede the half baked branch
[sukha/hipify-with-cmake](https://github.com/microsoft/onnxruntime/tree/sukha/hipify-with-cmake)
  • Loading branch information
cloudhan authored Oct 21, 2022
1 parent bb16ee7 commit 928c9fc
Show file tree
Hide file tree
Showing 5 changed files with 253 additions and 321 deletions.
4 changes: 4 additions & 0 deletions cmake/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -1871,6 +1871,10 @@ if (onnxruntime_USE_ROCM)
set(CMAKE_HIP_ARCHITECTURES "gfx906;gfx908;gfx90a;gfx1030")
endif()

if (NOT onnxruntime_HIPIFY_PERL)
set(onnxruntime_HIPIFY_PERL ${onnxruntime_ROCM_HOME}/hip/bin/hipify-perl)
endif()

# NOTE: HIP language is added in 3.21 and there are bugs before 3.23.1
cmake_minimum_required(VERSION 3.23.1 FATAL_ERROR)
find_package(HIP)
Expand Down
44 changes: 15 additions & 29 deletions cmake/onnxruntime_providers.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -93,16 +93,6 @@ file(GLOB_RECURSE onnxruntime_rocm_contrib_ops_cu_srcs CONFIGURE_DEPENDS
"${ONNXRUNTIME_ROOT}/contrib_ops/rocm/*.cuh"
)

file(GLOB_RECURSE onnxruntime_rocm_generated_contrib_ops_cc_srcs CONFIGURE_DEPENDS
"${CMAKE_CURRENT_BINARY_DIR}/amdgpu/onnxruntime/contrib_ops/rocm/*.h"
"${CMAKE_CURRENT_BINARY_DIR}/amdgpu/onnxruntime/contrib_ops/rocm/*.cc"
)

file(GLOB_RECURSE onnxruntime_rocm_generated_contrib_ops_cu_srcs CONFIGURE_DEPENDS
"${CMAKE_CURRENT_BINARY_DIR}/amdgpu/onnxruntime/contrib_ops/rocm/*.cu"
"${CMAKE_CURRENT_BINARY_DIR}/amdgpu/onnxruntime/contrib_ops/rocm/*.cuh"
)

file(GLOB onnxruntime_providers_common_srcs CONFIGURE_DEPENDS
"${ONNXRUNTIME_ROOT}/core/providers/*.h"
"${ONNXRUNTIME_ROOT}/core/providers/*.cc"
Expand Down Expand Up @@ -1276,6 +1266,7 @@ endif()

if (onnxruntime_USE_ROCM)
add_definitions(-DUSE_ROCM=1)
include(onnxruntime_rocm_hipify.cmake)

# Add search paths for default hip installation
list(APPEND CMAKE_PREFIX_PATH ${onnxruntime_ROCM_HOME} ${onnxruntime_ROCM_HOME}/hip ${onnxruntime_ROCM_HOME}/hcc ${onnxruntime_ROCM_HOME}/miopen ${onnxruntime_ROCM_HOME}/hiprand ${onnxruntime_ROCM_HOME}/rocrand)
Expand Down Expand Up @@ -1306,15 +1297,7 @@ if (onnxruntime_USE_ROCM)
"${ONNXRUNTIME_ROOT}/core/providers/rocm/*.cuh"
)

file(GLOB_RECURSE onnxruntime_providers_rocm_generated_cc_srcs CONFIGURE_DEPENDS
"${CMAKE_CURRENT_BINARY_DIR}/amdgpu/onnxruntime/core/providers/rocm/*.h"
"${CMAKE_CURRENT_BINARY_DIR}/amdgpu/onnxruntime/core/providers/rocm/*.cc"
)

file(GLOB_RECURSE onnxruntime_providers_rocm_generated_cu_srcs CONFIGURE_DEPENDS
"${CMAKE_CURRENT_BINARY_DIR}/amdgpu/onnxruntime/core/providers/rocm/*.cu"
"${CMAKE_CURRENT_BINARY_DIR}/amdgpu/onnxruntime/core/providers/rocm/*.cuh"
)
hipify("onnxruntime/core/providers" provider_excluded_files onnxruntime_providers_rocm_generated_cc_srcs onnxruntime_providers_rocm_generated_cu_srcs)

source_group(TREE ${ONNXRUNTIME_ROOT}/core FILES ${onnxruntime_providers_rocm_cc_srcs} ${onnxruntime_providers_rocm_shared_srcs} ${onnxruntime_providers_rocm_cu_srcs})
set(onnxruntime_providers_rocm_src ${onnxruntime_providers_rocm_cc_srcs} ${onnxruntime_providers_rocm_shared_srcs} ${onnxruntime_providers_rocm_cu_srcs})
Expand All @@ -1327,6 +1310,9 @@ if (onnxruntime_USE_ROCM)
"${ONNXRUNTIME_ROOT}/contrib_ops/rocm/aten_ops/aten_op.cc"
)
endif()

hipify("onnxruntime/contrib_ops" contrib_ops_excluded_files onnxruntime_rocm_generated_contrib_ops_cc_srcs onnxruntime_rocm_generated_contrib_ops_cu_srcs)

# add using ONNXRUNTIME_ROOT so they show up under the 'contrib_ops' folder in Visual Studio
source_group(TREE ${ONNXRUNTIME_ROOT} FILES ${onnxruntime_rocm_contrib_ops_cc_srcs} ${onnxruntime_rocm_contrib_ops_cu_srcs})
list(APPEND onnxruntime_providers_rocm_src ${onnxruntime_rocm_contrib_ops_cc_srcs} ${onnxruntime_rocm_contrib_ops_cu_srcs})
Expand All @@ -1344,15 +1330,7 @@ if (onnxruntime_USE_ROCM)
"${ORTTRAINING_SOURCE_DIR}/training_ops/rocm/*.cuh"
)

file(GLOB_RECURSE onnxruntime_rocm_generated_training_ops_cc_srcs CONFIGURE_DEPENDS
"${CMAKE_CURRENT_BINARY_DIR}/amdgpu/orttraining/orttraining/training_ops/rocm/*.h"
"${CMAKE_CURRENT_BINARY_DIR}/amdgpu/orttraining/orttraining/training_ops/rocm/*.cc"
)

file(GLOB_RECURSE onnxruntime_rocm_generated_training_ops_cu_srcs CONFIGURE_DEPENDS
"${CMAKE_CURRENT_BINARY_DIR}/amdgpu/orttraining/orttraining/training_ops/rocm/*.cu"
"${CMAKE_CURRENT_BINARY_DIR}/amdgpu/orttraining/orttraining/training_ops/rocm/*.cuh"
)
hipify("orttraining/orttraining/training_ops" training_ops_excluded_files onnxruntime_rocm_generated_training_ops_cc_srcs onnxruntime_rocm_generated_training_ops_cu_srcs)

# NCCL is not support in Windows build
if (WIN32 OR NOT onnxruntime_USE_NCCL)
Expand Down Expand Up @@ -1406,7 +1384,15 @@ if (onnxruntime_USE_ROCM)
endif()
endif()

add_dependencies(onnxruntime_providers_rocm onnxruntime_providers_shared ${onnxruntime_EXTERNAL_DEPENDENCIES})
add_custom_target(generate_hipified_files DEPENDS
${onnxruntime_providers_rocm_generated_cc_srcs}
${onnxruntime_providers_rocm_generated_cu_srcs}
${onnxruntime_rocm_generated_contrib_ops_cc_srcs}
${onnxruntime_rocm_generated_contrib_ops_cu_srcs}
${onnxruntime_rocm_generated_training_ops_cc_srcs}
${onnxruntime_rocm_generated_training_ops_cu_srcs})

add_dependencies(onnxruntime_providers_rocm generate_hipified_files onnxruntime_providers_shared ${onnxruntime_EXTERNAL_DEPENDENCIES})
target_link_libraries(onnxruntime_providers_rocm PRIVATE ${ONNXRUNTIME_ROCM_LIBS} ${ONNXRUNTIME_PROVIDERS_SHARED} ${ABSEIL_LIBS})
target_include_directories(onnxruntime_providers_rocm SYSTEM
PRIVATE
Expand Down
221 changes: 221 additions & 0 deletions cmake/onnxruntime_rocm_hipify.cmake
Original file line number Diff line number Diff line change
@@ -0,0 +1,221 @@
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.

find_package(Python3 COMPONENTS Interpreter REQUIRED)

# GLOB pattern of file to be excluded
set(contrib_ops_excluded_files
"bert/attention.cc"
"bert/attention.h"
"bert/attention_impl.cu"
"bert/attention_softmax.h"
"bert/embed_layer_norm.cc"
"bert/embed_layer_norm.h"
"bert/embed_layer_norm_impl.cu"
"bert/embed_layer_norm_impl.h"
"bert/fast_gelu_impl.cu"
"bert/fast_gelu_impl.h"
"bert/fast_gelu.cc"
"bert/fast_gelu.h"
"bert/skip_layer_norm.cc"
"bert/skip_layer_norm.h"
"bert/skip_layer_norm_impl.cu"
"bert/skip_layer_norm_impl.h"
"bert/tensorrt_fused_multihead_attention/*"
"bert/transformer_common.h"
"bert/transformer_common.cc"
"math/complex_mul.cc"
"math/complex_mul.h"
"math/complex_mul_impl.cu"
"math/complex_mul_impl.h"
"math/cufft_plan_cache.h"
"math/fft_ops.cc"
"math/fft_ops.h"
"math/fft_ops_impl.cu"
"math/fft_ops_impl.h"
"quantization/attention_quantization.cc"
"quantization/attention_quantization.h"
"quantization/attention_quantization_impl.cu"
"quantization/attention_quantization_impl.cuh"
"quantization/quantize_dequantize_linear.cc"
"quantization/qordered_ops/qordered_attention_impl.cu"
"quantization/qordered_ops/qordered_attention_impl.h"
"quantization/qordered_ops/qordered_attention_input_enum.h"
"quantization/qordered_ops/qordered_attention.cc"
"quantization/qordered_ops/qordered_attention.h"
"quantization/qordered_ops/qordered_common.cuh"
"quantization/qordered_ops/qordered_layer_norm.h"
"quantization/qordered_ops/qordered_layer_norm.cc"
"quantization/qordered_ops/qordered_layer_norm_impl.h"
"quantization/qordered_ops/qordered_layer_norm_impl.cu"
"quantization/qordered_ops/qordered_longformer_attention.cc"
"quantization/qordered_ops/qordered_longformer_attention.h"
"quantization/qordered_ops/qordered_matmul.h"
"quantization/qordered_ops/qordered_matmul.cc"
"quantization/qordered_ops/qordered_matmul_utils.h"
"quantization/qordered_ops/qordered_matmul_utils.cc"
"quantization/qordered_ops/qordered_qdq_impl.cu"
"quantization/qordered_ops/qordered_qdq_impl.h"
"quantization/qordered_ops/qordered_qdq.cc"
"quantization/qordered_ops/qordered_qdq.h"
"quantization/qordered_ops/qordered_unary_ops.h"
"quantization/qordered_ops/qordered_unary_ops.cc"
"quantization/qordered_ops/qordered_unary_ops_impl.h"
"quantization/qordered_ops/qordered_unary_ops_impl.cu"
"tensor/crop.cc"
"tensor/crop.h"
"tensor/crop_impl.cu"
"tensor/crop_impl.h"
"tensor/dynamicslice.cc"
"tensor/image_scaler.cc"
"tensor/image_scaler.h"
"tensor/image_scaler_impl.cu"
"tensor/image_scaler_impl.h"
"transformers/beam_search.cc"
"transformers/beam_search.h"
"transformers/generation_device_helper.cc"
"transformers/generation_device_helper.h"
"transformers/beam_search_impl.cu"
"transformers/beam_search_impl.h"
"transformers/greedy_search.cc"
"transformers/greedy_search.h"
"transformers/dump_cuda_tensor.cc"
"transformers/dump_cuda_tensor.h"
"conv_transpose_with_dynamic_pads.cc"
"conv_transpose_with_dynamic_pads.h"
"cuda_contrib_kernels.cc"
"cuda_contrib_kernels.h"
"inverse.cc"
"fused_conv.cc"
)

set(provider_excluded_files
"atomic/common.cuh"
"controlflow/if.cc"
"controlflow/if.h"
"controlflow/loop.cc"
"controlflow/loop.h"
"controlflow/scan.cc"
"controlflow/scan.h"
"cu_inc/common.cuh"
"math/einsum_utils/einsum_auxiliary_ops.cc"
"math/einsum_utils/einsum_auxiliary_ops.h"
"math/einsum_utils/einsum_auxiliary_ops_diagonal.cu"
"math/einsum_utils/einsum_auxiliary_ops_diagonal.h"
"math/einsum.cc"
"math/einsum.h"
"math/gemm.cc"
"math/matmul.cc"
"math/softmax_impl.cu"
"math/softmax_warpwise_impl.cuh"
"math/softmax_common.cc"
"math/softmax.cc"
"nn/conv.cc"
"nn/conv.h"
"nn/conv_transpose.cc"
"nn/conv_transpose.h"
"reduction/reduction_ops.cc"
"rnn/cudnn_rnn_base.cc"
"rnn/cudnn_rnn_base.h"
"rnn/gru.cc"
"rnn/gru.h"
"rnn/lstm.cc"
"rnn/lstm.h"
"rnn/rnn.cc"
"rnn/rnn.h"
"rnn/rnn_impl.cu"
"rnn/rnn_impl.h"
"shared_inc/cuda_call.h"
"shared_inc/fpgeneric.h"
"cuda_allocator.cc"
"cuda_allocator.h"
"cuda_call.cc"
"cuda_common.cc"
"cuda_common.h"
"cuda_execution_provider_info.cc"
"cuda_execution_provider_info.h"
"cuda_execution_provider.cc"
"cuda_execution_provider.h"
"cuda_memory_check.cc"
"cuda_memory_check.h"
"cuda_fence.cc"
"cuda_fence.h"
"cuda_fwd.h"
"cuda_kernel.h"
"cuda_pch.cc"
"cuda_pch.h"
"cuda_profiler.cc"
"cuda_profiler.h"
"cuda_provider_factory.cc"
"cuda_provider_factory.h"
"cuda_utils.cu"
"cudnn_common.cc"
"cudnn_common.h"
"fpgeneric.cu"
"gpu_data_transfer.cc"
"gpu_data_transfer.h"
"integer_gemm.cc"
)

set(training_ops_excluded_files
"activation/gelu_grad_impl_common.cuh" # uses custom tanh
"collective/adasum_kernels.cc"
"collective/adasum_kernels.h"
"math/div_grad.cc" # miopen API differs from cudnn, no double type support
"nn/batch_norm_grad.cc" # no double type support
"nn/batch_norm_grad.h" # miopen API differs from cudnn
"nn/batch_norm_internal.cc" # miopen API differs from cudnn, no double type support
"nn/batch_norm_internal.h" # miopen API differs from cudnn, no double type support
"nn/conv_grad.cc"
"nn/conv_grad.h"
"reduction/reduction_all.cc" # deterministic = true, ignore ctx setting
"reduction/reduction_ops.cc" # no double type support
"cuda_training_kernels.cc"
"cuda_training_kernels.h"
)


# cuda_dir must be relative to REPO_ROOT
function(hipify cuda_dir in_excluded_file_patterns out_generated_cc_files out_generated_cu_files)
set(hipify_tool ${REPO_ROOT}/tools/ci_build/amd_hipify.py)

file(GLOB_RECURSE srcs CONFIGURE_DEPENDS
"${REPO_ROOT}/${cuda_dir}/cuda/*.h"
"${REPO_ROOT}/${cuda_dir}/cuda/*.cc"
"${REPO_ROOT}/${cuda_dir}/cuda/*.cuh"
"${REPO_ROOT}/${cuda_dir}/cuda/*.cu"
)

# do exclusion
set(excluded_file_patterns ${${in_excluded_file_patterns}})
list(TRANSFORM excluded_file_patterns PREPEND "${REPO_ROOT}/${cuda_dir}/cuda/")
file(GLOB_RECURSE excluded_srcs CONFIGURE_DEPENDS ${excluded_file_patterns})
foreach(f ${excluded_srcs})
message(STATUS "Excluded from hipify: ${f}")
endforeach()
list(REMOVE_ITEM srcs ${excluded_srcs})

foreach(f ${srcs})
file(RELATIVE_PATH cuda_f_rel "${REPO_ROOT}" ${f})
string(REPLACE "cuda" "rocm" rocm_f_rel ${cuda_f_rel})
set(f_out "${CMAKE_CURRENT_BINARY_DIR}/amdgpu/${rocm_f_rel}")
add_custom_command(
OUTPUT ${f_out}
COMMAND Python3::Interpreter ${hipify_tool}
--hipify_perl ${onnxruntime_HIPIFY_PERL}
${f} -o ${f_out}
DEPENDS ${hipify_tool} ${f}
COMMENT "Hipify: ${cuda_f_rel} -> amdgpu/${rocm_f_rel}"
)
if(f MATCHES "\\..*cuh?")
list(APPEND generated_cu_files ${f_out})
else()
list(APPEND generated_cc_files ${f_out})
endif()
endforeach()

set_source_files_properties(generated_cc_files PROPERTIES GENERATED TRUE)
set_source_files_properties(generated_cu_files PROPERTIES GENERATED TRUE)
set(${out_generated_cc_files} ${generated_cc_files} PARENT_SCOPE)
set(${out_generated_cu_files} ${generated_cu_files} PARENT_SCOPE)
endfunction()
Loading

0 comments on commit 928c9fc

Please sign in to comment.