Hipify during build instead of before cmake config (#13333)

### Description Currently, hipify happens before cmake is configured and then cmake glob the directories. This get rids of thoes customized python threading logic and opt for build system itself to generate the files. This also supersede the half baked branch [sukha/hipify-with-cmake](https://github.com/microsoft/onnxruntime/tree/sukha/hipify-with-cmake)
microsoft · Oct 21, 2022 · 928c9fc · 928c9fc
1 parent bb16ee7
commit 928c9fc
Show file tree

Hide file tree

Showing 5 changed files with 253 additions and 321 deletions.
diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt
@@ -1871,6 +1871,10 @@ if (onnxruntime_USE_ROCM)
     set(CMAKE_HIP_ARCHITECTURES "gfx906;gfx908;gfx90a;gfx1030")
   endif()
 
+  if (NOT onnxruntime_HIPIFY_PERL)
+    set(onnxruntime_HIPIFY_PERL ${onnxruntime_ROCM_HOME}/hip/bin/hipify-perl)
+  endif()
+
   # NOTE: HIP language is added in 3.21 and there are bugs before 3.23.1
   cmake_minimum_required(VERSION 3.23.1 FATAL_ERROR)
   find_package(HIP)

diff --git a/cmake/onnxruntime_providers.cmake b/cmake/onnxruntime_providers.cmake
@@ -93,16 +93,6 @@ file(GLOB_RECURSE onnxruntime_rocm_contrib_ops_cu_srcs CONFIGURE_DEPENDS
   "${ONNXRUNTIME_ROOT}/contrib_ops/rocm/*.cuh"
 )
 
-file(GLOB_RECURSE onnxruntime_rocm_generated_contrib_ops_cc_srcs CONFIGURE_DEPENDS
-  "${CMAKE_CURRENT_BINARY_DIR}/amdgpu/onnxruntime/contrib_ops/rocm/*.h"
-  "${CMAKE_CURRENT_BINARY_DIR}/amdgpu/onnxruntime/contrib_ops/rocm/*.cc"
-)
-
-file(GLOB_RECURSE onnxruntime_rocm_generated_contrib_ops_cu_srcs CONFIGURE_DEPENDS
-  "${CMAKE_CURRENT_BINARY_DIR}/amdgpu/onnxruntime/contrib_ops/rocm/*.cu"
-  "${CMAKE_CURRENT_BINARY_DIR}/amdgpu/onnxruntime/contrib_ops/rocm/*.cuh"
-)
-
 file(GLOB onnxruntime_providers_common_srcs CONFIGURE_DEPENDS
   "${ONNXRUNTIME_ROOT}/core/providers/*.h"
   "${ONNXRUNTIME_ROOT}/core/providers/*.cc"
@@ -1276,6 +1266,7 @@ endif()
 
 if (onnxruntime_USE_ROCM)
   add_definitions(-DUSE_ROCM=1)
+  include(onnxruntime_rocm_hipify.cmake)
 
   # Add search paths for default hip installation
   list(APPEND CMAKE_PREFIX_PATH ${onnxruntime_ROCM_HOME} ${onnxruntime_ROCM_HOME}/hip ${onnxruntime_ROCM_HOME}/hcc ${onnxruntime_ROCM_HOME}/miopen ${onnxruntime_ROCM_HOME}/hiprand ${onnxruntime_ROCM_HOME}/rocrand)
@@ -1306,15 +1297,7 @@ if (onnxruntime_USE_ROCM)
     "${ONNXRUNTIME_ROOT}/core/providers/rocm/*.cuh"
   )
 
-  file(GLOB_RECURSE onnxruntime_providers_rocm_generated_cc_srcs CONFIGURE_DEPENDS
-    "${CMAKE_CURRENT_BINARY_DIR}/amdgpu/onnxruntime/core/providers/rocm/*.h"
-    "${CMAKE_CURRENT_BINARY_DIR}/amdgpu/onnxruntime/core/providers/rocm/*.cc"
-  )
-
-  file(GLOB_RECURSE onnxruntime_providers_rocm_generated_cu_srcs CONFIGURE_DEPENDS
-    "${CMAKE_CURRENT_BINARY_DIR}/amdgpu/onnxruntime/core/providers/rocm/*.cu"
-    "${CMAKE_CURRENT_BINARY_DIR}/amdgpu/onnxruntime/core/providers/rocm/*.cuh"
-  )
+  hipify("onnxruntime/core/providers" provider_excluded_files onnxruntime_providers_rocm_generated_cc_srcs onnxruntime_providers_rocm_generated_cu_srcs)
 
   source_group(TREE ${ONNXRUNTIME_ROOT}/core FILES ${onnxruntime_providers_rocm_cc_srcs} ${onnxruntime_providers_rocm_shared_srcs} ${onnxruntime_providers_rocm_cu_srcs})
   set(onnxruntime_providers_rocm_src ${onnxruntime_providers_rocm_cc_srcs} ${onnxruntime_providers_rocm_shared_srcs} ${onnxruntime_providers_rocm_cu_srcs})
@@ -1327,6 +1310,9 @@ if (onnxruntime_USE_ROCM)
         "${ONNXRUNTIME_ROOT}/contrib_ops/rocm/aten_ops/aten_op.cc"
       )
     endif()
+
+    hipify("onnxruntime/contrib_ops" contrib_ops_excluded_files onnxruntime_rocm_generated_contrib_ops_cc_srcs onnxruntime_rocm_generated_contrib_ops_cu_srcs)
+
     # add using ONNXRUNTIME_ROOT so they show up under the 'contrib_ops' folder in Visual Studio
     source_group(TREE ${ONNXRUNTIME_ROOT} FILES ${onnxruntime_rocm_contrib_ops_cc_srcs} ${onnxruntime_rocm_contrib_ops_cu_srcs})
     list(APPEND onnxruntime_providers_rocm_src ${onnxruntime_rocm_contrib_ops_cc_srcs} ${onnxruntime_rocm_contrib_ops_cu_srcs})
@@ -1344,15 +1330,7 @@ if (onnxruntime_USE_ROCM)
       "${ORTTRAINING_SOURCE_DIR}/training_ops/rocm/*.cuh"
     )
 
-    file(GLOB_RECURSE onnxruntime_rocm_generated_training_ops_cc_srcs CONFIGURE_DEPENDS
-      "${CMAKE_CURRENT_BINARY_DIR}/amdgpu/orttraining/orttraining/training_ops/rocm/*.h"
-      "${CMAKE_CURRENT_BINARY_DIR}/amdgpu/orttraining/orttraining/training_ops/rocm/*.cc"
-    )
-
-    file(GLOB_RECURSE onnxruntime_rocm_generated_training_ops_cu_srcs CONFIGURE_DEPENDS
-      "${CMAKE_CURRENT_BINARY_DIR}/amdgpu/orttraining/orttraining/training_ops/rocm/*.cu"
-      "${CMAKE_CURRENT_BINARY_DIR}/amdgpu/orttraining/orttraining/training_ops/rocm/*.cuh"
-    )
+    hipify("orttraining/orttraining/training_ops" training_ops_excluded_files onnxruntime_rocm_generated_training_ops_cc_srcs onnxruntime_rocm_generated_training_ops_cu_srcs)
 
     # NCCL is not support in Windows build
     if (WIN32 OR NOT onnxruntime_USE_NCCL)
@@ -1406,7 +1384,15 @@ if (onnxruntime_USE_ROCM)
     endif()
   endif()
 
-  add_dependencies(onnxruntime_providers_rocm onnxruntime_providers_shared ${onnxruntime_EXTERNAL_DEPENDENCIES})
+  add_custom_target(generate_hipified_files DEPENDS
+    ${onnxruntime_providers_rocm_generated_cc_srcs}
+    ${onnxruntime_providers_rocm_generated_cu_srcs}
+    ${onnxruntime_rocm_generated_contrib_ops_cc_srcs}
+    ${onnxruntime_rocm_generated_contrib_ops_cu_srcs}
+    ${onnxruntime_rocm_generated_training_ops_cc_srcs}
+    ${onnxruntime_rocm_generated_training_ops_cu_srcs})
+
+  add_dependencies(onnxruntime_providers_rocm generate_hipified_files onnxruntime_providers_shared ${onnxruntime_EXTERNAL_DEPENDENCIES})
   target_link_libraries(onnxruntime_providers_rocm PRIVATE ${ONNXRUNTIME_ROCM_LIBS} ${ONNXRUNTIME_PROVIDERS_SHARED} ${ABSEIL_LIBS})
   target_include_directories(onnxruntime_providers_rocm SYSTEM
     PRIVATE

diff --git a/cmake/onnxruntime_rocm_hipify.cmake b/cmake/onnxruntime_rocm_hipify.cmake
@@ -0,0 +1,221 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+
+find_package(Python3 COMPONENTS Interpreter REQUIRED)
+
+# GLOB pattern of file to be excluded
+set(contrib_ops_excluded_files
+  "bert/attention.cc"
+  "bert/attention.h"
+  "bert/attention_impl.cu"
+  "bert/attention_softmax.h"
+  "bert/embed_layer_norm.cc"
+  "bert/embed_layer_norm.h"
+  "bert/embed_layer_norm_impl.cu"
+  "bert/embed_layer_norm_impl.h"
+  "bert/fast_gelu_impl.cu"
+  "bert/fast_gelu_impl.h"
+  "bert/fast_gelu.cc"
+  "bert/fast_gelu.h"
+  "bert/skip_layer_norm.cc"
+  "bert/skip_layer_norm.h"
+  "bert/skip_layer_norm_impl.cu"
+  "bert/skip_layer_norm_impl.h"
+  "bert/tensorrt_fused_multihead_attention/*"
+  "bert/transformer_common.h"
+  "bert/transformer_common.cc"
+  "math/complex_mul.cc"
+  "math/complex_mul.h"
+  "math/complex_mul_impl.cu"
+  "math/complex_mul_impl.h"
+  "math/cufft_plan_cache.h"
+  "math/fft_ops.cc"
+  "math/fft_ops.h"
+  "math/fft_ops_impl.cu"
+  "math/fft_ops_impl.h"
+  "quantization/attention_quantization.cc"
+  "quantization/attention_quantization.h"
+  "quantization/attention_quantization_impl.cu"
+  "quantization/attention_quantization_impl.cuh"
+  "quantization/quantize_dequantize_linear.cc"
+  "quantization/qordered_ops/qordered_attention_impl.cu"
+  "quantization/qordered_ops/qordered_attention_impl.h"
+  "quantization/qordered_ops/qordered_attention_input_enum.h"
+  "quantization/qordered_ops/qordered_attention.cc"
+  "quantization/qordered_ops/qordered_attention.h"
+  "quantization/qordered_ops/qordered_common.cuh"
+  "quantization/qordered_ops/qordered_layer_norm.h"
+  "quantization/qordered_ops/qordered_layer_norm.cc"
+  "quantization/qordered_ops/qordered_layer_norm_impl.h"
+  "quantization/qordered_ops/qordered_layer_norm_impl.cu"
+  "quantization/qordered_ops/qordered_longformer_attention.cc"
+  "quantization/qordered_ops/qordered_longformer_attention.h"
+  "quantization/qordered_ops/qordered_matmul.h"
+  "quantization/qordered_ops/qordered_matmul.cc"
+  "quantization/qordered_ops/qordered_matmul_utils.h"
+  "quantization/qordered_ops/qordered_matmul_utils.cc"
+  "quantization/qordered_ops/qordered_qdq_impl.cu"
+  "quantization/qordered_ops/qordered_qdq_impl.h"
+  "quantization/qordered_ops/qordered_qdq.cc"
+  "quantization/qordered_ops/qordered_qdq.h"
+  "quantization/qordered_ops/qordered_unary_ops.h"
+  "quantization/qordered_ops/qordered_unary_ops.cc"
+  "quantization/qordered_ops/qordered_unary_ops_impl.h"
+  "quantization/qordered_ops/qordered_unary_ops_impl.cu"
+  "tensor/crop.cc"
+  "tensor/crop.h"
+  "tensor/crop_impl.cu"
+  "tensor/crop_impl.h"
+  "tensor/dynamicslice.cc"
+  "tensor/image_scaler.cc"
+  "tensor/image_scaler.h"
+  "tensor/image_scaler_impl.cu"
+  "tensor/image_scaler_impl.h"
+  "transformers/beam_search.cc"
+  "transformers/beam_search.h"
+  "transformers/generation_device_helper.cc"
+  "transformers/generation_device_helper.h"
+  "transformers/beam_search_impl.cu"
+  "transformers/beam_search_impl.h"
+  "transformers/greedy_search.cc"
+  "transformers/greedy_search.h"
+  "transformers/dump_cuda_tensor.cc"
+  "transformers/dump_cuda_tensor.h"
+  "conv_transpose_with_dynamic_pads.cc"
+  "conv_transpose_with_dynamic_pads.h"
+  "cuda_contrib_kernels.cc"
+  "cuda_contrib_kernels.h"
+  "inverse.cc"
+  "fused_conv.cc"
+)
+
+set(provider_excluded_files
+  "atomic/common.cuh"
+  "controlflow/if.cc"
+  "controlflow/if.h"
+  "controlflow/loop.cc"
+  "controlflow/loop.h"
+  "controlflow/scan.cc"
+  "controlflow/scan.h"
+  "cu_inc/common.cuh"
+  "math/einsum_utils/einsum_auxiliary_ops.cc"
+  "math/einsum_utils/einsum_auxiliary_ops.h"
+  "math/einsum_utils/einsum_auxiliary_ops_diagonal.cu"
+  "math/einsum_utils/einsum_auxiliary_ops_diagonal.h"
+  "math/einsum.cc"
+  "math/einsum.h"
+  "math/gemm.cc"
+  "math/matmul.cc"
+  "math/softmax_impl.cu"
+  "math/softmax_warpwise_impl.cuh"
+  "math/softmax_common.cc"
+  "math/softmax.cc"
+  "nn/conv.cc"
+  "nn/conv.h"
+  "nn/conv_transpose.cc"
+  "nn/conv_transpose.h"
+  "reduction/reduction_ops.cc"
+  "rnn/cudnn_rnn_base.cc"
+  "rnn/cudnn_rnn_base.h"
+  "rnn/gru.cc"
+  "rnn/gru.h"
+  "rnn/lstm.cc"
+  "rnn/lstm.h"
+  "rnn/rnn.cc"
+  "rnn/rnn.h"
+  "rnn/rnn_impl.cu"
+  "rnn/rnn_impl.h"
+  "shared_inc/cuda_call.h"
+  "shared_inc/fpgeneric.h"
+  "cuda_allocator.cc"
+  "cuda_allocator.h"
+  "cuda_call.cc"
+  "cuda_common.cc"
+  "cuda_common.h"
+  "cuda_execution_provider_info.cc"
+  "cuda_execution_provider_info.h"
+  "cuda_execution_provider.cc"
+  "cuda_execution_provider.h"
+  "cuda_memory_check.cc"
+  "cuda_memory_check.h"
+  "cuda_fence.cc"
+  "cuda_fence.h"
+  "cuda_fwd.h"
+  "cuda_kernel.h"
+  "cuda_pch.cc"
+  "cuda_pch.h"
+  "cuda_profiler.cc"
+  "cuda_profiler.h"
+  "cuda_provider_factory.cc"
+  "cuda_provider_factory.h"
+  "cuda_utils.cu"
+  "cudnn_common.cc"
+  "cudnn_common.h"
+  "fpgeneric.cu"
+  "gpu_data_transfer.cc"
+  "gpu_data_transfer.h"
+  "integer_gemm.cc"
+)
+
+set(training_ops_excluded_files
+  "activation/gelu_grad_impl_common.cuh"  # uses custom tanh
+  "collective/adasum_kernels.cc"
+  "collective/adasum_kernels.h"
+  "math/div_grad.cc"  # miopen API differs from cudnn, no double type support
+  "nn/batch_norm_grad.cc"  # no double type support
+  "nn/batch_norm_grad.h"  # miopen API differs from cudnn
+  "nn/batch_norm_internal.cc"  # miopen API differs from cudnn, no double type support
+  "nn/batch_norm_internal.h"  # miopen API differs from cudnn, no double type support
+  "nn/conv_grad.cc"
+  "nn/conv_grad.h"
+  "reduction/reduction_all.cc"  # deterministic = true, ignore ctx setting
+  "reduction/reduction_ops.cc"  # no double type support
+  "cuda_training_kernels.cc"
+  "cuda_training_kernels.h"
+)
+
+
+# cuda_dir must be relative to REPO_ROOT
+function(hipify cuda_dir in_excluded_file_patterns out_generated_cc_files out_generated_cu_files)
+  set(hipify_tool ${REPO_ROOT}/tools/ci_build/amd_hipify.py)
+
+  file(GLOB_RECURSE srcs CONFIGURE_DEPENDS
+    "${REPO_ROOT}/${cuda_dir}/cuda/*.h"
+    "${REPO_ROOT}/${cuda_dir}/cuda/*.cc"
+    "${REPO_ROOT}/${cuda_dir}/cuda/*.cuh"
+    "${REPO_ROOT}/${cuda_dir}/cuda/*.cu"
+  )
+
+  # do exclusion
+  set(excluded_file_patterns ${${in_excluded_file_patterns}})
+  list(TRANSFORM excluded_file_patterns PREPEND "${REPO_ROOT}/${cuda_dir}/cuda/")
+  file(GLOB_RECURSE excluded_srcs CONFIGURE_DEPENDS ${excluded_file_patterns})
+  foreach(f ${excluded_srcs})
+    message(STATUS "Excluded from hipify: ${f}")
+  endforeach()
+  list(REMOVE_ITEM srcs ${excluded_srcs})
+
+  foreach(f ${srcs})
+    file(RELATIVE_PATH cuda_f_rel "${REPO_ROOT}" ${f})
+    string(REPLACE "cuda" "rocm" rocm_f_rel ${cuda_f_rel})
+    set(f_out "${CMAKE_CURRENT_BINARY_DIR}/amdgpu/${rocm_f_rel}")
+    add_custom_command(
+      OUTPUT ${f_out}
+      COMMAND Python3::Interpreter ${hipify_tool}
+              --hipify_perl ${onnxruntime_HIPIFY_PERL}
+              ${f} -o ${f_out}
+      DEPENDS ${hipify_tool} ${f}
+      COMMENT "Hipify: ${cuda_f_rel} -> amdgpu/${rocm_f_rel}"
+    )
+    if(f MATCHES "\\..*cuh?")
+      list(APPEND generated_cu_files ${f_out})
+    else()
+      list(APPEND generated_cc_files ${f_out})
+    endif()
+  endforeach()
+
+  set_source_files_properties(generated_cc_files PROPERTIES GENERATED TRUE)
+  set_source_files_properties(generated_cu_files PROPERTIES GENERATED TRUE)
+  set(${out_generated_cc_files} ${generated_cc_files} PARENT_SCOPE)
+  set(${out_generated_cu_files} ${generated_cu_files} PARENT_SCOPE)
+endfunction()