From f913242115ec21dd775b020c4ff3de84a39593a9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?S=C5=82awomir=20Siwek?= <slawomir.siwek@intel.com>
Date: Mon, 20 Oct 2025 19:39:01 +0200
Subject: [PATCH 01/10] bump required cmake

---
 CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 063c728411..492a2d451e 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -18,7 +18,7 @@
 #  torch_xpu_ops
 #  -- Static archive library target
 
-cmake_minimum_required(VERSION 3.13 FATAL_ERROR)
+cmake_minimum_required(VERSION 3.27 FATAL_ERROR)
 
 set(PROJECT_NAME "torch-xpu-ops")
 set(PROJECT_VERSION "2.3.0")

From 11534a7e28139eec24fdd8c2405c27b1fe3d39bf Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?S=C5=82awomir=20Siwek?= <slawomir.siwek@intel.com>
Date: Tue, 21 Oct 2025 05:23:22 +0000
Subject: [PATCH 02/10] match torch version

---
 CMakeLists.txt | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 492a2d451e..dfa0a12fe7 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -20,8 +20,6 @@
 
 cmake_minimum_required(VERSION 3.27 FATAL_ERROR)
 
-set(PROJECT_NAME "torch-xpu-ops")
-set(PROJECT_VERSION "2.3.0")
 # Avoid SYCL compiler error
 if(NOT WIN32)
   string(APPEND CMAKE_CXX_FLAGS " -Wno-error")
@@ -30,14 +28,12 @@ if(NOT WIN32)
   endif()
 endif()
 
-cmake_policy(SET CMP0048 NEW)
-project(${PROJECT_NAME} VERSION "${PROJECT_VERSION}" LANGUAGES C CXX)
+project(torch-xpu-ops VERSION 2.10.0 LANGUAGES C CXX)
 
 set(TORCH_XPU_OPS_FOUND FALSE)
-
 set(TORCH_XPU_OPS_ROOT ${PROJECT_SOURCE_DIR})
-list(APPEND CMAKE_MODULE_PATH ${TORCH_XPU_OPS_ROOT}/cmake/Modules)
 
+list(APPEND CMAKE_MODULE_PATH ${TORCH_XPU_OPS_ROOT}/cmake/Modules)
 include(${TORCH_XPU_OPS_ROOT}/cmake/SYCL.cmake)
 include(${TORCH_XPU_OPS_ROOT}/cmake/ONEMKL.cmake)
 include(${TORCH_XPU_OPS_ROOT}/cmake/BuildFlags.cmake)

From a055f23575b9ffe877971a5ace7c6ab13f7a7b86 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?S=C5=82awomir=20Siwek?= <slawomir.siwek@intel.com>
Date: Tue, 21 Oct 2025 06:03:14 +0000
Subject: [PATCH 03/10] Simplify install_xpu_headers macro

---
 src/ATen/CMakeLists.txt | 30 +++++++++++++++---------------
 1 file changed, 15 insertions(+), 15 deletions(-)

diff --git a/src/ATen/CMakeLists.txt b/src/ATen/CMakeLists.txt
index 493675e804..ebe3da5f3b 100644
--- a/src/ATen/CMakeLists.txt
+++ b/src/ATen/CMakeLists.txt
@@ -19,25 +19,25 @@ set(ATen_XPU_SYCL_SRCS ${ATen_XPU_SYCL_SRCS} PARENT_SCOPE)
 
 # ATen XPU headers
 
-macro(install_xpu_headers glob_pattern dest_subdir)
-  file(GLOB headers ${glob_pattern})
+macro(install_xpu_headers subdir)
+  file(GLOB headers CONFIGURE_DEPENDS "${subdir}/*.h")
   if(headers)
-    install(FILES ${headers} DESTINATION "${AT_INSTALL_INCLUDE_DIR}/${dest_subdir}")
+    install(FILES ${headers} DESTINATION "${AT_INSTALL_INCLUDE_DIR}/ATen/${subdir}")
   endif()
 endmacro()
 
-install_xpu_headers("xpu/*.h" "ATen/xpu")
-install_xpu_headers("native/xpu/*.h" "ATen/native/xpu")
-install_xpu_headers("native/xpu/sycl/*.h" "ATen/native/xpu/sycl")
-install_xpu_headers("native/xpu/mkl/*.h" "ATen/native/xpu/mkl")
-install_xpu_headers("native/nested/xpu/*.h" "ATen/native/nested/xpu")
-install_xpu_headers("native/nested/xpu/sycl/*.h" "ATen/native/nested/xpu/sycl")
-install_xpu_headers("native/quantized/*.h" "ATen/native/quantized/xpu")
-install_xpu_headers("native/quantized/sycl/*.h" "ATen/native/quantized/xpu/sycl")
-install_xpu_headers("native/sparse/xpu/*.h" "ATen/native/sparse/xpu")
-install_xpu_headers("native/sparse/xpu/sycl/*.h" "ATen/native/sparse/xpu/sycl")
-install_xpu_headers("native/transformers/*.h" "ATen/native/transformers/xpu")
-install_xpu_headers("native/transformers/sycl/*.h" "ATen/native/transformers/xpu/sycl")
+install_xpu_headers("xpu")
+install_xpu_headers("native/xpu")
+install_xpu_headers("native/xpu/sycl")
+install_xpu_headers("native/xpu/mkl")
+install_xpu_headers("native/nested/xpu")
+install_xpu_headers("native/nested/xpu/sycl")
+install_xpu_headers("native/quantized/xpu")
+install_xpu_headers("native/quantized/xpu/sycl")
+install_xpu_headers("native/sparse/xpu")
+install_xpu_headers("native/sparse/xpu/sycl")
+install_xpu_headers("native/transformers/xpu")
+install_xpu_headers("native/transformers/xpu/sycl")
 
 if(xpu_ops_generated_headers)
   install(FILES ${xpu_ops_generated_headers} DESTINATION ${AT_INSTALL_INCLUDE_DIR}/ATen/ops)

From caab6f91611ea6293471768b507f5fefe5f26a20 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?S=C5=82awomir=20Siwek?= <slawomir.siwek@intel.com>
Date: Tue, 21 Oct 2025 07:08:41 +0000
Subject: [PATCH 04/10] Align filename in logs

---
 CMakeLists.txt               | 3 ++-
 cmake/Modules/FindSYCL.cmake | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index dfa0a12fe7..04e21cceff 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -31,9 +31,10 @@ endif()
 project(torch-xpu-ops VERSION 2.10.0 LANGUAGES C CXX)
 
 set(TORCH_XPU_OPS_FOUND FALSE)
-set(TORCH_XPU_OPS_ROOT ${PROJECT_SOURCE_DIR})
 
+set(TORCH_XPU_OPS_ROOT ${PROJECT_SOURCE_DIR})
 list(APPEND CMAKE_MODULE_PATH ${TORCH_XPU_OPS_ROOT}/cmake/Modules)
+
 include(${TORCH_XPU_OPS_ROOT}/cmake/SYCL.cmake)
 include(${TORCH_XPU_OPS_ROOT}/cmake/ONEMKL.cmake)
 include(${TORCH_XPU_OPS_ROOT}/cmake/BuildFlags.cmake)
diff --git a/cmake/Modules/FindSYCL.cmake b/cmake/Modules/FindSYCL.cmake
index 86457ba362..e78bcbdfcf 100644
--- a/cmake/Modules/FindSYCL.cmake
+++ b/cmake/Modules/FindSYCL.cmake
@@ -107,7 +107,7 @@ macro(SYCL_INCLUDE_DEPENDENCIES dependency_file)
 
   if(SYCL_DEPEND_REGENERATE)
     set(SYCL_DEPEND ${dependency_file})
-    file(WRITE ${dependency_file} "#FindCUDA.cmake generated file.  Do not edit.\n")
+    file(WRITE ${dependency_file} "#FindSYCL.cmake generated file.  Do not edit.\n")
   endif()
 endmacro()
 

From 02a091b77ddf10d491edd2a834908111cfda73bb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?S=C5=82awomir=20Siwek?= <slawomir.siwek@intel.com>
Date: Tue, 21 Oct 2025 07:21:29 +0000
Subject: [PATCH 05/10] Extract common libs for win/linux

---
 cmake/Modules/FindONEMKL.cmake | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/cmake/Modules/FindONEMKL.cmake b/cmake/Modules/FindONEMKL.cmake
index 67e801f55f..bc0e5d2485 100644
--- a/cmake/Modules/FindONEMKL.cmake
+++ b/cmake/Modules/FindONEMKL.cmake
@@ -44,17 +44,18 @@ find_file(
 
 if((ONEMKL_INCLUDE_DIR STREQUAL "ONEMKL_INCLUDE_DIR-NOTFOUND")
    OR(ONEMKL_LIB_DIR STREQUAL "ONEMKL_LIB_DIR-NOTFOUND"))
-  message(WARNING "oneMKL sdk is incomplete!!")
+  message(WARNING "oneMKL SDK is incomplete!!")
   return()
 endif()
 
+set(MKL_LIB_NAMES "mkl_sycl_blas" "mkl_sycl_dft" "mkl_sycl_lapack"
+                  "mkl_intel_lp64" "mkl_core")
+
 if(WIN32)
-  set(MKL_LIB_NAMES "mkl_sycl_blas" "mkl_sycl_dft" "mkl_sycl_lapack"
-                    "mkl_intel_lp64" "mkl_intel_thread" "mkl_core")
+  list(APPEND MKL_LIB_NAMES "mkl_intel_thread")
   list(TRANSFORM MKL_LIB_NAMES APPEND "_dll.lib")
 else()
-  set(MKL_LIB_NAMES "mkl_sycl_blas" "mkl_sycl_dft" "mkl_sycl_lapack"
-                    "mkl_intel_lp64" "mkl_gnu_thread" "mkl_core")
+  list(APPEND MKL_LIB_NAMES "mkl_gnu_thread")
   list(TRANSFORM MKL_LIB_NAMES PREPEND "lib")
   list(TRANSFORM MKL_LIB_NAMES APPEND ".so")
 endif()

From b573bd35205949b162fa832845022c111db27321 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?S=C5=82awomir=20Siwek?= <slawomir.siwek@intel.com>
Date: Tue, 21 Oct 2025 07:34:45 +0000
Subject: [PATCH 06/10] Remove checks for old compilers (compared to 2025.2.1)

---
 src/BuildOnLinux.cmake   | 2 +-
 src/BuildOnWindows.cmake | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/BuildOnLinux.cmake b/src/BuildOnLinux.cmake
index 3cf18e008d..b87151c0c1 100644
--- a/src/BuildOnLinux.cmake
+++ b/src/BuildOnLinux.cmake
@@ -38,7 +38,7 @@ if(BUILD_SEPARATE_OPS)
   endforeach()
 # Working with the compilers which don't support device code compression, we have to split kernels
 # into multiple libraries to meet the bin size limitation.
-elseif(BUILD_SPLIT_KERNEL_LIB OR __INTEL_LLVM_COMPILER LESS 20250004 OR ICX_DATE LESS 20241205)
+elseif(BUILD_SPLIT_KERNEL_LIB)
   setup_common_libraries()
   # Split SYCL kernels into 4 libraries as categories 1) Unary+Binary 2) Reduce 3) Foreach 4) Others.
   set(ATen_XPU_SYCL_UNARY_BINARY_SRCS)
diff --git a/src/BuildOnWindows.cmake b/src/BuildOnWindows.cmake
index bf067c8e70..ae5c88da19 100644
--- a/src/BuildOnWindows.cmake
+++ b/src/BuildOnWindows.cmake
@@ -43,7 +43,7 @@ if(BUILD_SEPARATE_OPS)
   list(APPEND TORCH_XPU_OPS_LIBRARIES torch_xpu_ops_aten)
 # Working with the compilers which don't support device code compression, we have to split kernels
 # into multiple libraries to meet the bin size limitation.
-elseif(BUILD_SPLIT_KERNEL_LIB OR __INTEL_LLVM_COMPILER LESS 20250004 OR ICX_DATE LESS 20241205)
+elseif(BUILD_SPLIT_KERNEL_LIB)
   setup_common_libraries()
   # Split SYCL kernels into 2 libraries as categories 1) Unary+Binary 2) Others.
   set(ATen_XPU_SYCL_BINARY_SRCS)

From 638184f724229d5cb4e7179b10097ebf296d9f88 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?S=C5=82awomir=20Siwek?= <slawomir.siwek@intel.com>
Date: Tue, 21 Oct 2025 12:23:43 +0000
Subject: [PATCH 07/10] move MKL glob to MKL ifdef

---
 src/ATen/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/ATen/CMakeLists.txt b/src/ATen/CMakeLists.txt
index ebe3da5f3b..86e1a549ff 100644
--- a/src/ATen/CMakeLists.txt
+++ b/src/ATen/CMakeLists.txt
@@ -1,12 +1,12 @@
 # ATen XPU sources
 
 file(GLOB xpu_cpp "xpu/*.cpp")
-file(GLOB xpu_mkl "native/xpu/mkl/*.cpp")
 file(GLOB xpu_native_cpp "native/xpu/*.cpp" "native/sparse/*.cpp" "native/sparse/xpu/*.cpp" "native/nested/*.cpp" "native/nested/xpu/*.cpp" "native/transformers/*.cpp" "native/quantized/*.cpp")
 file(GLOB xpu_sycl "native/xpu/sycl/*.cpp" "native/sparse/xpu/sycl/*.cpp" "native/nested/xpu/sycl/*.cpp" "native/transformers/sycl/*.cpp" "native/quantized/sycl/*.cpp")
 
 list(APPEND ATen_XPU_CPP_SRCS ${xpu_cpp})
 if(USE_ONEMKL_XPU)
+  file(GLOB xpu_mkl "native/xpu/mkl/*.cpp")
   list(APPEND ATen_XPU_MKL_SRCS ${xpu_mkl})
 endif()
 list(APPEND ATen_XPU_NATIVE_CPP_SRCS ${xpu_native_cpp})

From c65abec23ac6e84432b2e3c9f64f648571c1b125 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?S=C5=82awomir=20Siwek?= <slawomir.siwek@intel.com>
Date: Tue, 21 Oct 2025 12:43:33 +0000
Subject: [PATCH 08/10] Remove BUILD_SPLIT_KERNEL_LIB

---
 CMakeLists.txt           |   2 +-
 src/BuildOnLinux.cmake   |  81 ----------------
 src/BuildOnWindows.cmake | 200 ---------------------------------------
 3 files changed, 1 insertion(+), 282 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 04e21cceff..8778d93338 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -79,7 +79,7 @@ set(BUILD_SEPARATE_OPS $ENV{BUILD_SEPARATE_OPS})
 if(CMAKE_BUILD_TYPE MATCHES "(Debug|RelWithDebInfo)")
   set(BUILD_SEPARATE_OPS TRUE)
 endif()
-set(BUILD_SPLIT_KERNEL_LIB $ENV{BUILD_SPLIT_KERNEL_LIB})
+
 add_subdirectory(${TORCH_XPU_OPS_ROOT}/src)
 
 set(TORCH_XPU_OPS_FOUND TRUE)
diff --git a/src/BuildOnLinux.cmake b/src/BuildOnLinux.cmake
index b87151c0c1..fd01b26a3f 100644
--- a/src/BuildOnLinux.cmake
+++ b/src/BuildOnLinux.cmake
@@ -36,87 +36,6 @@ if(BUILD_SEPARATE_OPS)
     # Decouple with PyTorch cmake definition.
     install(TARGETS ${sycl_lib} DESTINATION "${TORCH_INSTALL_LIB_DIR}")
   endforeach()
-# Working with the compilers which don't support device code compression, we have to split kernels
-# into multiple libraries to meet the bin size limitation.
-elseif(BUILD_SPLIT_KERNEL_LIB)
-  setup_common_libraries()
-  # Split SYCL kernels into 4 libraries as categories 1) Unary+Binary 2) Reduce 3) Foreach 4) Others.
-  set(ATen_XPU_SYCL_UNARY_BINARY_SRCS)
-  set(ATen_XPU_SYCL_REDUCE_SRCS)
-  set(ATen_XPU_SYCL_FOREACH_SRCS)
-  set(ATen_XPU_SYCL_OTHERS_SRCS)
-
-  foreach(sycl_src ${ATen_XPU_SYCL_SRCS})
-    string(REGEX MATCH "Binary" IS_BINARY ${sycl_src})
-    string(REGEX MATCH "Unary" IS_UNARY ${sycl_src})
-    string(REGEX MATCH "Pow" IS_POW ${sycl_src})
-    string(REGEX MATCH "Copy" IS_COPY ${sycl_src})
-    string(REGEX MATCH "Reduce" IS_REDUCE ${sycl_src})
-    string(REGEX MATCH "Activation" IS_ACTIVATION ${sycl_src})
-    string(REGEX MATCH "Foreach" IS_FOREACH ${sycl_src})
-
-    if(NOT IS_FOREACH STREQUAL "")
-      list(APPEND ATen_XPU_SYCL_FOREACH_SRCS ${sycl_src})
-    elseif(NOT IS_REDUCE STREQUAL "")
-      list(APPEND ATen_XPU_SYCL_REDUCE_SRCS ${sycl_src})
-    elseif(NOT IS_UNARY STREQUAL "" OR NOT IS_BINARY STREQUAL "")
-      list(APPEND ATen_XPU_SYCL_UNARY_BINARY_SRCS ${sycl_src})
-    elseif(NOT IS_COPY STREQUAL "" OR NOT IS_POW STREQUAL "")
-      list(APPEND ATen_XPU_SYCL_UNARY_BINARY_SRCS ${sycl_src})
-    elseif(NOT IS_ACTIVATION STREQUAL "")
-      list(APPEND ATen_XPU_SYCL_UNARY_BINARY_SRCS ${sycl_src})
-    else()
-      list(APPEND ATen_XPU_SYCL_OTHERS_SRCS ${sycl_src})
-    endif()
-  endforeach()
-
-  # Unary binary kernel lib
-  set(sycl_unary_binary_lib torch_xpu_ops_sycl_unary_binary_kernels)
-  sycl_add_library(
-    ${sycl_unary_binary_lib}
-    SHARED
-    SYCL_SOURCES ${ATen_XPU_SYCL_UNARY_BINARY_SRCS})
-  target_link_libraries(torch_xpu_ops PUBLIC ${sycl_unary_binary_lib})
-  list(APPEND TORCH_XPU_OPS_LIBRARIES ${sycl_unary_binary_lib})
-
-  # Decouple with PyTorch cmake definition.
-  install(TARGETS ${sycl_unary_binary_lib} DESTINATION "${TORCH_INSTALL_LIB_DIR}")
-
-  # Reduce kernel lib
-  set(sycl_reduce_lib torch_xpu_ops_sycl_reduce_kernels)
-  sycl_add_library(
-    ${sycl_reduce_lib}
-    SHARED
-    SYCL_SOURCES ${ATen_XPU_SYCL_REDUCE_SRCS})
-  target_link_libraries(torch_xpu_ops PUBLIC ${sycl_reduce_lib})
-  list(APPEND TORCH_XPU_OPS_LIBRARIES ${sycl_reduce_lib})
-
-  # Decouple with PyTorch cmake definition.
-  install(TARGETS ${sycl_reduce_lib} DESTINATION "${TORCH_INSTALL_LIB_DIR}")
-
-  # Foreach kernel lib
-  set(sycl_foreach_lib torch_xpu_ops_sycl_foreach_kernels)
-  sycl_add_library(
-    ${sycl_foreach_lib}
-    SHARED
-    SYCL_SOURCES ${ATen_XPU_SYCL_FOREACH_SRCS})
-  target_link_libraries(torch_xpu_ops PUBLIC ${sycl_foreach_lib})
-  list(APPEND TORCH_XPU_OPS_LIBRARIES ${sycl_foreach_lib})
-
-  # Decouple with PyTorch cmake definition.
-  install(TARGETS ${sycl_foreach_lib} DESTINATION "${TORCH_INSTALL_LIB_DIR}")
-
-  # Other kernel lib
-  set(sycl_lib torch_xpu_ops_sycl_kernels)
-  sycl_add_library(
-    ${sycl_lib}
-    SHARED
-    SYCL_SOURCES ${ATen_XPU_SYCL_OTHERS_SRCS})
-  target_link_libraries(torch_xpu_ops PUBLIC ${sycl_lib})
-  list(APPEND TORCH_XPU_OPS_LIBRARIES ${sycl_lib})
-
-  # Decouple with PyTorch cmake definition.
-  install(TARGETS ${sycl_lib} DESTINATION "${TORCH_INSTALL_LIB_DIR}")
 else()
   sycl_add_library(
     torch_xpu_ops
diff --git a/src/BuildOnWindows.cmake b/src/BuildOnWindows.cmake
index ae5c88da19..4005a2f3a6 100644
--- a/src/BuildOnWindows.cmake
+++ b/src/BuildOnWindows.cmake
@@ -39,206 +39,6 @@ if(BUILD_SEPARATE_OPS)
     # Decouple with PyTorch cmake definition.
     install(TARGETS ${sycl_lib} DESTINATION "${TORCH_INSTALL_LIB_DIR}")
   endforeach()
-  list(APPEND TORCH_XPU_OPS_LIBRARIES torch_xpu_ops)
-  list(APPEND TORCH_XPU_OPS_LIBRARIES torch_xpu_ops_aten)
-# Working with the compilers which don't support device code compression, we have to split kernels
-# into multiple libraries to meet the bin size limitation.
-elseif(BUILD_SPLIT_KERNEL_LIB)
-  setup_common_libraries()
-  # Split SYCL kernels into 2 libraries as categories 1) Unary+Binary 2) Others.
-  set(ATen_XPU_SYCL_BINARY_SRCS)
-  set(ATen_XPU_SYCL_UNARY_SRCS)
-  set(ATen_XPU_SYCL_REDUCE_SRCS)
-  set(ATen_XPU_SYCL_ACTIVATION_SRCS)
-  set(ATen_XPU_SYCL_FOREACH_SRCS)
-  set(ATen_XPU_SYCL_TENSOR_SRCS)
-  set(ATen_XPU_SYCL_NORM_LOSS_SRCS)
-  set(ATen_XPU_SYCL_POLY_SRCS)
-  set(ATen_XPU_SYCL_DISTRIBUTION_SRCS)
-  set(ATen_XPU_SYCL_OTHERS_SRCS)
-  foreach(sycl_src ${ATen_XPU_SYCL_SRCS})
-    string(REGEX MATCH "Binary" IS_BINARY ${sycl_src})
-    string(REGEX MATCH "Unary" IS_UNARY ${sycl_src})
-    # Resolve cyclic dependences between
-    # torch_xpu_ops_sycl_unary_binary_kernels.dll and
-    # torch_xpu_ops_sycl_kernels.dll. Move definition and invoke of kernels
-    # into a same kernel library. Here we move elementwise kernel pow and copy
-    # into torch_xpu_ops_sycl_unary_binary_kernels.dll.
-    string(REGEX MATCH "Pow" IS_POW ${sycl_src})
-    string(REGEX MATCH "Copy" IS_COPY ${sycl_src})
-    string(REGEX MATCH "Activation" IS_ACTIVATION ${sycl_src})
-    string(REGEX MATCH "Foreach" IS_FOREACH ${sycl_src})
-    string(REGEX MATCH "Reduce" IS_REDUCE ${sycl_src})
-    string(REGEX MATCH "Tensor" IS_TENSOR ${sycl_src})
-    string(REGEX MATCH "Norm" IS_NORM ${sycl_src})
-    string(REGEX MATCH "Loss" IS_LOSS ${sycl_src})
-    string(REGEX MATCH "Polynomial" IS_POLY ${sycl_src})
-    #Move resize kernel to Norm and Loss lib, to resolve symbol.
-    string(REGEX MATCH "Resize" IS_RESIZE ${sycl_src})
-    string(REGEX MATCH "Distribution" IS_DISTRIBUTION ${sycl_src})
-
-    if(NOT IS_FOREACH STREQUAL "")
-      list(APPEND ATen_XPU_SYCL_FOREACH_SRCS ${sycl_src})
-    elseif(NOT IS_BINARY STREQUAL "")
-      list(APPEND ATen_XPU_SYCL_BINARY_SRCS ${sycl_src})
-    elseif(NOT IS_UNARY STREQUAL "" OR NOT IS_COPY STREQUAL "" OR NOT IS_POW STREQUAL "")
-      list(APPEND ATen_XPU_SYCL_UNARY_SRCS ${sycl_src})
-    elseif(NOT IS_REDUCE STREQUAL "")
-      list(APPEND ATen_XPU_SYCL_REDUCE_SRCS ${sycl_src})
-    elseif(NOT IS_ACTIVATION STREQUAL "")
-      list(APPEND ATen_XPU_SYCL_ACTIVATION_SRCS ${sycl_src})
-    elseif(NOT IS_TENSOR STREQUAL "")
-      list(APPEND ATen_XPU_SYCL_TENSOR_SRCS ${sycl_src})
-    elseif(NOT IS_DISTRIBUTION STREQUAL "")
-      list(APPEND ATen_XPU_SYCL_DISTRIBUTION_SRCS ${sycl_src})
-    elseif(NOT IS_NORM STREQUAL "" OR NOT IS_LOSS STREQUAL "" OR NOT IS_RESIZE STREQUAL "")
-      list(APPEND ATen_XPU_SYCL_NORM_LOSS_SRCS ${sycl_src})
-    elseif(NOT IS_POLY STREQUAL "")
-      list(APPEND ATen_XPU_SYCL_POLY_SRCS ${sycl_src})
-    else()
-      list(APPEND ATen_XPU_SYCL_OTHERS_SRCS ${sycl_src})
-    endif()
-  endforeach()
-  # Binary kernel lib
-  set(sycl_binary_lib torch_xpu_ops_sycl_binary_kernels)
-  sycl_add_library(
-    ${sycl_binary_lib}
-    SHARED
-    SYCL_SOURCES ${ATen_XPU_SYCL_BINARY_SRCS})
-  target_compile_definitions(${sycl_binary_lib} PRIVATE TORCH_XPU_BUILD_MAIN_LIB)
-  target_link_libraries(torch_xpu_ops_aten PUBLIC ${sycl_binary_lib})
-  target_link_libraries(${sycl_binary_lib} PUBLIC torch_xpu)
-  list(APPEND TORCH_XPU_OPS_LIBRARIES ${sycl_binary_lib})
-
-  # Decouple with PyTorch cmake definition.
-  install(TARGETS ${sycl_binary_lib} DESTINATION "${TORCH_INSTALL_LIB_DIR}")
-
-  # Unary kernel lib
-  set(sycl_unary_lib torch_xpu_ops_sycl_unary_kernels)
-  sycl_add_library(
-    ${sycl_unary_lib}
-    SHARED
-    SYCL_SOURCES ${ATen_XPU_SYCL_UNARY_SRCS})
-  target_compile_definitions(${sycl_unary_lib} PRIVATE TORCH_XPU_BUILD_MAIN_LIB)
-  target_link_libraries(torch_xpu_ops_aten PUBLIC ${sycl_unary_lib})
-  target_link_libraries(${sycl_unary_lib} PUBLIC torch_xpu)
-  list(APPEND TORCH_XPU_OPS_LIBRARIES ${sycl_unary_lib})
-
-  # Decouple with PyTorch cmake definition.
-  install(TARGETS ${sycl_unary_lib} DESTINATION "${TORCH_INSTALL_LIB_DIR}")
-
-  # Reduce kernel lib
-  set(sycl_reduce_lib torch_xpu_ops_sycl_reduce_kernels)
-  sycl_add_library(
-    ${sycl_reduce_lib}
-    SHARED
-    SYCL_SOURCES ${ATen_XPU_SYCL_REDUCE_SRCS})
-  target_compile_definitions(${sycl_reduce_lib} PRIVATE TORCH_XPU_BUILD_MAIN_LIB)
-  target_link_libraries(torch_xpu_ops_aten PUBLIC ${sycl_reduce_lib})
-  target_link_libraries(${sycl_reduce_lib} PUBLIC torch_xpu)
-  list(APPEND TORCH_XPU_OPS_LIBRARIES ${sycl_reduce_lib})
-
-  # Decouple with PyTorch cmake definition.
-  install(TARGETS ${sycl_reduce_lib} DESTINATION "${TORCH_INSTALL_LIB_DIR}")
-
-  # Activation kernel lib
-  set(sycl_activation_lib torch_xpu_ops_sycl_activation_kernels)
-  sycl_add_library(
-    ${sycl_activation_lib}
-    SHARED
-    SYCL_SOURCES ${ATen_XPU_SYCL_ACTIVATION_SRCS})
-  target_compile_definitions(${sycl_activation_lib} PRIVATE TORCH_XPU_BUILD_MAIN_LIB)
-  target_link_libraries(torch_xpu_ops_aten PUBLIC ${sycl_activation_lib})
-  target_link_libraries(${sycl_activation_lib} PUBLIC torch_xpu)
-  list(APPEND TORCH_XPU_OPS_LIBRARIES ${sycl_activation_lib})
-
-  # Decouple with PyTorch cmake definition.
-  install(TARGETS ${sycl_activation_lib} DESTINATION "${TORCH_INSTALL_LIB_DIR}")
-
-  # Foreach kernel lib
-  set(sycl_foreach_lib torch_xpu_ops_sycl_foreach_kernels)
-  sycl_add_library(
-    ${sycl_foreach_lib}
-    SHARED
-    SYCL_SOURCES ${ATen_XPU_SYCL_FOREACH_SRCS})
-  target_compile_definitions(${sycl_foreach_lib} PRIVATE TORCH_XPU_BUILD_MAIN_LIB)
-  target_link_libraries(torch_xpu_ops_aten PUBLIC ${sycl_foreach_lib})
-  target_link_libraries(${sycl_foreach_lib} PUBLIC torch_xpu)
-  list(APPEND TORCH_XPU_OPS_LIBRARIES ${sycl_foreach_lib})
-
-  # Decouple with PyTorch cmake definition.
-  install(TARGETS ${sycl_foreach_lib} DESTINATION "${TORCH_INSTALL_LIB_DIR}")
-
-  # Tensor kernel lib
-  set(sycl_tensor_lib torch_xpu_ops_sycl_tensor_kernels)
-  sycl_add_library(
-    ${sycl_tensor_lib}
-    SHARED
-    SYCL_SOURCES ${ATen_XPU_SYCL_TENSOR_SRCS})
-  target_compile_definitions(${sycl_tensor_lib} PRIVATE TORCH_XPU_BUILD_MAIN_LIB)
-  target_link_libraries(torch_xpu_ops_aten PUBLIC ${sycl_tensor_lib})
-  target_link_libraries(${sycl_tensor_lib} PUBLIC torch_xpu)
-  list(APPEND TORCH_XPU_OPS_LIBRARIES ${sycl_tensor_lib})
-
-  # Decouple with PyTorch cmake definition.
-  install(TARGETS ${sycl_tensor_lib} DESTINATION "${TORCH_INSTALL_LIB_DIR}")
-
-  # Norm and Loss kernel lib
-  set(sycl_norm_loss_lib torch_xpu_ops_sycl_norm_loss_kernels)
-  sycl_add_library(
-    ${sycl_norm_loss_lib}
-    SHARED
-    SYCL_SOURCES ${ATen_XPU_SYCL_NORM_LOSS_SRCS})
-  target_compile_definitions(${sycl_norm_loss_lib} PRIVATE TORCH_XPU_BUILD_MAIN_LIB)
-  target_link_libraries(torch_xpu_ops_aten PUBLIC ${sycl_norm_loss_lib})
-  target_link_libraries(${sycl_norm_loss_lib} PUBLIC torch_xpu)
-  list(APPEND TORCH_XPU_OPS_LIBRARIES ${sycl_norm_loss_lib})
-
-  # Decouple with PyTorch cmake definition.
-  install(TARGETS ${sycl_norm_loss_lib} DESTINATION "${TORCH_INSTALL_LIB_DIR}")
-
-  # Polynomial kernel lib
-  set(sycl_poly_lib torch_xpu_ops_sycl_poly_kernels)
-  sycl_add_library(
-    ${sycl_poly_lib}
-    SHARED
-    SYCL_SOURCES ${ATen_XPU_SYCL_POLY_SRCS})
-  target_compile_definitions(${sycl_poly_lib} PRIVATE TORCH_XPU_BUILD_MAIN_LIB)
-  target_link_libraries(torch_xpu_ops_aten PUBLIC ${sycl_poly_lib})
-  target_link_libraries(${sycl_poly_lib} PUBLIC torch_xpu)
-  list(APPEND TORCH_XPU_OPS_LIBRARIES ${sycl_poly_lib})
-
-  # Decouple with PyTorch cmake definition.
-  install(TARGETS ${sycl_poly_lib} DESTINATION "${TORCH_INSTALL_LIB_DIR}")
-
-  # Distribution kernel lib
-  set(sycl_dist_lib torch_xpu_ops_sycl_dist_kernels)
-  sycl_add_library(
-    ${sycl_dist_lib}
-    SHARED
-    SYCL_SOURCES ${ATen_XPU_SYCL_DISTRIBUTION_SRCS})
-  target_compile_definitions(${sycl_dist_lib} PRIVATE TORCH_XPU_BUILD_MAIN_LIB)
-  target_link_libraries(torch_xpu_ops_aten PUBLIC ${sycl_dist_lib})
-  target_link_libraries(${sycl_dist_lib} PUBLIC torch_xpu)
-  list(APPEND TORCH_XPU_OPS_LIBRARIES ${sycl_dist_lib})
-
-  # Decouple with PyTorch cmake definition.
-  install(TARGETS ${sycl_dist_lib} DESTINATION "${TORCH_INSTALL_LIB_DIR}")
-
-  # Other kernel lib
-  set(sycl_lib torch_xpu_ops_sycl_kernels)
-  sycl_add_library(
-    ${sycl_lib}
-    SHARED
-    SYCL_SOURCES ${ATen_XPU_SYCL_OTHERS_SRCS})
-  target_compile_definitions(${sycl_lib} PRIVATE TORCH_XPU_BUILD_MAIN_LIB)
-  target_link_libraries(torch_xpu_ops_aten PUBLIC ${sycl_lib})
-  target_link_libraries(${sycl_lib} PUBLIC torch_xpu)
-  list(APPEND TORCH_XPU_OPS_LIBRARIES ${sycl_lib})
-
-  # Decouple with PyTorch cmake definition.
-  install(TARGETS ${sycl_lib} DESTINATION "${TORCH_INSTALL_LIB_DIR}")
-
   list(APPEND TORCH_XPU_OPS_LIBRARIES torch_xpu_ops)
   list(APPEND TORCH_XPU_OPS_LIBRARIES torch_xpu_ops_aten)
 else()

From 4c8ede205a5362556454027c2e9ddcb61b583f18 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?S=C5=82awomir=20Siwek?= <slawomir.siwek@intel.com>
Date: Thu, 23 Oct 2025 05:29:20 +0000
Subject: [PATCH 09/10] Move part of diff to second PR

---
 src/BuildOnLinux.cmake   |  81 ++++++++++++++++
 src/BuildOnWindows.cmake | 200 +++++++++++++++++++++++++++++++++++++++
 2 files changed, 281 insertions(+)

diff --git a/src/BuildOnLinux.cmake b/src/BuildOnLinux.cmake
index fd01b26a3f..3cf18e008d 100644
--- a/src/BuildOnLinux.cmake
+++ b/src/BuildOnLinux.cmake
@@ -36,6 +36,87 @@ if(BUILD_SEPARATE_OPS)
     # Decouple with PyTorch cmake definition.
     install(TARGETS ${sycl_lib} DESTINATION "${TORCH_INSTALL_LIB_DIR}")
   endforeach()
+# Working with the compilers which don't support device code compression, we have to split kernels
+# into multiple libraries to meet the bin size limitation.
+elseif(BUILD_SPLIT_KERNEL_LIB OR __INTEL_LLVM_COMPILER LESS 20250004 OR ICX_DATE LESS 20241205)
+  setup_common_libraries()
+  # Split SYCL kernels into 4 libraries as categories 1) Unary+Binary 2) Reduce 3) Foreach 4) Others.
+  set(ATen_XPU_SYCL_UNARY_BINARY_SRCS)
+  set(ATen_XPU_SYCL_REDUCE_SRCS)
+  set(ATen_XPU_SYCL_FOREACH_SRCS)
+  set(ATen_XPU_SYCL_OTHERS_SRCS)
+
+  foreach(sycl_src ${ATen_XPU_SYCL_SRCS})
+    string(REGEX MATCH "Binary" IS_BINARY ${sycl_src})
+    string(REGEX MATCH "Unary" IS_UNARY ${sycl_src})
+    string(REGEX MATCH "Pow" IS_POW ${sycl_src})
+    string(REGEX MATCH "Copy" IS_COPY ${sycl_src})
+    string(REGEX MATCH "Reduce" IS_REDUCE ${sycl_src})
+    string(REGEX MATCH "Activation" IS_ACTIVATION ${sycl_src})
+    string(REGEX MATCH "Foreach" IS_FOREACH ${sycl_src})
+
+    if(NOT IS_FOREACH STREQUAL "")
+      list(APPEND ATen_XPU_SYCL_FOREACH_SRCS ${sycl_src})
+    elseif(NOT IS_REDUCE STREQUAL "")
+      list(APPEND ATen_XPU_SYCL_REDUCE_SRCS ${sycl_src})
+    elseif(NOT IS_UNARY STREQUAL "" OR NOT IS_BINARY STREQUAL "")
+      list(APPEND ATen_XPU_SYCL_UNARY_BINARY_SRCS ${sycl_src})
+    elseif(NOT IS_COPY STREQUAL "" OR NOT IS_POW STREQUAL "")
+      list(APPEND ATen_XPU_SYCL_UNARY_BINARY_SRCS ${sycl_src})
+    elseif(NOT IS_ACTIVATION STREQUAL "")
+      list(APPEND ATen_XPU_SYCL_UNARY_BINARY_SRCS ${sycl_src})
+    else()
+      list(APPEND ATen_XPU_SYCL_OTHERS_SRCS ${sycl_src})
+    endif()
+  endforeach()
+
+  # Unary binary kernel lib
+  set(sycl_unary_binary_lib torch_xpu_ops_sycl_unary_binary_kernels)
+  sycl_add_library(
+    ${sycl_unary_binary_lib}
+    SHARED
+    SYCL_SOURCES ${ATen_XPU_SYCL_UNARY_BINARY_SRCS})
+  target_link_libraries(torch_xpu_ops PUBLIC ${sycl_unary_binary_lib})
+  list(APPEND TORCH_XPU_OPS_LIBRARIES ${sycl_unary_binary_lib})
+
+  # Decouple with PyTorch cmake definition.
+  install(TARGETS ${sycl_unary_binary_lib} DESTINATION "${TORCH_INSTALL_LIB_DIR}")
+
+  # Reduce kernel lib
+  set(sycl_reduce_lib torch_xpu_ops_sycl_reduce_kernels)
+  sycl_add_library(
+    ${sycl_reduce_lib}
+    SHARED
+    SYCL_SOURCES ${ATen_XPU_SYCL_REDUCE_SRCS})
+  target_link_libraries(torch_xpu_ops PUBLIC ${sycl_reduce_lib})
+  list(APPEND TORCH_XPU_OPS_LIBRARIES ${sycl_reduce_lib})
+
+  # Decouple with PyTorch cmake definition.
+  install(TARGETS ${sycl_reduce_lib} DESTINATION "${TORCH_INSTALL_LIB_DIR}")
+
+  # Foreach kernel lib
+  set(sycl_foreach_lib torch_xpu_ops_sycl_foreach_kernels)
+  sycl_add_library(
+    ${sycl_foreach_lib}
+    SHARED
+    SYCL_SOURCES ${ATen_XPU_SYCL_FOREACH_SRCS})
+  target_link_libraries(torch_xpu_ops PUBLIC ${sycl_foreach_lib})
+  list(APPEND TORCH_XPU_OPS_LIBRARIES ${sycl_foreach_lib})
+
+  # Decouple with PyTorch cmake definition.
+  install(TARGETS ${sycl_foreach_lib} DESTINATION "${TORCH_INSTALL_LIB_DIR}")
+
+  # Other kernel lib
+  set(sycl_lib torch_xpu_ops_sycl_kernels)
+  sycl_add_library(
+    ${sycl_lib}
+    SHARED
+    SYCL_SOURCES ${ATen_XPU_SYCL_OTHERS_SRCS})
+  target_link_libraries(torch_xpu_ops PUBLIC ${sycl_lib})
+  list(APPEND TORCH_XPU_OPS_LIBRARIES ${sycl_lib})
+
+  # Decouple with PyTorch cmake definition.
+  install(TARGETS ${sycl_lib} DESTINATION "${TORCH_INSTALL_LIB_DIR}")
 else()
   sycl_add_library(
     torch_xpu_ops
diff --git a/src/BuildOnWindows.cmake b/src/BuildOnWindows.cmake
index 4005a2f3a6..bf067c8e70 100644
--- a/src/BuildOnWindows.cmake
+++ b/src/BuildOnWindows.cmake
@@ -39,6 +39,206 @@ if(BUILD_SEPARATE_OPS)
     # Decouple with PyTorch cmake definition.
     install(TARGETS ${sycl_lib} DESTINATION "${TORCH_INSTALL_LIB_DIR}")
   endforeach()
+  list(APPEND TORCH_XPU_OPS_LIBRARIES torch_xpu_ops)
+  list(APPEND TORCH_XPU_OPS_LIBRARIES torch_xpu_ops_aten)
+# Working with the compilers which don't support device code compression, we have to split kernels
+# into multiple libraries to meet the bin size limitation.
+elseif(BUILD_SPLIT_KERNEL_LIB OR __INTEL_LLVM_COMPILER LESS 20250004 OR ICX_DATE LESS 20241205)
+  setup_common_libraries()
+  # Split SYCL kernels into 2 libraries as categories 1) Unary+Binary 2) Others.
+  set(ATen_XPU_SYCL_BINARY_SRCS)
+  set(ATen_XPU_SYCL_UNARY_SRCS)
+  set(ATen_XPU_SYCL_REDUCE_SRCS)
+  set(ATen_XPU_SYCL_ACTIVATION_SRCS)
+  set(ATen_XPU_SYCL_FOREACH_SRCS)
+  set(ATen_XPU_SYCL_TENSOR_SRCS)
+  set(ATen_XPU_SYCL_NORM_LOSS_SRCS)
+  set(ATen_XPU_SYCL_POLY_SRCS)
+  set(ATen_XPU_SYCL_DISTRIBUTION_SRCS)
+  set(ATen_XPU_SYCL_OTHERS_SRCS)
+  foreach(sycl_src ${ATen_XPU_SYCL_SRCS})
+    string(REGEX MATCH "Binary" IS_BINARY ${sycl_src})
+    string(REGEX MATCH "Unary" IS_UNARY ${sycl_src})
+    # Resolve cyclic dependences between
+    # torch_xpu_ops_sycl_unary_binary_kernels.dll and
+    # torch_xpu_ops_sycl_kernels.dll. Move definition and invoke of kernels
+    # into a same kernel library. Here we move elementwise kernel pow and copy
+    # into torch_xpu_ops_sycl_unary_binary_kernels.dll.
+    string(REGEX MATCH "Pow" IS_POW ${sycl_src})
+    string(REGEX MATCH "Copy" IS_COPY ${sycl_src})
+    string(REGEX MATCH "Activation" IS_ACTIVATION ${sycl_src})
+    string(REGEX MATCH "Foreach" IS_FOREACH ${sycl_src})
+    string(REGEX MATCH "Reduce" IS_REDUCE ${sycl_src})
+    string(REGEX MATCH "Tensor" IS_TENSOR ${sycl_src})
+    string(REGEX MATCH "Norm" IS_NORM ${sycl_src})
+    string(REGEX MATCH "Loss" IS_LOSS ${sycl_src})
+    string(REGEX MATCH "Polynomial" IS_POLY ${sycl_src})
+    #Move resize kernel to Norm and Loss lib, to resolve symbol.
+    string(REGEX MATCH "Resize" IS_RESIZE ${sycl_src})
+    string(REGEX MATCH "Distribution" IS_DISTRIBUTION ${sycl_src})
+
+    if(NOT IS_FOREACH STREQUAL "")
+      list(APPEND ATen_XPU_SYCL_FOREACH_SRCS ${sycl_src})
+    elseif(NOT IS_BINARY STREQUAL "")
+      list(APPEND ATen_XPU_SYCL_BINARY_SRCS ${sycl_src})
+    elseif(NOT IS_UNARY STREQUAL "" OR NOT IS_COPY STREQUAL "" OR NOT IS_POW STREQUAL "")
+      list(APPEND ATen_XPU_SYCL_UNARY_SRCS ${sycl_src})
+    elseif(NOT IS_REDUCE STREQUAL "")
+      list(APPEND ATen_XPU_SYCL_REDUCE_SRCS ${sycl_src})
+    elseif(NOT IS_ACTIVATION STREQUAL "")
+      list(APPEND ATen_XPU_SYCL_ACTIVATION_SRCS ${sycl_src})
+    elseif(NOT IS_TENSOR STREQUAL "")
+      list(APPEND ATen_XPU_SYCL_TENSOR_SRCS ${sycl_src})
+    elseif(NOT IS_DISTRIBUTION STREQUAL "")
+      list(APPEND ATen_XPU_SYCL_DISTRIBUTION_SRCS ${sycl_src})
+    elseif(NOT IS_NORM STREQUAL "" OR NOT IS_LOSS STREQUAL "" OR NOT IS_RESIZE STREQUAL "")
+      list(APPEND ATen_XPU_SYCL_NORM_LOSS_SRCS ${sycl_src})
+    elseif(NOT IS_POLY STREQUAL "")
+      list(APPEND ATen_XPU_SYCL_POLY_SRCS ${sycl_src})
+    else()
+      list(APPEND ATen_XPU_SYCL_OTHERS_SRCS ${sycl_src})
+    endif()
+  endforeach()
+  # Binary kernel lib
+  set(sycl_binary_lib torch_xpu_ops_sycl_binary_kernels)
+  sycl_add_library(
+    ${sycl_binary_lib}
+    SHARED
+    SYCL_SOURCES ${ATen_XPU_SYCL_BINARY_SRCS})
+  target_compile_definitions(${sycl_binary_lib} PRIVATE TORCH_XPU_BUILD_MAIN_LIB)
+  target_link_libraries(torch_xpu_ops_aten PUBLIC ${sycl_binary_lib})
+  target_link_libraries(${sycl_binary_lib} PUBLIC torch_xpu)
+  list(APPEND TORCH_XPU_OPS_LIBRARIES ${sycl_binary_lib})
+
+  # Decouple with PyTorch cmake definition.
+  install(TARGETS ${sycl_binary_lib} DESTINATION "${TORCH_INSTALL_LIB_DIR}")
+
+  # Unary kernel lib
+  set(sycl_unary_lib torch_xpu_ops_sycl_unary_kernels)
+  sycl_add_library(
+    ${sycl_unary_lib}
+    SHARED
+    SYCL_SOURCES ${ATen_XPU_SYCL_UNARY_SRCS})
+  target_compile_definitions(${sycl_unary_lib} PRIVATE TORCH_XPU_BUILD_MAIN_LIB)
+  target_link_libraries(torch_xpu_ops_aten PUBLIC ${sycl_unary_lib})
+  target_link_libraries(${sycl_unary_lib} PUBLIC torch_xpu)
+  list(APPEND TORCH_XPU_OPS_LIBRARIES ${sycl_unary_lib})
+
+  # Decouple with PyTorch cmake definition.
+  install(TARGETS ${sycl_unary_lib} DESTINATION "${TORCH_INSTALL_LIB_DIR}")
+
+  # Reduce kernel lib
+  set(sycl_reduce_lib torch_xpu_ops_sycl_reduce_kernels)
+  sycl_add_library(
+    ${sycl_reduce_lib}
+    SHARED
+    SYCL_SOURCES ${ATen_XPU_SYCL_REDUCE_SRCS})
+  target_compile_definitions(${sycl_reduce_lib} PRIVATE TORCH_XPU_BUILD_MAIN_LIB)
+  target_link_libraries(torch_xpu_ops_aten PUBLIC ${sycl_reduce_lib})
+  target_link_libraries(${sycl_reduce_lib} PUBLIC torch_xpu)
+  list(APPEND TORCH_XPU_OPS_LIBRARIES ${sycl_reduce_lib})
+
+  # Decouple with PyTorch cmake definition.
+  install(TARGETS ${sycl_reduce_lib} DESTINATION "${TORCH_INSTALL_LIB_DIR}")
+
+  # Activation kernel lib
+  set(sycl_activation_lib torch_xpu_ops_sycl_activation_kernels)
+  sycl_add_library(
+    ${sycl_activation_lib}
+    SHARED
+    SYCL_SOURCES ${ATen_XPU_SYCL_ACTIVATION_SRCS})
+  target_compile_definitions(${sycl_activation_lib} PRIVATE TORCH_XPU_BUILD_MAIN_LIB)
+  target_link_libraries(torch_xpu_ops_aten PUBLIC ${sycl_activation_lib})
+  target_link_libraries(${sycl_activation_lib} PUBLIC torch_xpu)
+  list(APPEND TORCH_XPU_OPS_LIBRARIES ${sycl_activation_lib})
+
+  # Decouple with PyTorch cmake definition.
+  install(TARGETS ${sycl_activation_lib} DESTINATION "${TORCH_INSTALL_LIB_DIR}")
+
+  # Foreach kernel lib
+  set(sycl_foreach_lib torch_xpu_ops_sycl_foreach_kernels)
+  sycl_add_library(
+    ${sycl_foreach_lib}
+    SHARED
+    SYCL_SOURCES ${ATen_XPU_SYCL_FOREACH_SRCS})
+  target_compile_definitions(${sycl_foreach_lib} PRIVATE TORCH_XPU_BUILD_MAIN_LIB)
+  target_link_libraries(torch_xpu_ops_aten PUBLIC ${sycl_foreach_lib})
+  target_link_libraries(${sycl_foreach_lib} PUBLIC torch_xpu)
+  list(APPEND TORCH_XPU_OPS_LIBRARIES ${sycl_foreach_lib})
+
+  # Decouple with PyTorch cmake definition.
+  install(TARGETS ${sycl_foreach_lib} DESTINATION "${TORCH_INSTALL_LIB_DIR}")
+
+  # Tensor kernel lib
+  set(sycl_tensor_lib torch_xpu_ops_sycl_tensor_kernels)
+  sycl_add_library(
+    ${sycl_tensor_lib}
+    SHARED
+    SYCL_SOURCES ${ATen_XPU_SYCL_TENSOR_SRCS})
+  target_compile_definitions(${sycl_tensor_lib} PRIVATE TORCH_XPU_BUILD_MAIN_LIB)
+  target_link_libraries(torch_xpu_ops_aten PUBLIC ${sycl_tensor_lib})
+  target_link_libraries(${sycl_tensor_lib} PUBLIC torch_xpu)
+  list(APPEND TORCH_XPU_OPS_LIBRARIES ${sycl_tensor_lib})
+
+  # Decouple with PyTorch cmake definition.
+  install(TARGETS ${sycl_tensor_lib} DESTINATION "${TORCH_INSTALL_LIB_DIR}")
+
+  # Norm and Loss kernel lib
+  set(sycl_norm_loss_lib torch_xpu_ops_sycl_norm_loss_kernels)
+  sycl_add_library(
+    ${sycl_norm_loss_lib}
+    SHARED
+    SYCL_SOURCES ${ATen_XPU_SYCL_NORM_LOSS_SRCS})
+  target_compile_definitions(${sycl_norm_loss_lib} PRIVATE TORCH_XPU_BUILD_MAIN_LIB)
+  target_link_libraries(torch_xpu_ops_aten PUBLIC ${sycl_norm_loss_lib})
+  target_link_libraries(${sycl_norm_loss_lib} PUBLIC torch_xpu)
+  list(APPEND TORCH_XPU_OPS_LIBRARIES ${sycl_norm_loss_lib})
+
+  # Decouple with PyTorch cmake definition.
+  install(TARGETS ${sycl_norm_loss_lib} DESTINATION "${TORCH_INSTALL_LIB_DIR}")
+
+  # Polynomial kernel lib
+  set(sycl_poly_lib torch_xpu_ops_sycl_poly_kernels)
+  sycl_add_library(
+    ${sycl_poly_lib}
+    SHARED
+    SYCL_SOURCES ${ATen_XPU_SYCL_POLY_SRCS})
+  target_compile_definitions(${sycl_poly_lib} PRIVATE TORCH_XPU_BUILD_MAIN_LIB)
+  target_link_libraries(torch_xpu_ops_aten PUBLIC ${sycl_poly_lib})
+  target_link_libraries(${sycl_poly_lib} PUBLIC torch_xpu)
+  list(APPEND TORCH_XPU_OPS_LIBRARIES ${sycl_poly_lib})
+
+  # Decouple with PyTorch cmake definition.
+  install(TARGETS ${sycl_poly_lib} DESTINATION "${TORCH_INSTALL_LIB_DIR}")
+
+  # Distribution kernel lib
+  set(sycl_dist_lib torch_xpu_ops_sycl_dist_kernels)
+  sycl_add_library(
+    ${sycl_dist_lib}
+    SHARED
+    SYCL_SOURCES ${ATen_XPU_SYCL_DISTRIBUTION_SRCS})
+  target_compile_definitions(${sycl_dist_lib} PRIVATE TORCH_XPU_BUILD_MAIN_LIB)
+  target_link_libraries(torch_xpu_ops_aten PUBLIC ${sycl_dist_lib})
+  target_link_libraries(${sycl_dist_lib} PUBLIC torch_xpu)
+  list(APPEND TORCH_XPU_OPS_LIBRARIES ${sycl_dist_lib})
+
+  # Decouple with PyTorch cmake definition.
+  install(TARGETS ${sycl_dist_lib} DESTINATION "${TORCH_INSTALL_LIB_DIR}")
+
+  # Other kernel lib
+  set(sycl_lib torch_xpu_ops_sycl_kernels)
+  sycl_add_library(
+    ${sycl_lib}
+    SHARED
+    SYCL_SOURCES ${ATen_XPU_SYCL_OTHERS_SRCS})
+  target_compile_definitions(${sycl_lib} PRIVATE TORCH_XPU_BUILD_MAIN_LIB)
+  target_link_libraries(torch_xpu_ops_aten PUBLIC ${sycl_lib})
+  target_link_libraries(${sycl_lib} PUBLIC torch_xpu)
+  list(APPEND TORCH_XPU_OPS_LIBRARIES ${sycl_lib})
+
+  # Decouple with PyTorch cmake definition.
+  install(TARGETS ${sycl_lib} DESTINATION "${TORCH_INSTALL_LIB_DIR}")
+
   list(APPEND TORCH_XPU_OPS_LIBRARIES torch_xpu_ops)
   list(APPEND TORCH_XPU_OPS_LIBRARIES torch_xpu_ops_aten)
 else()

From e7555e462b13639865f39574633544df03b1417d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?S=C5=82awomir=20Siwek?= <slawomir.siwek@intel.com>
Date: Thu, 23 Oct 2025 05:31:41 +0000
Subject: [PATCH 10/10] Restore missing line

---
 CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 8778d93338..04e21cceff 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -79,7 +79,7 @@ set(BUILD_SEPARATE_OPS $ENV{BUILD_SEPARATE_OPS})
 if(CMAKE_BUILD_TYPE MATCHES "(Debug|RelWithDebInfo)")
   set(BUILD_SEPARATE_OPS TRUE)
 endif()
-
+set(BUILD_SPLIT_KERNEL_LIB $ENV{BUILD_SPLIT_KERNEL_LIB})
 add_subdirectory(${TORCH_XPU_OPS_ROOT}/src)
 
 set(TORCH_XPU_OPS_FOUND TRUE)