make aoti_torch_empty_strided support creating incontiguous tensor (#15321)

pytorchbot · Gasoonjia · web-flow · commit efccca45f1dc · 2025-10-21T13:36:01.000-07:00
This PR was created by the merge bot to help merge the original PR into the main branch. ghstack PR number: #15228 by @Gasoonjia ^ Please use this as the source of truth for the PR details, comments, and reviews ghstack PR base: https://github.com/pytorch/executorch/tree/gh/gasoonjia/59/base ghstack PR head: https://github.com/pytorch/executorch/tree/gh/gasoonjia/59/head Merge bot PR base: https://github.com/pytorch/executorch/tree/main Merge bot PR head: https://github.com/pytorch/executorch/tree/gh/gasoonjia/59/orig Differential Revision: [D84938258](https://our.internmc.facebook.com/intern/diff/D84938258/) @diff-train-skip-merge Co-authored-by: gasoonjia <gasoonjia@icloud.com> Co-authored-by: Gasoonjia <gasoonjia@meta.com>
diff --git a/backends/cuda/CMakeLists.txt b/backends/cuda/CMakeLists.txt
@@ -34,6 +34,39 @@ find_package(CUDAToolkit REQUIRED)
 include(${EXECUTORCH_ROOT}/tools/cmake/Utils.cmake)
 find_package_torch()
 
+# CUDA tensor maker for backends that support incontiguous tensors
+set(_tensor_maker_sources runtime/tensor/tensor_maker.cpp)
+add_library(cuda_tensor_maker STATIC ${_tensor_maker_sources})
+target_include_directories(
+  cuda_tensor_maker
+  PUBLIC $<BUILD_INTERFACE:${EXECUTORCH_ROOT}> $<INSTALL_INTERFACE:include>
+         $<BUILD_INTERFACE:${EXECUTORCH_ROOT}/..>
+)
+target_compile_options(
+  cuda_tensor_maker
+  PUBLIC $<$<CXX_COMPILER_ID:MSVC>:/EHsc /GR>
+         $<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-fexceptions -frtti -fPIC>
+)
+# Ensure symbols are exported properly
+if(APPLE)
+  target_link_options(cuda_tensor_maker PUBLIC -Wl,-export_dynamic)
+else()
+  target_link_options(
+    cuda_tensor_maker PUBLIC
+    $<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-Wl,--export-dynamic>
+  )
+endif()
+
+# Link against ExecuTorch core libraries
+target_link_libraries(cuda_tensor_maker PUBLIC executorch ${CMAKE_DL_LIBS})
+executorch_target_link_options_shared_lib(cuda_tensor_maker)
+
+install(
+  TARGETS cuda_tensor_maker
+  EXPORT ExecuTorchTargets
+  DESTINATION lib
+)
+
 # CUDA-specific AOTI functionality
 set(_aoti_cuda_sources
     runtime/cuda_backend.cpp
@@ -62,9 +95,10 @@ target_link_options(
   aoti_cuda PUBLIC $<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-Wl,--export-dynamic>
 )
 
-# Link against CUDA::cudart, common AOTI library, and PyTorch CUDA libraries
+# Link against CUDA::cudart, common AOTI library, cuda_tensor_maker, and PyTorch
+# CUDA libraries
 target_link_libraries(
-  aoti_cuda PUBLIC aoti_common CUDA::cudart ${CMAKE_DL_LIBS}
+  aoti_cuda PUBLIC aoti_common cuda_tensor_maker CUDA::cudart ${CMAKE_DL_LIBS}
 )
 # If you need other CUDA libraries, link them similarly:
 # target_link_libraries(aoti_cuda PUBLIC CUDA::cublas CUDA::cufft ...)
diff --git a/backends/cuda/runtime/TARGETS b/backends/cuda/runtime/TARGETS
@@ -27,6 +27,25 @@ runtime.cxx_library(
     ],
 )
 
+runtime.cxx_library(
+    name = "tensor_maker",
+    srcs = [
+        "tensor/tensor_maker.cpp",
+    ],
+    headers = [
+        "tensor/tensor_maker.h",
+    ],
+    # @lint-ignore BUCKLINT: Avoid `link_whole=True` (https://fburl.com/avoid-link-whole)
+    link_whole = True,
+    supports_python_dlopen = True,
+    visibility = ["@EXECUTORCH_CLIENTS"],
+    deps = [
+        "//executorch/runtime/core:core",
+        "//executorch/runtime/core/exec_aten:lib",
+        "//executorch/runtime/core/exec_aten/util:tensor_util",
+    ],
+)
+
 runtime.cxx_library(
     name = "runtime_shims",
     srcs = [
@@ -52,8 +71,8 @@ runtime.cxx_library(
     compiler_flags = ["-Wno-global-constructors"],
     visibility = ["@EXECUTORCH_CLIENTS"],
     deps = [
+        ":tensor_maker",
         "//executorch/backends/aoti:common_shims",
-        "//executorch/extension/tensor:tensor",
         "//executorch/runtime/core:core",
         "//executorch/runtime/core/exec_aten:lib",
         "//executorch/runtime/platform:platform",
diff --git a/backends/cuda/runtime/shims/memory.cpp b/backends/cuda/runtime/shims/memory.cpp
@@ -11,6 +11,7 @@
 #include <executorch/backends/cuda/runtime/platform/platform.h>
 #include <executorch/backends/cuda/runtime/shims/memory.h>
 #include <executorch/backends/cuda/runtime/shims/tensor_attribute.h>
+#include <executorch/backends/cuda/runtime/tensor/tensor_maker.h>
 #include <executorch/backends/cuda/runtime/utils.h>
 #include <executorch/runtime/platform/log.h>
 #include <cstdint>
@@ -163,9 +164,11 @@ AOTITorchError aoti_torch_create_tensor_from_blob_v2(
 
   // Create ExecutorTorch tensor that wraps the existing memory
   // Note: We're NOT copying the data, just wrapping it
-  auto tensor = executorch::extension::from_blob(
-      data, // existing memory (don't copy!)
+  // Using CUDA-specific tensor maker that supports incontiguous tensors
+  auto tensor = make_tensor(
       sizes, // tensor dimensions
+      data, // existing memory (don't copy!)
+      {}, // dim_order (empty, will be auto-generated)
       strides, // tensor strides (allows different strides)
       dtype_to_scalar_type(dtype) // map int32_t dtype to ScalarType
   );
@@ -210,10 +213,6 @@ AOTITorchError aoti_torch_empty_strided(
 
   // This requires us to reserve CUDA memory and put it into a ETensor
   void* ptr;
-  int64_t numel = 1;
-  for (int64_t i = 0; i < ndim; i++) {
-    numel *= sizes_ptr[i];
-  }
 
   ET_CHECK_OK_OR_RETURN_ERROR(validate_dtype(dtype));
 
@@ -223,7 +222,28 @@ AOTITorchError aoti_torch_empty_strided(
       InvalidArgument,
       "Invalid element size for dtype: %d",
       dtype);
-  int64_t nbytes = numel * element_size;
+
+  // Calculate storage size based on strides, matching PyTorch's behavior
+  // This is critical when sizes and strides don't match the expected contiguous
+  // layout Reference: PyTorch's computeStorageNbytes in EmptyTensor.cpp
+  int64_t storage_size = 1; // storage offset (0) + 1
+  for (int64_t i = 0; i < ndim; i++) {
+    if (sizes_ptr[i] == 0) {
+      storage_size = 0;
+      break;
+    }
+    // For each dimension, add stride[i] * (size[i] - 1)
+    // This gives us the maximum offset in that dimension
+    int64_t stride_i = (strides_ptr != nullptr) ? strides_ptr[i] : 1;
+    if (strides_ptr == nullptr) {
+      // Calculate contiguous stride if not provided
+      for (int64_t j = i + 1; j < ndim; j++) {
+        stride_i *= sizes_ptr[j];
+      }
+    }
+    storage_size += stride_i * (sizes_ptr[i] - 1);
+  }
+  int64_t nbytes = storage_size * element_size;
 
   if (device_type == static_cast<int32_t>(SupportedDevices::CUDA)) {
     ET_CUDA_CHECK_OR_RETURN_ERROR(
@@ -250,16 +270,20 @@ AOTITorchError aoti_torch_empty_strided(
   auto strides = convert_strides_to_vector(ndim, sizes_ptr, strides_ptr);
 
   // ETensor creation with dynamic shape support for edge cases
-  auto tensor = executorch::extension::from_blob(
-      ptr, sizes, strides, dtype_to_scalar_type(dtype));
+  // Using CUDA-specific tensor maker that supports incontiguous tensors
+  auto tensor = make_tensor(
+      sizes,
+      ptr,
+      {}, // dim_order (empty, will be auto-generated)
+      strides,
+      dtype_to_scalar_type(dtype));
 
   // Store the tensor so it doesn't get destroyed
   tensors.insert(tensor);
   *ret_new_tensor = tensor.get();
 
   // This tensor owns the memory it allocated, set reference count to 1
   memory_to_n_tensor[ptr] = 1;
-
   return Error::Ok;
 }
 
@@ -630,9 +654,11 @@ AOTITorchError aoti_torch__reinterpret_tensor(
 
   // Create new tensor view that reinterprets the same memory with different
   // shape/strides This creates a view, not a copy - the data pointer is shared
-  std::shared_ptr<Tensor> tensor = executorch::extension::from_blob(
-      data_ptr, // Reuse the same memory from source tensor
+  // Using CUDA-specific tensor maker that supports incontiguous tensors
+  std::shared_ptr<Tensor> tensor = make_tensor(
       sizes, // New sizes with explicit SizesType
+      data_ptr, // Reuse the same memory from source tensor
+      {}, // dim_order (empty, will be auto-generated)
       strides, // New strides with explicit StridesType
       dtype_to_scalar_type(dtype) // Convert dtype with explicit type casting
   );
diff --git a/backends/cuda/runtime/shims/tensor_attribute.h b/backends/cuda/runtime/shims/tensor_attribute.h
@@ -8,8 +8,8 @@
 
 #pragma once
 
-#include <executorch/extension/tensor/tensor.h>
 #include <executorch/runtime/core/error.h>
+#include <executorch/runtime/core/exec_aten/exec_aten.h>
 #include <cstdint>
 
 namespace executorch::backends::cuda {
diff --git a/backends/cuda/runtime/shims/tests/test_aoti_torch__reinterpret_tensor.cpp b/backends/cuda/runtime/shims/tests/test_aoti_torch__reinterpret_tensor.cpp
@@ -12,6 +12,7 @@
 #include <executorch/backends/cuda/runtime/shims/memory.h>
 #include <executorch/backends/cuda/runtime/shims/tensor_attribute.h>
 #include <executorch/backends/cuda/runtime/utils.h>
+#include <executorch/extension/tensor/tensor_ptr_maker.h>
 #include <executorch/runtime/core/error.h>
 #include <executorch/runtime/platform/platform.h>
 #include <gtest/gtest.h>
diff --git a/backends/cuda/runtime/shims/tests/test_aoti_torch_empty_strided.cpp b/backends/cuda/runtime/shims/tests/test_aoti_torch_empty_strided.cpp
@@ -278,30 +278,6 @@ TEST_F(AOTITorchEmptyStridedTest, LargeTensor) {
   EXPECT_EQ(tensor->size(2), 50);
 }
 
-// Test error handling with memory allocation failures
-TEST_F(AOTITorchEmptyStridedTest, MemoryAllocationStress) {
-  // Try to create a very large tensor that might cause allocation failure
-  // (This test may pass or fail depending on available memory)
-  std::vector<int64_t> huge_sizes = {10000, 10000, 100}; // ~38GB for float32
-  Tensor* tensor;
-
-  AOTITorchError error = aoti_torch_empty_strided(
-      huge_sizes.size(),
-      huge_sizes.data(),
-      nullptr,
-      6, // float32
-      1, // CUDA device
-      0, // device index
-      &tensor);
-
-  // Either succeed or fail with memory allocation error
-  if (error == Error::Ok) {
-    EXPECT_NE(tensor, nullptr);
-  } else {
-    EXPECT_EQ(error, Error::MemoryAllocationFailed);
-  }
-}
-
 // Test aoti_torch_empty_strided with bfloat16 dtype
 TEST_F(AOTITorchEmptyStridedTest, BFloat16Tensor) {
   // Test creating bfloat16 tensor on CUDA
@@ -509,11 +485,11 @@ TEST_F(AOTITorchEmptyStridedTest, ZeroElementTensor) {
   EXPECT_EQ(sizes_ptr[2], 3);
 }
 
-// Test different data types (only float32 is currently supported)
+// Test different data types (currently we support bf16, fp32 and int32)
 TEST_F(AOTITorchEmptyStridedTest, DifferentDataTypes) {
   std::vector<int64_t> sizes = {2, 3};
 
-  // Test float32 (dtype 6) - currently the only supported type
+  // Test float32 (dtype 6) - one of the supported types
   Tensor* tensor_float32;
   AOTITorchError error = aoti_torch_empty_strided(
       sizes.size(),
@@ -527,7 +503,7 @@ TEST_F(AOTITorchEmptyStridedTest, DifferentDataTypes) {
   EXPECT_EQ(error, Error::Ok);
   EXPECT_NE(tensor_float32, nullptr);
 
-  // Test unsupported data types should return error
+  // Test int32 (dtype 3) - one of the supported types
   Tensor* tensor_int32;
   error = aoti_torch_empty_strided(
       sizes.size(),
@@ -538,7 +514,8 @@ TEST_F(AOTITorchEmptyStridedTest, DifferentDataTypes) {
       0, // device index
       &tensor_int32);
 
-  EXPECT_EQ(error, Error::InvalidArgument); // Should fail for unsupported dtype
+  EXPECT_EQ(error, Error::Ok);
+  EXPECT_NE(tensor_int32, nullptr);
 
   // Test another unsupported data type
   Tensor* tensor_float64;
@@ -586,3 +563,105 @@ TEST_F(AOTITorchEmptyStridedTest, MultiDimensionalTensors) {
   EXPECT_EQ(tensor_5d->size(3), 4);
   EXPECT_EQ(tensor_5d->size(4), 5);
 }
+
+// Test incontiguous tensor creation - transpose-like layout
+TEST_F(AOTITorchEmptyStridedTest, IncontiguousTransposeLayout) {
+  // Create a tensor with transpose-like strides (column-major)
+  // For a 3x4 tensor in column-major order, strides should be [1, 3]
+  // This means each row step is 1, and each column step is 3
+  std::vector<int64_t> sizes = {3, 4};
+  std::vector<int64_t> strides = {1, 3}; // Column-major (incontiguous)
+
+  Tensor* tensor;
+  AOTITorchError error = aoti_torch_empty_strided(
+      sizes.size(),
+      sizes.data(),
+      strides.data(),
+      static_cast<int32_t>(SupportedDTypes::FLOAT32),
+      static_cast<int32_t>(SupportedDevices::CUDA),
+      0, // device index
+      &tensor);
+
+  EXPECT_EQ(error, Error::Ok);
+  EXPECT_NE(tensor, nullptr);
+
+  // Verify tensor properties
+  EXPECT_EQ(tensor->dim(), 2);
+  EXPECT_EQ(tensor->size(0), 3);
+  EXPECT_EQ(tensor->size(1), 4);
+
+  // Verify the strides are what we specified
+  int64_t* strides_ptr;
+  EXPECT_EQ(aoti_torch_get_strides(tensor, &strides_ptr), Error::Ok);
+  EXPECT_EQ(strides_ptr[0], 1); // Column-major stride for dimension 0
+  EXPECT_EQ(strides_ptr[1], 3); // Column-major stride for dimension 1
+
+  // Verify that memory was allocated correctly for incontiguous layout
+  // Storage size should be: stride[0] * (size[0] - 1) + stride[1] * (size[1] -
+  // 1) + 1 = 1 * (3 - 1) + 3 * (4 - 1) + 1 = 1 * 2 + 3 * 3 + 1 = 2 + 9 + 1 = 12
+  // elements Total bytes = 12 * 4 = 48 bytes (for float32)
+  EXPECT_EQ(tensor->numel(), 12); // numel is still 3*4=12 for logical shape
+
+  // The tensor should be accessible and writable
+  void* data_ptr = tensor->mutable_data_ptr();
+  EXPECT_NE(data_ptr, nullptr);
+
+  // Verify we can use CUDA to write to the memory
+  std::vector<float> test_data(12, 1.0f);
+  cudaError_t cuda_err = cudaMemcpy(
+      data_ptr, test_data.data(), 12 * sizeof(float), cudaMemcpyHostToDevice);
+  EXPECT_EQ(cuda_err, cudaSuccess);
+}
+
+// Test incontiguous tensor creation - expanded/broadcasted stride pattern
+TEST_F(AOTITorchEmptyStridedTest, IncontiguousExpandedStrides) {
+  // Create a tensor with expanded strides (simulating broadcasting)
+  // A 2x3x4 tensor where the first dimension has stride 0 (expanded)
+  // This creates a tensor where the first dimension is "broadcasted"
+  std::vector<int64_t> sizes = {2, 3, 4};
+  std::vector<int64_t> strides = {0, 4, 1}; // First dimension has stride 0
+
+  Tensor* tensor;
+  AOTITorchError error = aoti_torch_empty_strided(
+      sizes.size(),
+      sizes.data(),
+      strides.data(),
+      static_cast<int32_t>(SupportedDTypes::FLOAT32),
+      static_cast<int32_t>(SupportedDevices::CUDA),
+      0, // device index
+      &tensor);
+
+  EXPECT_EQ(error, Error::Ok);
+  EXPECT_NE(tensor, nullptr);
+
+  // Verify tensor properties
+  EXPECT_EQ(tensor->dim(), 3);
+  EXPECT_EQ(tensor->size(0), 2);
+  EXPECT_EQ(tensor->size(1), 3);
+  EXPECT_EQ(tensor->size(2), 4);
+
+  // Verify the strides are what we specified
+  int64_t* strides_ptr;
+  EXPECT_EQ(aoti_torch_get_strides(tensor, &strides_ptr), Error::Ok);
+  EXPECT_EQ(strides_ptr[0], 0); // Expanded dimension stride
+  EXPECT_EQ(strides_ptr[1], 4);
+  EXPECT_EQ(strides_ptr[2], 1);
+
+  // Verify that memory was allocated correctly for this incontiguous layout
+  // Storage size should be: stride[0] * (size[0] - 1) + stride[1] * (size[1] -
+  // 1) + stride[2] * (size[2] - 1) + 1 = 0 * (2 - 1) + 4 * (3 - 1) + 1 * (4 -
+  // 1) + 1 = 0 + 8 + 3 + 1 = 12 elements Note: numel() returns logical number
+  // of elements (2*3*4=24), not storage size
+  EXPECT_EQ(tensor->numel(), 24); // Logical numel is 2*3*4=24
+
+  // The tensor should be accessible and writable
+  void* data_ptr = tensor->mutable_data_ptr();
+  EXPECT_NE(data_ptr, nullptr);
+
+  // Verify we can use CUDA to write to the allocated memory
+  // We only need to allocate 12 elements (storage size), not 24
+  std::vector<float> test_data(12, 2.0f);
+  cudaError_t cuda_err = cudaMemcpy(
+      data_ptr, test_data.data(), 12 * sizeof(float), cudaMemcpyHostToDevice);
+  EXPECT_EQ(cuda_err, cudaSuccess);
+}
diff --git a/backends/cuda/runtime/tensor/tensor_maker.cpp b/backends/cuda/runtime/tensor/tensor_maker.cpp
diff --git a/backends/cuda/runtime/tensor/tensor_maker.h b/backends/cuda/runtime/tensor/tensor_maker.h