PR1: Fixing allocation logic

jjsjann123 · jjsjann123 · commit 17df15ae14f5 · 2025-09-19T15:54:30.000-07:00
1. refactor buffer allocation buffer to use allocation domain, intead of logical domain.
2. fixing projection from allocation to logical special path when projection is not possible:
   We now compute correct extent instead of returning the allocation buffer as-is, this allows that layout op to return a tensor with the correct logical size, while still allocating a large enough buffer to accommodate the padding requirement.
diff --git a/csrc/runtime/allocations.cpp b/csrc/runtime/allocations.cpp
@@ -273,13 +273,29 @@ KernelArgumentHolder allocateOutputs(
   for (auto out_idx : arange(output_infos.size())) {
     auto out_info = output_infos.at(out_idx);
     if (output_alias_to_input_map.at(out_idx) == -1) {
-      auto alloc_tensor = at::native::empty_strided_cuda(
-          out_info.shape_info.logical_sizes,
-          out_info.shape_info.logical_strides,
-          out_info.type,
-          c10::nullopt,
-          device,
-          c10::nullopt);
+      at::Tensor alloc_tensor;
+      if (!out_info.shape_info.allocation_sizes.empty()) {
+        // allocate based on allocation size & stride and restride with logical
+        // size & stride afterwards.
+        alloc_tensor = at::native::empty_strided_cuda(
+            out_info.shape_info.allocation_sizes,
+            out_info.shape_info.allocation_strides,
+            out_info.type,
+            c10::nullopt,
+            device,
+            c10::nullopt);
+        alloc_tensor = alloc_tensor.as_strided_(
+            out_info.shape_info.logical_sizes,
+            out_info.shape_info.logical_strides);
+      } else {
+        alloc_tensor = at::native::empty_strided_cuda(
+            out_info.shape_info.logical_sizes,
+            out_info.shape_info.logical_strides,
+            out_info.type,
+            c10::nullopt,
+            device,
+            c10::nullopt);
+      }
       if (shouldFillAllocationWithNan()) {
         fillTensorWithNan(alloc_tensor);
       }
@@ -741,13 +757,22 @@ at::Tensor transformFromAllocationToLogical(
                .run(logical, alloc);
   NVF_ERROR(frontier.size() == logical.size());
 
-  // give up on producing right shape/stride when allocation domain has
+  // give up on producing right stride when allocation domain has
   // transformation that cannot be represented via permutation. This is
   // currently used by PreprocessGroupedMatmulInputSf, where output is padded.
   std::set<IterDomain*> frontier_set(frontier.begin(), frontier.end());
   std::set<IterDomain*> logical_set(logical.begin(), logical.end());
   if (frontier_set != logical_set) {
-    return tensor;
+    std::vector<int64_t> logical_sizes(logical.size(), 0);
+    std::vector<int64_t> logical_strides(logical.size(), 0);
+    int64_t cur_stride = 1;
+    for (const auto&& [i, id] : enumerate(logical) | std::views::reverse) {
+      int64_t cur_size = ee.evaluate(id->extent()).as<int64_t>();
+      logical_sizes[i] = cur_size;
+      logical_strides[i] = cur_stride;
+      cur_stride *= cur_size;
+    }
+    return tensor.as_strided(logical_sizes, logical_strides);
   }
 
   // Now that all affine transformations are handled, and frontiers should
diff --git a/tests/cpp/test_layout_op.cpp b/tests/cpp/test_layout_op.cpp
@@ -27,8 +27,14 @@ bool validateGroupedLayout(
   NVF_ERROR(BlockScalingFactorLayout::Block128x4 == layout);
   int num_group = expert_offsets.size(0) - 1;
 
+  // validate output logical shape
+  EXPECT_EQ(out.sizes(), ref.sizes());
+
   // take length of reference for un-padded k size.
   int k = ref.size(1);
+  int padded_k = (k + 4 - 1) / 4 * 4;
+  int padded_m = sf_offsets[num_group].item().to<int>();
+  out.as_strided_({padded_m, padded_k}, {padded_k, 1});
 
   // We validate each group individually
   for (int i = 0; i < num_group; ++i) {