Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion include/ck_tile/core/tensor/tile_window.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -1076,7 +1076,7 @@ struct tile_window_with_static_lengths
using ThreadBuf = thread_buffer<DataType, 2>;
auto buf = tensor_view.template get_vectorized_elements<ThreadBuf>(coord, 0);
auto value = buf.at(number<0>{}); // Extract first element from thread buffer
printf(" %s[%d,%d] = %f", label, i, j, static_cast<float>(value));
printf(" %s[%d,%d] = %f", label, i, j, type_convert<float>(value));
}
printf("\n");
}
Expand Down
6 changes: 6 additions & 0 deletions test/ck_tile/utility/print/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6,3 +6,9 @@ add_gtest_executable(test_print_coordinate_transform test_print_coordinate_trans
add_gtest_executable(test_print_static_encoding_pattern test_print_static_encoding_pattern.cpp)
add_gtest_executable(test_print_buffer_view test_print_buffer_view.cpp)
add_gtest_executable(test_print_basic_types test_print_basic_types.cpp)
add_gtest_executable(test_print_tile_window test_print_tile_window.cpp)

# Apply OCP FP8 flag for tile_window test to ensure host/device FP8 format consistency
if(CK_USE_OCP_FP8)
target_compile_options(test_print_tile_window PRIVATE -DCK_TILE_USE_OCP_FP8)
endif()
223 changes: 223 additions & 0 deletions test/ck_tile/utility/print/test_print_tile_window.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,223 @@
// SPDX-License-Identifier: MIT
// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.

#include "test_print_common.hpp"
#include "ck_tile/core.hpp"
#include <hip/hip_runtime.h>

namespace ck_tile {

template <typename DataType>
__global__ void KernelPrintTileWindow(DataType* data, int M, int N)
{
using namespace ck_tile;

auto tv = make_naive_tensor_view<address_space_enum::global>(
data, make_tuple(M, N), make_tuple(N, 1));

constexpr auto window_lengths = make_tuple(number<2>{}, number<3>{});

// Create tile window with static lengths 2x3 with origin (0,0)
auto tw = make_tile_window(tv, window_lengths, make_multi_index(0, 0));

if(threadIdx.x == 0 && blockIdx.x == 0)
{
tw.template print_tile_window_range<DataType>(0, 2, 0, 3, "TW");
}
}

class PrintTileWindowTest : public PrintTest
{
protected:
void SetUp() override
{
// Initialize HIP
hipError_t err = hipSetDevice(0);
if(err != hipSuccess)
{
GTEST_SKIP() << "No GPU available for tile window test";
}
}

void TearDown() override {}

template <typename DataType>
std::string CaptureTileWindowPrintOutput(const std::vector<DataType>& host_data, int M, int N)
{
// Allocate device memory
DataType* device_data = nullptr;
size_t size_bytes = host_data.size() * sizeof(DataType);
hipError_t err = hipMalloc(&device_data, size_bytes);
if(err != hipSuccess)
{
ADD_FAILURE() << "Failed to allocate device memory: " << hipGetErrorString(err);
return "";
}

// Copy data to device
err = hipMemcpy(device_data, host_data.data(), size_bytes, hipMemcpyHostToDevice);
if(err != hipSuccess)
{
ADD_FAILURE() << "Failed to copy data to device: " << hipGetErrorString(err);
(void)hipFree(device_data);
return "";
}

// Capture stdout
testing::internal::CaptureStdout();

// Launch kernel
dim3 grid_dim(1, 1, 1);
dim3 block_dim(1, 1, 1);
hipLaunchKernelGGL(
KernelPrintTileWindow<DataType>, grid_dim, block_dim, 0, 0, device_data, M, N);

// Synchronize to ensure print output is captured
err = hipDeviceSynchronize();
if(err != hipSuccess)
{
ADD_FAILURE() << "Failed to synchronize device: " << hipGetErrorString(err);
testing::internal::GetCapturedStdout(); // Consume captured output
(void)hipFree(device_data);
return "";
}

// Get captured output
std::string output = testing::internal::GetCapturedStdout();

// Cleanup
err = hipFree(device_data);
if(err != hipSuccess)
{
ADD_FAILURE() << "Failed to free device memory: " << hipGetErrorString(err);
}

return output;
}
};

TEST_F(PrintTileWindowTest, PrintTileWindow2x3)
{
// Create a 4x4 tensor with values 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
const int M = 4, N = 4;
std::vector<float> host_data(M * N);
for(int i = 0; i < M * N; ++i)
{
host_data[i] = static_cast<float>(i);
}

std::string output = CaptureTileWindowPrintOutput(host_data, M, N);

// Expected output for a 2x3 window starting at (0,0) from a 4x4 tensor
// Values should be: [0,1,2] in first row, [4,5,6] in second row
std::string expected = "TW Window Range [0:1, 0:2] (origin: 0, 0):\n"
" TW[0,0] = 0.000000 TW[0,1] = 1.000000 TW[0,2] = 2.000000\n"
" TW[1,0] = 4.000000 TW[1,1] = 5.000000 TW[1,2] = 6.000000\n"
"\n";

EXPECT_EQ(output, expected);
}

TEST_F(PrintTileWindowTest, PrintTileWindowScaledValues)
{
// Test with scaled values (multiples of 10)
const int M = 3, N = 3;
std::vector<float> host_data(M * N);
for(int i = 0; i < M * N; ++i)
{
host_data[i] = static_cast<float>(i * 10); // 0, 10, 20, 30, 40, 50, 60, 70, 80
}

std::string output = CaptureTileWindowPrintOutput(host_data, M, N);

// For a 2x3 window from this 3x3 tensor, we should get:
// [0, 10, 20] in first row, [30, 40, 50] in second row
std::string expected = "TW Window Range [0:1, 0:2] (origin: 0, 0):\n"
" TW[0,0] = 0.000000 TW[0,1] = 10.000000 TW[0,2] = 20.000000\n"
" TW[1,0] = 30.000000 TW[1,1] = 40.000000 TW[1,2] = 50.000000\n"
"\n";

EXPECT_EQ(output, expected);
}

TEST_F(PrintTileWindowTest, PrintTileWindowFp8)
{
// Test with fp8_t data type
const int M = 4, N = 4;
std::vector<ck_tile::fp8_t> host_data(M * N);
for(int i = 0; i < M * N; ++i)
{
host_data[i] = ck_tile::fp8_t(static_cast<float>(i));
}

std::string output = CaptureTileWindowPrintOutput<ck_tile::fp8_t>(host_data, M, N);

// Expected output for a 2x3 window starting at (0,0) from a 4x4 tensor
// Values should be: [0, 1, 2] in first row, [4, 5, 6] in second row
// we type convert on host to match the function implementation
float val_00 = type_convert<float>(ck_tile::fp8_t(0.0f));
float val_01 = type_convert<float>(ck_tile::fp8_t(1.0f));
float val_02 = type_convert<float>(ck_tile::fp8_t(2.0f));
float val_10 = type_convert<float>(ck_tile::fp8_t(4.0f));
float val_11 = type_convert<float>(ck_tile::fp8_t(5.0f));
float val_12 = type_convert<float>(ck_tile::fp8_t(6.0f));

char expected_buf[512];
snprintf(expected_buf,
sizeof(expected_buf),
"TW Window Range [0:1, 0:2] (origin: 0, 0):\n"
" TW[0,0] = %f TW[0,1] = %f TW[0,2] = %f\n"
" TW[1,0] = %f TW[1,1] = %f TW[1,2] = %f\n"
"\n",
val_00,
val_01,
val_02,
val_10,
val_11,
val_12);
std::string expected(expected_buf);

EXPECT_EQ(output, expected);
}

TEST_F(PrintTileWindowTest, PrintTileWindowBf8)
{
// Test with bf8_t data type
const int M = 3, N = 3;
std::vector<ck_tile::bf8_t> host_data(M * N);
for(int i = 0; i < M * N; ++i)
{
host_data[i] = ck_tile::bf8_t(static_cast<float>(i * 10));
}

std::string output = CaptureTileWindowPrintOutput<ck_tile::bf8_t>(host_data, M, N);

// Expected output for a 2x3 window starting at (0,0) from a 3x3 tensor
// Values should be: [0, 10, 20] in first row, [30, 40, 50] in second row
// we type convert on host to match the function implementation
float val_00 = type_convert<float>(ck_tile::bf8_t(0.0f));
float val_01 = type_convert<float>(ck_tile::bf8_t(10.0f));
float val_02 = type_convert<float>(ck_tile::bf8_t(20.0f));
float val_10 = type_convert<float>(ck_tile::bf8_t(30.0f));
float val_11 = type_convert<float>(ck_tile::bf8_t(40.0f));
float val_12 = type_convert<float>(ck_tile::bf8_t(50.0f));

char expected_buf[512];
snprintf(expected_buf,
sizeof(expected_buf),
"TW Window Range [0:1, 0:2] (origin: 0, 0):\n"
" TW[0,0] = %f TW[0,1] = %f TW[0,2] = %f\n"
" TW[1,0] = %f TW[1,1] = %f TW[1,2] = %f\n"
"\n",
val_00,
val_01,
val_02,
val_10,
val_11,
val_12);
std::string expected(expected_buf);

EXPECT_EQ(output, expected);
}

} // namespace ck_tile