From 1fb85e1cef9235fbdf7944475a09787558e0ebb2 Mon Sep 17 00:00:00 2001 From: AviralGoelAMD Date: Wed, 29 Oct 2025 18:41:47 +0000 Subject: [PATCH 1/3] fix: fix bug in print tile window when printing bf8/fp8 tiles --- include/ck_tile/core/tensor/tile_window.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/ck_tile/core/tensor/tile_window.hpp b/include/ck_tile/core/tensor/tile_window.hpp index ad02bec8c1..cfa2420f2f 100644 --- a/include/ck_tile/core/tensor/tile_window.hpp +++ b/include/ck_tile/core/tensor/tile_window.hpp @@ -1076,7 +1076,7 @@ struct tile_window_with_static_lengths using ThreadBuf = thread_buffer; auto buf = tensor_view.template get_vectorized_elements(coord, 0); auto value = buf.at(number<0>{}); // Extract first element from thread buffer - printf(" %s[%d,%d] = %f", label, i, j, static_cast(value)); + printf(" %s[%d,%d] = %f", label, i, j, type_convert(value)); } printf("\n"); } From 44138cdc6cc6d4a1d4ce40d7b7735279792e1145 Mon Sep 17 00:00:00 2001 From: AviralGoelAMD Date: Thu, 30 Oct 2025 17:43:57 +0000 Subject: [PATCH 2/3] test(print_tile_window_range): add unit tests to maintain function integrity --- test/ck_tile/utility/print/CMakeLists.txt | 1 + .../utility/print/test_print_tile_window.cpp | 223 ++++++++++++++++++ 2 files changed, 224 insertions(+) create mode 100644 test/ck_tile/utility/print/test_print_tile_window.cpp diff --git a/test/ck_tile/utility/print/CMakeLists.txt b/test/ck_tile/utility/print/CMakeLists.txt index 5300dd20ca..63261a8780 100644 --- a/test/ck_tile/utility/print/CMakeLists.txt +++ b/test/ck_tile/utility/print/CMakeLists.txt @@ -6,3 +6,4 @@ add_gtest_executable(test_print_coordinate_transform test_print_coordinate_trans add_gtest_executable(test_print_static_encoding_pattern test_print_static_encoding_pattern.cpp) add_gtest_executable(test_print_buffer_view test_print_buffer_view.cpp) add_gtest_executable(test_print_basic_types test_print_basic_types.cpp) +add_gtest_executable(test_print_tile_window test_print_tile_window.cpp) diff --git a/test/ck_tile/utility/print/test_print_tile_window.cpp b/test/ck_tile/utility/print/test_print_tile_window.cpp new file mode 100644 index 0000000000..0921525734 --- /dev/null +++ b/test/ck_tile/utility/print/test_print_tile_window.cpp @@ -0,0 +1,223 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#include "test_print_common.hpp" +#include "ck_tile/core.hpp" +#include + +namespace ck_tile { + +template +__global__ void KernelPrintTileWindow(DataType* data, int M, int N) +{ + using namespace ck_tile; + + auto tv = make_naive_tensor_view( + data, make_tuple(M, N), make_tuple(N, 1)); + + constexpr auto window_lengths = make_tuple(number<2>{}, number<3>{}); + + // Create tile window with static lengths 2x3 with origin (0,0) + auto tw = make_tile_window(tv, window_lengths, make_multi_index(0, 0)); + + if(threadIdx.x == 0 && blockIdx.x == 0) + { + tw.template print_tile_window_range(0, 2, 0, 3, "TW"); + } +} + +class PrintTileWindowTest : public PrintTest +{ + protected: + void SetUp() override + { + // Initialize HIP + hipError_t err = hipSetDevice(0); + if(err != hipSuccess) + { + GTEST_SKIP() << "No GPU available for tile window test"; + } + } + + void TearDown() override {} + + template + std::string CaptureTileWindowPrintOutput(const std::vector& host_data, int M, int N) + { + // Allocate device memory + DataType* device_data = nullptr; + size_t size_bytes = host_data.size() * sizeof(DataType); + hipError_t err = hipMalloc(&device_data, size_bytes); + if(err != hipSuccess) + { + ADD_FAILURE() << "Failed to allocate device memory: " << hipGetErrorString(err); + return ""; + } + + // Copy data to device + err = hipMemcpy(device_data, host_data.data(), size_bytes, hipMemcpyHostToDevice); + if(err != hipSuccess) + { + ADD_FAILURE() << "Failed to copy data to device: " << hipGetErrorString(err); + (void)hipFree(device_data); + return ""; + } + + // Capture stdout + testing::internal::CaptureStdout(); + + // Launch kernel + dim3 grid_dim(1, 1, 1); + dim3 block_dim(1, 1, 1); + hipLaunchKernelGGL( + KernelPrintTileWindow, grid_dim, block_dim, 0, 0, device_data, M, N); + + // Synchronize to ensure print output is captured + err = hipDeviceSynchronize(); + if(err != hipSuccess) + { + ADD_FAILURE() << "Failed to synchronize device: " << hipGetErrorString(err); + testing::internal::GetCapturedStdout(); // Consume captured output + (void)hipFree(device_data); + return ""; + } + + // Get captured output + std::string output = testing::internal::GetCapturedStdout(); + + // Cleanup + err = hipFree(device_data); + if(err != hipSuccess) + { + ADD_FAILURE() << "Failed to free device memory: " << hipGetErrorString(err); + } + + return output; + } +}; + +TEST_F(PrintTileWindowTest, PrintTileWindow2x3) +{ + // Create a 4x4 tensor with values 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 + const int M = 4, N = 4; + std::vector host_data(M * N); + for(int i = 0; i < M * N; ++i) + { + host_data[i] = static_cast(i); + } + + std::string output = CaptureTileWindowPrintOutput(host_data, M, N); + + // Expected output for a 2x3 window starting at (0,0) from a 4x4 tensor + // Values should be: [0,1,2] in first row, [4,5,6] in second row + std::string expected = "TW Window Range [0:1, 0:2] (origin: 0, 0):\n" + " TW[0,0] = 0.000000 TW[0,1] = 1.000000 TW[0,2] = 2.000000\n" + " TW[1,0] = 4.000000 TW[1,1] = 5.000000 TW[1,2] = 6.000000\n" + "\n"; + + EXPECT_EQ(output, expected); +} + +TEST_F(PrintTileWindowTest, PrintTileWindowScaledValues) +{ + // Test with scaled values (multiples of 10) + const int M = 3, N = 3; + std::vector host_data(M * N); + for(int i = 0; i < M * N; ++i) + { + host_data[i] = static_cast(i * 10); // 0, 10, 20, 30, 40, 50, 60, 70, 80 + } + + std::string output = CaptureTileWindowPrintOutput(host_data, M, N); + + // For a 2x3 window from this 3x3 tensor, we should get: + // [0, 10, 20] in first row, [30, 40, 50] in second row + std::string expected = "TW Window Range [0:1, 0:2] (origin: 0, 0):\n" + " TW[0,0] = 0.000000 TW[0,1] = 10.000000 TW[0,2] = 20.000000\n" + " TW[1,0] = 30.000000 TW[1,1] = 40.000000 TW[1,2] = 50.000000\n" + "\n"; + + EXPECT_EQ(output, expected); +} + +TEST_F(PrintTileWindowTest, PrintTileWindowFp8) +{ + // Test with fp8_t data type + const int M = 4, N = 4; + std::vector host_data(M * N); + for(int i = 0; i < M * N; ++i) + { + host_data[i] = ck_tile::fp8_t(static_cast(i)); + } + + std::string output = CaptureTileWindowPrintOutput(host_data, M, N); + + // Expected output for a 2x3 window starting at (0,0) from a 4x4 tensor + // Values should be: [0, 1, 2] in first row, [4, 5, 6] in second row + // we type convert on host to match the function implementation + float val_00 = type_convert(ck_tile::fp8_t(0.0f)); + float val_01 = type_convert(ck_tile::fp8_t(1.0f)); + float val_02 = type_convert(ck_tile::fp8_t(2.0f)); + float val_10 = type_convert(ck_tile::fp8_t(4.0f)); + float val_11 = type_convert(ck_tile::fp8_t(5.0f)); + float val_12 = type_convert(ck_tile::fp8_t(6.0f)); + + char expected_buf[512]; + snprintf(expected_buf, + sizeof(expected_buf), + "TW Window Range [0:1, 0:2] (origin: 0, 0):\n" + " TW[0,0] = %f TW[0,1] = %f TW[0,2] = %f\n" + " TW[1,0] = %f TW[1,1] = %f TW[1,2] = %f\n" + "\n", + val_00, + val_01, + val_02, + val_10, + val_11, + val_12); + std::string expected(expected_buf); + + EXPECT_EQ(output, expected); +} + +TEST_F(PrintTileWindowTest, PrintTileWindowBf8) +{ + // Test with bf8_t data type + const int M = 3, N = 3; + std::vector host_data(M * N); + for(int i = 0; i < M * N; ++i) + { + host_data[i] = ck_tile::bf8_t(static_cast(i * 10)); + } + + std::string output = CaptureTileWindowPrintOutput(host_data, M, N); + + // Expected output for a 2x3 window starting at (0,0) from a 3x3 tensor + // Values should be: [0, 10, 20] in first row, [30, 40, 50] in second row + // we type convert on host to match the function implementation + float val_00 = type_convert(ck_tile::bf8_t(0.0f)); + float val_01 = type_convert(ck_tile::bf8_t(10.0f)); + float val_02 = type_convert(ck_tile::bf8_t(20.0f)); + float val_10 = type_convert(ck_tile::bf8_t(30.0f)); + float val_11 = type_convert(ck_tile::bf8_t(40.0f)); + float val_12 = type_convert(ck_tile::bf8_t(50.0f)); + + char expected_buf[512]; + snprintf(expected_buf, + sizeof(expected_buf), + "TW Window Range [0:1, 0:2] (origin: 0, 0):\n" + " TW[0,0] = %f TW[0,1] = %f TW[0,2] = %f\n" + " TW[1,0] = %f TW[1,1] = %f TW[1,2] = %f\n" + "\n", + val_00, + val_01, + val_02, + val_10, + val_11, + val_12); + std::string expected(expected_buf); + + EXPECT_EQ(output, expected); +} + +} // namespace ck_tile From b4fec80bc9d53df0cf553637467b8a64ab844098 Mon Sep 17 00:00:00 2001 From: AviralGoelAMD Date: Thu, 30 Oct 2025 23:40:17 +0000 Subject: [PATCH 3/3] fix: fp8 numerical mismatch error on gfx950 by adding DCK_TILE_USE_OCP_FP8 --- test/ck_tile/utility/print/CMakeLists.txt | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/test/ck_tile/utility/print/CMakeLists.txt b/test/ck_tile/utility/print/CMakeLists.txt index 63261a8780..888f23b4c5 100644 --- a/test/ck_tile/utility/print/CMakeLists.txt +++ b/test/ck_tile/utility/print/CMakeLists.txt @@ -7,3 +7,8 @@ add_gtest_executable(test_print_static_encoding_pattern test_print_static_encodi add_gtest_executable(test_print_buffer_view test_print_buffer_view.cpp) add_gtest_executable(test_print_basic_types test_print_basic_types.cpp) add_gtest_executable(test_print_tile_window test_print_tile_window.cpp) + +# Apply OCP FP8 flag for tile_window test to ensure host/device FP8 format consistency +if(CK_USE_OCP_FP8) + target_compile_options(test_print_tile_window PRIVATE -DCK_TILE_USE_OCP_FP8) +endif()