Skip to content

Commit 45be741

Browse files
fix: fix bug in print tile window when printing bf8/fp8 tiles (#3120)
* fix: fix bug in print tile window when printing bf8/fp8 tiles * test(print_tile_window_range): add unit tests to maintain function integrity * fix: fp8 numerical mismatch error on gfx950 by adding DCK_TILE_USE_OCP_FP8
1 parent ab1a835 commit 45be741

File tree

3 files changed

+230
-1
lines changed

3 files changed

+230
-1
lines changed

include/ck_tile/core/tensor/tile_window.hpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1076,7 +1076,7 @@ struct tile_window_with_static_lengths
10761076
using ThreadBuf = thread_buffer<DataType, 2>;
10771077
auto buf = tensor_view.template get_vectorized_elements<ThreadBuf>(coord, 0);
10781078
auto value = buf.at(number<0>{}); // Extract first element from thread buffer
1079-
printf(" %s[%d,%d] = %f", label, i, j, static_cast<float>(value));
1079+
printf(" %s[%d,%d] = %f", label, i, j, type_convert<float>(value));
10801080
}
10811081
printf("\n");
10821082
}

test/ck_tile/utility/print/CMakeLists.txt

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,3 +6,9 @@ add_gtest_executable(test_print_coordinate_transform test_print_coordinate_trans
66
add_gtest_executable(test_print_static_encoding_pattern test_print_static_encoding_pattern.cpp)
77
add_gtest_executable(test_print_buffer_view test_print_buffer_view.cpp)
88
add_gtest_executable(test_print_basic_types test_print_basic_types.cpp)
9+
add_gtest_executable(test_print_tile_window test_print_tile_window.cpp)
10+
11+
# Apply OCP FP8 flag for tile_window test to ensure host/device FP8 format consistency
12+
if(CK_USE_OCP_FP8)
13+
target_compile_options(test_print_tile_window PRIVATE -DCK_TILE_USE_OCP_FP8)
14+
endif()
Lines changed: 223 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,223 @@
1+
// SPDX-License-Identifier: MIT
2+
// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
3+
4+
#include "test_print_common.hpp"
5+
#include "ck_tile/core.hpp"
6+
#include <hip/hip_runtime.h>
7+
8+
namespace ck_tile {
9+
10+
template <typename DataType>
11+
__global__ void KernelPrintTileWindow(DataType* data, int M, int N)
12+
{
13+
using namespace ck_tile;
14+
15+
auto tv = make_naive_tensor_view<address_space_enum::global>(
16+
data, make_tuple(M, N), make_tuple(N, 1));
17+
18+
constexpr auto window_lengths = make_tuple(number<2>{}, number<3>{});
19+
20+
// Create tile window with static lengths 2x3 with origin (0,0)
21+
auto tw = make_tile_window(tv, window_lengths, make_multi_index(0, 0));
22+
23+
if(threadIdx.x == 0 && blockIdx.x == 0)
24+
{
25+
tw.template print_tile_window_range<DataType>(0, 2, 0, 3, "TW");
26+
}
27+
}
28+
29+
class PrintTileWindowTest : public PrintTest
30+
{
31+
protected:
32+
void SetUp() override
33+
{
34+
// Initialize HIP
35+
hipError_t err = hipSetDevice(0);
36+
if(err != hipSuccess)
37+
{
38+
GTEST_SKIP() << "No GPU available for tile window test";
39+
}
40+
}
41+
42+
void TearDown() override {}
43+
44+
template <typename DataType>
45+
std::string CaptureTileWindowPrintOutput(const std::vector<DataType>& host_data, int M, int N)
46+
{
47+
// Allocate device memory
48+
DataType* device_data = nullptr;
49+
size_t size_bytes = host_data.size() * sizeof(DataType);
50+
hipError_t err = hipMalloc(&device_data, size_bytes);
51+
if(err != hipSuccess)
52+
{
53+
ADD_FAILURE() << "Failed to allocate device memory: " << hipGetErrorString(err);
54+
return "";
55+
}
56+
57+
// Copy data to device
58+
err = hipMemcpy(device_data, host_data.data(), size_bytes, hipMemcpyHostToDevice);
59+
if(err != hipSuccess)
60+
{
61+
ADD_FAILURE() << "Failed to copy data to device: " << hipGetErrorString(err);
62+
(void)hipFree(device_data);
63+
return "";
64+
}
65+
66+
// Capture stdout
67+
testing::internal::CaptureStdout();
68+
69+
// Launch kernel
70+
dim3 grid_dim(1, 1, 1);
71+
dim3 block_dim(1, 1, 1);
72+
hipLaunchKernelGGL(
73+
KernelPrintTileWindow<DataType>, grid_dim, block_dim, 0, 0, device_data, M, N);
74+
75+
// Synchronize to ensure print output is captured
76+
err = hipDeviceSynchronize();
77+
if(err != hipSuccess)
78+
{
79+
ADD_FAILURE() << "Failed to synchronize device: " << hipGetErrorString(err);
80+
testing::internal::GetCapturedStdout(); // Consume captured output
81+
(void)hipFree(device_data);
82+
return "";
83+
}
84+
85+
// Get captured output
86+
std::string output = testing::internal::GetCapturedStdout();
87+
88+
// Cleanup
89+
err = hipFree(device_data);
90+
if(err != hipSuccess)
91+
{
92+
ADD_FAILURE() << "Failed to free device memory: " << hipGetErrorString(err);
93+
}
94+
95+
return output;
96+
}
97+
};
98+
99+
TEST_F(PrintTileWindowTest, PrintTileWindow2x3)
100+
{
101+
// Create a 4x4 tensor with values 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
102+
const int M = 4, N = 4;
103+
std::vector<float> host_data(M * N);
104+
for(int i = 0; i < M * N; ++i)
105+
{
106+
host_data[i] = static_cast<float>(i);
107+
}
108+
109+
std::string output = CaptureTileWindowPrintOutput(host_data, M, N);
110+
111+
// Expected output for a 2x3 window starting at (0,0) from a 4x4 tensor
112+
// Values should be: [0,1,2] in first row, [4,5,6] in second row
113+
std::string expected = "TW Window Range [0:1, 0:2] (origin: 0, 0):\n"
114+
" TW[0,0] = 0.000000 TW[0,1] = 1.000000 TW[0,2] = 2.000000\n"
115+
" TW[1,0] = 4.000000 TW[1,1] = 5.000000 TW[1,2] = 6.000000\n"
116+
"\n";
117+
118+
EXPECT_EQ(output, expected);
119+
}
120+
121+
TEST_F(PrintTileWindowTest, PrintTileWindowScaledValues)
122+
{
123+
// Test with scaled values (multiples of 10)
124+
const int M = 3, N = 3;
125+
std::vector<float> host_data(M * N);
126+
for(int i = 0; i < M * N; ++i)
127+
{
128+
host_data[i] = static_cast<float>(i * 10); // 0, 10, 20, 30, 40, 50, 60, 70, 80
129+
}
130+
131+
std::string output = CaptureTileWindowPrintOutput(host_data, M, N);
132+
133+
// For a 2x3 window from this 3x3 tensor, we should get:
134+
// [0, 10, 20] in first row, [30, 40, 50] in second row
135+
std::string expected = "TW Window Range [0:1, 0:2] (origin: 0, 0):\n"
136+
" TW[0,0] = 0.000000 TW[0,1] = 10.000000 TW[0,2] = 20.000000\n"
137+
" TW[1,0] = 30.000000 TW[1,1] = 40.000000 TW[1,2] = 50.000000\n"
138+
"\n";
139+
140+
EXPECT_EQ(output, expected);
141+
}
142+
143+
TEST_F(PrintTileWindowTest, PrintTileWindowFp8)
144+
{
145+
// Test with fp8_t data type
146+
const int M = 4, N = 4;
147+
std::vector<ck_tile::fp8_t> host_data(M * N);
148+
for(int i = 0; i < M * N; ++i)
149+
{
150+
host_data[i] = ck_tile::fp8_t(static_cast<float>(i));
151+
}
152+
153+
std::string output = CaptureTileWindowPrintOutput<ck_tile::fp8_t>(host_data, M, N);
154+
155+
// Expected output for a 2x3 window starting at (0,0) from a 4x4 tensor
156+
// Values should be: [0, 1, 2] in first row, [4, 5, 6] in second row
157+
// we type convert on host to match the function implementation
158+
float val_00 = type_convert<float>(ck_tile::fp8_t(0.0f));
159+
float val_01 = type_convert<float>(ck_tile::fp8_t(1.0f));
160+
float val_02 = type_convert<float>(ck_tile::fp8_t(2.0f));
161+
float val_10 = type_convert<float>(ck_tile::fp8_t(4.0f));
162+
float val_11 = type_convert<float>(ck_tile::fp8_t(5.0f));
163+
float val_12 = type_convert<float>(ck_tile::fp8_t(6.0f));
164+
165+
char expected_buf[512];
166+
snprintf(expected_buf,
167+
sizeof(expected_buf),
168+
"TW Window Range [0:1, 0:2] (origin: 0, 0):\n"
169+
" TW[0,0] = %f TW[0,1] = %f TW[0,2] = %f\n"
170+
" TW[1,0] = %f TW[1,1] = %f TW[1,2] = %f\n"
171+
"\n",
172+
val_00,
173+
val_01,
174+
val_02,
175+
val_10,
176+
val_11,
177+
val_12);
178+
std::string expected(expected_buf);
179+
180+
EXPECT_EQ(output, expected);
181+
}
182+
183+
TEST_F(PrintTileWindowTest, PrintTileWindowBf8)
184+
{
185+
// Test with bf8_t data type
186+
const int M = 3, N = 3;
187+
std::vector<ck_tile::bf8_t> host_data(M * N);
188+
for(int i = 0; i < M * N; ++i)
189+
{
190+
host_data[i] = ck_tile::bf8_t(static_cast<float>(i * 10));
191+
}
192+
193+
std::string output = CaptureTileWindowPrintOutput<ck_tile::bf8_t>(host_data, M, N);
194+
195+
// Expected output for a 2x3 window starting at (0,0) from a 3x3 tensor
196+
// Values should be: [0, 10, 20] in first row, [30, 40, 50] in second row
197+
// we type convert on host to match the function implementation
198+
float val_00 = type_convert<float>(ck_tile::bf8_t(0.0f));
199+
float val_01 = type_convert<float>(ck_tile::bf8_t(10.0f));
200+
float val_02 = type_convert<float>(ck_tile::bf8_t(20.0f));
201+
float val_10 = type_convert<float>(ck_tile::bf8_t(30.0f));
202+
float val_11 = type_convert<float>(ck_tile::bf8_t(40.0f));
203+
float val_12 = type_convert<float>(ck_tile::bf8_t(50.0f));
204+
205+
char expected_buf[512];
206+
snprintf(expected_buf,
207+
sizeof(expected_buf),
208+
"TW Window Range [0:1, 0:2] (origin: 0, 0):\n"
209+
" TW[0,0] = %f TW[0,1] = %f TW[0,2] = %f\n"
210+
" TW[1,0] = %f TW[1,1] = %f TW[1,2] = %f\n"
211+
"\n",
212+
val_00,
213+
val_01,
214+
val_02,
215+
val_10,
216+
val_11,
217+
val_12);
218+
std::string expected(expected_buf);
219+
220+
EXPECT_EQ(output, expected);
221+
}
222+
223+
} // namespace ck_tile

0 commit comments

Comments
 (0)