Skip to content

Commit 10feef9

Browse files
committed
add unit tests for shfl_up, shfl_down and shfl_xor methods
1 parent 09a947c commit 10feef9

File tree

3 files changed

+509
-0
lines changed

3 files changed

+509
-0
lines changed

Diff for: test/unit/warp/src/ShflDown.cpp

+180
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,180 @@
1+
/* Copyright 2023 Aurora Perego
2+
*
3+
* This file is part of Alpaka.
4+
*
5+
* This Source Code Form is subject to the terms of the Mozilla Public
6+
* License, v. 2.0. If a copy of the MPL was not distributed with this
7+
* file, You can obtain one at http://mozilla.org/MPL/2.0/.
8+
*/
9+
10+
#include <alpaka/math/FloatEqualExact.hpp>
11+
#include <alpaka/test/KernelExecutionFixture.hpp>
12+
#include <alpaka/test/acc/TestAccs.hpp>
13+
#include <alpaka/test/queue/Queue.hpp>
14+
#include <alpaka/warp/Traits.hpp>
15+
16+
#include <catch2/catch_template_test_macros.hpp>
17+
#include <catch2/catch_test_macros.hpp>
18+
19+
#include <cstdint>
20+
#include <limits>
21+
22+
#if BOOST_COMP_GNUC
23+
# pragma GCC diagnostic push
24+
# pragma GCC diagnostic ignored "-Wstrict-overflow"
25+
#endif
26+
27+
struct ShflDownSingleThreadWarpTestKernel
28+
{
29+
ALPAKA_NO_HOST_ACC_WARNING
30+
template<typename TAcc>
31+
ALPAKA_FN_ACC auto operator()(TAcc const& acc, bool* success) const -> void
32+
{
33+
if constexpr(alpaka::Dim<TAcc>::value > 0)
34+
{
35+
ALPAKA_CHECK(*success, alpaka::warp::getSize(acc) == 1);
36+
ALPAKA_CHECK(*success, alpaka::warp::shfl_down(acc, 42, 0) == 42);
37+
}
38+
else
39+
{
40+
ALPAKA_CHECK(*success, alpaka::warp::shfl_down(acc, 42, 0, 1) == 42);
41+
}
42+
ALPAKA_CHECK(*success, alpaka::warp::shfl_down(acc, 12, 0) == 12);
43+
float ans = alpaka::warp::shfl_down(acc, 3.3f, 0);
44+
ALPAKA_CHECK(*success, alpaka::math::floatEqualExactNoWarning(ans, 3.3f));
45+
}
46+
};
47+
48+
template<std::uint32_t TWarpSize>
49+
struct ShflDownMultipleThreadWarpTestKernel
50+
{
51+
ALPAKA_NO_HOST_ACC_WARNING
52+
template<typename TAcc>
53+
ALPAKA_FN_ACC auto operator()(TAcc const& acc, bool* success) const -> void
54+
{
55+
auto const localThreadIdx = alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc);
56+
auto const blockExtent = alpaka::getWorkDiv<alpaka::Block, alpaka::Threads>(acc);
57+
std::int32_t const warpExtent = alpaka::warp::getSize(acc);
58+
// Test relies on having a single warp per thread block
59+
ALPAKA_CHECK(*success, static_cast<std::int32_t>(blockExtent.prod()) == warpExtent);
60+
auto const threadIdxInWarp = std::int32_t(alpaka::mapIdx<1u>(localThreadIdx, blockExtent)[0]);
61+
62+
ALPAKA_CHECK(*success, warpExtent > 1);
63+
64+
ALPAKA_CHECK(*success, alpaka::warp::shfl_down(acc, 42, 0) == 42);
65+
ALPAKA_CHECK(*success, alpaka::warp::shfl_down(acc, threadIdxInWarp, 0) == threadIdxInWarp);
66+
ALPAKA_CHECK(
67+
*success,
68+
alpaka::warp::shfl_down(acc, threadIdxInWarp, 1)
69+
== (threadIdxInWarp + 1 < warpExtent ? threadIdxInWarp + 1 : threadIdxInWarp));
70+
auto const epsilon = std::numeric_limits<float>::epsilon();
71+
72+
// Test various widths
73+
for(int width = 1; width < warpExtent; width *= 2)
74+
{
75+
for(int idx = 0; idx < width; idx++)
76+
{
77+
int const off = width * (threadIdxInWarp / width);
78+
ALPAKA_CHECK(
79+
*success,
80+
alpaka::warp::shfl_down(acc, threadIdxInWarp, static_cast<std::uint32_t>(idx), width)
81+
== ((threadIdxInWarp + idx < (width + off)) ? threadIdxInWarp + idx : threadIdxInWarp));
82+
float const ans = alpaka::warp::shfl_down(
83+
acc,
84+
4.0f - float(threadIdxInWarp),
85+
static_cast<std::uint32_t>(idx),
86+
width);
87+
float const expect
88+
= ((threadIdxInWarp + idx < (width + off)) ? (4.0f - float(threadIdxInWarp + idx))
89+
: (4.0f - float(threadIdxInWarp)));
90+
ALPAKA_CHECK(*success, alpaka::math::abs(acc, ans - expect) < epsilon);
91+
}
92+
}
93+
94+
// Some threads quit the kernel to test that the warp operations
95+
// properly operate on the active threads only
96+
if(threadIdxInWarp >= warpExtent / 2)
97+
return;
98+
99+
for(int idx = 0; idx < warpExtent / 2; idx++)
100+
{
101+
auto const shfl = alpaka::warp::shfl_down(acc, threadIdxInWarp, static_cast<std::uint32_t>(idx));
102+
float const ans
103+
= alpaka::warp::shfl_down(acc, 4.0f - float(threadIdxInWarp), static_cast<std::uint32_t>(idx));
104+
float const expect
105+
= ((threadIdxInWarp + idx < warpExtent / 2) ? (4.0f - float(threadIdxInWarp + idx)) : 0);
106+
if(threadIdxInWarp + idx < warpExtent / 2)
107+
{
108+
ALPAKA_CHECK(*success, shfl == threadIdxInWarp + idx);
109+
ALPAKA_CHECK(*success, alpaka::math::abs(acc, ans - expect) < epsilon);
110+
}
111+
}
112+
}
113+
};
114+
115+
template<std::uint32_t TWarpSize, typename TAcc>
116+
struct alpaka::trait::WarpSize<ShflDownMultipleThreadWarpTestKernel<TWarpSize>, TAcc>
117+
: std::integral_constant<std::uint32_t, TWarpSize>
118+
{
119+
};
120+
121+
TEMPLATE_LIST_TEST_CASE("shfl_down", "[warp]", alpaka::test::TestAccs)
122+
{
123+
using Acc = TestType;
124+
using Dev = alpaka::Dev<Acc>;
125+
using Dim = alpaka::Dim<Acc>;
126+
using Idx = alpaka::Idx<Acc>;
127+
128+
auto const platform = alpaka::Platform<Acc>{};
129+
Dev const dev(alpaka::getDevByIdx(platform, 0u));
130+
auto const warpExtents = alpaka::getWarpSizes(dev);
131+
for(auto const warpExtent : warpExtents)
132+
{
133+
auto const scalar = Dim::value == 0 || warpExtent == 1;
134+
if(scalar)
135+
{
136+
alpaka::test::KernelExecutionFixture<Acc> fixture(alpaka::Vec<Dim, Idx>::all(4));
137+
REQUIRE(fixture(ShflDownSingleThreadWarpTestKernel{}));
138+
}
139+
else
140+
{
141+
// Work around gcc 7.5 trying and failing to offload for OpenMP 4.0
142+
#if BOOST_COMP_GNUC && (BOOST_COMP_GNUC == BOOST_VERSION_NUMBER(7, 5, 0)) && defined ALPAKA_ACC_ANY_BT_OMP5_ENABLED
143+
return;
144+
#else
145+
using ExecutionFixture = alpaka::test::KernelExecutionFixture<Acc>;
146+
auto const gridBlockExtent = alpaka::Vec<Dim, Idx>::all(2);
147+
// Enforce one warp per thread block
148+
auto blockThreadExtent = alpaka::Vec<Dim, Idx>::ones();
149+
blockThreadExtent[0] = static_cast<Idx>(warpExtent);
150+
auto const threadElementExtent = alpaka::Vec<Dim, Idx>::ones();
151+
auto workDiv = typename ExecutionFixture::WorkDiv{gridBlockExtent, blockThreadExtent, threadElementExtent};
152+
auto fixture = ExecutionFixture{workDiv};
153+
if(warpExtent == 4)
154+
{
155+
REQUIRE(fixture(ShflDownMultipleThreadWarpTestKernel<4>{}));
156+
}
157+
else if(warpExtent == 8)
158+
{
159+
REQUIRE(fixture(ShflDownMultipleThreadWarpTestKernel<8>{}));
160+
}
161+
else if(warpExtent == 16)
162+
{
163+
REQUIRE(fixture(ShflDownMultipleThreadWarpTestKernel<16>{}));
164+
}
165+
else if(warpExtent == 32)
166+
{
167+
REQUIRE(fixture(ShflDownMultipleThreadWarpTestKernel<32>{}));
168+
}
169+
else if(warpExtent == 64)
170+
{
171+
REQUIRE(fixture(ShflDownMultipleThreadWarpTestKernel<64>{}));
172+
}
173+
#endif
174+
}
175+
}
176+
}
177+
178+
#if BOOST_COMP_GNUC
179+
# pragma GCC diagnostic pop
180+
#endif

Diff for: test/unit/warp/src/ShflUp.cpp

+172
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,172 @@
1+
/* Copyright 2023 Aurora Perego
2+
*
3+
* This file is part of Alpaka.
4+
*
5+
* This Source Code Form is subject to the terms of the Mozilla Public
6+
* License, v. 2.0. If a copy of the MPL was not distributed with this
7+
* file, You can obtain one at http://mozilla.org/MPL/2.0/.
8+
*/
9+
10+
#include <alpaka/math/FloatEqualExact.hpp>
11+
#include <alpaka/test/KernelExecutionFixture.hpp>
12+
#include <alpaka/test/acc/TestAccs.hpp>
13+
#include <alpaka/test/queue/Queue.hpp>
14+
#include <alpaka/warp/Traits.hpp>
15+
16+
#include <catch2/catch_template_test_macros.hpp>
17+
#include <catch2/catch_test_macros.hpp>
18+
19+
#include <cstdint>
20+
#include <limits>
21+
22+
struct ShflUpSingleThreadWarpTestKernel
23+
{
24+
ALPAKA_NO_HOST_ACC_WARNING
25+
template<typename TAcc>
26+
ALPAKA_FN_ACC auto operator()(TAcc const& acc, bool* success) const -> void
27+
{
28+
if constexpr(alpaka::Dim<TAcc>::value > 0)
29+
{
30+
ALPAKA_CHECK(*success, alpaka::warp::getSize(acc) == 1);
31+
ALPAKA_CHECK(*success, alpaka::warp::shfl_up(acc, 42, 0) == 42);
32+
}
33+
else
34+
{
35+
ALPAKA_CHECK(*success, alpaka::warp::shfl_up(acc, 42, 0, 1) == 42);
36+
}
37+
ALPAKA_CHECK(*success, alpaka::warp::shfl_up(acc, 12, 0) == 12);
38+
float ans = alpaka::warp::shfl_up(acc, 3.3f, 0);
39+
ALPAKA_CHECK(*success, alpaka::math::floatEqualExactNoWarning(ans, 3.3f));
40+
}
41+
};
42+
43+
template<std::uint32_t TWarpSize>
44+
struct ShflUpMultipleThreadWarpTestKernel
45+
{
46+
ALPAKA_NO_HOST_ACC_WARNING
47+
template<typename TAcc>
48+
ALPAKA_FN_ACC auto operator()(TAcc const& acc, bool* success) const -> void
49+
{
50+
auto const localThreadIdx = alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc);
51+
auto const blockExtent = alpaka::getWorkDiv<alpaka::Block, alpaka::Threads>(acc);
52+
std::int32_t const warpExtent = alpaka::warp::getSize(acc);
53+
// Test relies on having a single warp per thread block
54+
ALPAKA_CHECK(*success, static_cast<std::int32_t>(blockExtent.prod()) == warpExtent);
55+
auto const threadIdxInWarp = std::int32_t(alpaka::mapIdx<1u>(localThreadIdx, blockExtent)[0]);
56+
57+
ALPAKA_CHECK(*success, warpExtent > 1);
58+
59+
ALPAKA_CHECK(*success, alpaka::warp::shfl_up(acc, 42, 0) == 42);
60+
ALPAKA_CHECK(*success, alpaka::warp::shfl_up(acc, threadIdxInWarp, 0) == threadIdxInWarp);
61+
ALPAKA_CHECK(
62+
*success,
63+
alpaka::warp::shfl_up(acc, threadIdxInWarp, 1)
64+
== (threadIdxInWarp - 1 >= 0 ? threadIdxInWarp - 1 : threadIdxInWarp));
65+
66+
auto const epsilon = std::numeric_limits<float>::epsilon();
67+
68+
// Test various widths
69+
for(int width = 1; width < warpExtent; width *= 2)
70+
{
71+
for(int idx = 0; idx < width; idx++)
72+
{
73+
int const off = width * (threadIdxInWarp / width);
74+
ALPAKA_CHECK(
75+
*success,
76+
alpaka::warp::shfl_up(acc, threadIdxInWarp, static_cast<std::uint32_t>(idx), width)
77+
== ((threadIdxInWarp - idx >= off) ? threadIdxInWarp - idx : threadIdxInWarp));
78+
float const ans = alpaka::warp::shfl_up(
79+
acc,
80+
4.0f - float(threadIdxInWarp),
81+
static_cast<std::uint32_t>(idx),
82+
width);
83+
float const expect
84+
= ((threadIdxInWarp - idx >= off) ? (4.0f - float(threadIdxInWarp - idx))
85+
: (4.0f - float(threadIdxInWarp)));
86+
ALPAKA_CHECK(*success, alpaka::math::abs(acc, ans - expect) < epsilon);
87+
}
88+
}
89+
90+
// Some threads quit the kernel to test that the warp operations
91+
// properly operate on the active threads only
92+
if(threadIdxInWarp >= warpExtent / 2)
93+
return;
94+
95+
for(int idx = 0; idx < warpExtent / 2; idx++)
96+
{
97+
ALPAKA_CHECK(
98+
*success,
99+
alpaka::warp::shfl_up(acc, threadIdxInWarp, static_cast<std::uint32_t>(idx))
100+
== ((threadIdxInWarp - idx >= 0) ? (threadIdxInWarp - idx) : threadIdxInWarp));
101+
float const ans
102+
= alpaka::warp::shfl_up(acc, 4.0f - float(threadIdxInWarp), static_cast<std::uint32_t>(idx));
103+
float const expect
104+
= ((threadIdxInWarp - idx >= 0) ? (4.0f - float(threadIdxInWarp - idx))
105+
: (4.0f - float(threadIdxInWarp)));
106+
ALPAKA_CHECK(*success, alpaka::math::abs(acc, ans - expect) < epsilon);
107+
}
108+
}
109+
};
110+
111+
template<std::uint32_t TWarpSize, typename TAcc>
112+
struct alpaka::trait::WarpSize<ShflUpMultipleThreadWarpTestKernel<TWarpSize>, TAcc>
113+
: std::integral_constant<std::uint32_t, TWarpSize>
114+
{
115+
};
116+
117+
TEMPLATE_LIST_TEST_CASE("shfl_up", "[warp]", alpaka::test::TestAccs)
118+
{
119+
using Acc = TestType;
120+
using Dev = alpaka::Dev<Acc>;
121+
using Dim = alpaka::Dim<Acc>;
122+
using Idx = alpaka::Idx<Acc>;
123+
124+
auto const platform = alpaka::Platform<Acc>{};
125+
Dev const dev(alpaka::getDevByIdx(platform, 0u));
126+
auto const warpExtents = alpaka::getWarpSizes(dev);
127+
for(auto const warpExtent : warpExtents)
128+
{
129+
auto const scalar = Dim::value == 0 || warpExtent == 1;
130+
if(scalar)
131+
{
132+
alpaka::test::KernelExecutionFixture<Acc> fixture(alpaka::Vec<Dim, Idx>::all(4));
133+
REQUIRE(fixture(ShflUpSingleThreadWarpTestKernel{}));
134+
}
135+
else
136+
{
137+
// Work around gcc 7.5 trying and failing to offload for OpenMP 4.0
138+
#if BOOST_COMP_GNUC && (BOOST_COMP_GNUC == BOOST_VERSION_NUMBER(7, 5, 0)) && defined ALPAKA_ACC_ANY_BT_OMP5_ENABLED
139+
return;
140+
#else
141+
using ExecutionFixture = alpaka::test::KernelExecutionFixture<Acc>;
142+
auto const gridBlockExtent = alpaka::Vec<Dim, Idx>::all(2);
143+
// Enforce one warp per thread block
144+
auto blockThreadExtent = alpaka::Vec<Dim, Idx>::ones();
145+
blockThreadExtent[0] = static_cast<Idx>(warpExtent);
146+
auto const threadElementExtent = alpaka::Vec<Dim, Idx>::ones();
147+
auto workDiv = typename ExecutionFixture::WorkDiv{gridBlockExtent, blockThreadExtent, threadElementExtent};
148+
auto fixture = ExecutionFixture{workDiv};
149+
if(warpExtent == 4)
150+
{
151+
REQUIRE(fixture(ShflUpMultipleThreadWarpTestKernel<4>{}));
152+
}
153+
else if(warpExtent == 8)
154+
{
155+
REQUIRE(fixture(ShflUpMultipleThreadWarpTestKernel<8>{}));
156+
}
157+
else if(warpExtent == 16)
158+
{
159+
REQUIRE(fixture(ShflUpMultipleThreadWarpTestKernel<16>{}));
160+
}
161+
else if(warpExtent == 32)
162+
{
163+
REQUIRE(fixture(ShflUpMultipleThreadWarpTestKernel<32>{}));
164+
}
165+
else if(warpExtent == 64)
166+
{
167+
REQUIRE(fixture(ShflUpMultipleThreadWarpTestKernel<64>{}));
168+
}
169+
#endif
170+
}
171+
}
172+
}

0 commit comments

Comments
 (0)