Skip to content

Commit 73cb8ff

Browse files
authored
[SYCLomatic][Perf] Update to use const reference of sycl::nd_item when possible (#603)
Signed-off-by: Chen, Sheng S <[email protected]>
1 parent 7d7f0b9 commit 73cb8ff

File tree

83 files changed

+396
-351
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

83 files changed

+396
-351
lines changed

clang/lib/DPCT/AnalysisInfo.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3391,7 +3391,7 @@ MemVarMap::getItem<MemVarMap::DeclParameter>(ParameterStream &PS) const {
33913391
}
33923392

33933393
std::string ItemParamDecl =
3394-
MapNames::getClNamespace() + NDItem + " " + getItemName();
3394+
"const " + MapNames::getClNamespace() + NDItem + " &" + getItemName();
33953395
return PS << ItemParamDecl;
33963396
}
33973397

clang/runtime/dpct-rt/include/util.hpp.inc

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -594,7 +594,7 @@ namespace experimental {
594594
/// a SYCL kernel can be scheduled actively at the same time on a device.
595595
template <int dimensions = 3>
596596
inline void
597-
nd_range_barrier(sycl::nd_item<dimensions> item,
597+
nd_range_barrier(const sycl::nd_item<dimensions> &item,
598598
sycl::atomic_ref<
599599
unsigned int, sycl::memory_order::seq_cst,
600600
sycl::memory_scope::device,
@@ -634,7 +634,7 @@ nd_range_barrier(sycl::nd_item<dimensions> item,
634634
/// a SYCL kernel can be scheduled actively at the same time on a device.
635635
template <>
636636
inline void
637-
nd_range_barrier(sycl::nd_item<1> item,
637+
nd_range_barrier(const sycl::nd_item<1> &item,
638638
sycl::atomic_ref<
639639
unsigned int, sycl::memory_order::seq_cst,
640640
sycl::memory_scope::device,

clang/test/dpct/a_vcxproj_test/a_kernel.cu

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,8 @@
1616
#include "cuda_runtime.h"
1717
#include <stdio.h>
1818

19-
// CHECK: void addKernel(int *c, const int *a, const int *b, sycl::nd_item<3> item_ct1)
19+
// CHECK: void addKernel(int *c, const int *a, const int *b,
20+
// CHECK-NEXT: const sycl::nd_item<3> &item_ct1)
2021
__global__ void addKernel(int *c, const int *a, const int *b)
2122
{
2223
// CHECK: int i = item_ct1.get_local_id(2);

clang/test/dpct/atomic_functions.cu

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -138,7 +138,8 @@ int main() {
138138
InvokeKernel<double>();
139139
}
140140

141-
// CHECK: void foo(sycl::nd_item<3> item_ct1, uint8_t *dpct_local, uint32_t &share_v) {
141+
// CHECK: void foo(const sycl::nd_item<3> &item_ct1, uint8_t *dpct_local,
142+
// CHECK-NEXT: uint32_t &share_v) {
142143
// CHECK-NEXT: auto share_array = (uint32_t *)dpct_local;
143144
// CHECK-NEXT: for (int b = item_ct1.get_local_id(2); b < 64; b += item_ct1.get_local_range(2)) {
144145
// CHECK-NEXT: dpct::atomic_fetch_add<uint32_t, sycl::access::address_space::generic_space>(&share_array[b], (uint32_t)1);
@@ -157,7 +158,8 @@ __shared__ uint32_t share_v;
157158
atomicAdd(&share_v, 1);
158159
}
159160

160-
// CHECK:void foo_2(sycl::nd_item<3> item_ct1, uint8_t *dpct_local, uint32_t &share_v) {
161+
// CHECK: void foo_2(const sycl::nd_item<3> &item_ct1, uint8_t *dpct_local,
162+
// CHECK-NEXT: uint32_t &share_v) {
161163
// CHECK-NEXT: auto share_array = (uint32_t *)dpct_local;
162164
// CHECK-NEXT: for (int b = item_ct1.get_local_id(2); b < 64; b += item_ct1.get_local_range(2)) {
163165
// CHECK-NEXT: uint32_t *p_1 = &share_array[b];
@@ -451,7 +453,8 @@ __global__ void k() {
451453
atomicAdd(&f, f);
452454
}
453455

454-
// CHECK: void mykernel(unsigned int *dev, sycl::nd_item<3> item_ct1, uint8_t *dpct_local) {
456+
// CHECK: void mykernel(unsigned int *dev, const sycl::nd_item<3> &item_ct1,
457+
// CHECK-NEXT: uint8_t *dpct_local) {
455458
// CHECK-NEXT: auto sm = (unsigned int *)dpct_local;
456459
// CHECK-NEXT: unsigned int* as= (unsigned int*)sm;
457460
// CHECK-NEXT: const int kc=item_ct1.get_local_id(2);
@@ -469,7 +472,8 @@ __global__ void mykernel(unsigned int *dev) {
469472
}
470473

471474
// CHECK: void mykernel_1(unsigned char *buffer, long size,
472-
// CHECK-NEXT: unsigned int *histo, sycl::nd_item<3> item_ct1,
475+
// CHECK-NEXT: unsigned int *histo,
476+
// CHECK-NEXT: const sycl::nd_item<3> &item_ct1,
473477
// CHECK-NEXT: unsigned int *temp) {
474478
// CHECK-EMPTY:
475479
// CHECK-NEXT: temp[item_ct1.get_local_id(2)] = 0;
@@ -546,7 +550,7 @@ __device__ void __gpu_sync(int blocks_to_synch) {
546550
while(g_mutex < blocks_to_synch);
547551
}
548552

549-
//CHECK:void atomicInc_foo(sycl::nd_item<3> item_ct1, uint8_t *dpct_local,
553+
//CHECK:void atomicInc_foo(const sycl::nd_item<3> &item_ct1, uint8_t *dpct_local,
550554
//CHECK-NEXT: unsigned int &share_v) {
551555
//CHECK-NEXT: auto share_array = (unsigned int *)dpct_local;
552556
//CHECK-NEXT: for (int b = item_ct1.get_local_id(2); b < 64; b += item_ct1.get_local_range(2)) {

clang/test/dpct/atomic_functions_no_use_generic_space.cu

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -138,7 +138,8 @@ int main() {
138138
InvokeKernel<double>();
139139
}
140140

141-
// CHECK: void foo(sycl::nd_item<3> item_ct1, uint8_t *dpct_local, uint32_t &share_v) {
141+
// CHECK:void foo(const sycl::nd_item<3> &item_ct1, uint8_t *dpct_local,
142+
// CHECK-NEXT: uint32_t &share_v) {
142143
// CHECK-NEXT: auto share_array = (uint32_t *)dpct_local;
143144
// CHECK-NEXT: for (int b = item_ct1.get_local_id(2); b < 64; b += item_ct1.get_local_range(2)) {
144145
// CHECK-NEXT: dpct::atomic_fetch_add<uint32_t, sycl::access::address_space::local_space>(&share_array[b], (uint32_t)1);
@@ -157,7 +158,8 @@ __shared__ uint32_t share_v;
157158
atomicAdd(&share_v, 1);
158159
}
159160

160-
// CHECK:void foo_2(sycl::nd_item<3> item_ct1, uint8_t *dpct_local, uint32_t &share_v) {
161+
// CHECK: void foo_2(const sycl::nd_item<3> &item_ct1, uint8_t *dpct_local,
162+
// CHECK-NEXT: uint32_t &share_v) {
161163
// CHECK-NEXT: auto share_array = (uint32_t *)dpct_local;
162164
// CHECK-NEXT: for (int b = item_ct1.get_local_id(2); b < 64; b += item_ct1.get_local_range(2)) {
163165
// CHECK-NEXT: uint32_t *p_1 = &share_array[b];
@@ -457,7 +459,8 @@ __global__ void k() {
457459
atomicAdd(&f, f);
458460
}
459461

460-
// CHECK: void mykernel(unsigned int *dev, sycl::nd_item<3> item_ct1, uint8_t *dpct_local) {
462+
// CHECK: void mykernel(unsigned int *dev, const sycl::nd_item<3> &item_ct1,
463+
// CHECK-NEXT: uint8_t *dpct_local) {
461464
// CHECK-NEXT: auto sm = (unsigned int *)dpct_local;
462465
// CHECK-NEXT: unsigned int* as= (unsigned int*)sm;
463466
// CHECK-NEXT: const int kc=item_ct1.get_local_id(2);
@@ -475,7 +478,8 @@ __global__ void mykernel(unsigned int *dev) {
475478
}
476479

477480
// CHECK: void mykernel_1(unsigned char *buffer, long size,
478-
// CHECK-NEXT: unsigned int *histo, sycl::nd_item<3> item_ct1,
481+
// CHECK-NEXT: unsigned int *histo,
482+
// CHECK-NEXT: const sycl::nd_item<3> &item_ct1,
479483
// CHECK-NEXT: unsigned int *temp) {
480484
// CHECK-EMPTY:
481485
// CHECK-NEXT: temp[item_ct1.get_local_id(2)] = 0;

clang/test/dpct/b_vcxproj_test/b_kernel.cu

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,8 @@
2323
#include "cuda_runtime.h"
2424
#include <stdio.h>
2525

26-
// CHECK: void addKernel(int *c, const int *a, const int *b, sycl::nd_item<3> item_ct1)
26+
// CHECK: void addKernel(int *c, const int *a, const int *b,
27+
// CHECK-NEXT: const sycl::nd_item<3> &item_ct1)
2728
__global__ void addKernel(int *c, const int *a, const int *b)
2829
{
2930
// CHECK: int i = item_ct1.get_local_id(2);

clang/test/dpct/builtin_warpSize.cu

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ __global__ void foo(){
1717
int c= warpSize;
1818
}
1919

20-
// CHECK: void bar(sycl::nd_item<3> item_ct1){
20+
// CHECK: void bar(const sycl::nd_item<3> &item_ct1){
2121
// CHECK-NEXT: int a = sycl::max((int)item_ct1.get_sub_group().get_local_range().get(0), 0);
2222
// CHECK-NEXT: int warpSize = 1;
2323
// CHECK-NEXT: int b = sycl::max(warpSize, 0);
@@ -28,7 +28,7 @@ __global__ void bar(){
2828
int b = max(warpSize, 0);
2929
}
3030

31-
// CHECK: int tensorPos(const int ct, sycl::nd_item<3> item_ct1, int numLane = 0) {
31+
// CHECK: int tensorPos(const int ct, const sycl::nd_item<3> &item_ct1, int numLane = 0) {
3232
// CHECK-NEXT: if (!numLane) numLane = item_ct1.get_sub_group().get_local_range().get(0);
3333
// CHECK-NEXT: int r = ct * numLane;
3434
// CHECK-NEXT: return r;
@@ -38,17 +38,17 @@ __device__ int tensorPos(const int ct, const int numLane = warpSize) {
3838
return r;
3939
}
4040

41-
// CHECK: int tensorPos(const int ct, sycl::nd_item<3> item_ct1, int numLane);
41+
// CHECK: int tensorPos(const int ct, const sycl::nd_item<3> &item_ct1, int numLane);
4242
__device__ int tensorPos(const int ct, const int numLane);
4343

4444

4545

4646

4747

48-
// CHECK: int tensorPos2(const int ct, sycl::nd_item<3> item_ct1, int numLane);
48+
// CHECK: int tensorPos2(const int ct, const sycl::nd_item<3> &item_ct1, int numLane);
4949
__device__ int tensorPos2(const int ct, const int numLane);
5050

51-
// CHECK: int tensorPos2(const int ct, sycl::nd_item<3> item_ct1, int numLane) {
51+
// CHECK: int tensorPos2(const int ct, const sycl::nd_item<3> &item_ct1, int numLane) {
5252
// CHECK-NEXT: if (!numLane) numLane = item_ct1.get_sub_group().get_local_range().get(0);
5353
// CHECK-NEXT: int r = ct * numLane;
5454
// CHECK-NEXT: return r;
@@ -58,9 +58,9 @@ __device__ int tensorPos2(const int ct, const int numLane) {
5858
return r;
5959
}
6060

61-
// CHECK: int tensorPos2(const int ct, sycl::nd_item<3> item_ct1, int numLane = 0);
61+
// CHECK: int tensorPos2(const int ct, const sycl::nd_item<3> &item_ct1, int numLane = 0);
6262
__device__ int tensorPos2(const int ct, const int numLane = warpSize);
6363

6464

65-
// CHECK: int tensorPos3(const int ct, sycl::nd_item<3> item_ct1, int numLane = 0) {}
65+
// CHECK: int tensorPos3(const int ct, const sycl::nd_item<3> &item_ct1, int numLane = 0) {}
6666
__device__ int tensorPos3(const int ct, const int numLane = warpSize) {}

clang/test/dpct/c_vcxproj_test/c_kernel.cu

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,8 @@
1717
#include "cuda_runtime.h"
1818
#include <stdio.h>
1919

20-
// CHECK: void addKernel(int *c, const int *a, const int *b, sycl::nd_item<3> item_ct1)
20+
// CHECK: void addKernel(int *c, const int *a, const int *b,
21+
// CHECK-NEXT: const sycl::nd_item<3> &item_ct1)
2122
__global__ void addKernel(int *c, const int *a, const int *b)
2223
{
2324
// CHECK: int i = item_ct1.get_local_id(2);

clang/test/dpct/checkFormatAll.cu

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ __device__ void testDevice(const int *K) {
1717
__device__ void testDevice1(const int *K) { int t = K[0]; }
1818

1919
//CHECK:void testKernelPtr(const int *L, const int *M, int N,
20-
//CHECK-NEXT: sycl::nd_item<3> item_ct1) {
20+
//CHECK-NEXT: const sycl::nd_item<3> &item_ct1) {
2121
//CHECK-NEXT: testDevice(L);
2222
//CHECK-NEXT: int gtid = item_ct1.get_group(2) * item_ct1.get_local_range(2) +
2323
//CHECK-NEXT: item_ct1.get_local_id(2);

clang/test/dpct/checkFormatMigrated.cu

Lines changed: 18 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -18,8 +18,8 @@ __device__ void testDevice(const int *K) {
1818
//CHECK:void testDevice1(const int *K) { int t = K[0]; }
1919
__device__ void testDevice1(const int *K) { int t = K[0]; }
2020

21-
//CHECK:void testKernelPtr(const int *L, const int *M, int N,
22-
//CHECK-NEXT: cl::sycl::nd_item<3> item_ct1) {
21+
//CHECK:void testKernelPtr(const int *L, const int *M, int N,
22+
//CHECK-NEXT: const cl::sycl::nd_item<3> &item_ct1) {
2323
//CHECK-NEXT: testDevice(L);
2424
//CHECK-NEXT: int gtid = item_ct1.get_group(2) * item_ct1.get_local_range(2) +
2525
//CHECK-NEXT: item_ct1.get_local_id(2);
@@ -107,8 +107,8 @@ typedef struct
107107
//CHECK-NEXT: const float g_ewald, const float qqrd2e,
108108
//CHECK-NEXT: const float denom_lj_inv,
109109
//CHECK-NEXT: const int loop_trip,
110-
//CHECK-NEXT: cl::sycl::nd_item<3> item_ct1, float *sp_lj,
111-
//CHECK-NEXT: float *sp_coul, int *ljd,
110+
//CHECK-NEXT: const cl::sycl::nd_item<3> &item_ct1,
111+
//CHECK-NEXT: float *sp_lj, float *sp_coul, int *ljd,
112112
//CHECK-NEXT: cl::sycl::local_accessor<double, 2> la) {
113113
template <int EFLAG>
114114
__global__ void k_mdppp_outer_nn(const int * __restrict__ pos,
@@ -152,7 +152,7 @@ void test() {
152152

153153

154154

155-
//CHECK:void k_mdppp_outer_n0(cl::sycl::nd_item<3> item_ct1, float *sp_lj,
155+
//CHECK:void k_mdppp_outer_n0(const cl::sycl::nd_item<3> &item_ct1, float *sp_lj,
156156
//CHECK-NEXT: float *sp_coul, int *ljd,
157157
//CHECK-NEXT: cl::sycl::local_accessor<double, 2> la) {
158158
__global__ void k_mdppp_outer_n0() {
@@ -164,7 +164,7 @@ __global__ void k_mdppp_outer_n0() {
164164
}
165165

166166
//CHECK:void k_mdppp_outer_n1(const int * __restrict__ pos,
167-
//CHECK-NEXT: cl::sycl::nd_item<3> item_ct1, float *sp_lj,
167+
//CHECK-NEXT: const cl::sycl::nd_item<3> &item_ct1, float *sp_lj,
168168
//CHECK-NEXT: float *sp_coul, int *ljd,
169169
//CHECK-NEXT: cl::sycl::local_accessor<double, 2> la) {
170170
__global__ void k_mdppp_outer_n1(const int * __restrict__ pos) {
@@ -177,7 +177,7 @@ __global__ void k_mdppp_outer_n1(const int * __restrict__ pos) {
177177

178178
//CHECK:void k_mdppp_outer_22(const int * __restrict__ pos,
179179
//CHECK-NEXT: const float * __restrict__ q,
180-
//CHECK-NEXT: cl::sycl::nd_item<3> item_ct1,
180+
//CHECK-NEXT: const cl::sycl::nd_item<3> &item_ct1,
181181
//CHECK-NEXT: float *sp_lj,
182182
//CHECK-NEXT: float *sp_coul,
183183
//CHECK-NEXT: int *ljd,
@@ -193,7 +193,7 @@ __global__ void k_mdppp_outer_22(const int * __restrict__ pos,
193193

194194

195195
//CHECK:void k_mdppp_outer_n2(const int * __restrict__ pos, const float * __restrict__ q,
196-
//CHECK-NEXT: cl::sycl::nd_item<3> item_ct1, float *sp_lj,
196+
//CHECK-NEXT: const cl::sycl::nd_item<3> &item_ct1, float *sp_lj,
197197
//CHECK-NEXT: float *sp_coul, int *ljd,
198198
//CHECK-NEXT: cl::sycl::local_accessor<double, 2> la) {
199199
void __device__ k_mdppp_outer_n2(const int * __restrict__ pos, const float * __restrict__ q) {
@@ -207,7 +207,7 @@ void __device__ k_mdppp_outer_n2(const int * __restrict__ pos, const float * __r
207207

208208

209209
//CHECK:void k_mdppp_outer_n3(const int * __restrict__ pos, const float * __restrict__ q,
210-
//CHECK-NEXT: cl::sycl::nd_item<3> item_ct1, float *sp_lj,
210+
//CHECK-NEXT: const cl::sycl::nd_item<3> &item_ct1, float *sp_lj,
211211
//CHECK-NEXT: float *sp_coul, int *ljd,
212212
//CHECK-NEXT: cl::sycl::local_accessor<double, 2> la) {
213213
__device__
@@ -222,8 +222,8 @@ void k_mdppp_outer_n3(const int * __restrict__ pos, const float * __restrict__ q
222222
#define AAA const int * __restrict__ pos
223223
#define BBB const float * __restrict__ q
224224

225-
//CHECK:void foo1(AAA, BBB, cl::sycl::nd_item<3> item_ct1, float *sp_lj, float *sp_coul,
226-
//CHECK-NEXT: int *ljd, cl::sycl::local_accessor<double, 2> la) {
225+
//CHECK:void foo1(AAA, BBB, const cl::sycl::nd_item<3> &item_ct1, float *sp_lj,
226+
//CHECK-NEXT: float *sp_coul, int *ljd, cl::sycl::local_accessor<double, 2> la) {
227227
__device__ void foo1(AAA, BBB) {
228228
__shared__ float sp_lj[4];
229229
__shared__ float sp_coul[4];
@@ -232,9 +232,9 @@ __device__ void foo1(AAA, BBB) {
232232
const int tid = threadIdx.x;
233233
}
234234

235-
//CHECK:void foo2(const int * __restrict__ pos, BBB, cl::sycl::nd_item<3> item_ct1,
236-
//CHECK-NEXT: float *sp_lj, float *sp_coul, int *ljd,
237-
//CHECK-NEXT: cl::sycl::local_accessor<double, 2> la) {
235+
//CHECK:void foo2(const int * __restrict__ pos, BBB,
236+
//CHECK-NEXT: const cl::sycl::nd_item<3> &item_ct1, float *sp_lj, float *sp_coul,
237+
//CHECK-NEXT: int *ljd, cl::sycl::local_accessor<double, 2> la) {
238238
__device__ void foo2(const int * __restrict__ pos, BBB) {
239239
__shared__ float sp_lj[4];
240240
__shared__ float sp_coul[4];
@@ -243,9 +243,9 @@ __device__ void foo2(const int * __restrict__ pos, BBB) {
243243
const int tid = threadIdx.x;
244244
}
245245

246-
//CHECK:void foo3(AAA, const float * __restrict__ q, cl::sycl::nd_item<3> item_ct1,
247-
//CHECK-NEXT: float *sp_lj, float *sp_coul, int *ljd,
248-
//CHECK-NEXT: cl::sycl::local_accessor<double, 2> la) {
246+
//CHECK:void foo3(AAA, const float * __restrict__ q,
247+
//CHECK-NEXT: const cl::sycl::nd_item<3> &item_ct1, float *sp_lj, float *sp_coul,
248+
//CHECK-NEXT: int *ljd, cl::sycl::local_accessor<double, 2> la) {
249249
__device__ void foo3(AAA, const float * __restrict__ q) {
250250
__shared__ float sp_lj[4];
251251
__shared__ float sp_coul[4];
@@ -254,7 +254,7 @@ __device__ void foo3(AAA, const float * __restrict__ q) {
254254
const int tid = threadIdx.x;
255255
}
256256

257-
//CHECK:#define FFFFF(aaa,bbb) void foo4(const int * __restrict__ aaa, const float * __restrict__ bbb, cl::sycl::nd_item<3> item_ct1, float *sp_lj, float *sp_coul, int *ljd, cl::sycl::local_accessor<double, 2> la)
257+
//CHECK:#define FFFFF(aaa,bbb) void foo4(const int * __restrict__ aaa, const float * __restrict__ bbb, const cl::sycl::nd_item<3> &item_ct1, float *sp_lj, float *sp_coul, int *ljd, cl::sycl::local_accessor<double, 2> la)
258258
#define FFFFF(aaa,bbb) __device__ void foo4(const int * __restrict__ aaa, const float * __restrict__ bbb)
259259

260260
FFFFF(pos, q)

0 commit comments

Comments
 (0)