@@ -950,7 +950,7 @@ public:
950950 ExclusiveScan (T (&input)[ITEMS_PER_THREAD], T (&output)[ITEMS_PER_THREAD], T initial_value, ScanOp scan_op)
951951 {
952952 // Reduce consecutive thread items in registers
953- T thread_prefix = cub::ThreadReduce (input, scan_op);
953+ T thread_prefix = cub::ThreadReduce<::cuda::std:: remove_reference_t < decltype (input)>, ScanOp, T, T> (input, scan_op);
954954
955955 // Exclusive thread block-scan
956956 ExclusiveScan (thread_prefix, thread_prefix, initial_value, scan_op);
@@ -1037,7 +1037,7 @@ public:
10371037 T (&input)[ITEMS_PER_THREAD], T (&output)[ITEMS_PER_THREAD], T initial_value, ScanOp scan_op, T& block_aggregate)
10381038 {
10391039 // Reduce consecutive thread items in registers
1040- T thread_prefix = cub::ThreadReduce (input, scan_op);
1040+ T thread_prefix = cub::ThreadReduce<::cuda::std:: remove_reference_t < decltype (input)>, ScanOp, T, T> (input, scan_op);
10411041
10421042 // Exclusive thread block-scan
10431043 ExclusiveScan (thread_prefix, thread_prefix, initial_value, scan_op, block_aggregate);
@@ -1121,7 +1121,7 @@ public:
11211121 BlockPrefixCallbackOp& block_prefix_callback_op)
11221122 {
11231123 // Reduce consecutive thread items in registers
1124- T thread_prefix = cub::ThreadReduce (input, scan_op);
1124+ T thread_prefix = cub::ThreadReduce<::cuda::std:: remove_reference_t < decltype (input)>, ScanOp, T, T> (input, scan_op);
11251125
11261126 // Exclusive thread block-scan
11271127 ExclusiveScan (thread_prefix, thread_prefix, scan_op, block_prefix_callback_op);
@@ -1231,7 +1231,8 @@ public:
12311231 ExclusiveScan (T (&input)[ITEMS_PER_THREAD], T (&output)[ITEMS_PER_THREAD], ScanOp scan_op)
12321232 {
12331233 // Reduce consecutive thread items in registers
1234- T thread_partial = cub::ThreadReduce (input, scan_op);
1234+ T thread_partial =
1235+ cub::ThreadReduce<::cuda::std::remove_reference_t <decltype (input)>, ScanOp, T, T>(input, scan_op);
12351236
12361237 // Exclusive thread block-scan
12371238 ExclusiveScan (thread_partial, thread_partial, scan_op);
@@ -1275,7 +1276,8 @@ public:
12751276 ExclusiveScan (T (&input)[ITEMS_PER_THREAD], T (&output)[ITEMS_PER_THREAD], ScanOp scan_op, T& block_aggregate)
12761277 {
12771278 // Reduce consecutive thread items in registers
1278- T thread_partial = cub::ThreadReduce (input, scan_op);
1279+ T thread_partial =
1280+ cub::ThreadReduce<::cuda::std::remove_reference_t <decltype (input)>, ScanOp, T, T>(input, scan_op);
12791281
12801282 // Exclusive thread block-scan
12811283 ExclusiveScan (thread_partial, thread_partial, scan_op, block_aggregate);
@@ -1524,7 +1526,8 @@ public:
15241526 {
15251527 // Reduce consecutive thread items in registers
15261528 ::cuda::std::plus<> scan_op;
1527- T thread_prefix = cub::ThreadReduce (input, scan_op);
1529+ T thread_prefix =
1530+ cub::ThreadReduce<::cuda::std::remove_reference_t <decltype (input)>, decltype (scan_op), T, T>(input, scan_op);
15281531
15291532 // Exclusive thread block-scan
15301533 ExclusiveSum (thread_prefix, thread_prefix);
@@ -1601,7 +1604,8 @@ public:
16011604 {
16021605 // Reduce consecutive thread items in registers
16031606 ::cuda::std::plus<> scan_op;
1604- T thread_prefix = cub::ThreadReduce (input, scan_op);
1607+ T thread_prefix =
1608+ cub::ThreadReduce<::cuda::std::remove_reference_t <decltype (input)>, decltype (scan_op), T, T>(input, scan_op);
16051609
16061610 // Exclusive thread block-scan
16071611 ExclusiveSum (thread_prefix, thread_prefix, block_aggregate);
@@ -1682,7 +1686,8 @@ public:
16821686 {
16831687 // Reduce consecutive thread items in registers
16841688 ::cuda::std::plus<> scan_op;
1685- T thread_prefix = cub::ThreadReduce (input, scan_op);
1689+ T thread_prefix =
1690+ cub::ThreadReduce<::cuda::std::remove_reference_t <decltype (input)>, decltype (scan_op), T, T>(input, scan_op);
16861691
16871692 // Exclusive thread block-scan
16881693 ExclusiveSum (thread_prefix, thread_prefix, block_prefix_callback_op);
@@ -1954,7 +1959,8 @@ public:
19541959 else
19551960 {
19561961 // Reduce consecutive thread items in registers
1957- T thread_prefix = cub::ThreadReduce (input, scan_op);
1962+ T thread_prefix =
1963+ cub::ThreadReduce<::cuda::std::remove_reference_t <decltype (input)>, decltype (scan_op), T, T>(input, scan_op);
19581964
19591965 // Exclusive thread block-scan
19601966 ExclusiveScan (thread_prefix, thread_prefix, scan_op);
@@ -2011,7 +2017,7 @@ public:
20112017 InclusiveScan (T (&input)[ITEMS_PER_THREAD], T (&output)[ITEMS_PER_THREAD], T initial_value, ScanOp scan_op)
20122018 {
20132019 // Reduce consecutive thread items in registers
2014- T thread_prefix = cub::ThreadReduce (input, scan_op);
2020+ T thread_prefix = cub::ThreadReduce<::cuda::std:: remove_reference_t < decltype (input)>, ScanOp, T, T> (input, scan_op);
20152021
20162022 // Exclusive thread block-scan
20172023 ExclusiveScan (thread_prefix, thread_prefix, initial_value, scan_op);
@@ -2093,7 +2099,8 @@ public:
20932099 else
20942100 {
20952101 // Reduce consecutive thread items in registers
2096- T thread_prefix = cub::ThreadReduce (input, scan_op);
2102+ T thread_prefix =
2103+ cub::ThreadReduce<::cuda::std::remove_reference_t <decltype (input)>, ScanOp, T, T>(input, scan_op);
20972104
20982105 // Exclusive thread block-scan (with no initial value)
20992106 ExclusiveScan (thread_prefix, thread_prefix, scan_op, block_aggregate);
@@ -2160,7 +2167,7 @@ public:
21602167 T (&input)[ITEMS_PER_THREAD], T (&output)[ITEMS_PER_THREAD], T initial_value, ScanOp scan_op, T& block_aggregate)
21612168 {
21622169 // Reduce consecutive thread items in registers
2163- T thread_prefix = cub::ThreadReduce (input, scan_op);
2170+ T thread_prefix = cub::ThreadReduce<::cuda::std:: remove_reference_t < decltype (input)>, ScanOp, T, T> (input, scan_op);
21642171
21652172 // Exclusive thread block-scan
21662173 ExclusiveScan (thread_prefix, thread_prefix, initial_value, scan_op, block_aggregate);
@@ -2295,7 +2302,8 @@ public:
22952302 else
22962303 {
22972304 // Reduce consecutive thread items in registers
2298- T thread_prefix = cub::ThreadReduce (input, scan_op);
2305+ T thread_prefix =
2306+ cub::ThreadReduce<::cuda::std::remove_reference_t <decltype (input)>, ScanOp, T, T>(input, scan_op);
22992307
23002308 // Exclusive thread block-scan
23012309 ExclusiveScan (thread_prefix, thread_prefix, scan_op, block_prefix_callback_op);
0 commit comments