diff --git a/examples/device/example_device_decoupled_look_back.cu b/examples/device/example_device_decoupled_look_back.cu new file mode 100644 index 000000000..904a24fa9 --- /dev/null +++ b/examples/device/example_device_decoupled_look_back.cu @@ -0,0 +1,123 @@ +/****************************************************************************** + * Copyright (c) 2011-2023, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +#include + +#include + +#include + +template +__global__ void init_kernel(ScanTileStateT tile_state, int blocks_in_grid) +{ + tile_state.InitializeStatus(blocks_in_grid); +} + +template +__global__ void decoupled_look_back_kernel(cub::ScanTileState tile_state) +{ + using scan_op_t = cub::Sum; + using scan_tile_state_t = cub::ScanTileState; + using tile_prefix_op = cub::TilePrefixCallbackOp; + using temp_storage_t = typename tile_prefix_op::TempStorage; + + // Allocate temp storage in shared memory + __shared__ temp_storage_t temp_storage; + + scan_op_t scan_op{}; + const unsigned int threads_in_warp = 32; + const unsigned int tid = threadIdx.x; + + // Construct prefix op + tile_prefix_op prefix(tile_state, temp_storage, scan_op); + const unsigned int tile_idx = prefix.GetTileIdx(); + + // Compute block aggregate + MessageT block_aggregate = blockIdx.x; + + if (tile_idx == 0) + { + // There are no blocks to look back to, immediately set the inclusive state + if (tid == 0) + { + tile_state.SetInclusive(tile_idx, block_aggregate); + printf("tile %d: inclusive = %d\n", tile_idx, block_aggregate); + } + } + else + { + // Only the first warp in the block can perform the look back + const unsigned int warp_id = tid / threads_in_warp; + + if (warp_id == 0) + { + // Perform the decoupled look-back + // Invocation of the prefix will block until the look-back is complete. + MessageT exclusive_prefix = prefix(block_aggregate); + + if (tid == 0) + { + MessageT inclusive_prefix = scan_op(exclusive_prefix, block_aggregate); + printf("tile %d: exclusive = %d inclusive = %d\n", + tile_idx, + exclusive_prefix, + inclusive_prefix); + } + } + } +} + +template +void decoupled_look_back_example(int blocks_in_grid) +{ + using scan_tile_state_t = cub::ScanTileState; + + // Query temporary storage requirements + std::size_t temp_storage_bytes{}; + scan_tile_state_t::AllocationSize(blocks_in_grid, temp_storage_bytes); + + // Allocate temporary storage + thrust::device_vector temp_storage(temp_storage_bytes); + std::uint8_t *d_temp_storage = thrust::raw_pointer_cast(temp_storage.data()); + + // Initialize temporary storage + scan_tile_state_t tile_status; + tile_status.Init(blocks_in_grid, d_temp_storage, temp_storage_bytes); + const unsigned int threads_in_init_block = 256; + const unsigned int blocks_in_init_grid = cub::DivideAndRoundUp(blocks_in_grid, + threads_in_init_block); + init_kernel<<>>(tile_status, blocks_in_grid); + + // Launch decoupled look-back + const unsigned int threads_in_block = 256; + decoupled_look_back_kernel<<>>(tile_status); + + // Wait for kernel to finish + cudaDeviceSynchronize(); +} + +int main() { decoupled_look_back_example(14); }