diff --git a/cub/agent/agent_scan.cuh b/cub/agent/agent_scan.cuh index e8869034c..ccac7f596 100644 --- a/cub/agent/agent_scan.cuh +++ b/cub/agent/agent_scan.cuh @@ -358,7 +358,7 @@ struct AgentScan // Wait for all threads in the cluster to finish loading / dsmem initialization cooperative_groups::cluster_group::barrier_wait(std::move(token)); - // CTA_SYNC(); + CTA_SYNC(); // What, this sync fixes the race // Perform tile scan if (tile_idx == 0) diff --git a/cub/agent/single_pass_scan_operators.cuh b/cub/agent/single_pass_scan_operators.cuh index c8fae92af..ddf90af5b 100644 --- a/cub/agent/single_pass_scan_operators.cuh +++ b/cub/agent/single_pass_scan_operators.cuh @@ -593,6 +593,8 @@ struct ClusterTilePrefixCallbackOp } exclusive_prefix = Reduce(cta_rank, src_cta, tile_descriptor.value); + // second thread of block 31 reads 2399 instead of 1920 + if (__shfl_sync(CUB_DETAIL_CLUSTER_WARP_MASK, tile_descriptor.status == SCAN_TILE_PARTIAL, 0,