Skip to content
This repository has been archived by the owner on Mar 21, 2024. It is now read-only.

Commit

Permalink
- Fix for ambiguity in BlockScan::Reduce() between generic reduction and
Browse files Browse the repository at this point in the history
  summation.  Summation entrypoints are now called ::Sum(), similar
  to the convention in BlockScan.
		    
- Small edits to mainpage documentation and download tracking

- Refactor test and docs outside of cub directory.  Take generated html
docs off gitignore.

Former-commit-id: 98c1a6b
  • Loading branch information
dumerrill committed Mar 9, 2013
1 parent dbf8dae commit 6e2a232
Show file tree
Hide file tree
Showing 320 changed files with 19,524 additions and 480 deletions.
18 changes: 17 additions & 1 deletion VERSION.TXT
Original file line number Diff line number Diff line change
@@ -1 +1,17 @@
CUB version 0.900

0.9.1 03/09/2013

- Fix for ambiguity in BlockScan::Reduce() between generic reduction and
summation. Summation entrypoints are now called ::Sum(), similar
to the convention in BlockScan.

- Small edits to mainpage documentation and download tracking

//-----------------------------------------------------------------------------

0.9.0 03/07/2013

- Intial "preview" release. CUB is the first durable, high-performance library
of cooperative block-level, warp-level, and thread-level primitives for CUDA
kernel programming. More primitives and examples coming soon!

116 changes: 58 additions & 58 deletions cub/block/block_reduce.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,7 @@ namespace cub {
* ...
*
* // Compute the threadblock-wide sum for thread0
* int aggregate = BlockReduce::Reduce(smem_storage, data);
* int aggregate = BlockReduce::Sum(smem_storage, data);
*
* ...
* \endcode
Expand All @@ -137,7 +137,7 @@ namespace cub {
* if (threadIdx.x < num_elements) data = ...;
*
* // Compute the threadblock-wide sum of valid elements in thread0
* int aggregate = BlockReduce::Reduce(smem_storage, data, num_elements);
* int aggregate = BlockReduce::Sum(smem_storage, data, num_elements);
*
* ...
* \endcode
Expand Down Expand Up @@ -296,63 +296,7 @@ private:

public:

/******************************************************************//**
* \name Summation reductions
*********************************************************************/
//@{

/**
* \brief Computes a threadblock-wide reduction for thread<sub>0</sub> using addition (+) as the reduction operator. Each thread contributes one input element.
*
* The return value is undefined in threads other than thread<sub>0</sub>.
*
* \smemreuse
*/
static __device__ __forceinline__ T Reduce(
SmemStorage &smem_storage, ///< [in] Shared reference to opaque SmemStorage layout
T input) ///< [in] Calling thread's input
{
Sum<T> reduction_op;
return Reduce(smem_storage, input, reduction_op);
}

/**
* \brief Computes a threadblock-wide reduction for thread<sub>0</sub> using addition (+) as the reduction operator. Each thread contributes an array of consecutive input elements.
*
* The return value is undefined in threads other than thread<sub>0</sub>.
*
* \smemreuse
*
* \tparam ITEMS_PER_THREAD <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
*/
template <int ITEMS_PER_THREAD>
static __device__ __forceinline__ T Reduce(
SmemStorage &smem_storage, ///< [in] Shared reference to opaque SmemStorage layout
T (&inputs)[ITEMS_PER_THREAD]) ///< [in] Calling thread's input segment
{
Sum<T> reduction_op;
return Reduce(smem_storage, inputs, reduction_op);
}


/**
* \brief Computes a threadblock-wide reduction for thread<sub>0</sub> using addition (+) as the reduction operator. The first \p valid_threads threads each contribute one input element.
*
* \smemreuse
*
* The return value is undefined in threads other than thread<sub>0</sub>.
*/
static __device__ __forceinline__ T Reduce(
SmemStorage &smem_storage, ///< [in] Shared reference to opaque SmemStorage layout
T input, ///< [in] Calling thread's input
const unsigned int &valid_threads) ///< [in] Number of threads containing valid elements (may be less than BLOCK_THREADS)
{
Sum<T> reduction_op;
return Reduce(smem_storage, input, valid_threads);
}


//@}
/******************************************************************//**
* \name Generic reductions
*********************************************************************/
Expand Down Expand Up @@ -430,7 +374,63 @@ public:
}

//@}
/******************************************************************//**
* \name Summation reductions
*********************************************************************/
//@{

/**
* \brief Computes a threadblock-wide reduction for thread<sub>0</sub> using addition (+) as the reduction operator. Each thread contributes one input element.
*
* The return value is undefined in threads other than thread<sub>0</sub>.
*
* \smemreuse
*/
static __device__ __forceinline__ T Sum(
SmemStorage &smem_storage, ///< [in] Shared reference to opaque SmemStorage layout
T input) ///< [in] Calling thread's input
{
cub::Sum<T> reduction_op;
return Reduce(smem_storage, input, reduction_op);
}

/**
* \brief Computes a threadblock-wide reduction for thread<sub>0</sub> using addition (+) as the reduction operator. Each thread contributes an array of consecutive input elements.
*
* The return value is undefined in threads other than thread<sub>0</sub>.
*
* \smemreuse
*
* \tparam ITEMS_PER_THREAD <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
*/
template <int ITEMS_PER_THREAD>
static __device__ __forceinline__ T Sum(
SmemStorage &smem_storage, ///< [in] Shared reference to opaque SmemStorage layout
T (&inputs)[ITEMS_PER_THREAD]) ///< [in] Calling thread's input segment
{
cub::Sum<T> reduction_op;
return Reduce(smem_storage, inputs, reduction_op);
}


/**
* \brief Computes a threadblock-wide reduction for thread<sub>0</sub> using addition (+) as the reduction operator. The first \p valid_threads threads each contribute one input element.
*
* \smemreuse
*
* The return value is undefined in threads other than thread<sub>0</sub>.
*/
static __device__ __forceinline__ T Sum(
SmemStorage &smem_storage, ///< [in] Shared reference to opaque SmemStorage layout
T input, ///< [in] Calling thread's input
const unsigned int &valid_threads) ///< [in] Number of threads containing valid elements (may be less than BLOCK_THREADS)
{
cub::Sum<T> reduction_op;
return Reduce(smem_storage, input, reduction_op, valid_threads);
}


//@}
};

/** @} */ // end of SimtCoop group
Expand Down
Loading

0 comments on commit 6e2a232

Please sign in to comment.