Skip to content

Commit

Permalink
Enable use of cudaMemcpyAsync for thrust::copy
Browse files Browse the repository at this point in the history
In case of contigous ranges of trivially relocatable types we can directly utilize `cudaMemcpyAsync` instead of going through transform.

Fixes #210
  • Loading branch information
miscco committed Jul 18, 2023
1 parent 2e06e0b commit ce6a462
Showing 1 changed file with 45 additions and 3 deletions.
48 changes: 45 additions & 3 deletions thrust/thrust/system/cuda/detail/internal/copy_device_to_device.h
Original file line number Diff line number Diff line change
Expand Up @@ -33,12 +33,38 @@
#include <thrust/system/cuda/config.h>
#include <thrust/system/cuda/detail/execution_policy.h>
#include <thrust/system/cuda/detail/transform.h>
#include <thrust/system/cuda/detail/util.h>
#include <thrust/distance.h>
#include <thrust/functional.h>
#include <thrust/type_traits/is_trivially_relocatable.h>

THRUST_NAMESPACE_BEGIN
namespace cuda_cub {

namespace __copy {
template <class Derived,
class InputIt,
class OutputIt>
OutputIt THRUST_RUNTIME_FUNCTION
device_to_device(execution_policy<Derived>& policy,
InputIt first,
InputIt last,
OutputIt result,
thrust::detail::true_type)
{
typedef typename thrust::iterator_traits<InputIt>::value_type InputTy;
const auto n = thrust::distance(first, last);
if (n > 0) {
cudaError status;
status = trivial_copy_device_to_device(policy,
reinterpret_cast<InputTy*>(thrust::raw_pointer_cast(&*result)),
reinterpret_cast<InputTy const*>(thrust::raw_pointer_cast(&*first)),
n);
cuda_cub::throw_on_error(status, "__copy:: D->D: failed");
}

return result + n;
}

template <class Derived,
class InputIt,
Expand All @@ -47,16 +73,32 @@ namespace __copy {
device_to_device(execution_policy<Derived>& policy,
InputIt first,
InputIt last,
OutputIt result)
OutputIt result,
thrust::detail::false_type)
{
typedef typename thrust::iterator_traits<InputIt>::value_type InputTy;
return cuda_cub::transform(policy,
first,
last,
result,
thrust::identity<InputTy>());
}

template <class Derived,
class InputIt,
class OutputIt>
OutputIt THRUST_RUNTIME_FUNCTION
device_to_device(execution_policy<Derived>& policy,
InputIt first,
InputIt last,
OutputIt result)
{
return device_to_device(policy,
first,
last,
result,
thrust::identity<InputTy>());
typename is_indirectly_trivially_relocatable_to<InputIt, OutputIt>::type());
}

} // namespace __copy

} // namespace cuda_cub
Expand Down

0 comments on commit ce6a462

Please sign in to comment.