Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ENH: kernels for random.vonmisses; part 2 #681

Draft
wants to merge 17 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
180 changes: 96 additions & 84 deletions dpnp/backend/kernels/dpnp_krnl_random.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1217,7 +1217,7 @@ void dpnp_rng_uniform_c(void* result, const long low, const long high, const siz
template <typename _DataType>
void dpnp_rng_vonmises_large_kappa_c(void* result, const _DataType mu, const _DataType kappa, const size_t size)
{
if (!size)
if (!size || !result)
{
return;
}
Expand All @@ -1227,6 +1227,7 @@ void dpnp_rng_vonmises_large_kappa_c(void* result, const _DataType mu, const _Da
_DataType s_minus_one, hpt, r_over_two_kappa_minus_one, rho_minus_one;
_DataType* Uvec = nullptr;
_DataType* Vvec = nullptr;
size_t* n = nullptr;
const _DataType d_zero = 0.0, d_one = 1.0;

assert(kappa > 1.0);
Expand All @@ -1242,65 +1243,70 @@ void dpnp_rng_vonmises_large_kappa_c(void* result, const _DataType mu, const _Da

Uvec = reinterpret_cast<_DataType*>(dpnp_memory_alloc_c(size * sizeof(_DataType)));
Vvec = reinterpret_cast<_DataType*>(dpnp_memory_alloc_c(size * sizeof(_DataType)));

for (size_t n = 0; n < size;)
n = reinterpret_cast<size_t*>(dpnp_memory_alloc_c(sizeof(size_t)));
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this is quite strange (Make scalar as a array with one element).
I think it should be a scalar, not an array.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

scalar and this is the same. You just can not pass n to sycl region in other way.

for (n[0] = 0; n[0] < size;)
{
size_t diff_size = size - n;
size_t diff_size = size - n[0];
mkl_rng::uniform<_DataType> uniform_distribution_u(d_zero, 0.5 * M_PI);
auto event_out = mkl_rng::generate(uniform_distribution_u, DPNP_RNG_ENGINE, diff_size, Uvec);
event_out.wait();
// TODO
// use deps case
auto uniform_distr_u_event = mkl_rng::generate(uniform_distribution_u, DPNP_RNG_ENGINE, diff_size, Uvec);
mkl_rng::uniform<_DataType> uniform_distribution_v(d_zero, d_one);
event_out = mkl_rng::generate(uniform_distribution_v, DPNP_RNG_ENGINE, diff_size, Vvec);
event_out.wait();
auto uniform_distr_v_event = mkl_rng::generate(uniform_distribution_v, DPNP_RNG_ENGINE, diff_size, Vvec);

// TODO
// kernel
for (size_t i = 0; i < diff_size; i++)
{
_DataType sn, cn, sn2, cn2;
_DataType neg_W_minus_one, V, Y;
cl::sycl::range<1> diff_gws(diff_size);

sn = sin(Uvec[i]);
cn = cos(Uvec[i]);
V = Vvec[i];
sn2 = sn * sn;
cn2 = cn * cn;
auto paral_kernel_some = [&](cl::sycl::handler& cgh) {
Copy link
Contributor

@shssf shssf May 13, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Kernel inside the loop with bigger trip count. It would be more efficient to parallelize (make kernel) the algorithm by bigger value size instead size-n. So, it will require a loop inside the kernel.
It is questionable what will be more performant

  1. loop with a kernels queue (data dependent)
  2. kernel with a loop

It is hard to predict it with no perf measurements but I would vote that parallelization with bigger number of threads should be better.

cgh.depends_on({uniform_distr_u_event, uniform_distr_v_event});
cgh.parallel_for(diff_gws, [=](cl::sycl::id<1> global_id) {
size_t i = global_id[0];

neg_W_minus_one = s_minus_one * sn2 / (0.5 * s_minus_one + cn2);
Y = kappa * (s_minus_one + neg_W_minus_one);
_DataType sn, cn, sn2, cn2;
_DataType neg_W_minus_one, V, Y;

if ((Y * (2 - Y) >= V) || (log(Y / V) + 1 >= Y))
{
Y = neg_W_minus_one * (2 - neg_W_minus_one);
if (Y < 0)
Y = 0.0;
else if (Y > 1.0)
Y = 1.0;
sn = cl::sycl::sin(Uvec[i]);
cn = cl::sycl::cos(Uvec[i]);
V = Vvec[i];
sn2 = sn * sn;
cn2 = cn * cn;

result1[n++] = asin(sqrt(Y));
}
}
}
neg_W_minus_one = s_minus_one * sn2 / (0.5 * s_minus_one + cn2);
Y = kappa * (s_minus_one + neg_W_minus_one);

if ((Y * (2 - Y) >= V) || (cl::sycl::log(Y / V) + 1 >= Y))
{
Y = neg_W_minus_one * (2 - neg_W_minus_one);
if (Y < 0)
Y = 0.0;
else if (Y > 1.0)
Y = 1.0;
n[0] = n[0] + 1;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is a mistake. This is parallel environment (SYCL kernel). Writing inside the kernel into same memory cause https://en.wikipedia.org/wiki/Race_condition

result1[n[0]] = cl::sycl::asin(cl::sycl::sqrt(Y));
}
});
};
auto some_event = DPNP_QUEUE.submit(paral_kernel_some);
some_event.wait();
}
dpnp_memory_free_c(Uvec);
dpnp_memory_free_c(n);

mkl_rng::uniform<_DataType> uniform_distribution(d_zero, d_one);
auto event_out = mkl_rng::generate(uniform_distribution, DPNP_RNG_ENGINE, size, Vvec);
event_out.wait();
auto uniform_distr_event = mkl_rng::generate(uniform_distribution, DPNP_RNG_ENGINE, size, Vvec);

// TODO
// kernel
for (size_t i = 0; i < size; i++)
{
_DataType mod, resi;
cl::sycl::range<1> gws(size);

resi = (Vvec[i] < 0.5) ? mu - result1[i] : mu + result1[i];
mod = fabs(resi);
mod = (fmod(mod + M_PI, 2 * M_PI) - M_PI);
result1[i] = (resi < 0) ? -mod : mod;
}
auto paral_kernel_acceptance = [&](cl::sycl::handler& cgh) {
cgh.depends_on({uniform_distr_event});
cgh.parallel_for(gws, [=](cl::sycl::id<1> global_id) {
size_t i = global_id[0];
double mod, resi;
resi = (Vvec[i] < 0.5) ? mu - result1[i] : mu + result1[i];
mod = cl::sycl::fabs(resi);
mod = (cl::sycl::fmod(mod + M_PI, 2 * M_PI) - M_PI);
result1[i] = (resi < 0) ? -mod : mod;
});
};
auto acceptance_event = DPNP_QUEUE.submit(paral_kernel_acceptance);
acceptance_event.wait();

dpnp_memory_free_c(Vvec);
return;
Expand All @@ -1309,7 +1315,7 @@ void dpnp_rng_vonmises_large_kappa_c(void* result, const _DataType mu, const _Da
template <typename _DataType>
void dpnp_rng_vonmises_small_kappa_c(void* result, const _DataType mu, const _DataType kappa, const size_t size)
{
if (!size)
if (!size || !result)
{
return;
}
Expand All @@ -1318,6 +1324,7 @@ void dpnp_rng_vonmises_small_kappa_c(void* result, const _DataType mu, const _Da
_DataType rho_over_kappa, rho, r, s_kappa;
_DataType* Uvec = nullptr;
_DataType* Vvec = nullptr;
size_t* n = nullptr;

const _DataType d_zero = 0.0, d_one = 1.0;

Expand All @@ -1332,52 +1339,58 @@ void dpnp_rng_vonmises_small_kappa_c(void* result, const _DataType mu, const _Da

Uvec = reinterpret_cast<_DataType*>(dpnp_memory_alloc_c(size * sizeof(_DataType)));
Vvec = reinterpret_cast<_DataType*>(dpnp_memory_alloc_c(size * sizeof(_DataType)));
n = reinterpret_cast<size_t*>(dpnp_memory_alloc_c(sizeof(size_t)));

for (size_t n = 0; n < size;)
for (n[0] = 0; n[0] < size;)
{
size_t diff_size = size - n;
size_t diff_size = size - n[0];
mkl_rng::uniform<_DataType> uniform_distribution_u(d_zero, M_PI);
auto event_out = mkl_rng::generate(uniform_distribution_u, DPNP_RNG_ENGINE, diff_size, Uvec);
event_out.wait();
// TODO
// use deps case
auto uniform_distr_u_event = mkl_rng::generate(uniform_distribution_u, DPNP_RNG_ENGINE, diff_size, Uvec);
mkl_rng::uniform<_DataType> uniform_distribution_v(d_zero, d_one);
event_out = mkl_rng::generate(uniform_distribution_v, DPNP_RNG_ENGINE, diff_size, Vvec);
event_out.wait();
auto uniform_distr_v_event = mkl_rng::generate(uniform_distribution_v, DPNP_RNG_ENGINE, diff_size, Vvec);

// TODO
// kernel
for (size_t i = 0; i < diff_size; i++)
{
_DataType Z, W, Y, V;
Z = cos(Uvec[i]);
V = Vvec[i];
W = (kappa + s_kappa * Z) / (s_kappa + kappa * Z);
Y = s_kappa - kappa * W;
if ((Y * (2 - Y) >= V) || (log(Y / V) + 1 >= Y))
{
result1[n++] = acos(W);
}
}
}
cl::sycl::range<1> diff_gws(diff_size);

auto paral_kernel_some = [&](cl::sycl::handler& cgh) {
cgh.depends_on({uniform_distr_u_event, uniform_distr_v_event});
cgh.parallel_for(diff_gws, [=](cl::sycl::id<1> global_id) {
size_t i = global_id[0];

_DataType Z, W, Y, V;
Z = cl::sycl::cos(Uvec[i]);
V = Vvec[i];
W = (kappa + s_kappa * Z) / (s_kappa + kappa * Z);
Y = s_kappa - kappa * W;
if ((Y * (2 - Y) >= V) || (cl::sycl::log(Y / V) + 1 >= Y))
{
n[0] = n[0] + 1;
result1[n[0]] = cl::sycl::acos(W);
}
});
};
auto some_event = DPNP_QUEUE.submit(paral_kernel_some);
some_event.wait();
}
dpnp_memory_free_c(Uvec);
dpnp_memory_free_c(n);

mkl_rng::uniform<_DataType> uniform_distribution(d_zero, d_one);
auto event_out = mkl_rng::generate(uniform_distribution, DPNP_RNG_ENGINE, size, Vvec);
event_out.wait();

// TODO
// kernel
for (size_t i = 0; i < size; i++)
{
double mod, resi;
auto uniform_distr_event = mkl_rng::generate(uniform_distribution, DPNP_RNG_ENGINE, size, Vvec);

resi = (Vvec[i] < 0.5) ? mu - result1[i] : mu + result1[i];
mod = fabs(resi);
mod = (fmod(mod + M_PI, 2 * M_PI) - M_PI);
result1[i] = (resi < 0) ? -mod : mod;
}
cl::sycl::range<1> gws(size);
auto paral_kernel_acceptance = [&](cl::sycl::handler& cgh) {
cgh.depends_on({uniform_distr_event});
cgh.parallel_for(gws, [=](cl::sycl::id<1> global_id) {
size_t i = global_id[0];
double mod, resi;
resi = (Vvec[i] < 0.5) ? mu - result1[i] : mu + result1[i];
mod = cl::sycl::fabs(resi);
mod = (cl::sycl::fmod(mod + M_PI, 2 * M_PI) - M_PI);
result1[i] = (resi < 0) ? -mod : mod;
});
};
auto acceptance_event = DPNP_QUEUE.submit(paral_kernel_acceptance);
acceptance_event.wait();

dpnp_memory_free_c(Vvec);
return;
Expand All @@ -1396,7 +1409,6 @@ void dpnp_rng_vonmises_c(void* result, const _DataType mu, const _DataType kappa
dpnp_rng_vonmises_large_kappa_c<_DataType>(result, mu, kappa, size);
else
dpnp_rng_vonmises_small_kappa_c<_DataType>(result, mu, kappa, size);
// TODO case when kappa < kappa < 1e-8 (very small)
}

template <typename _KernelNameSpecialization>
Expand Down
2 changes: 2 additions & 0 deletions tests/skipped_tests.tbl
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@ tests/test_linalg.py::test_qr[(5,3)-float32]
tests/test_linalg.py::test_qr[(5,3)-float64]
tests/test_linalg.py::test_qr[(5,3)-int32]
tests/test_linalg.py::test_qr[(5,3)-int64]
tests/test_random.py::TestDistributionsVonmises::test_seed[large_kappa]
tests/test_random.py::TestDistributionsVonmises::test_seed[small_kappa]
tests/test_random.py::TestPermutationsTestShuffle::test_shuffle1[ lambda x: x]
tests/test_random.py::TestPermutationsTestShuffle::test_shuffle1[ lambda x: dpnp.asarray(x).astype(dpnp.int8)]
tests/test_random.py::TestPermutationsTestShuffle::test_shuffle1[ lambda x: dpnp.asarray(x).astype(dpnp.complex64)]
Expand Down