-
Notifications
You must be signed in to change notification settings - Fork 22
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
ENH: kernels for random.vonmisses
; part 2
#681
base: master
Are you sure you want to change the base?
Changes from 7 commits
63eeab1
5e6086c
4268517
6f77dc0
b5f539a
e9c17c7
75dc985
7221851
df3160f
0e743d2
ff8de8e
1492555
233cd59
30637c6
3222bc5
2c3eeb2
b2b3c42
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -1217,7 +1217,7 @@ void dpnp_rng_uniform_c(void* result, const long low, const long high, const siz | |
template <typename _DataType> | ||
void dpnp_rng_vonmises_large_kappa_c(void* result, const _DataType mu, const _DataType kappa, const size_t size) | ||
{ | ||
if (!size) | ||
if (!size || !result) | ||
{ | ||
return; | ||
} | ||
|
@@ -1227,6 +1227,7 @@ void dpnp_rng_vonmises_large_kappa_c(void* result, const _DataType mu, const _Da | |
_DataType s_minus_one, hpt, r_over_two_kappa_minus_one, rho_minus_one; | ||
_DataType* Uvec = nullptr; | ||
_DataType* Vvec = nullptr; | ||
size_t* n = nullptr; | ||
const _DataType d_zero = 0.0, d_one = 1.0; | ||
|
||
assert(kappa > 1.0); | ||
|
@@ -1242,65 +1243,70 @@ void dpnp_rng_vonmises_large_kappa_c(void* result, const _DataType mu, const _Da | |
|
||
Uvec = reinterpret_cast<_DataType*>(dpnp_memory_alloc_c(size * sizeof(_DataType))); | ||
Vvec = reinterpret_cast<_DataType*>(dpnp_memory_alloc_c(size * sizeof(_DataType))); | ||
|
||
for (size_t n = 0; n < size;) | ||
n = reinterpret_cast<size_t*>(dpnp_memory_alloc_c(sizeof(size_t))); | ||
for (n[0] = 0; n[0] < size;) | ||
{ | ||
size_t diff_size = size - n; | ||
size_t diff_size = size - n[0]; | ||
mkl_rng::uniform<_DataType> uniform_distribution_u(d_zero, 0.5 * M_PI); | ||
auto event_out = mkl_rng::generate(uniform_distribution_u, DPNP_RNG_ENGINE, diff_size, Uvec); | ||
event_out.wait(); | ||
// TODO | ||
// use deps case | ||
auto uniform_distr_u_event = mkl_rng::generate(uniform_distribution_u, DPNP_RNG_ENGINE, diff_size, Uvec); | ||
mkl_rng::uniform<_DataType> uniform_distribution_v(d_zero, d_one); | ||
event_out = mkl_rng::generate(uniform_distribution_v, DPNP_RNG_ENGINE, diff_size, Vvec); | ||
event_out.wait(); | ||
auto uniform_distr_v_event = mkl_rng::generate(uniform_distribution_v, DPNP_RNG_ENGINE, diff_size, Vvec); | ||
|
||
// TODO | ||
// kernel | ||
for (size_t i = 0; i < diff_size; i++) | ||
{ | ||
_DataType sn, cn, sn2, cn2; | ||
_DataType neg_W_minus_one, V, Y; | ||
cl::sycl::range<1> diff_gws(diff_size); | ||
|
||
sn = sin(Uvec[i]); | ||
cn = cos(Uvec[i]); | ||
V = Vvec[i]; | ||
sn2 = sn * sn; | ||
cn2 = cn * cn; | ||
auto paral_kernel_some = [&](cl::sycl::handler& cgh) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Kernel inside the loop with bigger trip count. It would be more efficient to parallelize (make kernel) the algorithm by bigger value
It is hard to predict it with no perf measurements but I would vote that parallelization with bigger number of threads should be better. |
||
cgh.depends_on({uniform_distr_u_event, uniform_distr_v_event}); | ||
cgh.parallel_for(diff_gws, [=](cl::sycl::id<1> global_id) { | ||
size_t i = global_id[0]; | ||
|
||
neg_W_minus_one = s_minus_one * sn2 / (0.5 * s_minus_one + cn2); | ||
Y = kappa * (s_minus_one + neg_W_minus_one); | ||
_DataType sn, cn, sn2, cn2; | ||
_DataType neg_W_minus_one, V, Y; | ||
|
||
if ((Y * (2 - Y) >= V) || (log(Y / V) + 1 >= Y)) | ||
{ | ||
Y = neg_W_minus_one * (2 - neg_W_minus_one); | ||
if (Y < 0) | ||
Y = 0.0; | ||
else if (Y > 1.0) | ||
Y = 1.0; | ||
sn = cl::sycl::sin(Uvec[i]); | ||
cn = cl::sycl::cos(Uvec[i]); | ||
V = Vvec[i]; | ||
sn2 = sn * sn; | ||
cn2 = cn * cn; | ||
|
||
result1[n++] = asin(sqrt(Y)); | ||
} | ||
} | ||
} | ||
neg_W_minus_one = s_minus_one * sn2 / (0.5 * s_minus_one + cn2); | ||
Y = kappa * (s_minus_one + neg_W_minus_one); | ||
|
||
if ((Y * (2 - Y) >= V) || (cl::sycl::log(Y / V) + 1 >= Y)) | ||
{ | ||
Y = neg_W_minus_one * (2 - neg_W_minus_one); | ||
if (Y < 0) | ||
Y = 0.0; | ||
else if (Y > 1.0) | ||
Y = 1.0; | ||
n[0] = n[0] + 1; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is a mistake. This is parallel environment (SYCL kernel). Writing inside the kernel into same memory cause https://en.wikipedia.org/wiki/Race_condition |
||
result1[n[0]] = cl::sycl::asin(cl::sycl::sqrt(Y)); | ||
} | ||
}); | ||
}; | ||
auto some_event = DPNP_QUEUE.submit(paral_kernel_some); | ||
some_event.wait(); | ||
} | ||
dpnp_memory_free_c(Uvec); | ||
dpnp_memory_free_c(n); | ||
|
||
mkl_rng::uniform<_DataType> uniform_distribution(d_zero, d_one); | ||
auto event_out = mkl_rng::generate(uniform_distribution, DPNP_RNG_ENGINE, size, Vvec); | ||
event_out.wait(); | ||
auto uniform_distr_event = mkl_rng::generate(uniform_distribution, DPNP_RNG_ENGINE, size, Vvec); | ||
|
||
// TODO | ||
// kernel | ||
for (size_t i = 0; i < size; i++) | ||
{ | ||
_DataType mod, resi; | ||
cl::sycl::range<1> gws(size); | ||
|
||
resi = (Vvec[i] < 0.5) ? mu - result1[i] : mu + result1[i]; | ||
mod = fabs(resi); | ||
mod = (fmod(mod + M_PI, 2 * M_PI) - M_PI); | ||
result1[i] = (resi < 0) ? -mod : mod; | ||
} | ||
auto paral_kernel_acceptance = [&](cl::sycl::handler& cgh) { | ||
cgh.depends_on({uniform_distr_event}); | ||
cgh.parallel_for(gws, [=](cl::sycl::id<1> global_id) { | ||
size_t i = global_id[0]; | ||
double mod, resi; | ||
resi = (Vvec[i] < 0.5) ? mu - result1[i] : mu + result1[i]; | ||
mod = cl::sycl::fabs(resi); | ||
mod = (cl::sycl::fmod(mod + M_PI, 2 * M_PI) - M_PI); | ||
result1[i] = (resi < 0) ? -mod : mod; | ||
}); | ||
}; | ||
auto acceptance_event = DPNP_QUEUE.submit(paral_kernel_acceptance); | ||
acceptance_event.wait(); | ||
|
||
dpnp_memory_free_c(Vvec); | ||
return; | ||
|
@@ -1309,7 +1315,7 @@ void dpnp_rng_vonmises_large_kappa_c(void* result, const _DataType mu, const _Da | |
template <typename _DataType> | ||
void dpnp_rng_vonmises_small_kappa_c(void* result, const _DataType mu, const _DataType kappa, const size_t size) | ||
{ | ||
if (!size) | ||
if (!size || !result) | ||
{ | ||
return; | ||
} | ||
|
@@ -1318,6 +1324,7 @@ void dpnp_rng_vonmises_small_kappa_c(void* result, const _DataType mu, const _Da | |
_DataType rho_over_kappa, rho, r, s_kappa; | ||
_DataType* Uvec = nullptr; | ||
_DataType* Vvec = nullptr; | ||
size_t* n = nullptr; | ||
|
||
const _DataType d_zero = 0.0, d_one = 1.0; | ||
|
||
|
@@ -1332,52 +1339,58 @@ void dpnp_rng_vonmises_small_kappa_c(void* result, const _DataType mu, const _Da | |
|
||
Uvec = reinterpret_cast<_DataType*>(dpnp_memory_alloc_c(size * sizeof(_DataType))); | ||
Vvec = reinterpret_cast<_DataType*>(dpnp_memory_alloc_c(size * sizeof(_DataType))); | ||
n = reinterpret_cast<size_t*>(dpnp_memory_alloc_c(sizeof(size_t))); | ||
|
||
for (size_t n = 0; n < size;) | ||
for (n[0] = 0; n[0] < size;) | ||
{ | ||
size_t diff_size = size - n; | ||
size_t diff_size = size - n[0]; | ||
mkl_rng::uniform<_DataType> uniform_distribution_u(d_zero, M_PI); | ||
auto event_out = mkl_rng::generate(uniform_distribution_u, DPNP_RNG_ENGINE, diff_size, Uvec); | ||
event_out.wait(); | ||
// TODO | ||
// use deps case | ||
auto uniform_distr_u_event = mkl_rng::generate(uniform_distribution_u, DPNP_RNG_ENGINE, diff_size, Uvec); | ||
mkl_rng::uniform<_DataType> uniform_distribution_v(d_zero, d_one); | ||
event_out = mkl_rng::generate(uniform_distribution_v, DPNP_RNG_ENGINE, diff_size, Vvec); | ||
event_out.wait(); | ||
auto uniform_distr_v_event = mkl_rng::generate(uniform_distribution_v, DPNP_RNG_ENGINE, diff_size, Vvec); | ||
|
||
// TODO | ||
// kernel | ||
for (size_t i = 0; i < diff_size; i++) | ||
{ | ||
_DataType Z, W, Y, V; | ||
Z = cos(Uvec[i]); | ||
V = Vvec[i]; | ||
W = (kappa + s_kappa * Z) / (s_kappa + kappa * Z); | ||
Y = s_kappa - kappa * W; | ||
if ((Y * (2 - Y) >= V) || (log(Y / V) + 1 >= Y)) | ||
{ | ||
result1[n++] = acos(W); | ||
} | ||
} | ||
} | ||
cl::sycl::range<1> diff_gws(diff_size); | ||
|
||
auto paral_kernel_some = [&](cl::sycl::handler& cgh) { | ||
cgh.depends_on({uniform_distr_u_event, uniform_distr_v_event}); | ||
cgh.parallel_for(diff_gws, [=](cl::sycl::id<1> global_id) { | ||
size_t i = global_id[0]; | ||
|
||
_DataType Z, W, Y, V; | ||
Z = cl::sycl::cos(Uvec[i]); | ||
V = Vvec[i]; | ||
W = (kappa + s_kappa * Z) / (s_kappa + kappa * Z); | ||
Y = s_kappa - kappa * W; | ||
if ((Y * (2 - Y) >= V) || (cl::sycl::log(Y / V) + 1 >= Y)) | ||
{ | ||
n[0] = n[0] + 1; | ||
result1[n[0]] = cl::sycl::acos(W); | ||
} | ||
}); | ||
}; | ||
auto some_event = DPNP_QUEUE.submit(paral_kernel_some); | ||
some_event.wait(); | ||
} | ||
dpnp_memory_free_c(Uvec); | ||
dpnp_memory_free_c(n); | ||
|
||
mkl_rng::uniform<_DataType> uniform_distribution(d_zero, d_one); | ||
auto event_out = mkl_rng::generate(uniform_distribution, DPNP_RNG_ENGINE, size, Vvec); | ||
event_out.wait(); | ||
|
||
// TODO | ||
// kernel | ||
for (size_t i = 0; i < size; i++) | ||
{ | ||
double mod, resi; | ||
auto uniform_distr_event = mkl_rng::generate(uniform_distribution, DPNP_RNG_ENGINE, size, Vvec); | ||
|
||
resi = (Vvec[i] < 0.5) ? mu - result1[i] : mu + result1[i]; | ||
mod = fabs(resi); | ||
mod = (fmod(mod + M_PI, 2 * M_PI) - M_PI); | ||
result1[i] = (resi < 0) ? -mod : mod; | ||
} | ||
cl::sycl::range<1> gws(size); | ||
auto paral_kernel_acceptance = [&](cl::sycl::handler& cgh) { | ||
cgh.depends_on({uniform_distr_event}); | ||
cgh.parallel_for(gws, [=](cl::sycl::id<1> global_id) { | ||
size_t i = global_id[0]; | ||
double mod, resi; | ||
resi = (Vvec[i] < 0.5) ? mu - result1[i] : mu + result1[i]; | ||
mod = cl::sycl::fabs(resi); | ||
mod = (cl::sycl::fmod(mod + M_PI, 2 * M_PI) - M_PI); | ||
result1[i] = (resi < 0) ? -mod : mod; | ||
}); | ||
}; | ||
auto acceptance_event = DPNP_QUEUE.submit(paral_kernel_acceptance); | ||
acceptance_event.wait(); | ||
|
||
dpnp_memory_free_c(Vvec); | ||
return; | ||
|
@@ -1396,7 +1409,6 @@ void dpnp_rng_vonmises_c(void* result, const _DataType mu, const _DataType kappa | |
dpnp_rng_vonmises_large_kappa_c<_DataType>(result, mu, kappa, size); | ||
else | ||
dpnp_rng_vonmises_small_kappa_c<_DataType>(result, mu, kappa, size); | ||
// TODO case when kappa < kappa < 1e-8 (very small) | ||
} | ||
|
||
template <typename _KernelNameSpecialization> | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
this is quite strange (Make scalar as a array with one element).
I think it should be a scalar, not an array.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
scalar and this is the same. You just can not pass n to sycl region in other way.