Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions src/gpu/intel/compute/kernel_arg_list.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -249,6 +249,7 @@ class kernel_arg_list_t {
APPEND_STORED_SCALAR_VALUE(bf16, bfloat16_t)
APPEND_STORED_SCALAR_VALUE(f32, float)
APPEND_STORED_SCALAR_VALUE(s32, int32_t)
APPEND_STORED_SCALAR_VALUE(s64, int64_t)
APPEND_STORED_SCALAR_VALUE(s8, int8_t)
APPEND_STORED_SCALAR_VALUE(u8, uint8_t)
default:
Expand Down Expand Up @@ -300,6 +301,7 @@ class kernel_arg_list_t {
SET_STORED_SCALAR_VALUE(bf16, bfloat16_t)
SET_STORED_SCALAR_VALUE(f32, float)
SET_STORED_SCALAR_VALUE(s32, int32_t)
SET_STORED_SCALAR_VALUE(s64, int64_t)
SET_STORED_SCALAR_VALUE(s8, int8_t)
SET_STORED_SCALAR_VALUE(u8, uint8_t)
default:
Expand Down
58 changes: 37 additions & 21 deletions src/gpu/intel/include/philox.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,30 +20,46 @@
#define DT_UNDEF 1
#include "gpu/intel/include/types.h"

uint philox_4x32(long idx, uint seed) {
uint philox_4x32_s64(ulong idx, ulong seed, ulong offset) {
#define PHILOX_4UINT_ROUND(mul, ctr, key) \
as_uint4(convert_ulong2(ctr.s31) * mul) ^ (uint4)(ctr.s20 ^ key, 0, 0).s3120
as_uint4(convert_ulong2(ctr.s02) * mul).s3210 \
^ (uint4)(ctr.s1 ^ key.s0, 0, ctr.s3 ^ key.s1, 0)

uint4 ctr = 0;
const ulong2 ctr_mul = (ulong2)(0xD2511F53uL, 0xCD9E8D57uL);
const ulong key_add = as_ulong((uint2)(0x9E3779B9u, 0xBB67AE85u));
const uint16 key0 = (uint16)(seed)
+ as_uint16((ulong8)(key_add))
ulong x = (idx & ~3L);
uint4 ctr = (uint4)((uint)offset, (uint)(offset >> 32), (uint)x,
(uint)(x >> 32));
uint seed_lo = (uint)seed;
uint seed_hi = (uint)(seed >> 32);
const ulong seeds = as_ulong((uint2)(seed_lo, seed_hi));

const ulong2 PHILOX_M4x32 = (ulong2)(0xD2511F53uL, 0xCD9E8D57uL);
const ulong PHILOX_W4x32 = as_ulong((uint2)(0x9E3779B9u, 0xBB67AE85u));
const uint16 key0 = as_uint16((ulong8)(seeds))
+ as_uint16((ulong8)(PHILOX_W4x32))
* (uint16)(0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7);
const uint4 key1
= (uint4)(seed) + as_uint4((ulong2)(key_add)) * (uint4)(8, 8, 9, 9);
ctr = (uint4)(idx & ~3L) + (uint4)(3, 2, 1, 0);
ctr = PHILOX_4UINT_ROUND(ctr_mul, ctr, key0.s01);
ctr = PHILOX_4UINT_ROUND(ctr_mul, ctr, key0.s23);
ctr = PHILOX_4UINT_ROUND(ctr_mul, ctr, key0.s45);
ctr = PHILOX_4UINT_ROUND(ctr_mul, ctr, key0.s67);
ctr = PHILOX_4UINT_ROUND(ctr_mul, ctr, key0.s89);
ctr = PHILOX_4UINT_ROUND(ctr_mul, ctr, key0.sAB);
ctr = PHILOX_4UINT_ROUND(ctr_mul, ctr, key0.sCD);
ctr = PHILOX_4UINT_ROUND(ctr_mul, ctr, key0.sEF);
ctr = PHILOX_4UINT_ROUND(ctr_mul, ctr, key1.s01);
ctr = PHILOX_4UINT_ROUND(ctr_mul, ctr, key1.s23);
return ctr[~idx & 3L];
const uint4 key1 = as_uint4((ulong2)seeds)
+ as_uint4((ulong2)(PHILOX_W4x32)) * (uint4)(8, 8, 9, 9);

ctr = PHILOX_4UINT_ROUND(PHILOX_M4x32, ctr, key0.s01);
ctr = PHILOX_4UINT_ROUND(PHILOX_M4x32, ctr, key0.s23);
ctr = PHILOX_4UINT_ROUND(PHILOX_M4x32, ctr, key0.s45);
ctr = PHILOX_4UINT_ROUND(PHILOX_M4x32, ctr, key0.s67);
ctr = PHILOX_4UINT_ROUND(PHILOX_M4x32, ctr, key0.s89);
ctr = PHILOX_4UINT_ROUND(PHILOX_M4x32, ctr, key0.sAB);
ctr = PHILOX_4UINT_ROUND(PHILOX_M4x32, ctr, key0.sCD);
ctr = PHILOX_4UINT_ROUND(PHILOX_M4x32, ctr, key0.sEF);
ctr = PHILOX_4UINT_ROUND(PHILOX_M4x32, ctr, key1.s01);
ctr = PHILOX_4UINT_ROUND(PHILOX_M4x32, ctr, key1.s23);
return ctr[idx & 3L];
}

uint philox_4x32(uint idx, uint seed) {
// Note: this is for compatibility with impls that don't support s64 rand
ulong x = idx & ~3L;
ulong idx_64 = ((x + 3) << 32) + (x + 2);
ulong offset_64 = ((x + 1) << 32) + x;
ulong seed_64 = ((ulong)(seed) << 32) + seed;
return philox_4x32_s64(idx_64, seed_64, offset_64);
}

ushort philox_8x16(long idx, uint seed) {
Expand Down
29 changes: 22 additions & 7 deletions src/gpu/intel/matmul/ref.cl
Original file line number Diff line number Diff line change
Expand Up @@ -88,8 +88,13 @@ __kernel void ref_matmul(__global SRC_DATA_T *A, __global WEI_DATA_T *B,
long c_stride_m, long c_stride_n
#if WITH_DROPOUT
,
__global uchar *dropout_mask_buf, __global uint *dropout_seed_buf,
__global float *dropout_p_buf
__global uchar *dropout_mask_buf,
#if USE_HOST_SCALARS
long dropout_seed, long dropout_offset, float dropout_p
#else
__global SEED_DATA_T *dropout_seed_buf,
__global long *dropout_offset_buf, __global float *dropout_p_buf
#endif
#endif
#if WITH_SROUND
,
Expand Down Expand Up @@ -117,10 +122,13 @@ __kernel void ref_matmul(__global SRC_DATA_T *A, __global WEI_DATA_T *B,
#endif

#if WITH_DROPOUT
uint dropout_seed = dropout_seed_buf[0];
uint dropout_threshold = get_dropout_threshold(dropout_p_buf[0]);
float dropout_inv_q
= (dropout_p_buf[0] != 1.f) ? 1.f / (1.f - dropout_p_buf[0]) : 0.f;
#if !USE_HOST_SCALARS
SEED_DATA_T dropout_seed = dropout_seed_buf[0];
long dropout_offset = USE_OFFSET ? dropout_offset_buf[0] : 0;
float dropout_p = dropout_p_buf[0];
#endif
uint dropout_threshold = get_dropout_threshold(dropout_p);
float dropout_inv_q = (dropout_p != 1.f) ? 1.f / (1.f - dropout_p) : 0.f;
#endif
#if WITH_SROUND
uint sround_seed = sround_seed_buf[0];
Expand Down Expand Up @@ -286,11 +294,18 @@ __kernel void ref_matmul(__global SRC_DATA_T *A, __global WEI_DATA_T *B,
float po_acc = convert_float(temp);

#if WITH_DROPOUT
uint res = philox_4x32(dst_off, dropout_seed);
#if USE_OFFSET
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this should check if SEED_DATA_T is int or int64_t. If it's not, then the assumption is the offset won't be passed anyway. If it is, then use a new version and if offset wasn't specified, the kernel sets it to 0.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

good catch! the offset will already be set to zero if use_offset = 0, but this condition is misleading.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

https://github.com/uxlfoundation/oneDNN/blob/main/src/cpu/primitive_attr_postops.cpp#L304 - based on this condition use_offset is the precursor between the two philox functions used. Will stick with what I have by adding explicit conversion to seed for philox without offset. Changing condition will mean I will have to add another similar function to cater for s64 philox without seed.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Alternatively you could have two versions of philox_4x32 with different type signatures so it would dispatch implicitly based on type.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We strictly like to keep the code between cpu/gpu same. Additionally if I do add another philox function, it will be just repeating the same code with lesser lines.

uint res = philox_4x32_s64(
dst_off, (ulong)dropout_seed, (ulong)dropout_offset);
#else
uint res = philox_4x32((uint)dst_off, (uint)dropout_seed);
#endif
uchar dropout = res > dropout_threshold;
po_acc = (dropout) ? po_acc * dropout_inv_q : 0;
#if HAS_OUTPUT_MASK
dropout_mask_buf[dst_off] = dropout;
#endif
#endif

#if WITH_SROUND
po_acc = stochastic_round_fwd(po_acc, dst_off, sround_seed);
Expand Down
42 changes: 39 additions & 3 deletions src/gpu/intel/matmul/ref.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -328,16 +328,52 @@ status_t ref_t::execute_ref(const exec_ctx_t &ctx) const {

const bool dropout = !pd()->attr()->dropout_.has_default_values();
if (dropout) {
const bool use_host_scalars = pd()->attr()->dropout_.use_host_scalars_;
const bool use_offset = pd()->attr()->dropout_.use_offset_;

const auto &dropout_p
= CTX_IN_STORAGE(DNNL_ARG_ATTR_DROPOUT_PROBABILITY);
const auto &dropout_seed = CTX_IN_STORAGE(DNNL_ARG_ATTR_DROPOUT_SEED);
const auto &dropout_offset
= CTX_IN_STORAGE(DNNL_ARG_ATTR_DROPOUT_OFFSET);

arg_list.set(arg_idx++, CTX_OUT_STORAGE(DNNL_ARG_ATTR_DROPOUT_MASK));
arg_list.set(arg_idx++, CTX_IN_STORAGE(DNNL_ARG_ATTR_DROPOUT_SEED));
arg_list.set(
arg_idx++, CTX_IN_STORAGE(DNNL_ARG_ATTR_DROPOUT_PROBABILITY));
if (use_host_scalars) {
int64_t scalar_seed = 0;
int64_t scalar_offset = 0;
float scalar_prob = 0.f;
const host_scalar_memory_storage_t *seed_storage
= utils::downcast<const host_scalar_memory_storage_t *>(
&dropout_seed);
CHECK(seed_storage->get_scalar_value(
&scalar_seed, sizeof(scalar_seed)));
if (use_offset) {
const host_scalar_memory_storage_t *offset_storage
= utils::downcast<const host_scalar_memory_storage_t *>(
&dropout_offset);
CHECK(offset_storage->get_scalar_value(
&scalar_offset, sizeof(scalar_offset)));
}
const host_scalar_memory_storage_t *prob_storage
= utils::downcast<const host_scalar_memory_storage_t *>(
&dropout_p);
CHECK(prob_storage->get_scalar_value(
&scalar_prob, sizeof(scalar_prob)));
arg_list.set(arg_idx++, scalar_seed);
arg_list.set(arg_idx++, scalar_offset);
arg_list.set(arg_idx++, scalar_prob);
} else {
arg_list.set(arg_idx++, dropout_seed);
arg_list.set(arg_idx++, dropout_offset);
arg_list.set(arg_idx++, dropout_p);
}
}

const bool sround = !pd()->attr()->rounding_mode_.has_default_values();
if (sround) {
arg_list.set(arg_idx++, CTX_IN_STORAGE(DNNL_ARG_ATTR_ROUNDING_SEED));
}

append_post_ops_to_arg_list(
ctx, arg_list, arg_idx, pd()->attr()->post_ops_, *pd()->dst_md());

Expand Down
38 changes: 26 additions & 12 deletions src/gpu/intel/matmul/ref.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -115,22 +115,11 @@ struct ref_t : public primitive_t {
VERBOSE_UNSUPPORTED_POSTOP);
VDISPATCH_MATMUL(post_ops_with_binary_ok(attr(), *dst_md(), 6),
VERBOSE_UNSUPPORTED_POSTOP);
VDISPATCH_MATMUL(IMPLICATION(!attr_.dropout_.has_default_values(),
attr_.dropout_.seed_dt_ == data_type::s32),
VERBOSE_UNSUPPORTED_DROPOUT);
const memory_desc_wrapper dropout_md(attr_.dropout_.dropout_desc_);
VDISPATCH_MATMUL(
IMPLICATION(!attr_.dropout_.has_default_values(),
dropout_md.similar_to(dst_md(), true, false)),
VERBOSE_INCONSISTENT_MDS, "dropout", "dst");
VDISPATCH_MATMUL(
IMPLICATION(!attr_.dropout_.has_default_values(),
utils::one_of(dropout_md.data_type(), u8, s8)),
VERBOSE_UNSUPPORTED_DT);
VDISPATCH_MATMUL(
IMPLICATION(utils::one_of(f64, src_dt_, wei_dt_, dst_dt_),
dev_info_->has_native(f64)),
VERBOSE_UNSUPPORTED_DT);
CHECK(dropout_ok());
subbyte_pack_ = utils::one_of(
dst_dt_, data_type::f4_e2m1, data_type::f4_e3m0);
mx_scales_ = attr()->scales_.get(DNNL_ARG_DST).is_mx();
Expand Down Expand Up @@ -214,6 +203,24 @@ struct ref_t : public primitive_t {
}
return true;
}
status_t dropout_ok() const {
if (attr_.dropout_.has_default_values()) return status::success;

assert(memory_desc_wrapper(dst_md(0)).format_kind()
== format_kind::blocked);

using namespace format_tag;
// Note: for `offset = 0` keep the legacy logic without the `offset`.
VDISPATCH_MATMUL_IC(memory_desc_matches_one_of_tag(
*dst_md(0), ncdhw, nchw, ncw, nc)
&& IMPLICATION(attr_.dropout_.has_output_mask(),
memory_desc_wrapper(dst_md(0)).similar_to(
attr_.dropout_.dropout_desc_, true,
false)),
VERBOSE_UNSUPPORTED_DROPOUT);

return status::success;
}
bool precomputed_reductions_ok() const {
const auto &pr = attr()->precomputed_reductions_;
if (pr.has_default_values(DNNL_ARG_SRC)) return true;
Expand Down Expand Up @@ -245,6 +252,11 @@ struct ref_t : public primitive_t {
kernel_ctx.define_int("WITH_BIAS", pd()->with_bias());
kernel_ctx.define_int(
"WITH_DROPOUT", !pd()->attr()->dropout_.has_default_values());
kernel_ctx.define_int(
"USE_HOST_SCALARS", pd()->attr()->dropout_.use_host_scalars_);
kernel_ctx.define_int("USE_OFFSET", pd()->attr()->dropout_.use_offset_);
kernel_ctx.define_int(
"HAS_OUTPUT_MASK", pd()->attr()->dropout_.has_output_mask());
kernel_ctx.define_int("NON_DEFAULT_ATTRS", pd()->non_default_attrs_);

auto dst_rnd_mode = pd()->attr()->rounding_mode_.get(DNNL_ARG_DST);
Expand Down Expand Up @@ -286,6 +298,7 @@ struct ref_t : public primitive_t {
def_data_type(kernel_ctx, pd()->dst_dt_, "DST");
def_data_type(kernel_ctx, pd()->bia_dt_, "BIA");
data_type_t acc_type = pd()->desc()->accum_data_type;
data_type_t seed_type = pd()->attr()->dropout_.seed_dt_;
switch (pd()->attr()->acc_mode_) {
case accumulation_mode::strict:
case accumulation_mode::relaxed:
Expand All @@ -296,6 +309,7 @@ struct ref_t : public primitive_t {
default: break;
}
def_data_type(kernel_ctx, acc_type, "ACC");
def_data_type(kernel_ctx, seed_type, "SEED");
def_data_type(kernel_ctx,
pd()->attr()->scales_.get_data_type(DNNL_ARG_WEIGHTS),
"WEI_SCALES");
Expand Down
5 changes: 5 additions & 0 deletions src/gpu/intel/primitive_conf.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -280,6 +280,7 @@ const char *get_type_name(data_type_t dt, bool with_punning) {
case data_type::s4: return with_punning ? "uchar" : "s4";
case data_type::u4: return with_punning ? "uchar" : "u4";
case data_type::s32: return "int";
case data_type::s64: return "long";
default:
gpu_error_not_expected()
<< "Unexpected data type " << dnnl_dt2str(dt);
Expand Down Expand Up @@ -352,6 +353,10 @@ void def_data_type(compute::kernel_ctx_t &kernel_ctx, data_type_t dt,
kernel_ctx.add_option(
utils::format("-D%s_DATA_T=int -D%s_DT_S32", str, str));
break;
case data_type::s64:
kernel_ctx.add_option(
utils::format("-D%s_DATA_T=int -D%s_DT_S64", str, str));
break;
default:
gpu_error_not_expected()
<< "Unexpected data type " << dnnl_dt2str(dt);
Expand Down
2 changes: 1 addition & 1 deletion tests/benchdnn/inputs/matmul/harness_matmul_dropout
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
--attr-fpmath=,bf16
--check-ref-impl=false

--attr-dropout=0.5:12345678,0.75:12345678:undef,0.25:843921:any:1238976:true
--attr-dropout=0.5:12345678,0.75:12345678:undef,0.25:843921:any:1238976:true,0.75:111786:any:121716:false
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

are we able to specify s32/s64 seed type from benchdnn? it might be worth verifying both if theyre still supported.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So benchdnn only supports s64 as a standard. We are keeping s32 for older test-pathways. I checked the s32 datatype here by making the change here: https://github.com/uxlfoundation/oneDNN/blob/main/tests/benchdnn/dnn_types.cpp#L1449-L1454
(and all the tests are passing for harness_matmul_dropout)


--stag=ab --dtag=ab
--batch=shapes_2d
Expand Down
9 changes: 0 additions & 9 deletions tests/benchdnn/matmul/matmul.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -697,15 +697,6 @@ void skip_unimplemented_prb(const prb_t *prb, res_t *res) {
return;
}
}

if (!prb->attr.dropout.is_def()) {
BENCHDNN_PRINT(2,
"[SKIP][%s:%d]: Dropout with s64 seed isn't supported.\n",
__FILE__, __LINE__);
res->state = SKIPPED;
res->reason = skip_reason::case_not_supported;
return;
}
}
}

Expand Down
Loading