diff --git a/src/gpu/intel/compute/kernel_arg_list.hpp b/src/gpu/intel/compute/kernel_arg_list.hpp index 5064aae3e5a..04fecf6d778 100644 --- a/src/gpu/intel/compute/kernel_arg_list.hpp +++ b/src/gpu/intel/compute/kernel_arg_list.hpp @@ -249,6 +249,7 @@ class kernel_arg_list_t { APPEND_STORED_SCALAR_VALUE(bf16, bfloat16_t) APPEND_STORED_SCALAR_VALUE(f32, float) APPEND_STORED_SCALAR_VALUE(s32, int32_t) + APPEND_STORED_SCALAR_VALUE(s64, int64_t) APPEND_STORED_SCALAR_VALUE(s8, int8_t) APPEND_STORED_SCALAR_VALUE(u8, uint8_t) default: @@ -300,6 +301,7 @@ class kernel_arg_list_t { SET_STORED_SCALAR_VALUE(bf16, bfloat16_t) SET_STORED_SCALAR_VALUE(f32, float) SET_STORED_SCALAR_VALUE(s32, int32_t) + SET_STORED_SCALAR_VALUE(s64, int64_t) SET_STORED_SCALAR_VALUE(s8, int8_t) SET_STORED_SCALAR_VALUE(u8, uint8_t) default: diff --git a/src/gpu/intel/include/philox.h b/src/gpu/intel/include/philox.h index 103de96e158..2926a114923 100644 --- a/src/gpu/intel/include/philox.h +++ b/src/gpu/intel/include/philox.h @@ -20,30 +20,46 @@ #define DT_UNDEF 1 #include "gpu/intel/include/types.h" -uint philox_4x32(long idx, uint seed) { +uint philox_4x32_s64(ulong idx, ulong seed, ulong offset) { #define PHILOX_4UINT_ROUND(mul, ctr, key) \ - as_uint4(convert_ulong2(ctr.s31) * mul) ^ (uint4)(ctr.s20 ^ key, 0, 0).s3120 + as_uint4(convert_ulong2(ctr.s02) * mul).s3210 \ + ^ (uint4)(ctr.s1 ^ key.s0, 0, ctr.s3 ^ key.s1, 0) - uint4 ctr = 0; - const ulong2 ctr_mul = (ulong2)(0xD2511F53uL, 0xCD9E8D57uL); - const ulong key_add = as_ulong((uint2)(0x9E3779B9u, 0xBB67AE85u)); - const uint16 key0 = (uint16)(seed) - + as_uint16((ulong8)(key_add)) + ulong x = (idx & ~3L); + uint4 ctr = (uint4)((uint)offset, (uint)(offset >> 32), (uint)x, + (uint)(x >> 32)); + uint seed_lo = (uint)seed; + uint seed_hi = (uint)(seed >> 32); + const ulong seeds = as_ulong((uint2)(seed_lo, seed_hi)); + + const ulong2 PHILOX_M4x32 = (ulong2)(0xD2511F53uL, 0xCD9E8D57uL); + const ulong PHILOX_W4x32 = as_ulong((uint2)(0x9E3779B9u, 0xBB67AE85u)); + const uint16 key0 = as_uint16((ulong8)(seeds)) + + as_uint16((ulong8)(PHILOX_W4x32)) * (uint16)(0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7); - const uint4 key1 - = (uint4)(seed) + as_uint4((ulong2)(key_add)) * (uint4)(8, 8, 9, 9); - ctr = (uint4)(idx & ~3L) + (uint4)(3, 2, 1, 0); - ctr = PHILOX_4UINT_ROUND(ctr_mul, ctr, key0.s01); - ctr = PHILOX_4UINT_ROUND(ctr_mul, ctr, key0.s23); - ctr = PHILOX_4UINT_ROUND(ctr_mul, ctr, key0.s45); - ctr = PHILOX_4UINT_ROUND(ctr_mul, ctr, key0.s67); - ctr = PHILOX_4UINT_ROUND(ctr_mul, ctr, key0.s89); - ctr = PHILOX_4UINT_ROUND(ctr_mul, ctr, key0.sAB); - ctr = PHILOX_4UINT_ROUND(ctr_mul, ctr, key0.sCD); - ctr = PHILOX_4UINT_ROUND(ctr_mul, ctr, key0.sEF); - ctr = PHILOX_4UINT_ROUND(ctr_mul, ctr, key1.s01); - ctr = PHILOX_4UINT_ROUND(ctr_mul, ctr, key1.s23); - return ctr[~idx & 3L]; + const uint4 key1 = as_uint4((ulong2)seeds) + + as_uint4((ulong2)(PHILOX_W4x32)) * (uint4)(8, 8, 9, 9); + + ctr = PHILOX_4UINT_ROUND(PHILOX_M4x32, ctr, key0.s01); + ctr = PHILOX_4UINT_ROUND(PHILOX_M4x32, ctr, key0.s23); + ctr = PHILOX_4UINT_ROUND(PHILOX_M4x32, ctr, key0.s45); + ctr = PHILOX_4UINT_ROUND(PHILOX_M4x32, ctr, key0.s67); + ctr = PHILOX_4UINT_ROUND(PHILOX_M4x32, ctr, key0.s89); + ctr = PHILOX_4UINT_ROUND(PHILOX_M4x32, ctr, key0.sAB); + ctr = PHILOX_4UINT_ROUND(PHILOX_M4x32, ctr, key0.sCD); + ctr = PHILOX_4UINT_ROUND(PHILOX_M4x32, ctr, key0.sEF); + ctr = PHILOX_4UINT_ROUND(PHILOX_M4x32, ctr, key1.s01); + ctr = PHILOX_4UINT_ROUND(PHILOX_M4x32, ctr, key1.s23); + return ctr[idx & 3L]; +} + +uint philox_4x32(uint idx, uint seed) { + // Note: this is for compatibility with impls that don't support s64 rand + ulong x = idx & ~3L; + ulong idx_64 = ((x + 3) << 32) + (x + 2); + ulong offset_64 = ((x + 1) << 32) + x; + ulong seed_64 = ((ulong)(seed) << 32) + seed; + return philox_4x32_s64(idx_64, seed_64, offset_64); } ushort philox_8x16(long idx, uint seed) { diff --git a/src/gpu/intel/matmul/ref.cl b/src/gpu/intel/matmul/ref.cl index 4e016a2eed9..863730e4690 100644 --- a/src/gpu/intel/matmul/ref.cl +++ b/src/gpu/intel/matmul/ref.cl @@ -88,8 +88,13 @@ __kernel void ref_matmul(__global SRC_DATA_T *A, __global WEI_DATA_T *B, long c_stride_m, long c_stride_n #if WITH_DROPOUT , - __global uchar *dropout_mask_buf, __global uint *dropout_seed_buf, - __global float *dropout_p_buf + __global uchar *dropout_mask_buf, +#if USE_HOST_SCALARS + long dropout_seed, long dropout_offset, float dropout_p +#else + __global SEED_DATA_T *dropout_seed_buf, + __global long *dropout_offset_buf, __global float *dropout_p_buf +#endif #endif #if WITH_SROUND , @@ -117,10 +122,13 @@ __kernel void ref_matmul(__global SRC_DATA_T *A, __global WEI_DATA_T *B, #endif #if WITH_DROPOUT - uint dropout_seed = dropout_seed_buf[0]; - uint dropout_threshold = get_dropout_threshold(dropout_p_buf[0]); - float dropout_inv_q - = (dropout_p_buf[0] != 1.f) ? 1.f / (1.f - dropout_p_buf[0]) : 0.f; +#if !USE_HOST_SCALARS + SEED_DATA_T dropout_seed = dropout_seed_buf[0]; + long dropout_offset = USE_OFFSET ? dropout_offset_buf[0] : 0; + float dropout_p = dropout_p_buf[0]; +#endif + uint dropout_threshold = get_dropout_threshold(dropout_p); + float dropout_inv_q = (dropout_p != 1.f) ? 1.f / (1.f - dropout_p) : 0.f; #endif #if WITH_SROUND uint sround_seed = sround_seed_buf[0]; @@ -286,11 +294,18 @@ __kernel void ref_matmul(__global SRC_DATA_T *A, __global WEI_DATA_T *B, float po_acc = convert_float(temp); #if WITH_DROPOUT - uint res = philox_4x32(dst_off, dropout_seed); +#if USE_OFFSET + uint res = philox_4x32_s64( + dst_off, (ulong)dropout_seed, (ulong)dropout_offset); +#else + uint res = philox_4x32((uint)dst_off, (uint)dropout_seed); +#endif uchar dropout = res > dropout_threshold; po_acc = (dropout) ? po_acc * dropout_inv_q : 0; +#if HAS_OUTPUT_MASK dropout_mask_buf[dst_off] = dropout; #endif +#endif #if WITH_SROUND po_acc = stochastic_round_fwd(po_acc, dst_off, sround_seed); diff --git a/src/gpu/intel/matmul/ref.cpp b/src/gpu/intel/matmul/ref.cpp index 4b1ee6e03e4..cd537afd7ce 100644 --- a/src/gpu/intel/matmul/ref.cpp +++ b/src/gpu/intel/matmul/ref.cpp @@ -328,16 +328,52 @@ status_t ref_t::execute_ref(const exec_ctx_t &ctx) const { const bool dropout = !pd()->attr()->dropout_.has_default_values(); if (dropout) { + const bool use_host_scalars = pd()->attr()->dropout_.use_host_scalars_; + const bool use_offset = pd()->attr()->dropout_.use_offset_; + + const auto &dropout_p + = CTX_IN_STORAGE(DNNL_ARG_ATTR_DROPOUT_PROBABILITY); + const auto &dropout_seed = CTX_IN_STORAGE(DNNL_ARG_ATTR_DROPOUT_SEED); + const auto &dropout_offset + = CTX_IN_STORAGE(DNNL_ARG_ATTR_DROPOUT_OFFSET); + arg_list.set(arg_idx++, CTX_OUT_STORAGE(DNNL_ARG_ATTR_DROPOUT_MASK)); - arg_list.set(arg_idx++, CTX_IN_STORAGE(DNNL_ARG_ATTR_DROPOUT_SEED)); - arg_list.set( - arg_idx++, CTX_IN_STORAGE(DNNL_ARG_ATTR_DROPOUT_PROBABILITY)); + if (use_host_scalars) { + int64_t scalar_seed = 0; + int64_t scalar_offset = 0; + float scalar_prob = 0.f; + const host_scalar_memory_storage_t *seed_storage + = utils::downcast( + &dropout_seed); + CHECK(seed_storage->get_scalar_value( + &scalar_seed, sizeof(scalar_seed))); + if (use_offset) { + const host_scalar_memory_storage_t *offset_storage + = utils::downcast( + &dropout_offset); + CHECK(offset_storage->get_scalar_value( + &scalar_offset, sizeof(scalar_offset))); + } + const host_scalar_memory_storage_t *prob_storage + = utils::downcast( + &dropout_p); + CHECK(prob_storage->get_scalar_value( + &scalar_prob, sizeof(scalar_prob))); + arg_list.set(arg_idx++, scalar_seed); + arg_list.set(arg_idx++, scalar_offset); + arg_list.set(arg_idx++, scalar_prob); + } else { + arg_list.set(arg_idx++, dropout_seed); + arg_list.set(arg_idx++, dropout_offset); + arg_list.set(arg_idx++, dropout_p); + } } const bool sround = !pd()->attr()->rounding_mode_.has_default_values(); if (sround) { arg_list.set(arg_idx++, CTX_IN_STORAGE(DNNL_ARG_ATTR_ROUNDING_SEED)); } + append_post_ops_to_arg_list( ctx, arg_list, arg_idx, pd()->attr()->post_ops_, *pd()->dst_md()); diff --git a/src/gpu/intel/matmul/ref.hpp b/src/gpu/intel/matmul/ref.hpp index 2ef52b9b8a3..1b4a06350e7 100644 --- a/src/gpu/intel/matmul/ref.hpp +++ b/src/gpu/intel/matmul/ref.hpp @@ -115,22 +115,11 @@ struct ref_t : public primitive_t { VERBOSE_UNSUPPORTED_POSTOP); VDISPATCH_MATMUL(post_ops_with_binary_ok(attr(), *dst_md(), 6), VERBOSE_UNSUPPORTED_POSTOP); - VDISPATCH_MATMUL(IMPLICATION(!attr_.dropout_.has_default_values(), - attr_.dropout_.seed_dt_ == data_type::s32), - VERBOSE_UNSUPPORTED_DROPOUT); - const memory_desc_wrapper dropout_md(attr_.dropout_.dropout_desc_); - VDISPATCH_MATMUL( - IMPLICATION(!attr_.dropout_.has_default_values(), - dropout_md.similar_to(dst_md(), true, false)), - VERBOSE_INCONSISTENT_MDS, "dropout", "dst"); - VDISPATCH_MATMUL( - IMPLICATION(!attr_.dropout_.has_default_values(), - utils::one_of(dropout_md.data_type(), u8, s8)), - VERBOSE_UNSUPPORTED_DT); VDISPATCH_MATMUL( IMPLICATION(utils::one_of(f64, src_dt_, wei_dt_, dst_dt_), dev_info_->has_native(f64)), VERBOSE_UNSUPPORTED_DT); + CHECK(dropout_ok()); subbyte_pack_ = utils::one_of( dst_dt_, data_type::f4_e2m1, data_type::f4_e3m0); mx_scales_ = attr()->scales_.get(DNNL_ARG_DST).is_mx(); @@ -214,6 +203,24 @@ struct ref_t : public primitive_t { } return true; } + status_t dropout_ok() const { + if (attr_.dropout_.has_default_values()) return status::success; + + assert(memory_desc_wrapper(dst_md(0)).format_kind() + == format_kind::blocked); + + using namespace format_tag; + // Note: for `offset = 0` keep the legacy logic without the `offset`. + VDISPATCH_MATMUL_IC(memory_desc_matches_one_of_tag( + *dst_md(0), ncdhw, nchw, ncw, nc) + && IMPLICATION(attr_.dropout_.has_output_mask(), + memory_desc_wrapper(dst_md(0)).similar_to( + attr_.dropout_.dropout_desc_, true, + false)), + VERBOSE_UNSUPPORTED_DROPOUT); + + return status::success; + } bool precomputed_reductions_ok() const { const auto &pr = attr()->precomputed_reductions_; if (pr.has_default_values(DNNL_ARG_SRC)) return true; @@ -245,6 +252,11 @@ struct ref_t : public primitive_t { kernel_ctx.define_int("WITH_BIAS", pd()->with_bias()); kernel_ctx.define_int( "WITH_DROPOUT", !pd()->attr()->dropout_.has_default_values()); + kernel_ctx.define_int( + "USE_HOST_SCALARS", pd()->attr()->dropout_.use_host_scalars_); + kernel_ctx.define_int("USE_OFFSET", pd()->attr()->dropout_.use_offset_); + kernel_ctx.define_int( + "HAS_OUTPUT_MASK", pd()->attr()->dropout_.has_output_mask()); kernel_ctx.define_int("NON_DEFAULT_ATTRS", pd()->non_default_attrs_); auto dst_rnd_mode = pd()->attr()->rounding_mode_.get(DNNL_ARG_DST); @@ -286,6 +298,7 @@ struct ref_t : public primitive_t { def_data_type(kernel_ctx, pd()->dst_dt_, "DST"); def_data_type(kernel_ctx, pd()->bia_dt_, "BIA"); data_type_t acc_type = pd()->desc()->accum_data_type; + data_type_t seed_type = pd()->attr()->dropout_.seed_dt_; switch (pd()->attr()->acc_mode_) { case accumulation_mode::strict: case accumulation_mode::relaxed: @@ -296,6 +309,7 @@ struct ref_t : public primitive_t { default: break; } def_data_type(kernel_ctx, acc_type, "ACC"); + def_data_type(kernel_ctx, seed_type, "SEED"); def_data_type(kernel_ctx, pd()->attr()->scales_.get_data_type(DNNL_ARG_WEIGHTS), "WEI_SCALES"); diff --git a/src/gpu/intel/primitive_conf.cpp b/src/gpu/intel/primitive_conf.cpp index 59b0df800ae..303db819fe3 100644 --- a/src/gpu/intel/primitive_conf.cpp +++ b/src/gpu/intel/primitive_conf.cpp @@ -280,6 +280,7 @@ const char *get_type_name(data_type_t dt, bool with_punning) { case data_type::s4: return with_punning ? "uchar" : "s4"; case data_type::u4: return with_punning ? "uchar" : "u4"; case data_type::s32: return "int"; + case data_type::s64: return "long"; default: gpu_error_not_expected() << "Unexpected data type " << dnnl_dt2str(dt); @@ -352,6 +353,10 @@ void def_data_type(compute::kernel_ctx_t &kernel_ctx, data_type_t dt, kernel_ctx.add_option( utils::format("-D%s_DATA_T=int -D%s_DT_S32", str, str)); break; + case data_type::s64: + kernel_ctx.add_option( + utils::format("-D%s_DATA_T=int -D%s_DT_S64", str, str)); + break; default: gpu_error_not_expected() << "Unexpected data type " << dnnl_dt2str(dt); diff --git a/tests/benchdnn/inputs/matmul/harness_matmul_dropout b/tests/benchdnn/inputs/matmul/harness_matmul_dropout index 5256d8b0c7c..f643df445a6 100644 --- a/tests/benchdnn/inputs/matmul/harness_matmul_dropout +++ b/tests/benchdnn/inputs/matmul/harness_matmul_dropout @@ -3,7 +3,7 @@ --attr-fpmath=,bf16 --check-ref-impl=false ---attr-dropout=0.5:12345678,0.75:12345678:undef,0.25:843921:any:1238976:true +--attr-dropout=0.5:12345678,0.75:12345678:undef,0.25:843921:any:1238976:true,0.75:111786:any:121716:false --stag=ab --dtag=ab --batch=shapes_2d diff --git a/tests/benchdnn/matmul/matmul.cpp b/tests/benchdnn/matmul/matmul.cpp index 7f3fcb9d53d..ee141e13f80 100644 --- a/tests/benchdnn/matmul/matmul.cpp +++ b/tests/benchdnn/matmul/matmul.cpp @@ -697,15 +697,6 @@ void skip_unimplemented_prb(const prb_t *prb, res_t *res) { return; } } - - if (!prb->attr.dropout.is_def()) { - BENCHDNN_PRINT(2, - "[SKIP][%s:%d]: Dropout with s64 seed isn't supported.\n", - __FILE__, __LINE__); - res->state = SKIPPED; - res->reason = skip_reason::case_not_supported; - return; - } } }