diff --git a/src/gpu/intel/compute/kernel_arg_list.hpp b/src/gpu/intel/compute/kernel_arg_list.hpp
index 5064aae3e5a..04fecf6d778 100644
--- a/src/gpu/intel/compute/kernel_arg_list.hpp
+++ b/src/gpu/intel/compute/kernel_arg_list.hpp
@@ -249,6 +249,7 @@ class kernel_arg_list_t {
             APPEND_STORED_SCALAR_VALUE(bf16, bfloat16_t)
             APPEND_STORED_SCALAR_VALUE(f32, float)
             APPEND_STORED_SCALAR_VALUE(s32, int32_t)
+            APPEND_STORED_SCALAR_VALUE(s64, int64_t)
             APPEND_STORED_SCALAR_VALUE(s8, int8_t)
             APPEND_STORED_SCALAR_VALUE(u8, uint8_t)
             default:
@@ -300,6 +301,7 @@ class kernel_arg_list_t {
             SET_STORED_SCALAR_VALUE(bf16, bfloat16_t)
             SET_STORED_SCALAR_VALUE(f32, float)
             SET_STORED_SCALAR_VALUE(s32, int32_t)
+            SET_STORED_SCALAR_VALUE(s64, int64_t)
             SET_STORED_SCALAR_VALUE(s8, int8_t)
             SET_STORED_SCALAR_VALUE(u8, uint8_t)
             default:
diff --git a/src/gpu/intel/include/philox.h b/src/gpu/intel/include/philox.h
index 103de96e158..2926a114923 100644
--- a/src/gpu/intel/include/philox.h
+++ b/src/gpu/intel/include/philox.h
@@ -20,30 +20,46 @@
 #define DT_UNDEF 1
 #include "gpu/intel/include/types.h"
 
-uint philox_4x32(long idx, uint seed) {
+uint philox_4x32_s64(ulong idx, ulong seed, ulong offset) {
 #define PHILOX_4UINT_ROUND(mul, ctr, key) \
-    as_uint4(convert_ulong2(ctr.s31) * mul) ^ (uint4)(ctr.s20 ^ key, 0, 0).s3120
+    as_uint4(convert_ulong2(ctr.s02) * mul).s3210 \
+            ^ (uint4)(ctr.s1 ^ key.s0, 0, ctr.s3 ^ key.s1, 0)
 
-    uint4 ctr = 0;
-    const ulong2 ctr_mul = (ulong2)(0xD2511F53uL, 0xCD9E8D57uL);
-    const ulong key_add = as_ulong((uint2)(0x9E3779B9u, 0xBB67AE85u));
-    const uint16 key0 = (uint16)(seed)
-            + as_uint16((ulong8)(key_add))
+    ulong x = (idx & ~3L);
+    uint4 ctr = (uint4)((uint)offset, (uint)(offset >> 32), (uint)x,
+            (uint)(x >> 32));
+    uint seed_lo = (uint)seed;
+    uint seed_hi = (uint)(seed >> 32);
+    const ulong seeds = as_ulong((uint2)(seed_lo, seed_hi));
+
+    const ulong2 PHILOX_M4x32 = (ulong2)(0xD2511F53uL, 0xCD9E8D57uL);
+    const ulong PHILOX_W4x32 = as_ulong((uint2)(0x9E3779B9u, 0xBB67AE85u));
+    const uint16 key0 = as_uint16((ulong8)(seeds))
+            + as_uint16((ulong8)(PHILOX_W4x32))
                     * (uint16)(0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7);
-    const uint4 key1
-            = (uint4)(seed) + as_uint4((ulong2)(key_add)) * (uint4)(8, 8, 9, 9);
-    ctr = (uint4)(idx & ~3L) + (uint4)(3, 2, 1, 0);
-    ctr = PHILOX_4UINT_ROUND(ctr_mul, ctr, key0.s01);
-    ctr = PHILOX_4UINT_ROUND(ctr_mul, ctr, key0.s23);
-    ctr = PHILOX_4UINT_ROUND(ctr_mul, ctr, key0.s45);
-    ctr = PHILOX_4UINT_ROUND(ctr_mul, ctr, key0.s67);
-    ctr = PHILOX_4UINT_ROUND(ctr_mul, ctr, key0.s89);
-    ctr = PHILOX_4UINT_ROUND(ctr_mul, ctr, key0.sAB);
-    ctr = PHILOX_4UINT_ROUND(ctr_mul, ctr, key0.sCD);
-    ctr = PHILOX_4UINT_ROUND(ctr_mul, ctr, key0.sEF);
-    ctr = PHILOX_4UINT_ROUND(ctr_mul, ctr, key1.s01);
-    ctr = PHILOX_4UINT_ROUND(ctr_mul, ctr, key1.s23);
-    return ctr[~idx & 3L];
+    const uint4 key1 = as_uint4((ulong2)seeds)
+            + as_uint4((ulong2)(PHILOX_W4x32)) * (uint4)(8, 8, 9, 9);
+
+    ctr = PHILOX_4UINT_ROUND(PHILOX_M4x32, ctr, key0.s01);
+    ctr = PHILOX_4UINT_ROUND(PHILOX_M4x32, ctr, key0.s23);
+    ctr = PHILOX_4UINT_ROUND(PHILOX_M4x32, ctr, key0.s45);
+    ctr = PHILOX_4UINT_ROUND(PHILOX_M4x32, ctr, key0.s67);
+    ctr = PHILOX_4UINT_ROUND(PHILOX_M4x32, ctr, key0.s89);
+    ctr = PHILOX_4UINT_ROUND(PHILOX_M4x32, ctr, key0.sAB);
+    ctr = PHILOX_4UINT_ROUND(PHILOX_M4x32, ctr, key0.sCD);
+    ctr = PHILOX_4UINT_ROUND(PHILOX_M4x32, ctr, key0.sEF);
+    ctr = PHILOX_4UINT_ROUND(PHILOX_M4x32, ctr, key1.s01);
+    ctr = PHILOX_4UINT_ROUND(PHILOX_M4x32, ctr, key1.s23);
+    return ctr[idx & 3L];
+}
+
+uint philox_4x32(uint idx, uint seed) {
+    // Note: this is for compatibility with impls that don't support s64 rand
+    ulong x = idx & ~3L;
+    ulong idx_64 = ((x + 3) << 32) + (x + 2);
+    ulong offset_64 = ((x + 1) << 32) + x;
+    ulong seed_64 = ((ulong)(seed) << 32) + seed;
+    return philox_4x32_s64(idx_64, seed_64, offset_64);
 }
 
 ushort philox_8x16(long idx, uint seed) {
diff --git a/src/gpu/intel/matmul/ref.cl b/src/gpu/intel/matmul/ref.cl
index 4e016a2eed9..863730e4690 100644
--- a/src/gpu/intel/matmul/ref.cl
+++ b/src/gpu/intel/matmul/ref.cl
@@ -88,8 +88,13 @@ __kernel void ref_matmul(__global SRC_DATA_T *A, __global WEI_DATA_T *B,
         long c_stride_m, long c_stride_n
 #if WITH_DROPOUT
         ,
-        __global uchar *dropout_mask_buf, __global uint *dropout_seed_buf,
-        __global float *dropout_p_buf
+        __global uchar *dropout_mask_buf,
+#if USE_HOST_SCALARS
+        long dropout_seed, long dropout_offset, float dropout_p
+#else
+        __global SEED_DATA_T *dropout_seed_buf,
+        __global long *dropout_offset_buf, __global float *dropout_p_buf
+#endif
 #endif
 #if WITH_SROUND
         ,
@@ -117,10 +122,13 @@ __kernel void ref_matmul(__global SRC_DATA_T *A, __global WEI_DATA_T *B,
 #endif
 
 #if WITH_DROPOUT
-    uint dropout_seed = dropout_seed_buf[0];
-    uint dropout_threshold = get_dropout_threshold(dropout_p_buf[0]);
-    float dropout_inv_q
-            = (dropout_p_buf[0] != 1.f) ? 1.f / (1.f - dropout_p_buf[0]) : 0.f;
+#if !USE_HOST_SCALARS
+    SEED_DATA_T dropout_seed = dropout_seed_buf[0];
+    long dropout_offset = USE_OFFSET ? dropout_offset_buf[0] : 0;
+    float dropout_p = dropout_p_buf[0];
+#endif
+    uint dropout_threshold = get_dropout_threshold(dropout_p);
+    float dropout_inv_q = (dropout_p != 1.f) ? 1.f / (1.f - dropout_p) : 0.f;
 #endif
 #if WITH_SROUND
     uint sround_seed = sround_seed_buf[0];
@@ -286,11 +294,18 @@ __kernel void ref_matmul(__global SRC_DATA_T *A, __global WEI_DATA_T *B,
         float po_acc = convert_float(temp);
 
 #if WITH_DROPOUT
-        uint res = philox_4x32(dst_off, dropout_seed);
+#if USE_OFFSET
+        uint res = philox_4x32_s64(
+                dst_off, (ulong)dropout_seed, (ulong)dropout_offset);
+#else
+        uint res = philox_4x32((uint)dst_off, (uint)dropout_seed);
+#endif
         uchar dropout = res > dropout_threshold;
         po_acc = (dropout) ? po_acc * dropout_inv_q : 0;
+#if HAS_OUTPUT_MASK
         dropout_mask_buf[dst_off] = dropout;
 #endif
+#endif
 
 #if WITH_SROUND
         po_acc = stochastic_round_fwd(po_acc, dst_off, sround_seed);
diff --git a/src/gpu/intel/matmul/ref.cpp b/src/gpu/intel/matmul/ref.cpp
index 4b1ee6e03e4..cd537afd7ce 100644
--- a/src/gpu/intel/matmul/ref.cpp
+++ b/src/gpu/intel/matmul/ref.cpp
@@ -328,16 +328,52 @@ status_t ref_t::execute_ref(const exec_ctx_t &ctx) const {
 
     const bool dropout = !pd()->attr()->dropout_.has_default_values();
     if (dropout) {
+        const bool use_host_scalars = pd()->attr()->dropout_.use_host_scalars_;
+        const bool use_offset = pd()->attr()->dropout_.use_offset_;
+
+        const auto &dropout_p
+                = CTX_IN_STORAGE(DNNL_ARG_ATTR_DROPOUT_PROBABILITY);
+        const auto &dropout_seed = CTX_IN_STORAGE(DNNL_ARG_ATTR_DROPOUT_SEED);
+        const auto &dropout_offset
+                = CTX_IN_STORAGE(DNNL_ARG_ATTR_DROPOUT_OFFSET);
+
         arg_list.set(arg_idx++, CTX_OUT_STORAGE(DNNL_ARG_ATTR_DROPOUT_MASK));
-        arg_list.set(arg_idx++, CTX_IN_STORAGE(DNNL_ARG_ATTR_DROPOUT_SEED));
-        arg_list.set(
-                arg_idx++, CTX_IN_STORAGE(DNNL_ARG_ATTR_DROPOUT_PROBABILITY));
+        if (use_host_scalars) {
+            int64_t scalar_seed = 0;
+            int64_t scalar_offset = 0;
+            float scalar_prob = 0.f;
+            const host_scalar_memory_storage_t *seed_storage
+                    = utils::downcast<const host_scalar_memory_storage_t *>(
+                            &dropout_seed);
+            CHECK(seed_storage->get_scalar_value(
+                    &scalar_seed, sizeof(scalar_seed)));
+            if (use_offset) {
+                const host_scalar_memory_storage_t *offset_storage
+                        = utils::downcast<const host_scalar_memory_storage_t *>(
+                                &dropout_offset);
+                CHECK(offset_storage->get_scalar_value(
+                        &scalar_offset, sizeof(scalar_offset)));
+            }
+            const host_scalar_memory_storage_t *prob_storage
+                    = utils::downcast<const host_scalar_memory_storage_t *>(
+                            &dropout_p);
+            CHECK(prob_storage->get_scalar_value(
+                    &scalar_prob, sizeof(scalar_prob)));
+            arg_list.set(arg_idx++, scalar_seed);
+            arg_list.set(arg_idx++, scalar_offset);
+            arg_list.set(arg_idx++, scalar_prob);
+        } else {
+            arg_list.set(arg_idx++, dropout_seed);
+            arg_list.set(arg_idx++, dropout_offset);
+            arg_list.set(arg_idx++, dropout_p);
+        }
     }
 
     const bool sround = !pd()->attr()->rounding_mode_.has_default_values();
     if (sround) {
         arg_list.set(arg_idx++, CTX_IN_STORAGE(DNNL_ARG_ATTR_ROUNDING_SEED));
     }
+
     append_post_ops_to_arg_list(
             ctx, arg_list, arg_idx, pd()->attr()->post_ops_, *pd()->dst_md());
 
diff --git a/src/gpu/intel/matmul/ref.hpp b/src/gpu/intel/matmul/ref.hpp
index 2ef52b9b8a3..1b4a06350e7 100644
--- a/src/gpu/intel/matmul/ref.hpp
+++ b/src/gpu/intel/matmul/ref.hpp
@@ -115,22 +115,11 @@ struct ref_t : public primitive_t {
                     VERBOSE_UNSUPPORTED_POSTOP);
             VDISPATCH_MATMUL(post_ops_with_binary_ok(attr(), *dst_md(), 6),
                     VERBOSE_UNSUPPORTED_POSTOP);
-            VDISPATCH_MATMUL(IMPLICATION(!attr_.dropout_.has_default_values(),
-                                     attr_.dropout_.seed_dt_ == data_type::s32),
-                    VERBOSE_UNSUPPORTED_DROPOUT);
-            const memory_desc_wrapper dropout_md(attr_.dropout_.dropout_desc_);
-            VDISPATCH_MATMUL(
-                    IMPLICATION(!attr_.dropout_.has_default_values(),
-                            dropout_md.similar_to(dst_md(), true, false)),
-                    VERBOSE_INCONSISTENT_MDS, "dropout", "dst");
-            VDISPATCH_MATMUL(
-                    IMPLICATION(!attr_.dropout_.has_default_values(),
-                            utils::one_of(dropout_md.data_type(), u8, s8)),
-                    VERBOSE_UNSUPPORTED_DT);
             VDISPATCH_MATMUL(
                     IMPLICATION(utils::one_of(f64, src_dt_, wei_dt_, dst_dt_),
                             dev_info_->has_native(f64)),
                     VERBOSE_UNSUPPORTED_DT);
+            CHECK(dropout_ok());
             subbyte_pack_ = utils::one_of(
                     dst_dt_, data_type::f4_e2m1, data_type::f4_e3m0);
             mx_scales_ = attr()->scales_.get(DNNL_ARG_DST).is_mx();
@@ -214,6 +203,24 @@ struct ref_t : public primitive_t {
             }
             return true;
         }
+        status_t dropout_ok() const {
+            if (attr_.dropout_.has_default_values()) return status::success;
+
+            assert(memory_desc_wrapper(dst_md(0)).format_kind()
+                    == format_kind::blocked);
+
+            using namespace format_tag;
+            // Note: for `offset = 0` keep the legacy logic without the `offset`.
+            VDISPATCH_MATMUL_IC(memory_desc_matches_one_of_tag(
+                                        *dst_md(0), ncdhw, nchw, ncw, nc)
+                            && IMPLICATION(attr_.dropout_.has_output_mask(),
+                                    memory_desc_wrapper(dst_md(0)).similar_to(
+                                            attr_.dropout_.dropout_desc_, true,
+                                            false)),
+                    VERBOSE_UNSUPPORTED_DROPOUT);
+
+            return status::success;
+        }
         bool precomputed_reductions_ok() const {
             const auto &pr = attr()->precomputed_reductions_;
             if (pr.has_default_values(DNNL_ARG_SRC)) return true;
@@ -245,6 +252,11 @@ struct ref_t : public primitive_t {
         kernel_ctx.define_int("WITH_BIAS", pd()->with_bias());
         kernel_ctx.define_int(
                 "WITH_DROPOUT", !pd()->attr()->dropout_.has_default_values());
+        kernel_ctx.define_int(
+                "USE_HOST_SCALARS", pd()->attr()->dropout_.use_host_scalars_);
+        kernel_ctx.define_int("USE_OFFSET", pd()->attr()->dropout_.use_offset_);
+        kernel_ctx.define_int(
+                "HAS_OUTPUT_MASK", pd()->attr()->dropout_.has_output_mask());
         kernel_ctx.define_int("NON_DEFAULT_ATTRS", pd()->non_default_attrs_);
 
         auto dst_rnd_mode = pd()->attr()->rounding_mode_.get(DNNL_ARG_DST);
@@ -286,6 +298,7 @@ struct ref_t : public primitive_t {
         def_data_type(kernel_ctx, pd()->dst_dt_, "DST");
         def_data_type(kernel_ctx, pd()->bia_dt_, "BIA");
         data_type_t acc_type = pd()->desc()->accum_data_type;
+        data_type_t seed_type = pd()->attr()->dropout_.seed_dt_;
         switch (pd()->attr()->acc_mode_) {
             case accumulation_mode::strict:
             case accumulation_mode::relaxed:
@@ -296,6 +309,7 @@ struct ref_t : public primitive_t {
             default: break;
         }
         def_data_type(kernel_ctx, acc_type, "ACC");
+        def_data_type(kernel_ctx, seed_type, "SEED");
         def_data_type(kernel_ctx,
                 pd()->attr()->scales_.get_data_type(DNNL_ARG_WEIGHTS),
                 "WEI_SCALES");
diff --git a/src/gpu/intel/primitive_conf.cpp b/src/gpu/intel/primitive_conf.cpp
index 59b0df800ae..303db819fe3 100644
--- a/src/gpu/intel/primitive_conf.cpp
+++ b/src/gpu/intel/primitive_conf.cpp
@@ -280,6 +280,7 @@ const char *get_type_name(data_type_t dt, bool with_punning) {
         case data_type::s4: return with_punning ? "uchar" : "s4";
         case data_type::u4: return with_punning ? "uchar" : "u4";
         case data_type::s32: return "int";
+        case data_type::s64: return "long";
         default:
             gpu_error_not_expected()
                     << "Unexpected data type " << dnnl_dt2str(dt);
@@ -352,6 +353,10 @@ void def_data_type(compute::kernel_ctx_t &kernel_ctx, data_type_t dt,
             kernel_ctx.add_option(
                     utils::format("-D%s_DATA_T=int -D%s_DT_S32", str, str));
             break;
+        case data_type::s64:
+            kernel_ctx.add_option(
+                    utils::format("-D%s_DATA_T=int -D%s_DT_S64", str, str));
+            break;
         default:
             gpu_error_not_expected()
                     << "Unexpected data type " << dnnl_dt2str(dt);
diff --git a/tests/benchdnn/inputs/matmul/harness_matmul_dropout b/tests/benchdnn/inputs/matmul/harness_matmul_dropout
index 5256d8b0c7c..f643df445a6 100644
--- a/tests/benchdnn/inputs/matmul/harness_matmul_dropout
+++ b/tests/benchdnn/inputs/matmul/harness_matmul_dropout
@@ -3,7 +3,7 @@
 --attr-fpmath=,bf16
 --check-ref-impl=false
 
---attr-dropout=0.5:12345678,0.75:12345678:undef,0.25:843921:any:1238976:true
+--attr-dropout=0.5:12345678,0.75:12345678:undef,0.25:843921:any:1238976:true,0.75:111786:any:121716:false
 
 --stag=ab --dtag=ab
 --batch=shapes_2d
diff --git a/tests/benchdnn/matmul/matmul.cpp b/tests/benchdnn/matmul/matmul.cpp
index 7f3fcb9d53d..ee141e13f80 100644
--- a/tests/benchdnn/matmul/matmul.cpp
+++ b/tests/benchdnn/matmul/matmul.cpp
@@ -697,15 +697,6 @@ void skip_unimplemented_prb(const prb_t *prb, res_t *res) {
                 return;
             }
         }
-
-        if (!prb->attr.dropout.is_def()) {
-            BENCHDNN_PRINT(2,
-                    "[SKIP][%s:%d]: Dropout with s64 seed isn't supported.\n",
-                    __FILE__, __LINE__);
-            res->state = SKIPPED;
-            res->reason = skip_reason::case_not_supported;
-            return;
-        }
     }
 }