From 70edc8abc52079cbc90b1c1877107bb08a2c5375 Mon Sep 17 00:00:00 2001 From: Raahul Kalyaan Jakka Date: Mon, 27 Oct 2025 13:12:08 -0700 Subject: [PATCH 1/2] Changing Backend Tensor initialization (#5055) Summary: X-link: https://github.com/facebookresearch/FBGEMM/pull/2065 **Context:** Currently, RocksDB stores data on row-wise format, to enable optimizer offloading for the Kernel. We will append the optimizer state to its corresponding row. During initialization, we need to randomly initialize weights while the optimizer values need to initialized to zero. When optimizer offloading is enabled, **In this diff:** We add two new arguments: 1. enable_optimizer_offloading: This flag toggles between initializing the last optimizer_D rows to zero 2. optimizer_D: The number of columns in the table that needs to be initialized to zero. This set of columns represent the optimizer values (w/wo padding). **Scenarios:** 1. Optimizer_offloading is False: max_D = Dimensions of weights only, optimizer_D = 0 2. Optimizer_offloading is True: max_D = Dimension of weights (w_D) + optimizers (o_D) optimizer_D = dimensions of optimizers (o_D) initialize o_D columns with zero Differential Revision: D85157732 --- .../ssd_table_batched_embeddings.h | 34 +++++++++++++------ 1 file changed, 24 insertions(+), 10 deletions(-) diff --git a/fbgemm_gpu/src/ssd_split_embeddings_cache/ssd_table_batched_embeddings.h b/fbgemm_gpu/src/ssd_split_embeddings_cache/ssd_table_batched_embeddings.h index 47e1d29893..c440eec69b 100644 --- a/fbgemm_gpu/src/ssd_split_embeddings_cache/ssd_table_batched_embeddings.h +++ b/fbgemm_gpu/src/ssd_split_embeddings_cache/ssd_table_batched_embeddings.h @@ -121,7 +121,9 @@ class EmbeddingRocksDB : public kv_db::EmbeddingKVDB { std::optional table_dims = std::nullopt, std::optional hash_size_cumsum = std::nullopt, int64_t flushing_block_size = 2000000000 /*2GB*/, - bool disable_random_init = false) + bool disable_random_init = false, + bool enable_optimizer_offloading = false, + int64_t optimizer_D = 0) : kv_db::EmbeddingKVDB( num_shards, max_D, @@ -266,7 +268,9 @@ class EmbeddingRocksDB : public kv_db::EmbeddingKVDB { uniform_init_lower, uniform_init_upper, row_storage_bitwidth, - disable_random_init); + disable_random_init, + enable_optimizer_offloading, + optimizer_D); executor_ = std::make_unique(num_shards); ro_.verify_checksums = false; ro_.async_io = true; @@ -421,19 +425,29 @@ class EmbeddingRocksDB : public kv_db::EmbeddingKVDB { float uniform_init_lower, float uniform_init_upper, int64_t row_storage_bitwidth, - bool disable_random_init) { + bool disable_random_init, + bool enable_optimizer_offloading = false, + int64_t optimizer_D = 0) { for (auto i = 0; i < num_shards; ++i) { auto* gen = at::check_generator( at::detail::getDefaultCPUGenerator()); { std::lock_guard lock(gen->mutex_); - initializers_.push_back( - std::make_unique( - gen->random64(), - max_D, - uniform_init_lower, - uniform_init_upper, - row_storage_bitwidth)); + auto initializer = std::make_unique( + gen->random64(), + max_D, + uniform_init_lower, + uniform_init_upper, + row_storage_bitwidth); + + // When Optimizer offloading is enabled, we want to initialize the last + // optimizer_D columns(optimizer values) to zero + if (enable_optimizer_offloading) { + auto& tensor = initializer->row_storage_; + tensor.index({"...", at::indexing::Slice(max_D - optimizer_D, max_D)}) + .zero_(); + } + initializers_.push_back(std::move(initializer)); } } disable_random_init_ = disable_random_init; From 58631d9a0db920dbcae8d57a017e9bfd31f99c3b Mon Sep 17 00:00:00 2001 From: Raahul Kalyaan Jakka Date: Mon, 27 Oct 2025 13:12:08 -0700 Subject: [PATCH 2/2] Changing Backend Tensor initialization (#5056) Summary: X-link: https://github.com/meta-pytorch/torchrec/pull/3484 X-link: https://github.com/facebookresearch/FBGEMM/pull/2066 **Context:** Currently, we are enabling SSD optimizer offloading for the ssd tbe kernel **In this diff:** We retrieve the newly added parameters from the tbe config and pass it down to the tbe Differential Revision: D85353134 --- fbgemm_gpu/fbgemm_gpu/tbe/ssd/training.py | 1 + .../embedding_rocksdb_wrapper.h | 8 ++++++-- .../ssd_split_table_batched_embeddings.cpp | 6 +++++- .../ssd_table_batched_embeddings.h | 17 +++++++++++------ 4 files changed, 23 insertions(+), 9 deletions(-) diff --git a/fbgemm_gpu/fbgemm_gpu/tbe/ssd/training.py b/fbgemm_gpu/fbgemm_gpu/tbe/ssd/training.py index a497cf9a5b..031d0eec56 100644 --- a/fbgemm_gpu/fbgemm_gpu/tbe/ssd/training.py +++ b/fbgemm_gpu/fbgemm_gpu/tbe/ssd/training.py @@ -179,6 +179,7 @@ def __init__( table_names: Optional[list[str]] = None, use_rowwise_bias_correction: bool = False, # For Adam use optimizer_state_dtypes: dict[str, SparseType] = {}, # noqa: B006 + enable_optimizer_offloading: bool = False, ) -> None: super(SSDTableBatchedEmbeddingBags, self).__init__() diff --git a/fbgemm_gpu/src/ssd_split_embeddings_cache/embedding_rocksdb_wrapper.h b/fbgemm_gpu/src/ssd_split_embeddings_cache/embedding_rocksdb_wrapper.h index 8cebdef1eb..82ce53445d 100644 --- a/fbgemm_gpu/src/ssd_split_embeddings_cache/embedding_rocksdb_wrapper.h +++ b/fbgemm_gpu/src/ssd_split_embeddings_cache/embedding_rocksdb_wrapper.h @@ -45,7 +45,9 @@ class EmbeddingRocksDBWrapper : public torch::jit::CustomClassHolder { std::optional table_dims = std::nullopt, std::optional hash_size_cumsum = std::nullopt, int64_t flushing_block_size = 2000000000 /*2GB*/, - bool disable_random_init = false) + bool disable_random_init = false, + std::optional enable_optimizer_offloading = std::nullopt, + std::optional optimizer_D = std::nullopt) : impl_( std::make_shared( path, @@ -77,7 +79,9 @@ class EmbeddingRocksDBWrapper : public torch::jit::CustomClassHolder { table_dims, hash_size_cumsum, flushing_block_size, - disable_random_init)) {} + disable_random_init, + enable_optimizer_offloading, + optimizer_D)) {} void set_cuda( at::Tensor indices, diff --git a/fbgemm_gpu/src/ssd_split_embeddings_cache/ssd_split_table_batched_embeddings.cpp b/fbgemm_gpu/src/ssd_split_embeddings_cache/ssd_split_table_batched_embeddings.cpp index 0b95285a8f..fab6ad9253 100644 --- a/fbgemm_gpu/src/ssd_split_embeddings_cache/ssd_split_table_batched_embeddings.cpp +++ b/fbgemm_gpu/src/ssd_split_embeddings_cache/ssd_split_table_batched_embeddings.cpp @@ -809,7 +809,9 @@ static auto embedding_rocks_db_wrapper = std::optional, std::optional, int64_t, - bool>(), + bool, + std::optional, + std::optional>(), "", { torch::arg("path"), @@ -842,6 +844,8 @@ static auto embedding_rocks_db_wrapper = torch::arg("hash_size_cumsum") = std::nullopt, torch::arg("flushing_block_size") = 2000000000 /* 2GB */, torch::arg("disable_random_init") = false, + torch::arg("enable_optimizer_offloading") = std::nullopt, + torch::arg("optimizer_D") = std::nullopt, }) .def( "set_cuda", diff --git a/fbgemm_gpu/src/ssd_split_embeddings_cache/ssd_table_batched_embeddings.h b/fbgemm_gpu/src/ssd_split_embeddings_cache/ssd_table_batched_embeddings.h index c440eec69b..d95421dce3 100644 --- a/fbgemm_gpu/src/ssd_split_embeddings_cache/ssd_table_batched_embeddings.h +++ b/fbgemm_gpu/src/ssd_split_embeddings_cache/ssd_table_batched_embeddings.h @@ -122,8 +122,8 @@ class EmbeddingRocksDB : public kv_db::EmbeddingKVDB { std::optional hash_size_cumsum = std::nullopt, int64_t flushing_block_size = 2000000000 /*2GB*/, bool disable_random_init = false, - bool enable_optimizer_offloading = false, - int64_t optimizer_D = 0) + std::optional enable_optimizer_offloading = std::nullopt, + std::optional optimizer_D = std::nullopt) : kv_db::EmbeddingKVDB( num_shards, max_D, @@ -426,8 +426,8 @@ class EmbeddingRocksDB : public kv_db::EmbeddingKVDB { float uniform_init_upper, int64_t row_storage_bitwidth, bool disable_random_init, - bool enable_optimizer_offloading = false, - int64_t optimizer_D = 0) { + std::optional enable_optimizer_offloading = std::nullopt, + std::optional optimizer_D = std::nullopt) { for (auto i = 0; i < num_shards; ++i) { auto* gen = at::check_generator( at::detail::getDefaultCPUGenerator()); @@ -442,9 +442,13 @@ class EmbeddingRocksDB : public kv_db::EmbeddingKVDB { // When Optimizer offloading is enabled, we want to initialize the last // optimizer_D columns(optimizer values) to zero - if (enable_optimizer_offloading) { + if (enable_optimizer_offloading.has_value() && + enable_optimizer_offloading.value() && optimizer_D.has_value()) { auto& tensor = initializer->row_storage_; - tensor.index({"...", at::indexing::Slice(max_D - optimizer_D, max_D)}) + tensor + .index( + {"...", + at::indexing::Slice(max_D - optimizer_D.value(), max_D)}) .zero_(); } initializers_.push_back(std::move(initializer)); @@ -1378,6 +1382,7 @@ class EmbeddingRocksDB : public kv_db::EmbeddingKVDB { std::vector db_paths_; bool disable_random_init_; + std::optional enable_optimizer_offloading = std::nullopt; }; // class EmbeddingRocksDB /// @ingroup embedding-ssd