NVIDIA · shijieliu · Jan 12, 2026 · Nov 27, 2025 · Dec 2, 2025 · Dec 4, 2025
diff --git a/README.md b/README.md
@@ -5,6 +5,7 @@ NVIDIA RecSys Examples is a collection of optimized recommender models and compo
 
 The project includes:
 - Examples for large-scale HSTU ranking and retrieval models through [TorchRec](https://github.com/pytorch/torchrec) and [Megatron-Core](https://github.com/NVIDIA/Megatron-LM/tree/main/megatron/core) integration
+- Examples for semantic-id based retrieval model through [TorchRec](https://github.com/pytorch/torchrec) and [Megatron-Core](https://github.com/NVIDIA/Megatron-LM/tree/main/megatron/core) integration
 - HSTU (Hierarchical Sequential Transduction Unit) attention operator support
 - Dynamic Embeddings with GPU acceleration
 
@@ -47,6 +48,7 @@ For more detailed release notes, please refer our [releases](https://github.com/
 # Get Started
 The examples we supported:
 - [HSTU recommender examples](./examples/hstu/README.md)
+- [SID based generative recommender examples](./examples/sid_gr/README.md)
 
 # Contribution Guidelines
 Please see our [contributing guidelines](./CONTRIBUTING.md) for details on how to contribute to this project.

diff --git a/docker/Dockerfile b/docker/Dockerfile
@@ -79,5 +79,5 @@ RUN cd /workspace/recsys-examples/corelib/hstu && \
     cd hopper && \
     HSTU_DISABLE_ARBITRARY=TRUE HSTU_DISABLE_SM8x=TRUE HSTU_DISABLE_LOCAL=TRUE HSTU_DISABLE_RAB=TRUE HSTU_DISABLE_DELTA_Q=FALSE HSTU_DISABLE_DRAB=TRUE pip install .
 
-RUN cd /workspace/recsys-examples/examples/hstu && \
+RUN cd /workspace/recsys-examples/examples/commons && \
     TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 9.0" python3 setup.py install
diff --git a/examples/hstu/distributed/__init__.py → examples/commons/distributed/__init__.py b/examples/hstu/distributed/__init__.py → examples/commons/distributed/__init__.py
diff --git a/examples/hstu/distributed/dmp_to_tp.py → examples/commons/distributed/dmp_to_tp.py b/examples/hstu/distributed/dmp_to_tp.py → examples/commons/distributed/dmp_to_tp.py
@@ -1,16 +1,15 @@
-from typing import Dict, Optional, Union
+from typing import Any, Dict, Optional, Union
 
 import torch
 import torch.distributed as dist
-from dataset.utils import RankingBatch, RetrievalBatch
-from megatron.core import parallel_state
-from ops.collective_ops import (
+from commons.ops.collective_ops import (
     gather_along_first_dim,
     gatherv_along_first_dim,
     jagged_tensor_allgather,
     keyed_jagged_tensor_allgather,
 )
-from ops.grad_scaling import grad_scaling
+from commons.ops.grad_scaling import grad_scaling
+from megatron.core import parallel_state
 from torchrec.sparse.jagged_tensor import JaggedTensor
 
 
@@ -35,9 +34,7 @@ def jt_dict_grad_scaling_and_allgather(
 
 
 # The features is a kjt, input to embedding module.
-def dmp_batch_to_tp(
-    batch: Union[RetrievalBatch, RankingBatch], exclude_features: bool = True
-) -> Union[RetrievalBatch, RankingBatch]:
+def dmp_batch_to_tp(batch: Any, exclude_features: bool = True) -> Any:
     tp_pg = parallel_state.get_tensor_model_parallel_group()
     tp_size = dist.get_world_size(group=tp_pg)
     batch_cls = type(batch)

diff --git a/.../hstu/distributed/finalize_model_grads.py → ...mmons/distributed/finalize_model_grads.py b/.../hstu/distributed/finalize_model_grads.py → ...mmons/distributed/finalize_model_grads.py
diff --git a/examples/hstu/distributed/sharding.py → examples/commons/distributed/sharding.py b/examples/hstu/distributed/sharding.py → examples/commons/distributed/sharding.py
@@ -19,10 +19,11 @@
 import torch
 import torch.distributed as dist
 import torchrec
-from configs.task_config import OptimizerParam
 
 # import our own finalize model grads
-from distributed.finalize_model_grads import finalize_model_grads
+from commons.distributed.finalize_model_grads import finalize_model_grads
+from commons.modules.embedding import DataParallelEmbeddingCollection
+from commons.optimizer import OptimizerParam
 from dynamicemb import DynamicEmbTableOptions
 from dynamicemb.get_planner import get_planner
 from dynamicemb.planner import (
@@ -40,7 +41,6 @@
 from megatron.core.optimizer import OptimizerConfig, get_megatron_optimizer
 from megatron.core.transformer import TransformerConfig
 from megatron.core.transformer.module import Float16Module
-from modules.embedding import DataParallelEmbeddingCollection
 from torch import distributed as dist
 from torch.distributed.optim import (
     _apply_optimizer_in_backward as apply_optimizer_in_backward,

diff --git a/examples/commons/modules/__init__.py b/examples/commons/modules/__init__.py
diff --git a/examples/hstu/modules/embedding.py → examples/commons/modules/embedding.py b/examples/hstu/modules/embedding.py → examples/commons/modules/embedding.py
@@ -14,6 +14,7 @@
 # limitations under the License.
 import copy
 import os
+from dataclasses import dataclass
 
 # pyre-strict
 from typing import Any, Dict, Iterator, List, Optional, Tuple
@@ -23,7 +24,6 @@
 import torch.fx
 import torch.nn as nn
 from commons.utils.nvtx_op import output_nvtx_hook, register_setter_and_getter_for_nvtx
-from configs.task_config import ShardedEmbeddingConfig
 from dynamicemb.planner import (
     DynamicEmbeddingShardingPlanner as DynamicEmbeddingShardingPlanner,
 )
@@ -52,6 +52,40 @@
 from torchrec.sparse.jagged_tensor import JaggedTensor, KeyedJaggedTensor
 
 
+@dataclass
+class ShardedEmbeddingConfig:
+    """
+    Configuration for sharded embeddings with sharding type. Inherits from BaseShardedEmbeddingConfig.
+
+    Args:
+        config (EmbeddingConfig): The embedding configuration.
+        sharding_type (str): The type of sharding, ``'data_parallel'`` | ``'model_parallel'``.
+    """
+
+    """
+    Base configuration for sharded embeddings.
+
+    Args:
+        feature_names (List[str]): The name of the features in this embedding.
+        table_name (str): The name of the table.
+        vocab_size (int): The size of the vocabulary.
+        dim (int): The dimension size of the embeddings.
+        sharding_type (str): The type of sharding, ``'data_parallel'`` | ``'model_parallel'``.
+    """
+
+    feature_names: List[str]
+    table_name: str
+    vocab_size: int
+    dim: int
+    sharding_type: str
+
+    def __post_init__(self):
+        assert self.sharding_type in [
+            "data_parallel",
+            "model_parallel",
+        ], "sharding type should be data_parallel or model_parallel"
+
+
 def create_data_parallel_sharding_infos_by_sharding(
     module: EmbeddingCollectionInterface,
     table_name_to_parameter_sharding: Dict[str, ParameterSharding],
@@ -132,6 +166,7 @@ class DataParallelEmbeddingCollection(torch.nn.Module):
     """
     Sharded implementation of `EmbeddingCollection`.
     This is part of the public API to allow for manual data dist pipelining.
+    We re-implement the DP embedding so that it can be wrapped by Megatron DDP.
     """
 
     def __init__(
@@ -354,14 +389,19 @@ def forward(self, kjt: KeyedJaggedTensor) -> Dict[str, JaggedTensor]:
         Returns:
             `Dict[str, JaggedTensor <https://pytorch.org/torchrec/concepts.html#jaggedtensor>]`: The output embeddings.
         """
-        mp_embeddings_awaitables = self._model_parallel_embedding_collection(kjt)
+        assert not (
+            self._model_parallel_embedding_collection is None
+            and self._data_parallel_embedding_collection is None
+        ), "either model_parallel_embedding_collection or data_parallel_embedding_collection must be not None"
+        embeddings: Dict[str, JaggedTensor] = {}
+        if self._model_parallel_embedding_collection is not None:
+            mp_embeddings_awaitables = self._model_parallel_embedding_collection(kjt)
+            embeddings = {**embeddings, **(mp_embeddings_awaitables.wait())}
         if self._data_parallel_embedding_collection is not None:
             with torch.cuda.stream(self._side_stream):
                 dp_embeddings = self._data_parallel_embedding_collection(kjt)
             torch.cuda.current_stream().wait_stream(self._side_stream)
-            embeddings = {**mp_embeddings_awaitables.wait(), **dp_embeddings}
-        else:
-            embeddings = mp_embeddings_awaitables.wait()
+            embeddings = {**embeddings, **dp_embeddings}
         return embeddings
 
     def export_local_embedding(self, table_name: str) -> Tuple[np.ndarray, np.ndarray]:
@@ -381,7 +421,7 @@ def export_local_embedding(self, table_name: str) -> Tuple[np.ndarray, np.ndarra
         Example:
             >>> # assume we have 2 ranks
             >>> import torch
-            >>> from modules.embedding import ShardedEmbedding
+            >>> from commons.modules.embedding import ShardedEmbedding
             >>> from configs.task_config import ShardedEmbeddingConfig
             >>> from commons.utils.initialize as init
             >>> from commons.utils.logger import print_rank_0

diff --git a/examples/hstu/ops/collective_ops.py → examples/commons/ops/collective_ops.py b/examples/hstu/ops/collective_ops.py → examples/commons/ops/collective_ops.py
@@ -16,7 +16,7 @@
 
 import torch
 import torch.distributed as dist
-from ops.length_to_offsets import length_to_complete_offsets
+from commons.ops.length_to_offsets import length_to_complete_offsets
 from torchrec.sparse.jagged_tensor import JaggedTensor, KeyedJaggedTensor
 
 

diff --git a/...tu/ops/cuda_ops/JaggedTensorOpFunction.py → ...ns/ops/cuda_ops/JaggedTensorOpFunction.py b/...tu/ops/cuda_ops/JaggedTensorOpFunction.py → ...ns/ops/cuda_ops/JaggedTensorOpFunction.py
@@ -2,7 +2,7 @@
 
 import hstu_cuda_ops
 import torch
-from ops.length_to_offsets import length_to_complete_offsets
+from commons.ops.length_to_offsets import length_to_complete_offsets
 
 
 class _JaggedTensorOpFunction(torch.autograd.Function):

diff --git a/...s/cuda_ops/csrc/jagged_tensor_op_cuda.cpp → ...s/cuda_ops/csrc/jagged_tensor_op_cuda.cpp b/...s/cuda_ops/csrc/jagged_tensor_op_cuda.cpp → ...s/cuda_ops/csrc/jagged_tensor_op_cuda.cpp
diff --git a/.../cuda_ops/csrc/jagged_tensor_op_kernel.cu → .../cuda_ops/csrc/jagged_tensor_op_kernel.cu b/.../cuda_ops/csrc/jagged_tensor_op_kernel.cu → .../cuda_ops/csrc/jagged_tensor_op_kernel.cu
diff --git a/.../cuda_ops/csrc/paged_kvcache_ops_cuda.cpp → .../cuda_ops/csrc/paged_kvcache_ops_cuda.cpp b/.../cuda_ops/csrc/paged_kvcache_ops_cuda.cpp → .../cuda_ops/csrc/paged_kvcache_ops_cuda.cpp
diff --git a/...cuda_ops/csrc/paged_kvcache_ops_kernel.cu → ...cuda_ops/csrc/paged_kvcache_ops_kernel.cu b/...cuda_ops/csrc/paged_kvcache_ops_kernel.cu → ...cuda_ops/csrc/paged_kvcache_ops_kernel.cu
diff --git a/...les/hstu/ops/cuda_ops/csrc/vec_dtypes.cuh → .../commons/ops/cuda_ops/csrc/vec_dtypes.cuh b/...les/hstu/ops/cuda_ops/csrc/vec_dtypes.cuh → .../commons/ops/cuda_ops/csrc/vec_dtypes.cuh
diff --git a/examples/hstu/ops/grad_scaling.py → examples/commons/ops/grad_scaling.py b/examples/hstu/ops/grad_scaling.py → examples/commons/ops/grad_scaling.py
diff --git a/examples/hstu/ops/length_to_offsets.py → examples/commons/ops/length_to_offsets.py b/examples/hstu/ops/length_to_offsets.py → examples/commons/ops/length_to_offsets.py
diff --git a/examples/hstu/ops/triton_ops/common.py → examples/commons/ops/triton_ops/common.py b/examples/hstu/ops/triton_ops/common.py → examples/commons/ops/triton_ops/common.py
@@ -33,7 +33,7 @@
 import dataclasses
 from dataclasses import dataclass
 from enum import Enum, unique
-from typing import Any, Dict, List, Optional, Tuple, Union
+from typing import Any, Dict, List, Tuple, Union
 
 import torch
 
@@ -129,55 +129,6 @@ class HammerKernel(Enum): # type: ignore[no-redef]
         TRITON_CC = "TRITON_CC"
 
 
-class GRModuleBase(torch.nn.Module):
-    _is_inference: bool
-    _use_triton_cc: bool
-    _custom_kernel: bool
-    _hammer_kernel: Optional[HammerKernel] = None
-
-    def __init__(
-        self,
-        is_inference: bool,
-        use_triton_cc: bool = True,
-        custom_kernel: bool = True,
-        hammer_kernel: Optional[HammerKernel] = None,
-    ) -> None:
-        super().__init__()
-        self._is_inference = is_inference
-        self._use_triton_cc = use_triton_cc
-        self._custom_kernel = custom_kernel
-        self._hammer_kernel = hammer_kernel
-
-    def hammer_kernel(self) -> HammerKernel:
-        kernel = self._hammer_kernel
-        if kernel is not None:
-            return kernel
-        if self._custom_kernel:
-            if self._is_inference and self._use_triton_cc:
-                return HammerKernel.TRITON_CC
-            else:
-                return HammerKernel.TRITON
-        else:
-            return HammerKernel.PYTORCH
-
-    # pyre-ignore[2]
-    def recursive_setattr(self, name: str, value: Any) -> None:
-        for _, module in self.named_modules():
-            if hasattr(module, name):
-                setattr(module, name, value)
-
-    @property
-    def predict_mode(self) -> bool:
-        return self._is_inference
-
-    @property
-    def eval_mode(self) -> bool:
-        return (not self._is_inference) and (not self.training)
-
-    @property
-    def train_mode(self) -> bool:
-        return (not self._is_inference) and self.training
-
 
 def generate_sparse_seq_len(
     size: int,