pytorch · vmoens · Jan 8, 2026 · Jan 8, 2026 · Jan 8, 2026 · Jan 8, 2026
diff --git a/.github/unittest/linux_sota/scripts/test_sota.py b/.github/unittest/linux_sota/scripts/test_sota.py
@@ -284,6 +284,7 @@
   collector.total_frames=600 \
   collector.init_random_frames=10 \
   collector.frames_per_batch=200 \
+  collector.num_collectors=1 \
   env.n_parallel_envs=1 \
   optimization.optim_steps_per_batch=1 \
   optimization.compile=False \
@@ -292,6 +293,7 @@
   replay_buffer.buffer_size=120 \
   replay_buffer.batch_size=24 \
   replay_buffer.batch_length=12 \
+  replay_buffer.prefetch=1 \
   networks.rssm_hidden_dim=17
 """,
 }

diff --git a/sota-implementations/dreamer/README.md b/sota-implementations/dreamer/README.md
@@ -1,7 +1,129 @@
-# Dreamer example
+# Dreamer V1
 
-## Note:
-This example is not included in the benchmarked results of the current release (v0.3). The intention is to include it in the
-benchmarking of future releases, to ensure that it can be successfully run with the release code and that the
-results are consistent. For now, be aware that this additional check has not been performed in the case of this 
-specific example.
+This is an implementation of the Dreamer algorithm from the paper 
+["Dream to Control: Learning Behaviors by Latent Imagination"](https://arxiv.org/abs/1912.01603) (Hafner et al., ICLR 2020).
+
+Dreamer is a model-based reinforcement learning algorithm that:
+1. Learns a **world model** (RSSM) from experience
+2. **Imagines** future trajectories in latent space
+3. Trains **actor and critic** using analytic gradients through the imagined rollouts
+
+## Setup
+
+### Dependencies
+
+```bash
+# Create virtual environment
+uv venv torchrl --python 3.12
+source torchrl/bin/activate
+
+# Install PyTorch (adjust for your CUDA version)
+uv pip install --pre torch torchvision --index-url https://download.pytorch.org/whl/nightly/cu128
+
+# Install TorchRL and TensorDict
+uv pip install tensordict torchrl
+
+# Install additional dependencies
+uv pip install mujoco dm_control wandb tqdm hydra-core
+```
+
+### System Dependencies (for MuJoCo rendering)
+
+```bash
+apt-get update && apt-get install -y \
+    libegl1 \
+    libgl1 \
+    libgles2 \
+    libglvnd0
+```
+
+### Environment Variables
+
+```bash
+export MUJOCO_GL=egl
+export MUJOCO_EGL_DEVICE_ID=0
+```
+
+## Running
+
+```bash
+python dreamer.py
+```
+
+### Configuration
+
+The default configuration trains on DMControl's `cheetah-run` task. You can override settings via command line:
+
+```bash
+# Different environment
+python dreamer.py env.name=walker env.task=walk
+
+# Mixed precision options: false, true (=bfloat16), float16, bfloat16
+python dreamer.py optimization.autocast=bfloat16  # default
+python dreamer.py optimization.autocast=float16   # for older GPUs
+python dreamer.py optimization.autocast=false     # disable autocast
+
+# Adjust batch size
+python dreamer.py replay_buffer.batch_size=1000
+```
+
+## Known Caveats
+
+### 1. Mixed Precision (Autocast) Compatibility
+
+Some GPU/cuBLAS combinations have issues with `bfloat16` autocast, resulting in:
+```
+RuntimeError: CUDA error: CUBLAS_STATUS_INVALID_VALUE when calling cublasGemmEx
+```
+
+**Solutions:**
+- Try float16: `optimization.autocast=float16`
+- Or disable autocast entirely: `optimization.autocast=false`
+
+Note: Ensure your PyTorch CUDA version matches your driver. For example, with CUDA 13.0:
+```bash
+uv pip install --pre torch torchvision --index-url https://download.pytorch.org/whl/nightly/cu130
+```
+
+### 2. Benchmarking Status
+
+This implementation has not been fully benchmarked against the original paper's results.
+Performance may differ from published numbers.
+
+### 3. Video Logging
+
+To enable video logging of both real and imagined rollouts:
+```bash
+python dreamer.py logger.video=True
+```
+
+This requires additional setup for rendering and significantly increases computation time.
+
+## Architecture Overview
+
+```
+World Model:
+  - ObsEncoder: pixels -> encoded_latents
+  - RSSMPrior: (state, belief, action) -> next_belief, prior_dist
+  - RSSMPosterior: (belief, encoded_latents) -> posterior_dist, state
+  - ObsDecoder: (state, belief) -> reconstructed_pixels
+  - RewardModel: (state, belief) -> predicted_reward
+
+Actor: (state, belief) -> action_distribution
+Critic: (state, belief) -> state_value
+```
+
+## Training Loop
+
+1. **Collect** real experience from environment
+2. **Train world model** on sequences from replay buffer (KL + reconstruction + reward loss)
+3. **Imagine** trajectories starting from encoded real states
+4. **Train actor** to maximize imagined returns (gradients flow through dynamics)
+5. **Train critic** to predict lambda returns on imagined trajectories
+
+## References
+
+- Original Paper: [Dream to Control: Learning Behaviors by Latent Imagination](https://arxiv.org/abs/1912.01603)
+- PlaNet (predecessor): [Learning Latent Dynamics for Planning from Pixels](https://arxiv.org/abs/1811.04551)
+- DreamerV2: [Mastering Atari with Discrete World Models](https://arxiv.org/abs/2010.02193)
+- DreamerV3: [Mastering Diverse Domains through World Models](https://arxiv.org/abs/2301.04104)
diff --git a/sota-implementations/dreamer/config.yaml b/sota-implementations/dreamer/config.yaml
@@ -15,6 +15,9 @@ collector:
   total_frames: 5_000_000
   init_random_frames: 3000
   frames_per_batch: 1000
+  # Number of parallel collector workers (async mode)
+  # On multi-GPU: must be <= num_gpus - 1 (cuda:0 reserved for training)
+  num_collectors: 7
   device:
 
 optimization:
@@ -26,13 +29,18 @@ optimization:
   value_lr: 8e-5
   kl_scale: 1.0
   free_nats: 3.0
-  optim_steps_per_batch: 80
+  optim_steps_per_batch: 20
   gamma: 0.99
   lmbda: 0.95
   imagination_horizon: 15
-  compile: False
-  compile_backend: inductor
-  use_autocast: True
+  compile:
+    enabled: True
+    backend: inductor # or cudagraphs
+    mode: reduce-overhead
+    # Which losses to compile (subset of: world_model, actor, value)
+    losses: ["world_model", "actor", "value"]
+  # Autocast options: false, true (=bfloat16), float16, bfloat16
+  autocast: bfloat16
 
 networks:
   exploration_noise: 0.3
@@ -41,13 +49,21 @@ networks:
   rssm_hidden_dim: 200
   hidden_dim: 400
   activation: "elu"
+  # Use torch.scan for RSSM rollout (faster, no graph breaks with torch.compile)
+  use_scan: False
+  rssm_rollout:
+    # Compile only the per-timestep RSSM rollout step (keeps Python loop, avoids scan/unrolling).
+    compile: False
+    compile_backend: inductor
+    compile_mode: reduce-overhead
 
 
 replay_buffer: 
-  batch_size: 2500
+  batch_size: 10000
   buffer_size: 1000000
   batch_length: 50
   scratch_dir: null
+  prefetch: 8
 
 logger:
   backend: wandb
@@ -58,3 +74,27 @@ logger:
   eval_iter: 10
   eval_rollout_steps: 500
   video: False
+
+profiling:
+  # Enable PyTorch profiling (overrides total_frames to profiling_total_frames)
+  enabled: False
+  # Total frames to collect when profiling (default: 5005 = 5 collection iters + buffer warmup)
+  total_frames: 5005
+  # Skip the first N optim steps (no profiling at all)
+  skip_first: 1
+  # Warmup steps (profiler runs but data discarded for warmup)
+  warmup_steps: 1
+  # Number of optim steps to profile (actual traced data)
+  active_steps: 1
+  # Export chrome trace to this file (if set)
+  trace_file: dreamer_trace.json
+  # Profile CUDA kernels (VERY heavy on GPU - 13GB vs 1GB trace!)
+  profile_cuda: true
+  # Record tensor shapes
+  record_shapes: True
+  # Profile memory usage
+  profile_memory: True
+  # Record Python call stacks
+  with_stack: True
+  # Compute FLOPs
+  with_flops: True