style(nyz): polish dreamerv3 code style and add readme link

PaParaZz1 · PaParaZz1 · commit ce5e50c682ae · 2023-08-07T17:23:45.000+08:00
diff --git a/README.md b/README.md
@@ -246,16 +246,17 @@ P.S: The `.py` file in `Runnable Demo` can be found in `dizoo`
 |  41  |         [CQL](https://arxiv.org/pdf/2006.04779.pdf)          | ![offline](https://img.shields.io/badge/-offlineRL-darkblue) | [CQL doc](https://di-engine-docs.readthedocs.io/en/latest/12_policies/cql.html)<br>[policy/cql](https://github.com/opendilab/DI-engine/blob/main/ding/policy/cql.py) |                 python3 -u d4rl_cql_main.py                  |
 |  42  |         [TD3BC](https://arxiv.org/pdf/2106.06860.pdf)          | ![offline](https://img.shields.io/badge/-offlineRL-darkblue) | [TD3BC doc](https://di-engine-docs.readthedocs.io/en/latest/12_policies/td3_bc.html)<br>[policy/td3_bc](https://github.com/opendilab/DI-engine/blob/main/ding/policy/td3_bc.py) |                 python3 -u d4rl_td3_bc_main.py                  |
 |  43  |         [Decision Transformer](https://arxiv.org/pdf/2106.01345.pdf)          | ![offline](https://img.shields.io/badge/-offlineRL-darkblue) | [policy/dt](https://github.com/opendilab/DI-engine/blob/main/ding/policy/decision_transformer.py) |                 python3 -u d4rl_dt_main.py                  |
-|  44  |         MBSAC([SAC](https://arxiv.org/abs/1801.01290)+[MVE](https://arxiv.org/abs/1803.00101)+[SVG](https://arxiv.org/abs/1510.09142))         | ![continuous](https://img.shields.io/badge/-continous-green)![mbrl](https://img.shields.io/badge/-ModelBasedRL-lightblue) | [policy/mbpolicy/mbsac](https://github.com/opendilab/DI-engine/blob/main/ding/policy/mbpolicy/mbsac.py) |        python3 -u pendulum_mbsac_mbpo_config.py \ python3 -u pendulum_mbsac_ddppo_config.py    |
-|  45  |         STEVESAC([SAC](https://arxiv.org/abs/1801.01290)+[STEVE](https://arxiv.org/abs/1807.01675)+[SVG](https://arxiv.org/abs/1510.09142))         | ![continuous](https://img.shields.io/badge/-continous-green)![mbrl](https://img.shields.io/badge/-ModelBasedRL-lightblue) | [policy/mbpolicy/mbsac](https://github.com/opendilab/DI-engine/blob/main/ding/policy/mbpolicy/mbsac.py) |        python3 -u pendulum_stevesac_mbpo_config.py    |
-|  46  |         [MBPO](https://arxiv.org/pdf/1906.08253.pdf)         | ![mbrl](https://img.shields.io/badge/-ModelBasedRL-lightblue) | [MBPO doc](https://di-engine-docs.readthedocs.io/en/latest/12_policies/mbpo.html)<br>[world_model/mbpo](https://github.com/opendilab/DI-engine/blob/main/ding/world_model/mbpo.py) |        python3 -u pendulum_sac_mbpo_config.py    |
-|  47  |         [DDPPO](https://openreview.net/forum?id=rzvOQrnclO0)         | ![mbrl](https://img.shields.io/badge/-ModelBasedRL-lightblue) | [world_model/ddppo](https://github.com/opendilab/DI-engine/blob/main/ding/world_model/ddppo.py) |        python3 -u pendulum_mbsac_ddppo_config.py    |
-|  48  |         [PER](https://arxiv.org/pdf/1511.05952.pdf)          |   ![other](https://img.shields.io/badge/-other-lightgrey)    | [worker/replay_buffer](https://github.com/opendilab/DI-engine/blob/main/ding/worker/replay_buffer/advanced_buffer.py) |                        `rainbow demo`                        |
-|  49  |         [GAE](https://arxiv.org/pdf/1506.02438.pdf)          |   ![other](https://img.shields.io/badge/-other-lightgrey)    | [rl_utils/gae](https://github.com/opendilab/DI-engine/blob/main/ding/rl_utils/gae.py) |                          `ppo demo`                          |
-|  50  |         [ST-DIM](https://arxiv.org/pdf/1906.08226.pdf)          |   ![other](https://img.shields.io/badge/-other-lightgrey)    | [torch_utils/loss/contrastive_loss](https://github.com/opendilab/DI-engine/blob/main/ding/torch_utils/loss/contrastive_loss.py) |        ding -m serial -c cartpole_dqn_stdim_config.py -s 0       |
-|  51  |         [PLR](https://arxiv.org/pdf/2010.03934.pdf)          |   ![other](https://img.shields.io/badge/-other-lightgrey)    | [PLR doc](https://di-engine-docs.readthedocs.io/en/latest/12_policies/plr.html)<br>[data/level_replay/level_sampler](https://github.com/opendilab/DI-engine/blob/main/ding/data/level_replay/level_sampler.py) |        python3 -u bigfish_plr_config.py -s 0       |
-|  52  |         [PCGrad](https://arxiv.org/pdf/2001.06782.pdf)          |   ![other](https://img.shields.io/badge/-other-lightgrey)    | [torch_utils/optimizer_helper/PCGrad](https://github.com/opendilab/DI-engine/blob/main/ding/data/torch_utils/optimizer_helper.py) |        python3 -u multi_mnist_pcgrad_main.py -s 0       |
-|  53  |         [edac](https://arxiv.org/pdf/2110.01548.pdf)          | ![offline](https://img.shields.io/badge/-offlineRL-darkblue) | [EDAC doc](https://di-engine-docs.readthedocs.io/en/latest/12_policies/edac.html)<br>[policy/edac](https://github.com/opendilab/DI-engine/blob/main/ding/policy/edac.py) |                 python3 -u d4rl_edac_main.py                  |
+|  44  |         [EDAC](https://arxiv.org/pdf/2110.01548.pdf)          | ![offline](https://img.shields.io/badge/-offlineRL-darkblue) | [EDAC doc](https://di-engine-docs.readthedocs.io/en/latest/12_policies/edac.html)<br>[policy/edac](https://github.com/opendilab/DI-engine/blob/main/ding/policy/edac.py) |                 python3 -u d4rl_edac_main.py                  |
+|  45  |         MBSAC([SAC](https://arxiv.org/abs/1801.01290)+[MVE](https://arxiv.org/abs/1803.00101)+[SVG](https://arxiv.org/abs/1510.09142))         | ![continuous](https://img.shields.io/badge/-continous-green)![mbrl](https://img.shields.io/badge/-ModelBasedRL-lightblue) | [policy/mbpolicy/mbsac](https://github.com/opendilab/DI-engine/blob/main/ding/policy/mbpolicy/mbsac.py) |        python3 -u pendulum_mbsac_mbpo_config.py \ python3 -u pendulum_mbsac_ddppo_config.py    |
+|  46  |         STEVESAC([SAC](https://arxiv.org/abs/1801.01290)+[STEVE](https://arxiv.org/abs/1807.01675)+[SVG](https://arxiv.org/abs/1510.09142))         | ![continuous](https://img.shields.io/badge/-continous-green)![mbrl](https://img.shields.io/badge/-ModelBasedRL-lightblue) | [policy/mbpolicy/mbsac](https://github.com/opendilab/DI-engine/blob/main/ding/policy/mbpolicy/mbsac.py) |        python3 -u pendulum_stevesac_mbpo_config.py    |
+|  47  |         [MBPO](https://arxiv.org/pdf/1906.08253.pdf)         | ![mbrl](https://img.shields.io/badge/-ModelBasedRL-lightblue) | [MBPO doc](https://di-engine-docs.readthedocs.io/en/latest/12_policies/mbpo.html)<br>[world_model/mbpo](https://github.com/opendilab/DI-engine/blob/main/ding/world_model/mbpo.py) |        python3 -u pendulum_sac_mbpo_config.py    |
+|  48  |         [DDPPO](https://openreview.net/forum?id=rzvOQrnclO0)         | ![mbrl](https://img.shields.io/badge/-ModelBasedRL-lightblue) | [world_model/ddppo](https://github.com/opendilab/DI-engine/blob/main/ding/world_model/ddppo.py) |        python3 -u pendulum_mbsac_ddppo_config.py    |
+|  49  |         [DreamerV3](https://arxiv.org/pdf/2301.04104.pdf)         | ![mbrl](https://img.shields.io/badge/-ModelBasedRL-lightblue) | [world_model/dreamerv3](https://github.com/opendilab/DI-engine/blob/main/ding/world_model/dreamerv3.py) |        python3 -u cartpole_balance_dreamer_config.py    |
+|  50  |         [PER](https://arxiv.org/pdf/1511.05952.pdf)          |   ![other](https://img.shields.io/badge/-other-lightgrey)    | [worker/replay_buffer](https://github.com/opendilab/DI-engine/blob/main/ding/worker/replay_buffer/advanced_buffer.py) |                        `rainbow demo`                        |
+|  51  |         [GAE](https://arxiv.org/pdf/1506.02438.pdf)          |   ![other](https://img.shields.io/badge/-other-lightgrey)    | [rl_utils/gae](https://github.com/opendilab/DI-engine/blob/main/ding/rl_utils/gae.py) |                          `ppo demo`                          |
+|  52  |         [ST-DIM](https://arxiv.org/pdf/1906.08226.pdf)          |   ![other](https://img.shields.io/badge/-other-lightgrey)    | [torch_utils/loss/contrastive_loss](https://github.com/opendilab/DI-engine/blob/main/ding/torch_utils/loss/contrastive_loss.py) |        ding -m serial -c cartpole_dqn_stdim_config.py -s 0       |
+|  53  |         [PLR](https://arxiv.org/pdf/2010.03934.pdf)          |   ![other](https://img.shields.io/badge/-other-lightgrey)    | [PLR doc](https://di-engine-docs.readthedocs.io/en/latest/12_policies/plr.html)<br>[data/level_replay/level_sampler](https://github.com/opendilab/DI-engine/blob/main/ding/data/level_replay/level_sampler.py) |        python3 -u bigfish_plr_config.py -s 0       |
+|  54  |         [PCGrad](https://arxiv.org/pdf/2001.06782.pdf)          |   ![other](https://img.shields.io/badge/-other-lightgrey)    | [torch_utils/optimizer_helper/PCGrad](https://github.com/opendilab/DI-engine/blob/main/ding/data/torch_utils/optimizer_helper.py) |        python3 -u multi_mnist_pcgrad_main.py -s 0       |
 </details>
 
 
diff --git a/ding/entry/serial_entry_mbrl.py b/ding/entry/serial_entry_mbrl.py
@@ -283,34 +283,43 @@ def serial_pipeline_dreamer(
         collect_kwargs = commander.step()
         # eval the policy
         if evaluator.should_eval(collector.envstep):
-            stop, reward = evaluator.eval(learner.save_checkpoint, learner.train_iter, collector.envstep, policy_kwargs=dict(world_model=world_model))
+            stop, reward = evaluator.eval(
+                learner.save_checkpoint,
+                learner.train_iter,
+                collector.envstep,
+                policy_kwargs=dict(world_model=world_model)
+            )
             if stop:
                 break
-        
+
         # train world model and fill imagination buffer
         steps = (
             cfg.world_model.pretrain
-            if world_model.should_pretrain()
-            else int(world_model.should_train(collector.envstep))
+            if world_model.should_pretrain() else int(world_model.should_train(collector.envstep))
         )
         for _ in range(steps):
             batch_size = learner.policy.get_attribute('batch_size')
             batch_length = cfg.policy.learn.batch_length
-            post, context = world_model.train(env_buffer, collector.envstep, learner.train_iter, batch_size, batch_length)
-            
+            post, context = world_model.train(
+                env_buffer, collector.envstep, learner.train_iter, batch_size, batch_length
+            )
+
             start = post
-            
+
             learner.train(
                 start, collector.envstep, policy_kwargs=dict(world_model=world_model, envstep=collector.envstep)
             )
-        
+
         # fill environment buffer
-        data = collector.collect(train_iter=learner.train_iter, policy_kwargs=dict(world_model=world_model, envstep=collector.envstep, **collect_kwargs))
+        data = collector.collect(
+            train_iter=learner.train_iter,
+            policy_kwargs=dict(world_model=world_model, envstep=collector.envstep, **collect_kwargs)
+        )
         env_buffer.push(data, cur_collector_envstep=collector.envstep)
 
         if collector.envstep >= max_env_step or learner.train_iter >= max_train_iter:
             break
 
     learner.call_hook('after_run')
 
-    return policy
+    return policy
diff --git a/ding/entry/utils.py b/ding/entry/utils.py
@@ -60,8 +60,10 @@ def random_collect(
         new_data = collector.collect(n_episode=policy_cfg.random_collect_size, policy_kwargs=collect_kwargs)
     else:
         new_data = collector.collect(
-            n_sample=policy_cfg.random_collect_size, random_collect=True,
-            record_random_collect=False, policy_kwargs=collect_kwargs
+            n_sample=policy_cfg.random_collect_size,
+            random_collect=True,
+            record_random_collect=False,
+            policy_kwargs=collect_kwargs
         )  # 'record_random_collect=False' means random collect without output log
     if postprocess_data_fn is not None:
         new_data = postprocess_data_fn(new_data)
diff --git a/ding/envs/env_wrappers/env_wrappers.py b/ding/envs/env_wrappers/env_wrappers.py
@@ -182,16 +182,15 @@ def observation(self, frame):
             import sys
             logging.warning("Please install opencv-python first.")
             sys.exit(1)
-        # to do 
-        # channel_first
+        # deal with channel_first case
         if frame.shape[0] < 10:
             frame = frame.transpose(1, 2, 0)
             frame = cv2.resize(frame, (self.size, self.size), interpolation=cv2.INTER_AREA)
             frame = frame.transpose(2, 0, 1)
         else:
             frame = cv2.cvtColor(frame, cv2.COLOR_RGB2GRAY)
             frame = cv2.resize(frame, (self.size, self.size), interpolation=cv2.INTER_AREA)
-            
+
         return frame
 
 
@@ -265,6 +264,7 @@ def reward(self, reward):
         """
         return np.sign(reward)
 
+
 @ENV_WRAPPER_REGISTRY.register('action_repeat')
 class ActionRepeatWrapper(gym.Wrapper):
     """
@@ -275,7 +275,7 @@ class ActionRepeatWrapper(gym.Wrapper):
     Properties:
         - env (:obj:`gym.Env`): the environment to wrap.
         - ``action_repeat``
-    
+
     """
 
     def __init__(self, env, action_repeat=1):
diff --git a/ding/policy/command_mode_policy_instance.py b/ding/policy/command_mode_policy_instance.py
@@ -303,6 +303,7 @@ class MBSACCommandModePolicy(MBSACPolicy, DummyCommandModePolicy):
 class STEVESACCommandModePolicy(STEVESACPolicy, DummyCommandModePolicy):
     pass
 
+
 @POLICY_REGISTRY.register('dreamer_command')
 class DREAMERCommandModePolicy(DREAMERPolicy, DummyCommandModePolicy):
     pass
diff --git a/ding/worker/collector/interaction_serial_evaluator.py b/ding/worker/collector/interaction_serial_evaluator.py
@@ -190,6 +190,7 @@ def eval(
             envstep: int = -1,
             n_episode: Optional[int] = None,
             force_render: bool = False,
+            policy_kwargs: Optional[Dict] = {},
     ) -> Tuple[bool, Dict[str, List]]:
         '''
         Overview:
@@ -228,7 +229,9 @@ def eval(
                         eval_monitor.update_video(self._env.ready_imgs)
 
                     if self._policy_cfg.type == 'dreamer_command':
-                        policy_output = self._policy.forward(obs, **policy_kwargs, reset=self._resets, state=self._states)
+                        policy_output = self._policy.forward(
+                            obs, **policy_kwargs, reset=self._resets, state=self._states
+                        )
                         #self._states = {env_id: output['state'] for env_id, output in policy_output.items()}
                         self._states = [output['state'] for output in policy_output.values()]
                     else:
@@ -317,4 +320,4 @@ def eval(
             stop_flag, episode_info = objects
 
         episode_info = to_item(episode_info)
-        return stop_flag, episode_info
+        return stop_flag, episode_info
diff --git a/ding/worker/collector/sample_serial_collector.py b/ding/worker/collector/sample_serial_collector.py
@@ -266,7 +266,7 @@ def collect(
                     self._states = [output['state'] for output in policy_output.values()]
                 else:
                     policy_output = self._policy.forward(obs, **policy_kwargs)
-                self._policy_output_pool.update(policy_output)                
+                self._policy_output_pool.update(policy_output)
                 # Interact with env.
                 actions = {env_id: output['action'] for env_id, output in policy_output.items()}
                 actions = to_ndarray(actions)
@@ -410,4 +410,4 @@ def _output_log(self, train_iter: int) -> None:
                 self._tb_logger.add_scalar('{}_iter/'.format(self._instance_name) + k, v, train_iter)
                 if k in ['total_envstep_count']:
                     continue
-                self._tb_logger.add_scalar('{}_step/'.format(self._instance_name) + k, v, self._total_envstep_count)
+                self._tb_logger.add_scalar('{}_step/'.format(self._instance_name) + k, v, self._total_envstep_count)
diff --git a/ding/worker/replay_buffer/naive_buffer.py b/ding/worker/replay_buffer/naive_buffer.py
@@ -541,7 +541,7 @@ def _get_indices(self, size: int, sequence: int, sample_range: slice = None, rep
                 indices.append(np.random.randint(episode * 500, episode * 500 + available + 1))
                 batch += 1
         else:
-            raise NotImplemented("sample_range is not implemented in this version")
+            raise NotImplementedError("sample_range is not implemented in this version")
         return indices
 
     def _sample_with_indices(self, indices: List[int], sequence: int, cur_learner_iter: int) -> list:
diff --git a/ding/world_model/dreamer.py b/ding/world_model/dreamer.py
@@ -143,7 +143,7 @@ def __init__(self, cfg, env, tb_logger):
                 dist="binary",
                 device=self._cfg.device,
             )
-        
+
         if self._cuda:
             self.cuda()
         # to do
@@ -164,7 +164,9 @@ def should_pretrain(self):
 
     def train(self, env_buffer, envstep, train_iter, batch_size, batch_length):
         self.last_train_step = envstep
-        data = env_buffer.sample(batch_size, batch_length, train_iter)  # [len=B, ele=[len=T, ele={dict_key: Tensor(any_dims)}]]
+        data = env_buffer.sample(
+            batch_size, batch_length, train_iter
+        )  # [len=B, ele=[len=T, ele={dict_key: Tensor(any_dims)}]]
         data = default_collate(data)  # -> [len=T, ele={dict_key: Tensor(B, any_dims)}]
         data = lists_to_dicts(data, recursive=True)  # -> {some_key: T lists}, each list is [B, some_dim]
         data = {k: torch.stack(data[k], dim=1) for k in data}  # -> {dict_key: Tensor([B, T, any_dims])}
@@ -186,7 +188,7 @@ def train(self, env_buffer, envstep, train_iter, batch_size, batch_length):
         image = data['image'].reshape([-1] + list(data['image'].shape[-3:]))
         embed = self.encoder(image)
         embed = embed.reshape(list(data['image'].shape[:-3]) + [embed.shape[-1]])
-        
+
         post, prior = self.dynamics.observe(embed, data["action"])
         kl_loss, kl_value, loss_lhs, loss_rhs = self.dynamics.kl_loss(
             post, prior, self._cfg.kl_forward, self._cfg.kl_free, self._cfg.kl_lscale, self._cfg.kl_rscale
@@ -209,7 +211,7 @@ def train(self, env_buffer, envstep, train_iter, batch_size, batch_length):
         self.optimizer.zero_grad()
         model_loss.backward()
         self.optimizer.step()
-        
+
         self.requires_grad_(requires_grad=False)
         # log
         if self.tb_logger is not None:
diff --git a/ding/world_model/tests/test_dreamer.py b/ding/world_model/tests/test_dreamer.py
@@ -7,7 +7,7 @@
 from ding.utils import deep_merge_dicts
 
 # arguments
-state_size = [3,64,64]
+state_size = [3, 64, 64]
 action_size = [6, 1]
 args = list(product(*[state_size, action_size]))
 
@@ -30,4 +30,3 @@ def test_train(self, state_size, action_size):
         actions = torch.rand(1280, action_size)
 
         model = self.get_world_model(state_size, action_size)
-