From 4d5307425b5a68b0892ac26ab144d5018180af9b Mon Sep 17 00:00:00 2001
From: zjowowen <93968541+zjowowen@users.noreply.github.com>
Date: Wed, 27 Dec 2023 21:22:06 +0800
Subject: [PATCH] doc(zjow): add API doc for ding agent (#758)

* polish API doc for agent ppof and dqn

* add doc for ding agent

* polish code
---
 ding/bonus/a2c.py           | 173 ++++++++++++++++++++++++++++++++-
 ding/bonus/c51.py           | 174 ++++++++++++++++++++++++++++++++-
 ding/bonus/ddpg.py          | 175 ++++++++++++++++++++++++++++++++-
 ding/bonus/dqn.py           | 172 ++++++++++++++++++++++++++++++++-
 ding/bonus/pg.py            | 172 ++++++++++++++++++++++++++++++++-
 ding/bonus/ppo_offpolicy.py | 177 +++++++++++++++++++++++++++++++++-
 ding/bonus/ppof.py          | 187 +++++++++++++++++++++++++++++++++++-
 ding/bonus/sac.py           | 173 ++++++++++++++++++++++++++++++++-
 ding/bonus/sql.py           | 177 +++++++++++++++++++++++++++++++++-
 ding/bonus/td3.py           | 173 ++++++++++++++++++++++++++++++++-
 10 files changed, 1738 insertions(+), 15 deletions(-)

diff --git a/ding/bonus/a2c.py b/ding/bonus/a2c.py
index de666b30df..d10def313b 100644
--- a/ding/bonus/a2c.py
+++ b/ding/bonus/a2c.py
@@ -23,7 +23,23 @@
 
 
 class A2CAgent:
+    """
+    Overview:
+        Class of agent for training, evaluation and deployment of Reinforcement learning algorithm \
+        Advantage Actor Critic(A2C).
+        For more information about the system design of RL agent, please refer to \
+        <https://di-engine-docs.readthedocs.io/en/latest/03_system/agent.html>.
+    Interface:
+        ``__init__``, ``train``, ``deploy``, ``collect_data``, ``batch_evaluate``, ``best``
+    """
     supported_env_list = list(supported_env_cfg.keys())
+    """
+    Overview:
+        List of supported envs.
+    Examples:
+        >>> from ding.bonus.a2c import A2CAgent
+        >>> print(A2CAgent.supported_env_list)
+    """
 
     def __init__(
             self,
@@ -35,6 +51,52 @@ def __init__(
             cfg: Optional[Union[EasyDict, dict]] = None,
             policy_state_dict: str = None,
     ) -> None:
+        """
+        Overview:
+            Initialize agent for A2C algorithm.
+        Arguments:
+            - env_id (:obj:`str`): The environment id, which is a registered environment name in gym or gymnasium. \
+                If ``env_id`` is not specified, ``env_id`` in ``cfg.env`` must be specified. \
+                If ``env_id`` is specified, ``env_id`` in ``cfg.env`` will be ignored. \
+                ``env_id`` should be one of the supported envs, which can be found in ``supported_env_list``.
+            - env (:obj:`BaseEnv`): The environment instance for training and evaluation. \
+                If ``env`` is not specified, `env_id`` or ``cfg.env.env_id`` must be specified. \
+                ``env_id`` or ``cfg.env.env_id`` will be used to create environment instance. \
+                If ``env`` is specified, ``env_id`` and ``cfg.env.env_id`` will be ignored.
+            - seed (:obj:`int`): The random seed, which is set before running the program. \
+                Default to 0.
+            - exp_name (:obj:`str`): The name of this experiment, which will be used to create the folder to save \
+                log data. Default to None. If not specified, the folder name will be ``env_id``-``algorithm``.
+            - model (:obj:`torch.nn.Module`): The model of A2C algorithm, which should be an instance of class \
+                :class:`ding.model.VAC`. \
+                If not specified, a default model will be generated according to the configuration.
+            - cfg (:obj:Union[EasyDict, dict]): The configuration of A2C algorithm, which is a dict. \
+                Default to None. If not specified, the default configuration will be used. \
+                The default configuration can be found in ``ding/config/example/A2C/gym_lunarlander_v2.py``.
+            - policy_state_dict (:obj:`str`): The path of policy state dict saved by PyTorch a in local file. \
+                If specified, the policy will be loaded from this file. Default to None.
+
+        .. note::
+            An RL Agent Instance can be initialized in two basic ways. \
+            For example, we have an environment with id ``LunarLanderContinuous-v2`` registered in gym, \
+            and we want to train an agent with A2C algorithm with default configuration. \
+            Then we can initialize the agent in the following ways:
+                >>> agent = A2CAgent(env_id='LunarLanderContinuous-v2')
+            or, if we want can specify the env_id in the configuration:
+                >>> cfg = {'env': {'env_id': 'LunarLanderContinuous-v2'}, 'policy': ...... }
+                >>> agent = A2CAgent(cfg=cfg)
+            There are also other arguments to specify the agent when initializing.
+            For example, if we want to specify the environment instance:
+                >>> env = CustomizedEnv('LunarLanderContinuous-v2')
+                >>> agent = A2CAgent(cfg=cfg, env=env)
+            or, if we want to specify the model:
+                >>> model = VAC(**cfg.policy.model)
+                >>> agent = A2CAgent(cfg=cfg, model=model)
+            or, if we want to reload the policy from a saved policy state dict:
+                >>> agent = A2CAgent(cfg=cfg, policy_state_dict='LunarLanderContinuous-v2.pth.tar')
+            Make sure that the configuration is consistent with the saved policy state dict.
+        """
+
         assert env_id is not None or cfg is not None, "Please specify env_id or cfg."
 
         if cfg is not None and not isinstance(cfg, EasyDict):
@@ -91,6 +153,32 @@ def train(
         debug: bool = False,
         wandb_sweep: bool = False,
     ) -> TrainingReturn:
+        """
+        Overview:
+            Train the agent with A2C algorithm for ``step`` iterations with ``collector_env_num`` collector \
+            environments and ``evaluator_env_num`` evaluator environments. Information during training will be \
+            recorded and saved by wandb.
+        Arguments:
+            - step (:obj:`int`): The total training environment steps of all collector environments. Default to 1e7.
+            - collector_env_num (:obj:`int`): The collector environment number. Default to None. \
+                If not specified, it will be set according to the configuration.
+            - evaluator_env_num (:obj:`int`): The evaluator environment number. Default to None. \
+                If not specified, it will be set according to the configuration.
+            - n_iter_save_ckpt (:obj:`int`): The frequency of saving checkpoint every training iteration. \
+                Default to 1000.
+            - context (:obj:`str`): The multi-process context of the environment manager. Default to None. \
+                It can be specified as ``spawn``, ``fork`` or ``forkserver``.
+            - debug (:obj:`bool`): Whether to use debug mode in the environment manager. Default to False. \
+                If set True, base environment manager will be used for easy debugging. Otherwise, \
+                subprocess environment manager will be used.
+            - wandb_sweep (:obj:`bool`): Whether to use wandb sweep, \
+                which is a hyper-parameter optimization process for seeking the best configurations. \
+                Default to False. If True, the wandb sweep id will be used as the experiment name.
+        Returns:
+            - (:obj:`TrainingReturn`): The training result, of which the attributions are:
+                - wandb_url (:obj:`str`): The weight & biases (wandb) project url of the trainning experiment.
+        """
+
         if debug:
             logging.getLogger().setLevel(logging.DEBUG)
         logging.debug(self.policy._model)
@@ -142,6 +230,31 @@ def deploy(
             seed: Optional[Union[int, List]] = None,
             debug: bool = False
     ) -> EvalReturn:
+        """
+        Overview:
+            Deploy the agent with A2C algorithm by interacting with the environment, during which the replay video \
+            can be saved if ``enable_save_replay`` is True. The evaluation result will be returned.
+        Arguments:
+            - enable_save_replay (:obj:`bool`): Whether to save the replay video. Default to False.
+            - concatenate_all_replay (:obj:`bool`): Whether to concatenate all replay videos into one video. \
+                Default to False. If ``enable_save_replay`` is False, this argument will be ignored. \
+                If ``enable_save_replay`` is True and ``concatenate_all_replay`` is False, \
+                the replay video of each episode will be saved separately.
+            - replay_save_path (:obj:`str`): The path to save the replay video. Default to None. \
+                If not specified, the video will be saved in ``exp_name/videos``.
+            - seed (:obj:`Union[int, List]`): The random seed, which is set before running the program. \
+                Default to None. If not specified, ``self.seed`` will be used. \
+                If ``seed`` is an integer, the agent will be deployed once. \
+                If ``seed`` is a list of integers, the agent will be deployed once for each seed in the list.
+            - debug (:obj:`bool`): Whether to use debug mode in the environment manager. Default to False. \
+                If set True, base environment manager will be used for easy debugging. Otherwise, \
+                subprocess environment manager will be used.
+        Returns:
+            - (:obj:`EvalReturn`): The evaluation result, of which the attributions are:
+                - eval_value (:obj:`np.float32`): The mean of evaluation return.
+                - eval_value_std (:obj:`np.float32`): The standard deviation of evaluation return.
+        """
+
         if debug:
             logging.getLogger().setLevel(logging.DEBUG)
         # define env and policy
@@ -227,6 +340,26 @@ def collect_data(
             context: Optional[str] = None,
             debug: bool = False
     ) -> None:
+        """
+        Overview:
+            Collect data with A2C algorithm for ``n_episode`` episodes with ``env_num`` collector environments. \
+            The collected data will be saved in ``save_data_path`` if specified, otherwise it will be saved in \
+            ``exp_name/demo_data``.
+        Arguments:
+            - env_num (:obj:`int`): The number of collector environments. Default to 8.
+            - save_data_path (:obj:`str`): The path to save the collected data. Default to None. \
+                If not specified, the data will be saved in ``exp_name/demo_data``.
+            - n_sample (:obj:`int`): The number of samples to collect. Default to None. \
+                If not specified, ``n_episode`` must be specified.
+            - n_episode (:obj:`int`): The number of episodes to collect. Default to None. \
+                If not specified, ``n_sample`` must be specified.
+            - context (:obj:`str`): The multi-process context of the environment manager. Default to None. \
+                It can be specified as ``spawn``, ``fork`` or ``forkserver``.
+            - debug (:obj:`bool`): Whether to use debug mode in the environment manager. Default to False. \
+                If set True, base environment manager will be used for easy debugging. Otherwise, \
+                subprocess environment manager will be used.
+        """
+
         if debug:
             logging.getLogger().setLevel(logging.DEBUG)
         if n_episode is not None:
@@ -258,6 +391,27 @@ def batch_evaluate(
             context: Optional[str] = None,
             debug: bool = False
     ) -> EvalReturn:
+        """
+        Overview:
+            Evaluate the agent with A2C algorithm for ``n_evaluator_episode`` episodes with ``env_num`` evaluator \
+            environments. The evaluation result will be returned.
+            The difference between methods ``batch_evaluate`` and ``deploy`` is that ``batch_evaluate`` will create \
+            multiple evaluator environments to evaluate the agent to get an average performance, while ``deploy`` \
+            will only create one evaluator environment to evaluate the agent and save the replay video.
+        Arguments:
+            - env_num (:obj:`int`): The number of evaluator environments. Default to 4.
+            - n_evaluator_episode (:obj:`int`): The number of episodes to evaluate. Default to 4.
+            - context (:obj:`str`): The multi-process context of the environment manager. Default to None. \
+                It can be specified as ``spawn``, ``fork`` or ``forkserver``.
+            - debug (:obj:`bool`): Whether to use debug mode in the environment manager. Default to False. \
+                If set True, base environment manager will be used for easy debugging. Otherwise, \
+                subprocess environment manager will be used.
+        Returns:
+            - (:obj:`EvalReturn`): The evaluation result, of which the attributions are:
+                - eval_value (:obj:`np.float32`): The mean of evaluation return.
+                - eval_value_std (:obj:`np.float32`): The standard deviation of evaluation return.
+        """
+
         if debug:
             logging.getLogger().setLevel(logging.DEBUG)
         # define env and policy
@@ -280,7 +434,24 @@ def batch_evaluate(
         return EvalReturn(eval_value=task.ctx.eval_value, eval_value_std=task.ctx.eval_value_std)
 
     @property
-    def best(self):
+    def best(self) -> 'A2CAgent':
+        """
+        Overview:
+            Load the best model from the checkpoint directory, \
+            which is by default in folder ``exp_name/ckpt/eval.pth.tar``. \
+            The return value is the agent with the best model.
+        Returns:
+            - (:obj:`A2CAgent`): The agent with the best model.
+        Examples:
+            >>> agent = A2CAgent(env_id='LunarLanderContinuous-v2')
+            >>> agent.train()
+            >>> agent = agent.best
+
+        .. note::
+            The best model is the model with the highest evaluation return. If this method is called, the current \
+            model will be replaced by the best model.
+        """
+
         best_model_file_path = os.path.join(self.checkpoint_save_dir, "eval.pth.tar")
         # Load best model if it exists
         if os.path.exists(best_model_file_path):
diff --git a/ding/bonus/c51.py b/ding/bonus/c51.py
index 52ab7ec220..ab4f0be85e 100644
--- a/ding/bonus/c51.py
+++ b/ding/bonus/c51.py
@@ -24,7 +24,22 @@
 
 
 class C51Agent:
+    """
+    Overview:
+        Class of agent for training, evaluation and deployment of Reinforcement learning algorithm C51.
+        For more information about the system design of RL agent, please refer to \
+        <https://di-engine-docs.readthedocs.io/en/latest/03_system/agent.html>.
+    Interface:
+        ``__init__``, ``train``, ``deploy``, ``collect_data``, ``batch_evaluate``, ``best``
+    """
     supported_env_list = list(supported_env_cfg.keys())
+    """
+    Overview:
+        List of supported envs.
+    Examples:
+        >>> from ding.bonus.c51 import C51Agent
+        >>> print(C51Agent.supported_env_list)
+    """
 
     def __init__(
             self,
@@ -36,6 +51,52 @@ def __init__(
             cfg: Optional[Union[EasyDict, dict]] = None,
             policy_state_dict: str = None,
     ) -> None:
+        """
+        Overview:
+            Initialize agent for C51 algorithm.
+        Arguments:
+            - env_id (:obj:`str`): The environment id, which is a registered environment name in gym or gymnasium. \
+                If ``env_id`` is not specified, ``env_id`` in ``cfg.env`` must be specified. \
+                If ``env_id`` is specified, ``env_id`` in ``cfg.env`` will be ignored. \
+                ``env_id`` should be one of the supported envs, which can be found in ``supported_env_list``.
+            - env (:obj:`BaseEnv`): The environment instance for training and evaluation. \
+                If ``env`` is not specified, `env_id`` or ``cfg.env.env_id`` must be specified. \
+                ``env_id`` or ``cfg.env.env_id`` will be used to create environment instance. \
+                If ``env`` is specified, ``env_id`` and ``cfg.env.env_id`` will be ignored.
+            - seed (:obj:`int`): The random seed, which is set before running the program. \
+                Default to 0.
+            - exp_name (:obj:`str`): The name of this experiment, which will be used to create the folder to save \
+                log data. Default to None. If not specified, the folder name will be ``env_id``-``algorithm``.
+            - model (:obj:`torch.nn.Module`): The model of C51 algorithm, which should be an instance of class \
+                :class:`ding.model.C51DQN`. \
+                If not specified, a default model will be generated according to the configuration.
+            - cfg (:obj:Union[EasyDict, dict]): The configuration of C51 algorithm, which is a dict. \
+                Default to None. If not specified, the default configuration will be used. \
+                The default configuration can be found in ``ding/config/example/C51/gym_lunarlander_v2.py``.
+            - policy_state_dict (:obj:`str`): The path of policy state dict saved by PyTorch a in local file. \
+                If specified, the policy will be loaded from this file. Default to None.
+
+        .. note::
+            An RL Agent Instance can be initialized in two basic ways. \
+            For example, we have an environment with id ``LunarLander-v2`` registered in gym, \
+            and we want to train an agent with C51 algorithm with default configuration. \
+            Then we can initialize the agent in the following ways:
+                >>> agent = C51Agent(env_id='LunarLander-v2')
+            or, if we want can specify the env_id in the configuration:
+                >>> cfg = {'env': {'env_id': 'LunarLander-v2'}, 'policy': ...... }
+                >>> agent = C51Agent(cfg=cfg)
+            There are also other arguments to specify the agent when initializing.
+            For example, if we want to specify the environment instance:
+                >>> env = CustomizedEnv('LunarLander-v2')
+                >>> agent = C51Agent(cfg=cfg, env=env)
+            or, if we want to specify the model:
+                >>> model = C51DQN(**cfg.policy.model)
+                >>> agent = C51Agent(cfg=cfg, model=model)
+            or, if we want to reload the policy from a saved policy state dict:
+                >>> agent = C51Agent(cfg=cfg, policy_state_dict='LunarLander-v2.pth.tar')
+            Make sure that the configuration is consistent with the saved policy state dict.
+        """
+
         assert env_id is not None or cfg is not None, "Please specify env_id or cfg."
 
         if cfg is not None and not isinstance(cfg, EasyDict):
@@ -92,6 +153,32 @@ def train(
         debug: bool = False,
         wandb_sweep: bool = False,
     ) -> TrainingReturn:
+        """
+        Overview:
+            Train the agent with C51 algorithm for ``step`` iterations with ``collector_env_num`` collector \
+            environments and ``evaluator_env_num`` evaluator environments. Information during training will be \
+            recorded and saved by wandb.
+        Arguments:
+            - step (:obj:`int`): The total training environment steps of all collector environments. Default to 1e7.
+            - collector_env_num (:obj:`int`): The collector environment number. Default to None. \
+                If not specified, it will be set according to the configuration.
+            - evaluator_env_num (:obj:`int`): The evaluator environment number. Default to None. \
+                If not specified, it will be set according to the configuration.
+            - n_iter_save_ckpt (:obj:`int`): The frequency of saving checkpoint every training iteration. \
+                Default to 1000.
+            - context (:obj:`str`): The multi-process context of the environment manager. Default to None. \
+                It can be specified as ``spawn``, ``fork`` or ``forkserver``.
+            - debug (:obj:`bool`): Whether to use debug mode in the environment manager. Default to False. \
+                If set True, base environment manager will be used for easy debugging. Otherwise, \
+                subprocess environment manager will be used.
+            - wandb_sweep (:obj:`bool`): Whether to use wandb sweep, \
+                which is a hyper-parameter optimization process for seeking the best configurations. \
+                Default to False. If True, the wandb sweep id will be used as the experiment name.
+        Returns:
+            - (:obj:`TrainingReturn`): The training result, of which the attributions are:
+                - wandb_url (:obj:`str`): The weight & biases (wandb) project url of the trainning experiment.
+        """
+
         if debug:
             logging.getLogger().setLevel(logging.DEBUG)
         logging.debug(self.policy._model)
@@ -147,6 +234,31 @@ def deploy(
             seed: Optional[Union[int, List]] = None,
             debug: bool = False
     ) -> EvalReturn:
+        """
+        Overview:
+            Deploy the agent with C51 algorithm by interacting with the environment, during which the replay video \
+            can be saved if ``enable_save_replay`` is True. The evaluation result will be returned.
+        Arguments:
+            - enable_save_replay (:obj:`bool`): Whether to save the replay video. Default to False.
+            - concatenate_all_replay (:obj:`bool`): Whether to concatenate all replay videos into one video. \
+                Default to False. If ``enable_save_replay`` is False, this argument will be ignored. \
+                If ``enable_save_replay`` is True and ``concatenate_all_replay`` is False, \
+                the replay video of each episode will be saved separately.
+            - replay_save_path (:obj:`str`): The path to save the replay video. Default to None. \
+                If not specified, the video will be saved in ``exp_name/videos``.
+            - seed (:obj:`Union[int, List]`): The random seed, which is set before running the program. \
+                Default to None. If not specified, ``self.seed`` will be used. \
+                If ``seed`` is an integer, the agent will be deployed once. \
+                If ``seed`` is a list of integers, the agent will be deployed once for each seed in the list.
+            - debug (:obj:`bool`): Whether to use debug mode in the environment manager. Default to False. \
+                If set True, base environment manager will be used for easy debugging. Otherwise, \
+                subprocess environment manager will be used.
+        Returns:
+            - (:obj:`EvalReturn`): The evaluation result, of which the attributions are:
+                - eval_value (:obj:`np.float32`): The mean of evaluation return.
+                - eval_value_std (:obj:`np.float32`): The standard deviation of evaluation return.
+        """
+
         if debug:
             logging.getLogger().setLevel(logging.DEBUG)
         # define env and policy
@@ -206,7 +318,7 @@ def _forward(obs):
                 step += 1
                 if done:
                     break
-            logging.info(f'DQN deploy is finished, final episode return with {step} steps is: {return_}')
+            logging.info(f'C51 deploy is finished, final episode return with {step} steps is: {return_}')
             returns.append(return_)
 
         env.close()
@@ -227,6 +339,26 @@ def collect_data(
             context: Optional[str] = None,
             debug: bool = False
     ) -> None:
+        """
+        Overview:
+            Collect data with C51 algorithm for ``n_episode`` episodes with ``env_num`` collector environments. \
+            The collected data will be saved in ``save_data_path`` if specified, otherwise it will be saved in \
+            ``exp_name/demo_data``.
+        Arguments:
+            - env_num (:obj:`int`): The number of collector environments. Default to 8.
+            - save_data_path (:obj:`str`): The path to save the collected data. Default to None. \
+                If not specified, the data will be saved in ``exp_name/demo_data``.
+            - n_sample (:obj:`int`): The number of samples to collect. Default to None. \
+                If not specified, ``n_episode`` must be specified.
+            - n_episode (:obj:`int`): The number of episodes to collect. Default to None. \
+                If not specified, ``n_sample`` must be specified.
+            - context (:obj:`str`): The multi-process context of the environment manager. Default to None. \
+                It can be specified as ``spawn``, ``fork`` or ``forkserver``.
+            - debug (:obj:`bool`): Whether to use debug mode in the environment manager. Default to False. \
+                If set True, base environment manager will be used for easy debugging. Otherwise, \
+                subprocess environment manager will be used.
+        """
+
         if debug:
             logging.getLogger().setLevel(logging.DEBUG)
         if n_episode is not None:
@@ -258,6 +390,27 @@ def batch_evaluate(
             context: Optional[str] = None,
             debug: bool = False
     ) -> EvalReturn:
+        """
+        Overview:
+            Evaluate the agent with C51 algorithm for ``n_evaluator_episode`` episodes with ``env_num`` evaluator \
+            environments. The evaluation result will be returned.
+            The difference between methods ``batch_evaluate`` and ``deploy`` is that ``batch_evaluate`` will create \
+            multiple evaluator environments to evaluate the agent to get an average performance, while ``deploy`` \
+            will only create one evaluator environment to evaluate the agent and save the replay video.
+        Arguments:
+            - env_num (:obj:`int`): The number of evaluator environments. Default to 4.
+            - n_evaluator_episode (:obj:`int`): The number of episodes to evaluate. Default to 4.
+            - context (:obj:`str`): The multi-process context of the environment manager. Default to None. \
+                It can be specified as ``spawn``, ``fork`` or ``forkserver``.
+            - debug (:obj:`bool`): Whether to use debug mode in the environment manager. Default to False. \
+                If set True, base environment manager will be used for easy debugging. Otherwise, \
+                subprocess environment manager will be used.
+        Returns:
+            - (:obj:`EvalReturn`): The evaluation result, of which the attributions are:
+                - eval_value (:obj:`np.float32`): The mean of evaluation return.
+                - eval_value_std (:obj:`np.float32`): The standard deviation of evaluation return.
+        """
+
         if debug:
             logging.getLogger().setLevel(logging.DEBUG)
         # define env and policy
@@ -280,7 +433,24 @@ def batch_evaluate(
         return EvalReturn(eval_value=task.ctx.eval_value, eval_value_std=task.ctx.eval_value_std)
 
     @property
-    def best(self):
+    def best(self) -> 'C51Agent':
+        """
+        Overview:
+            Load the best model from the checkpoint directory, \
+            which is by default in folder ``exp_name/ckpt/eval.pth.tar``. \
+            The return value is the agent with the best model.
+        Returns:
+            - (:obj:`C51Agent`): The agent with the best model.
+        Examples:
+            >>> agent = C51Agent(env_id='LunarLander-v2')
+            >>> agent.train()
+            >>> agent = agent.best
+
+        .. note::
+            The best model is the model with the highest evaluation return. If this method is called, the current \
+            model will be replaced by the best model.
+        """
+
         best_model_file_path = os.path.join(self.checkpoint_save_dir, "eval.pth.tar")
         # Load best model if it exists
         if os.path.exists(best_model_file_path):
diff --git a/ding/bonus/ddpg.py b/ding/bonus/ddpg.py
index ce7bdee46f..0dade9e38b 100644
--- a/ding/bonus/ddpg.py
+++ b/ding/bonus/ddpg.py
@@ -23,7 +23,23 @@
 
 
 class DDPGAgent:
+    """
+    Overview:
+        Class of agent for training, evaluation and deployment of Reinforcement learning algorithm \
+        Deep Deterministic Policy Gradient(DDPG).
+        For more information about the system design of RL agent, please refer to \
+        <https://di-engine-docs.readthedocs.io/en/latest/03_system/agent.html>.
+    Interface:
+        ``__init__``, ``train``, ``deploy``, ``collect_data``, ``batch_evaluate``, ``best``
+    """
     supported_env_list = list(supported_env_cfg.keys())
+    """
+    Overview:
+        List of supported envs.
+    Examples:
+        >>> from ding.bonus.ddpg import DDPGAgent
+        >>> print(DDPGAgent.supported_env_list)
+    """
 
     def __init__(
             self,
@@ -35,6 +51,52 @@ def __init__(
             cfg: Optional[Union[EasyDict, dict]] = None,
             policy_state_dict: str = None,
     ) -> None:
+        """
+        Overview:
+            Initialize agent for DDPG algorithm.
+        Arguments:
+            - env_id (:obj:`str`): The environment id, which is a registered environment name in gym or gymnasium. \
+                If ``env_id`` is not specified, ``env_id`` in ``cfg.env`` must be specified. \
+                If ``env_id`` is specified, ``env_id`` in ``cfg.env`` will be ignored. \
+                ``env_id`` should be one of the supported envs, which can be found in ``supported_env_list``.
+            - env (:obj:`BaseEnv`): The environment instance for training and evaluation. \
+                If ``env`` is not specified, `env_id`` or ``cfg.env.env_id`` must be specified. \
+                ``env_id`` or ``cfg.env.env_id`` will be used to create environment instance. \
+                If ``env`` is specified, ``env_id`` and ``cfg.env.env_id`` will be ignored.
+            - seed (:obj:`int`): The random seed, which is set before running the program. \
+                Default to 0.
+            - exp_name (:obj:`str`): The name of this experiment, which will be used to create the folder to save \
+                log data. Default to None. If not specified, the folder name will be ``env_id``-``algorithm``.
+            - model (:obj:`torch.nn.Module`): The model of DDPG algorithm, which should be an instance of class \
+                :class:`ding.model.ContinuousQAC`. \
+                If not specified, a default model will be generated according to the configuration.
+            - cfg (:obj:Union[EasyDict, dict]): The configuration of DDPG algorithm, which is a dict. \
+                Default to None. If not specified, the default configuration will be used. \
+                The default configuration can be found in ``ding/config/example/DDPG/gym_lunarlander_v2.py``.
+            - policy_state_dict (:obj:`str`): The path of policy state dict saved by PyTorch a in local file. \
+                If specified, the policy will be loaded from this file. Default to None.
+
+        .. note::
+            An RL Agent Instance can be initialized in two basic ways. \
+            For example, we have an environment with id ``LunarLanderContinuous-v2`` registered in gym, \
+            and we want to train an agent with DDPG algorithm with default configuration. \
+            Then we can initialize the agent in the following ways:
+                >>> agent = DDPGAgent(env_id='LunarLanderContinuous-v2')
+            or, if we want can specify the env_id in the configuration:
+                >>> cfg = {'env': {'env_id': 'LunarLanderContinuous-v2'}, 'policy': ...... }
+                >>> agent = DDPGAgent(cfg=cfg)
+            There are also other arguments to specify the agent when initializing.
+            For example, if we want to specify the environment instance:
+                >>> env = CustomizedEnv('LunarLanderContinuous-v2')
+                >>> agent = DDPGAgent(cfg=cfg, env=env)
+            or, if we want to specify the model:
+                >>> model = ContinuousQAC(**cfg.policy.model)
+                >>> agent = DDPGAgent(cfg=cfg, model=model)
+            or, if we want to reload the policy from a saved policy state dict:
+                >>> agent = DDPGAgent(cfg=cfg, policy_state_dict='LunarLanderContinuous-v2.pth.tar')
+            Make sure that the configuration is consistent with the saved policy state dict.
+        """
+
         assert env_id is not None or cfg is not None, "Please specify env_id or cfg."
 
         if cfg is not None and not isinstance(cfg, EasyDict):
@@ -92,6 +154,32 @@ def train(
         debug: bool = False,
         wandb_sweep: bool = False,
     ) -> TrainingReturn:
+        """
+        Overview:
+            Train the agent with DDPG algorithm for ``step`` iterations with ``collector_env_num`` collector \
+            environments and ``evaluator_env_num`` evaluator environments. Information during training will be \
+            recorded and saved by wandb.
+        Arguments:
+            - step (:obj:`int`): The total training environment steps of all collector environments. Default to 1e7.
+            - collector_env_num (:obj:`int`): The collector environment number. Default to None. \
+                If not specified, it will be set according to the configuration.
+            - evaluator_env_num (:obj:`int`): The evaluator environment number. Default to None. \
+                If not specified, it will be set according to the configuration.
+            - n_iter_save_ckpt (:obj:`int`): The frequency of saving checkpoint every training iteration. \
+                Default to 1000.
+            - context (:obj:`str`): The multi-process context of the environment manager. Default to None. \
+                It can be specified as ``spawn``, ``fork`` or ``forkserver``.
+            - debug (:obj:`bool`): Whether to use debug mode in the environment manager. Default to False. \
+                If set True, base environment manager will be used for easy debugging. Otherwise, \
+                subprocess environment manager will be used.
+            - wandb_sweep (:obj:`bool`): Whether to use wandb sweep, \
+                which is a hyper-parameter optimization process for seeking the best configurations. \
+                Default to False. If True, the wandb sweep id will be used as the experiment name.
+        Returns:
+            - (:obj:`TrainingReturn`): The training result, of which the attributions are:
+                - wandb_url (:obj:`str`): The weight & biases (wandb) project url of the trainning experiment.
+        """
+
         if debug:
             logging.getLogger().setLevel(logging.DEBUG)
         logging.debug(self.policy._model)
@@ -145,6 +233,31 @@ def deploy(
             seed: Optional[Union[int, List]] = None,
             debug: bool = False
     ) -> EvalReturn:
+        """
+        Overview:
+            Deploy the agent with DDPG algorithm by interacting with the environment, during which the replay video \
+            can be saved if ``enable_save_replay`` is True. The evaluation result will be returned.
+        Arguments:
+            - enable_save_replay (:obj:`bool`): Whether to save the replay video. Default to False.
+            - concatenate_all_replay (:obj:`bool`): Whether to concatenate all replay videos into one video. \
+                Default to False. If ``enable_save_replay`` is False, this argument will be ignored. \
+                If ``enable_save_replay`` is True and ``concatenate_all_replay`` is False, \
+                the replay video of each episode will be saved separately.
+            - replay_save_path (:obj:`str`): The path to save the replay video. Default to None. \
+                If not specified, the video will be saved in ``exp_name/videos``.
+            - seed (:obj:`Union[int, List]`): The random seed, which is set before running the program. \
+                Default to None. If not specified, ``self.seed`` will be used. \
+                If ``seed`` is an integer, the agent will be deployed once. \
+                If ``seed`` is a list of integers, the agent will be deployed once for each seed in the list.
+            - debug (:obj:`bool`): Whether to use debug mode in the environment manager. Default to False. \
+                If set True, base environment manager will be used for easy debugging. Otherwise, \
+                subprocess environment manager will be used.
+        Returns:
+            - (:obj:`EvalReturn`): The evaluation result, of which the attributions are:
+                - eval_value (:obj:`np.float32`): The mean of evaluation return.
+                - eval_value_std (:obj:`np.float32`): The standard deviation of evaluation return.
+        """
+
         if debug:
             logging.getLogger().setLevel(logging.DEBUG)
         # define env and policy
@@ -202,7 +315,7 @@ def _forward(obs):
                 step += 1
                 if done:
                     break
-            logging.info(f'DQN deploy is finished, final episode return with {step} steps is: {return_}')
+            logging.info(f'DDPG deploy is finished, final episode return with {step} steps is: {return_}')
             returns.append(return_)
 
         env.close()
@@ -223,6 +336,26 @@ def collect_data(
             context: Optional[str] = None,
             debug: bool = False
     ) -> None:
+        """
+        Overview:
+            Collect data with DDPG algorithm for ``n_episode`` episodes with ``env_num`` collector environments. \
+            The collected data will be saved in ``save_data_path`` if specified, otherwise it will be saved in \
+            ``exp_name/demo_data``.
+        Arguments:
+            - env_num (:obj:`int`): The number of collector environments. Default to 8.
+            - save_data_path (:obj:`str`): The path to save the collected data. Default to None. \
+                If not specified, the data will be saved in ``exp_name/demo_data``.
+            - n_sample (:obj:`int`): The number of samples to collect. Default to None. \
+                If not specified, ``n_episode`` must be specified.
+            - n_episode (:obj:`int`): The number of episodes to collect. Default to None. \
+                If not specified, ``n_sample`` must be specified.
+            - context (:obj:`str`): The multi-process context of the environment manager. Default to None. \
+                It can be specified as ``spawn``, ``fork`` or ``forkserver``.
+            - debug (:obj:`bool`): Whether to use debug mode in the environment manager. Default to False. \
+                If set True, base environment manager will be used for easy debugging. Otherwise, \
+                subprocess environment manager will be used.
+        """
+
         if debug:
             logging.getLogger().setLevel(logging.DEBUG)
         if n_episode is not None:
@@ -254,6 +387,27 @@ def batch_evaluate(
             context: Optional[str] = None,
             debug: bool = False
     ) -> EvalReturn:
+        """
+        Overview:
+            Evaluate the agent with DDPG algorithm for ``n_evaluator_episode`` episodes with ``env_num`` evaluator \
+            environments. The evaluation result will be returned.
+            The difference between methods ``batch_evaluate`` and ``deploy`` is that ``batch_evaluate`` will create \
+            multiple evaluator environments to evaluate the agent to get an average performance, while ``deploy`` \
+            will only create one evaluator environment to evaluate the agent and save the replay video.
+        Arguments:
+            - env_num (:obj:`int`): The number of evaluator environments. Default to 4.
+            - n_evaluator_episode (:obj:`int`): The number of episodes to evaluate. Default to 4.
+            - context (:obj:`str`): The multi-process context of the environment manager. Default to None. \
+                It can be specified as ``spawn``, ``fork`` or ``forkserver``.
+            - debug (:obj:`bool`): Whether to use debug mode in the environment manager. Default to False. \
+                If set True, base environment manager will be used for easy debugging. Otherwise, \
+                subprocess environment manager will be used.
+        Returns:
+            - (:obj:`EvalReturn`): The evaluation result, of which the attributions are:
+                - eval_value (:obj:`np.float32`): The mean of evaluation return.
+                - eval_value_std (:obj:`np.float32`): The standard deviation of evaluation return.
+        """
+
         if debug:
             logging.getLogger().setLevel(logging.DEBUG)
         # define env and policy
@@ -276,7 +430,24 @@ def batch_evaluate(
         return EvalReturn(eval_value=task.ctx.eval_value, eval_value_std=task.ctx.eval_value_std)
 
     @property
-    def best(self):
+    def best(self) -> 'DDPGAgent':
+        """
+        Overview:
+            Load the best model from the checkpoint directory, \
+            which is by default in folder ``exp_name/ckpt/eval.pth.tar``. \
+            The return value is the agent with the best model.
+        Returns:
+            - (:obj:`DDPGAgent`): The agent with the best model.
+        Examples:
+            >>> agent = DDPGAgent(env_id='LunarLanderContinuous-v2')
+            >>> agent.train()
+            >>> agent = agent.best
+
+        .. note::
+            The best model is the model with the highest evaluation return. If this method is called, the current \
+            model will be replaced by the best model.
+        """
+
         best_model_file_path = os.path.join(self.checkpoint_save_dir, "eval.pth.tar")
         # Load best model if it exists
         if os.path.exists(best_model_file_path):
diff --git a/ding/bonus/dqn.py b/ding/bonus/dqn.py
index 038551b083..4894e2aa6f 100644
--- a/ding/bonus/dqn.py
+++ b/ding/bonus/dqn.py
@@ -24,7 +24,22 @@
 
 
 class DQNAgent:
+    """
+    Overview:
+        Class of agent for training, evaluation and deployment of Reinforcement learning algorithm Deep Q-Learning(DQN).
+        For more information about the system design of RL agent, please refer to \
+        <https://di-engine-docs.readthedocs.io/en/latest/03_system/agent.html>.
+    Interface:
+        ``__init__``, ``train``, ``deploy``, ``collect_data``, ``batch_evaluate``, ``best``
+    """
     supported_env_list = list(supported_env_cfg.keys())
+    """
+    Overview:
+        List of supported envs.
+    Examples:
+        >>> from ding.bonus.dqn import DQNAgent
+        >>> print(DQNAgent.supported_env_list)
+    """
 
     def __init__(
             self,
@@ -36,6 +51,52 @@ def __init__(
             cfg: Optional[Union[EasyDict, dict]] = None,
             policy_state_dict: str = None,
     ) -> None:
+        """
+        Overview:
+            Initialize agent for DQN algorithm.
+        Arguments:
+            - env_id (:obj:`str`): The environment id, which is a registered environment name in gym or gymnasium. \
+                If ``env_id`` is not specified, ``env_id`` in ``cfg.env`` must be specified. \
+                If ``env_id`` is specified, ``env_id`` in ``cfg.env`` will be ignored. \
+                ``env_id`` should be one of the supported envs, which can be found in ``supported_env_list``.
+            - env (:obj:`BaseEnv`): The environment instance for training and evaluation. \
+                If ``env`` is not specified, `env_id`` or ``cfg.env.env_id`` must be specified. \
+                ``env_id`` or ``cfg.env.env_id`` will be used to create environment instance. \
+                If ``env`` is specified, ``env_id`` and ``cfg.env.env_id`` will be ignored.
+            - seed (:obj:`int`): The random seed, which is set before running the program. \
+                Default to 0.
+            - exp_name (:obj:`str`): The name of this experiment, which will be used to create the folder to save \
+                log data. Default to None. If not specified, the folder name will be ``env_id``-``algorithm``.
+            - model (:obj:`torch.nn.Module`): The model of DQN algorithm, which should be an instance of class \
+                :class:`ding.model.DQN`. \
+                If not specified, a default model will be generated according to the configuration.
+            - cfg (:obj:Union[EasyDict, dict]): The configuration of DQN algorithm, which is a dict. \
+                Default to None. If not specified, the default configuration will be used. \
+                The default configuration can be found in ``ding/config/example/DQN/gym_lunarlander_v2.py``.
+            - policy_state_dict (:obj:`str`): The path of policy state dict saved by PyTorch a in local file. \
+                If specified, the policy will be loaded from this file. Default to None.
+
+        .. note::
+            An RL Agent Instance can be initialized in two basic ways. \
+            For example, we have an environment with id ``LunarLander-v2`` registered in gym, \
+            and we want to train an agent with DQN algorithm with default configuration. \
+            Then we can initialize the agent in the following ways:
+                >>> agent = DQNAgent(env_id='LunarLander-v2')
+            or, if we want can specify the env_id in the configuration:
+                >>> cfg = {'env': {'env_id': 'LunarLander-v2'}, 'policy': ...... }
+                >>> agent = DQNAgent(cfg=cfg)
+            There are also other arguments to specify the agent when initializing.
+            For example, if we want to specify the environment instance:
+                >>> env = CustomizedEnv('LunarLander-v2')
+                >>> agent = DQNAgent(cfg=cfg, env=env)
+            or, if we want to specify the model:
+                >>> model = DQN(**cfg.policy.model)
+                >>> agent = DQNAgent(cfg=cfg, model=model)
+            or, if we want to reload the policy from a saved policy state dict:
+                >>> agent = DQNAgent(cfg=cfg, policy_state_dict='LunarLander-v2.pth.tar')
+            Make sure that the configuration is consistent with the saved policy state dict.
+        """
+
         assert env_id is not None or cfg is not None, "Please specify env_id or cfg."
 
         if cfg is not None and not isinstance(cfg, EasyDict):
@@ -92,6 +153,32 @@ def train(
         debug: bool = False,
         wandb_sweep: bool = False,
     ) -> TrainingReturn:
+        """
+        Overview:
+            Train the agent with DQN algorithm for ``step`` iterations with ``collector_env_num`` collector \
+            environments and ``evaluator_env_num`` evaluator environments. Information during training will be \
+            recorded and saved by wandb.
+        Arguments:
+            - step (:obj:`int`): The total training environment steps of all collector environments. Default to 1e7.
+            - collector_env_num (:obj:`int`): The collector environment number. Default to None. \
+                If not specified, it will be set according to the configuration.
+            - evaluator_env_num (:obj:`int`): The evaluator environment number. Default to None. \
+                If not specified, it will be set according to the configuration.
+            - n_iter_save_ckpt (:obj:`int`): The frequency of saving checkpoint every training iteration. \
+                Default to 1000.
+            - context (:obj:`str`): The multi-process context of the environment manager. Default to None. \
+                It can be specified as ``spawn``, ``fork`` or ``forkserver``.
+            - debug (:obj:`bool`): Whether to use debug mode in the environment manager. Default to False. \
+                If set True, base environment manager will be used for easy debugging. Otherwise, \
+                subprocess environment manager will be used.
+            - wandb_sweep (:obj:`bool`): Whether to use wandb sweep, \
+                which is a hyper-parameter optimization process for seeking the best configurations. \
+                Default to False. If True, the wandb sweep id will be used as the experiment name.
+        Returns:
+            - (:obj:`TrainingReturn`): The training result, of which the attributions are:
+                - wandb_url (:obj:`str`): The weight & biases (wandb) project url of the trainning experiment.
+        """
+
         if debug:
             logging.getLogger().setLevel(logging.DEBUG)
         logging.debug(self.policy._model)
@@ -148,6 +235,31 @@ def deploy(
             seed: Optional[Union[int, List]] = None,
             debug: bool = False
     ) -> EvalReturn:
+        """
+        Overview:
+            Deploy the agent with DQN algorithm by interacting with the environment, during which the replay video \
+            can be saved if ``enable_save_replay`` is True. The evaluation result will be returned.
+        Arguments:
+            - enable_save_replay (:obj:`bool`): Whether to save the replay video. Default to False.
+            - concatenate_all_replay (:obj:`bool`): Whether to concatenate all replay videos into one video. \
+                Default to False. If ``enable_save_replay`` is False, this argument will be ignored. \
+                If ``enable_save_replay`` is True and ``concatenate_all_replay`` is False, \
+                the replay video of each episode will be saved separately.
+            - replay_save_path (:obj:`str`): The path to save the replay video. Default to None. \
+                If not specified, the video will be saved in ``exp_name/videos``.
+            - seed (:obj:`Union[int, List]`): The random seed, which is set before running the program. \
+                Default to None. If not specified, ``self.seed`` will be used. \
+                If ``seed`` is an integer, the agent will be deployed once. \
+                If ``seed`` is a list of integers, the agent will be deployed once for each seed in the list.
+            - debug (:obj:`bool`): Whether to use debug mode in the environment manager. Default to False. \
+                If set True, base environment manager will be used for easy debugging. Otherwise, \
+                subprocess environment manager will be used.
+        Returns:
+            - (:obj:`EvalReturn`): The evaluation result, of which the attributions are:
+                - eval_value (:obj:`np.float32`): The mean of evaluation return.
+                - eval_value_std (:obj:`np.float32`): The standard deviation of evaluation return.
+        """
+
         if debug:
             logging.getLogger().setLevel(logging.DEBUG)
         # define env and policy
@@ -228,6 +340,26 @@ def collect_data(
             context: Optional[str] = None,
             debug: bool = False
     ) -> None:
+        """
+        Overview:
+            Collect data with DQN algorithm for ``n_episode`` episodes with ``env_num`` collector environments. \
+            The collected data will be saved in ``save_data_path`` if specified, otherwise it will be saved in \
+            ``exp_name/demo_data``.
+        Arguments:
+            - env_num (:obj:`int`): The number of collector environments. Default to 8.
+            - save_data_path (:obj:`str`): The path to save the collected data. Default to None. \
+                If not specified, the data will be saved in ``exp_name/demo_data``.
+            - n_sample (:obj:`int`): The number of samples to collect. Default to None. \
+                If not specified, ``n_episode`` must be specified.
+            - n_episode (:obj:`int`): The number of episodes to collect. Default to None. \
+                If not specified, ``n_sample`` must be specified.
+            - context (:obj:`str`): The multi-process context of the environment manager. Default to None. \
+                It can be specified as ``spawn``, ``fork`` or ``forkserver``.
+            - debug (:obj:`bool`): Whether to use debug mode in the environment manager. Default to False. \
+                If set True, base environment manager will be used for easy debugging. Otherwise, \
+                subprocess environment manager will be used.
+        """
+
         if debug:
             logging.getLogger().setLevel(logging.DEBUG)
         if n_episode is not None:
@@ -259,6 +391,27 @@ def batch_evaluate(
             context: Optional[str] = None,
             debug: bool = False
     ) -> EvalReturn:
+        """
+        Overview:
+            Evaluate the agent with DQN algorithm for ``n_evaluator_episode`` episodes with ``env_num`` evaluator \
+            environments. The evaluation result will be returned.
+            The difference between methods ``batch_evaluate`` and ``deploy`` is that ``batch_evaluate`` will create \
+            multiple evaluator environments to evaluate the agent to get an average performance, while ``deploy`` \
+            will only create one evaluator environment to evaluate the agent and save the replay video.
+        Arguments:
+            - env_num (:obj:`int`): The number of evaluator environments. Default to 4.
+            - n_evaluator_episode (:obj:`int`): The number of episodes to evaluate. Default to 4.
+            - context (:obj:`str`): The multi-process context of the environment manager. Default to None. \
+                It can be specified as ``spawn``, ``fork`` or ``forkserver``.
+            - debug (:obj:`bool`): Whether to use debug mode in the environment manager. Default to False. \
+                If set True, base environment manager will be used for easy debugging. Otherwise, \
+                subprocess environment manager will be used.
+        Returns:
+            - (:obj:`EvalReturn`): The evaluation result, of which the attributions are:
+                - eval_value (:obj:`np.float32`): The mean of evaluation return.
+                - eval_value_std (:obj:`np.float32`): The standard deviation of evaluation return.
+        """
+
         if debug:
             logging.getLogger().setLevel(logging.DEBUG)
         # define env and policy
@@ -281,7 +434,24 @@ def batch_evaluate(
         return EvalReturn(eval_value=task.ctx.eval_value, eval_value_std=task.ctx.eval_value_std)
 
     @property
-    def best(self):
+    def best(self) -> 'DQNAgent':
+        """
+        Overview:
+            Load the best model from the checkpoint directory, \
+            which is by default in folder ``exp_name/ckpt/eval.pth.tar``. \
+            The return value is the agent with the best model.
+        Returns:
+            - (:obj:`DQNAgent`): The agent with the best model.
+        Examples:
+            >>> agent = DQNAgent(env_id='LunarLander-v2')
+            >>> agent.train()
+            >>> agent = agent.best
+
+        .. note::
+            The best model is the model with the highest evaluation return. If this method is called, the current \
+            model will be replaced by the best model.
+        """
+
         best_model_file_path = os.path.join(self.checkpoint_save_dir, "eval.pth.tar")
         # Load best model if it exists
         if os.path.exists(best_model_file_path):
diff --git a/ding/bonus/pg.py b/ding/bonus/pg.py
index 4f8062d8a4..59c031d65d 100644
--- a/ding/bonus/pg.py
+++ b/ding/bonus/pg.py
@@ -22,7 +22,22 @@
 
 
 class PGAgent:
+    """
+    Overview:
+        Class of agent for training, evaluation and deployment of Reinforcement learning algorithm Policy Gradient(PG).
+        For more information about the system design of RL agent, please refer to \
+        <https://di-engine-docs.readthedocs.io/en/latest/03_system/agent.html>.
+    Interface:
+        ``__init__``, ``train``, ``deploy``, ``collect_data``, ``batch_evaluate``, ``best``
+    """
     supported_env_list = list(supported_env_cfg.keys())
+    """
+    Overview:
+        List of supported envs.
+    Examples:
+        >>> from ding.bonus.pg import PGAgent
+        >>> print(PGAgent.supported_env_list)
+    """
 
     def __init__(
             self,
@@ -34,6 +49,52 @@ def __init__(
             cfg: Optional[Union[EasyDict, dict]] = None,
             policy_state_dict: str = None,
     ) -> None:
+        """
+        Overview:
+            Initialize agent for PG algorithm.
+        Arguments:
+            - env_id (:obj:`str`): The environment id, which is a registered environment name in gym or gymnasium. \
+                If ``env_id`` is not specified, ``env_id`` in ``cfg.env`` must be specified. \
+                If ``env_id`` is specified, ``env_id`` in ``cfg.env`` will be ignored. \
+                ``env_id`` should be one of the supported envs, which can be found in ``supported_env_list``.
+            - env (:obj:`BaseEnv`): The environment instance for training and evaluation. \
+                If ``env`` is not specified, `env_id`` or ``cfg.env.env_id`` must be specified. \
+                ``env_id`` or ``cfg.env.env_id`` will be used to create environment instance. \
+                If ``env`` is specified, ``env_id`` and ``cfg.env.env_id`` will be ignored.
+            - seed (:obj:`int`): The random seed, which is set before running the program. \
+                Default to 0.
+            - exp_name (:obj:`str`): The name of this experiment, which will be used to create the folder to save \
+                log data. Default to None. If not specified, the folder name will be ``env_id``-``algorithm``.
+            - model (:obj:`torch.nn.Module`): The model of PG algorithm, which should be an instance of class \
+                :class:`ding.model.PG`. \
+                If not specified, a default model will be generated according to the configuration.
+            - cfg (:obj:Union[EasyDict, dict]): The configuration of PG algorithm, which is a dict. \
+                Default to None. If not specified, the default configuration will be used. \
+                The default configuration can be found in ``ding/config/example/PG/gym_lunarlander_v2.py``.
+            - policy_state_dict (:obj:`str`): The path of policy state dict saved by PyTorch a in local file. \
+                If specified, the policy will be loaded from this file. Default to None.
+
+        .. note::
+            An RL Agent Instance can be initialized in two basic ways. \
+            For example, we have an environment with id ``LunarLanderContinuous-v2`` registered in gym, \
+            and we want to train an agent with PG algorithm with default configuration. \
+            Then we can initialize the agent in the following ways:
+                >>> agent = PGAgent(env_id='LunarLanderContinuous-v2')
+            or, if we want can specify the env_id in the configuration:
+                >>> cfg = {'env': {'env_id': 'LunarLanderContinuous-v2'}, 'policy': ...... }
+                >>> agent = PGAgent(cfg=cfg)
+            There are also other arguments to specify the agent when initializing.
+            For example, if we want to specify the environment instance:
+                >>> env = CustomizedEnv('LunarLanderContinuous-v2')
+                >>> agent = PGAgent(cfg=cfg, env=env)
+            or, if we want to specify the model:
+                >>> model = PG(**cfg.policy.model)
+                >>> agent = PGAgent(cfg=cfg, model=model)
+            or, if we want to reload the policy from a saved policy state dict:
+                >>> agent = PGAgent(cfg=cfg, policy_state_dict='LunarLanderContinuous-v2.pth.tar')
+            Make sure that the configuration is consistent with the saved policy state dict.
+        """
+
         assert env_id is not None or cfg is not None, "Please specify env_id or cfg."
 
         if cfg is not None and not isinstance(cfg, EasyDict):
@@ -89,6 +150,32 @@ def train(
         debug: bool = False,
         wandb_sweep: bool = False,
     ) -> TrainingReturn:
+        """
+        Overview:
+            Train the agent with PG algorithm for ``step`` iterations with ``collector_env_num`` collector \
+            environments and ``evaluator_env_num`` evaluator environments. Information during training will be \
+            recorded and saved by wandb.
+        Arguments:
+            - step (:obj:`int`): The total training environment steps of all collector environments. Default to 1e7.
+            - collector_env_num (:obj:`int`): The collector environment number. Default to None. \
+                If not specified, it will be set according to the configuration.
+            - evaluator_env_num (:obj:`int`): The evaluator environment number. Default to None. \
+                If not specified, it will be set according to the configuration.
+            - n_iter_save_ckpt (:obj:`int`): The frequency of saving checkpoint every training iteration. \
+                Default to 1000.
+            - context (:obj:`str`): The multi-process context of the environment manager. Default to None. \
+                It can be specified as ``spawn``, ``fork`` or ``forkserver``.
+            - debug (:obj:`bool`): Whether to use debug mode in the environment manager. Default to False. \
+                If set True, base environment manager will be used for easy debugging. Otherwise, \
+                subprocess environment manager will be used.
+            - wandb_sweep (:obj:`bool`): Whether to use wandb sweep, \
+                which is a hyper-parameter optimization process for seeking the best configurations. \
+                Default to False. If True, the wandb sweep id will be used as the experiment name.
+        Returns:
+            - (:obj:`TrainingReturn`): The training result, of which the attributions are:
+                - wandb_url (:obj:`str`): The weight & biases (wandb) project url of the trainning experiment.
+        """
+
         if debug:
             logging.getLogger().setLevel(logging.DEBUG)
         logging.debug(self.policy._model)
@@ -134,6 +221,31 @@ def deploy(
             seed: Optional[Union[int, List]] = None,
             debug: bool = False
     ) -> EvalReturn:
+        """
+        Overview:
+            Deploy the agent with PG algorithm by interacting with the environment, during which the replay video \
+            can be saved if ``enable_save_replay`` is True. The evaluation result will be returned.
+        Arguments:
+            - enable_save_replay (:obj:`bool`): Whether to save the replay video. Default to False.
+            - concatenate_all_replay (:obj:`bool`): Whether to concatenate all replay videos into one video. \
+                Default to False. If ``enable_save_replay`` is False, this argument will be ignored. \
+                If ``enable_save_replay`` is True and ``concatenate_all_replay`` is False, \
+                the replay video of each episode will be saved separately.
+            - replay_save_path (:obj:`str`): The path to save the replay video. Default to None. \
+                If not specified, the video will be saved in ``exp_name/videos``.
+            - seed (:obj:`Union[int, List]`): The random seed, which is set before running the program. \
+                Default to None. If not specified, ``self.seed`` will be used. \
+                If ``seed`` is an integer, the agent will be deployed once. \
+                If ``seed`` is a list of integers, the agent will be deployed once for each seed in the list.
+            - debug (:obj:`bool`): Whether to use debug mode in the environment manager. Default to False. \
+                If set True, base environment manager will be used for easy debugging. Otherwise, \
+                subprocess environment manager will be used.
+        Returns:
+            - (:obj:`EvalReturn`): The evaluation result, of which the attributions are:
+                - eval_value (:obj:`np.float32`): The mean of evaluation return.
+                - eval_value_std (:obj:`np.float32`): The standard deviation of evaluation return.
+        """
+
         if debug:
             logging.getLogger().setLevel(logging.DEBUG)
         # define env and policy
@@ -221,6 +333,26 @@ def collect_data(
             context: Optional[str] = None,
             debug: bool = False
     ) -> None:
+        """
+        Overview:
+            Collect data with PG algorithm for ``n_episode`` episodes with ``env_num`` collector environments. \
+            The collected data will be saved in ``save_data_path`` if specified, otherwise it will be saved in \
+            ``exp_name/demo_data``.
+        Arguments:
+            - env_num (:obj:`int`): The number of collector environments. Default to 8.
+            - save_data_path (:obj:`str`): The path to save the collected data. Default to None. \
+                If not specified, the data will be saved in ``exp_name/demo_data``.
+            - n_sample (:obj:`int`): The number of samples to collect. Default to None. \
+                If not specified, ``n_episode`` must be specified.
+            - n_episode (:obj:`int`): The number of episodes to collect. Default to None. \
+                If not specified, ``n_sample`` must be specified.
+            - context (:obj:`str`): The multi-process context of the environment manager. Default to None. \
+                It can be specified as ``spawn``, ``fork`` or ``forkserver``.
+            - debug (:obj:`bool`): Whether to use debug mode in the environment manager. Default to False. \
+                If set True, base environment manager will be used for easy debugging. Otherwise, \
+                subprocess environment manager will be used.
+        """
+
         if debug:
             logging.getLogger().setLevel(logging.DEBUG)
         if n_episode is not None:
@@ -252,6 +384,27 @@ def batch_evaluate(
             context: Optional[str] = None,
             debug: bool = False
     ) -> EvalReturn:
+        """
+        Overview:
+            Evaluate the agent with PG algorithm for ``n_evaluator_episode`` episodes with ``env_num`` evaluator \
+            environments. The evaluation result will be returned.
+            The difference between methods ``batch_evaluate`` and ``deploy`` is that ``batch_evaluate`` will create \
+            multiple evaluator environments to evaluate the agent to get an average performance, while ``deploy`` \
+            will only create one evaluator environment to evaluate the agent and save the replay video.
+        Arguments:
+            - env_num (:obj:`int`): The number of evaluator environments. Default to 4.
+            - n_evaluator_episode (:obj:`int`): The number of episodes to evaluate. Default to 4.
+            - context (:obj:`str`): The multi-process context of the environment manager. Default to None. \
+                It can be specified as ``spawn``, ``fork`` or ``forkserver``.
+            - debug (:obj:`bool`): Whether to use debug mode in the environment manager. Default to False. \
+                If set True, base environment manager will be used for easy debugging. Otherwise, \
+                subprocess environment manager will be used.
+        Returns:
+            - (:obj:`EvalReturn`): The evaluation result, of which the attributions are:
+                - eval_value (:obj:`np.float32`): The mean of evaluation return.
+                - eval_value_std (:obj:`np.float32`): The standard deviation of evaluation return.
+        """
+
         if debug:
             logging.getLogger().setLevel(logging.DEBUG)
         # define env and policy
@@ -274,7 +427,24 @@ def batch_evaluate(
         return EvalReturn(eval_value=task.ctx.eval_value, eval_value_std=task.ctx.eval_value_std)
 
     @property
-    def best(self):
+    def best(self) -> 'PGAgent':
+        """
+        Overview:
+            Load the best model from the checkpoint directory, \
+            which is by default in folder ``exp_name/ckpt/eval.pth.tar``. \
+            The return value is the agent with the best model.
+        Returns:
+            - (:obj:`PGAgent`): The agent with the best model.
+        Examples:
+            >>> agent = PGAgent(env_id='LunarLanderContinuous-v2')
+            >>> agent.train()
+            >>> agent = agent.best
+
+        .. note::
+            The best model is the model with the highest evaluation return. If this method is called, the current \
+            model will be replaced by the best model.
+        """
+
         best_model_file_path = os.path.join(self.checkpoint_save_dir, "eval.pth.tar")
         # Load best model if it exists
         if os.path.exists(best_model_file_path):
diff --git a/ding/bonus/ppo_offpolicy.py b/ding/bonus/ppo_offpolicy.py
index e1d2d81c80..546aecbd6d 100644
--- a/ding/bonus/ppo_offpolicy.py
+++ b/ding/bonus/ppo_offpolicy.py
@@ -23,7 +23,23 @@
 
 
 class PPOOffPolicyAgent:
+    """
+    Overview:
+        Class of agent for training, evaluation and deployment of Reinforcement learning algorithm \
+        Proximal Policy Optimization(PPO) in an off-policy style.
+        For more information about the system design of RL agent, please refer to \
+        <https://di-engine-docs.readthedocs.io/en/latest/03_system/agent.html>.
+    Interface:
+        ``__init__``, ``train``, ``deploy``, ``collect_data``, ``batch_evaluate``, ``best``
+    """
     supported_env_list = list(supported_env_cfg.keys())
+    """
+    Overview:
+        List of supported envs.
+    Examples:
+        >>> from ding.bonus.ppo_offpolicy import PPOOffPolicyAgent
+        >>> print(PPOOffPolicyAgent.supported_env_list)
+    """
 
     def __init__(
             self,
@@ -35,6 +51,52 @@ def __init__(
             cfg: Optional[Union[EasyDict, dict]] = None,
             policy_state_dict: str = None,
     ) -> None:
+        """
+        Overview:
+            Initialize agent for PPO (offpolicy) algorithm.
+        Arguments:
+            - env_id (:obj:`str`): The environment id, which is a registered environment name in gym or gymnasium. \
+                If ``env_id`` is not specified, ``env_id`` in ``cfg.env`` must be specified. \
+                If ``env_id`` is specified, ``env_id`` in ``cfg.env`` will be ignored. \
+                ``env_id`` should be one of the supported envs, which can be found in ``supported_env_list``.
+            - env (:obj:`BaseEnv`): The environment instance for training and evaluation. \
+                If ``env`` is not specified, `env_id`` or ``cfg.env.env_id`` must be specified. \
+                ``env_id`` or ``cfg.env.env_id`` will be used to create environment instance. \
+                If ``env`` is specified, ``env_id`` and ``cfg.env.env_id`` will be ignored.
+            - seed (:obj:`int`): The random seed, which is set before running the program. \
+                Default to 0.
+            - exp_name (:obj:`str`): The name of this experiment, which will be used to create the folder to save \
+                log data. Default to None. If not specified, the folder name will be ``env_id``-``algorithm``.
+            - model (:obj:`torch.nn.Module`): The model of PPO (offpolicy) algorithm, \
+                which should be an instance of class :class:`ding.model.VAC`. \
+                If not specified, a default model will be generated according to the configuration.
+            - cfg (:obj:Union[EasyDict, dict]): The configuration of PPO (offpolicy) algorithm, which is a dict. \
+                Default to None. If not specified, the default configuration will be used. \
+                The default configuration can be found in ``ding/config/example/PPO (offpolicy)/gym_lunarlander_v2.py``.
+            - policy_state_dict (:obj:`str`): The path of policy state dict saved by PyTorch a in local file. \
+                If specified, the policy will be loaded from this file. Default to None.
+
+        .. note::
+            An RL Agent Instance can be initialized in two basic ways. \
+            For example, we have an environment with id ``LunarLander-v2`` registered in gym, \
+            and we want to train an agent with PPO (offpolicy) algorithm with default configuration. \
+            Then we can initialize the agent in the following ways:
+                >>> agent = PPOOffPolicyAgent(env_id='LunarLander-v2')
+            or, if we want can specify the env_id in the configuration:
+                >>> cfg = {'env': {'env_id': 'LunarLander-v2'}, 'policy': ...... }
+                >>> agent = PPOOffPolicyAgent(cfg=cfg)
+            There are also other arguments to specify the agent when initializing.
+            For example, if we want to specify the environment instance:
+                >>> env = CustomizedEnv('LunarLander-v2')
+                >>> agent = PPOOffPolicyAgent(cfg=cfg, env=env)
+            or, if we want to specify the model:
+                >>> model = VAC(**cfg.policy.model)
+                >>> agent = PPOOffPolicyAgent(cfg=cfg, model=model)
+            or, if we want to reload the policy from a saved policy state dict:
+                >>> agent = PPOOffPolicyAgent(cfg=cfg, policy_state_dict='LunarLander-v2.pth.tar')
+            Make sure that the configuration is consistent with the saved policy state dict.
+        """
+
         assert env_id is not None or cfg is not None, "Please specify env_id or cfg."
 
         if cfg is not None and not isinstance(cfg, EasyDict):
@@ -91,6 +153,32 @@ def train(
         debug: bool = False,
         wandb_sweep: bool = False,
     ) -> TrainingReturn:
+        """
+        Overview:
+            Train the agent with PPO (offpolicy) algorithm for ``step`` iterations with ``collector_env_num`` \
+            collector environments and ``evaluator_env_num`` evaluator environments. \
+            Information during training will be recorded and saved by wandb.
+        Arguments:
+            - step (:obj:`int`): The total training environment steps of all collector environments. Default to 1e7.
+            - collector_env_num (:obj:`int`): The collector environment number. Default to None. \
+                If not specified, it will be set according to the configuration.
+            - evaluator_env_num (:obj:`int`): The evaluator environment number. Default to None. \
+                If not specified, it will be set according to the configuration.
+            - n_iter_save_ckpt (:obj:`int`): The frequency of saving checkpoint every training iteration. \
+                Default to 1000.
+            - context (:obj:`str`): The multi-process context of the environment manager. Default to None. \
+                It can be specified as ``spawn``, ``fork`` or ``forkserver``.
+            - debug (:obj:`bool`): Whether to use debug mode in the environment manager. Default to False. \
+                If set True, base environment manager will be used for easy debugging. Otherwise, \
+                subprocess environment manager will be used.
+            - wandb_sweep (:obj:`bool`): Whether to use wandb sweep, \
+                which is a hyper-parameter optimization process for seeking the best configurations. \
+                Default to False. If True, the wandb sweep id will be used as the experiment name.
+        Returns:
+            - (:obj:`TrainingReturn`): The training result, of which the attributions are:
+                - wandb_url (:obj:`str`): The weight & biases (wandb) project url of the trainning experiment.
+        """
+
         if debug:
             logging.getLogger().setLevel(logging.DEBUG)
         logging.debug(self.policy._model)
@@ -146,6 +234,32 @@ def deploy(
             seed: Optional[Union[int, List]] = None,
             debug: bool = False
     ) -> EvalReturn:
+        """
+        Overview:
+            Deploy the agent with PPO (offpolicy) algorithm by interacting with the environment, \
+            during which the replay video can be saved if ``enable_save_replay`` is True. \
+            The evaluation result will be returned.
+        Arguments:
+            - enable_save_replay (:obj:`bool`): Whether to save the replay video. Default to False.
+            - concatenate_all_replay (:obj:`bool`): Whether to concatenate all replay videos into one video. \
+                Default to False. If ``enable_save_replay`` is False, this argument will be ignored. \
+                If ``enable_save_replay`` is True and ``concatenate_all_replay`` is False, \
+                the replay video of each episode will be saved separately.
+            - replay_save_path (:obj:`str`): The path to save the replay video. Default to None. \
+                If not specified, the video will be saved in ``exp_name/videos``.
+            - seed (:obj:`Union[int, List]`): The random seed, which is set before running the program. \
+                Default to None. If not specified, ``self.seed`` will be used. \
+                If ``seed`` is an integer, the agent will be deployed once. \
+                If ``seed`` is a list of integers, the agent will be deployed once for each seed in the list.
+            - debug (:obj:`bool`): Whether to use debug mode in the environment manager. Default to False. \
+                If set True, base environment manager will be used for easy debugging. Otherwise, \
+                subprocess environment manager will be used.
+        Returns:
+            - (:obj:`EvalReturn`): The evaluation result, of which the attributions are:
+                - eval_value (:obj:`np.float32`): The mean of evaluation return.
+                - eval_value_std (:obj:`np.float32`): The standard deviation of evaluation return.
+        """
+
         if debug:
             logging.getLogger().setLevel(logging.DEBUG)
         # define env and policy
@@ -214,7 +328,7 @@ def _forward(obs):
                 step += 1
                 if done:
                     break
-            logging.info(f'DQN deploy is finished, final episode return with {step} steps is: {return_}')
+            logging.info(f'PPO (offpolicy) deploy is finished, final episode return with {step} steps is: {return_}')
             returns.append(return_)
 
         env.close()
@@ -235,6 +349,27 @@ def collect_data(
             context: Optional[str] = None,
             debug: bool = False
     ) -> None:
+        """
+        Overview:
+            Collect data with PPO (offpolicy) algorithm for ``n_episode`` episodes \
+            with ``env_num`` collector environments. \
+            The collected data will be saved in ``save_data_path`` if specified, otherwise it will be saved in \
+            ``exp_name/demo_data``.
+        Arguments:
+            - env_num (:obj:`int`): The number of collector environments. Default to 8.
+            - save_data_path (:obj:`str`): The path to save the collected data. Default to None. \
+                If not specified, the data will be saved in ``exp_name/demo_data``.
+            - n_sample (:obj:`int`): The number of samples to collect. Default to None. \
+                If not specified, ``n_episode`` must be specified.
+            - n_episode (:obj:`int`): The number of episodes to collect. Default to None. \
+                If not specified, ``n_sample`` must be specified.
+            - context (:obj:`str`): The multi-process context of the environment manager. Default to None. \
+                It can be specified as ``spawn``, ``fork`` or ``forkserver``.
+            - debug (:obj:`bool`): Whether to use debug mode in the environment manager. Default to False. \
+                If set True, base environment manager will be used for easy debugging. Otherwise, \
+                subprocess environment manager will be used.
+        """
+
         if debug:
             logging.getLogger().setLevel(logging.DEBUG)
         if n_episode is not None:
@@ -267,6 +402,27 @@ def batch_evaluate(
             context: Optional[str] = None,
             debug: bool = False
     ) -> EvalReturn:
+        """
+        Overview:
+            Evaluate the agent with PPO (offpolicy) algorithm for ``n_evaluator_episode`` episodes \
+            with ``env_num`` evaluator environments. The evaluation result will be returned.
+            The difference between methods ``batch_evaluate`` and ``deploy`` is that ``batch_evaluate`` will create \
+            multiple evaluator environments to evaluate the agent to get an average performance, while ``deploy`` \
+            will only create one evaluator environment to evaluate the agent and save the replay video.
+        Arguments:
+            - env_num (:obj:`int`): The number of evaluator environments. Default to 4.
+            - n_evaluator_episode (:obj:`int`): The number of episodes to evaluate. Default to 4.
+            - context (:obj:`str`): The multi-process context of the environment manager. Default to None. \
+                It can be specified as ``spawn``, ``fork`` or ``forkserver``.
+            - debug (:obj:`bool`): Whether to use debug mode in the environment manager. Default to False. \
+                If set True, base environment manager will be used for easy debugging. Otherwise, \
+                subprocess environment manager will be used.
+        Returns:
+            - (:obj:`EvalReturn`): The evaluation result, of which the attributions are:
+                - eval_value (:obj:`np.float32`): The mean of evaluation return.
+                - eval_value_std (:obj:`np.float32`): The standard deviation of evaluation return.
+        """
+
         if debug:
             logging.getLogger().setLevel(logging.DEBUG)
         # define env and policy
@@ -289,7 +445,24 @@ def batch_evaluate(
         return EvalReturn(eval_value=task.ctx.eval_value, eval_value_std=task.ctx.eval_value_std)
 
     @property
-    def best(self):
+    def best(self) -> 'PPOOffPolicyAgent':
+        """
+        Overview:
+            Load the best model from the checkpoint directory, \
+            which is by default in folder ``exp_name/ckpt/eval.pth.tar``. \
+            The return value is the agent with the best model.
+        Returns:
+            - (:obj:`PPOOffPolicyAgent`): The agent with the best model.
+        Examples:
+            >>> agent = PPOOffPolicyAgent(env_id='LunarLander-v2')
+            >>> agent.train()
+            >>> agent.best
+
+        .. note::
+            The best model is the model with the highest evaluation return. If this method is called, the current \
+            model will be replaced by the best model.
+        """
+
         best_model_file_path = os.path.join(self.checkpoint_save_dir, "eval.pth.tar")
         # Load best model if it exists
         if os.path.exists(best_model_file_path):
diff --git a/ding/bonus/ppof.py b/ding/bonus/ppof.py
index bf6012240f..88d0b43e1e 100644
--- a/ding/bonus/ppof.py
+++ b/ding/bonus/ppof.py
@@ -21,6 +21,16 @@
 
 
 class PPOF:
+    """
+    Overview:
+        Class of agent for training, evaluation and deployment of Reinforcement learning algorithm \
+        Proximal Policy Optimization(PPO).
+        For more information about the system design of RL agent, please refer to \
+        <https://di-engine-docs.readthedocs.io/en/latest/03_system/agent.html>.
+    Interface:
+        ``__init__``, ``train``, ``deploy``, ``collect_data``, ``batch_evaluate``, ``best``
+    """
+
     supported_env_list = [
         # common
         'LunarLander-v2',
@@ -53,6 +63,13 @@ class PPOF:
         'HalfCheetah-v3',
         'Walker2d-v3',
     ]
+    """
+    Overview:
+        List of supported envs.
+    Examples:
+        >>> from ding.bonus.ppof import PPOF
+        >>> print(PPOF.supported_env_list)
+    """
 
     def __init__(
             self,
@@ -64,6 +81,51 @@ def __init__(
             cfg: Optional[Union[EasyDict, dict]] = None,
             policy_state_dict: str = None
     ) -> None:
+        """
+        Overview:
+            Initialize agent for PPO algorithm.
+        Arguments:
+            - env_id (:obj:`str`): The environment id, which is a registered environment name in gym or gymnasium. \
+                If ``env_id`` is not specified, ``env_id`` in ``cfg`` must be specified. \
+                If ``env_id`` is specified, ``env_id`` in ``cfg`` will be ignored. \
+                ``env_id`` should be one of the supported envs, which can be found in ``PPOF.supported_env_list``.
+            - env (:obj:`BaseEnv`): The environment instance for training and evaluation. \
+                If ``env`` is not specified, ``env_id`` or ``cfg.env_id`` must be specified. \
+                ``env_id`` or ``cfg.env_id`` will be used to create environment instance. \
+                If ``env`` is specified, ``env_id`` and ``cfg.env_id`` will be ignored.
+            - seed (:obj:`int`): The random seed, which is set before running the program. \
+                Default to 0.
+            - exp_name (:obj:`str`): The name of this experiment, which will be used to create the folder to save \
+                log data. Default to None. If not specified, the folder name will be ``env_id``-``algorithm``.
+            - model (:obj:`torch.nn.Module`): The model of PPO algorithm, which should be an instance of class \
+                ``ding.model.PPOFModel``. \
+                If not specified, a default model will be generated according to the configuration.
+            - cfg (:obj:`Union[EasyDict, dict]`): The configuration of PPO algorithm, which is a dict. \
+                Default to None. If not specified, the default configuration will be used.
+            - policy_state_dict (:obj:`str`): The path of policy state dict saved by PyTorch a in local file. \
+                If specified, the policy will be loaded from this file. Default to None.
+
+        .. note::
+            An RL Agent Instance can be initialized in two basic ways. \
+            For example, we have an environment with id ``LunarLander-v2`` registered in gym, \
+            and we want to train an agent with PPO algorithm with default configuration. \
+            Then we can initialize the agent in the following ways:
+                >>> agent = PPOF(env_id='LunarLander-v2')
+            or, if we want can specify the env_id in the configuration:
+                >>> cfg = {'env': {'env_id': 'LunarLander-v2'}, 'policy': ...... }
+                >>> agent = PPOF(cfg=cfg)
+            There are also other arguments to specify the agent when initializing.
+            For example, if we want to specify the environment instance:
+                >>> env = CustomizedEnv('LunarLander-v2')
+                >>> agent = PPOF(cfg=cfg, env=env)
+            or, if we want to specify the model:
+                >>> model = VAC(**cfg.policy.model)
+                >>> agent = PPOF(cfg=cfg, model=model)
+            or, if we want to reload the policy from a saved policy state dict:
+                >>> agent = PPOF(cfg=cfg, policy_state_dict='LunarLander-v2.pth.tar')
+            Make sure that the configuration is consistent with the saved policy state dict.
+        """
+
         assert env_id is not None or cfg is not None, "Please specify env_id or cfg."
 
         if cfg is not None and not isinstance(cfg, EasyDict):
@@ -146,6 +208,32 @@ def train(
         debug: bool = False,
         wandb_sweep: bool = False,
     ) -> TrainingReturn:
+        """
+        Overview:
+            Train the agent with PPO algorithm for ``step`` iterations with ``collector_env_num`` collector \
+            environments and ``evaluator_env_num`` evaluator environments. Information during training will be \
+            recorded and saved by wandb.
+        Arguments:
+            - step (:obj:`int`): The total training environment steps of all collector environments. Default to 1e7.
+            - collector_env_num (:obj:`int`): The number of collector environments. Default to 4.
+            - evaluator_env_num (:obj:`int`): The number of evaluator environments. Default to 4.
+            - n_iter_log_show (:obj:`int`): The frequency of logging every training iteration. Default to 500.
+            - n_iter_save_ckpt (:obj:`int`): The frequency of saving checkpoint every training iteration. \
+                Default to 1000.
+            - context (:obj:`str`): The multi-process context of the environment manager. Default to None. \
+                It can be specified as ``spawn``, ``fork`` or ``forkserver``.
+            - reward_model (:obj:`str`): The reward model name. Default to None. This argument is not supported yet.
+            - debug (:obj:`bool`): Whether to use debug mode in the environment manager. Default to False. \
+                If set True, base environment manager will be used for easy debugging. Otherwise, \
+                subprocess environment manager will be used.
+            - wandb_sweep (:obj:`bool`): Whether to use wandb sweep, \
+                which is a hyper-parameter optimization process for seeking the best configurations. \
+                Default to False. If True, the wandb sweep id will be used as the experiment name.
+        Returns:
+            - (:obj:`TrainingReturn`): The training result, of which the attributions are:
+                - wandb_url (:obj:`str`): The weight & biases (wandb) project url of the trainning experiment.
+        """
+
         if debug:
             logging.getLogger().setLevel(logging.DEBUG)
         logging.debug(self.policy._model)
@@ -185,6 +273,31 @@ def deploy(
             seed: Optional[Union[int, List]] = None,
             debug: bool = False
     ) -> EvalReturn:
+        """
+        Overview:
+            Deploy the agent with PPO algorithm by interacting with the environment, during which the replay video \
+            can be saved if ``enable_save_replay`` is True. The evaluation result will be returned.
+        Arguments:
+            - enable_save_replay (:obj:`bool`): Whether to save the replay video. Default to False.
+            - concatenate_all_replay (:obj:`bool`): Whether to concatenate all replay videos into one video. \
+                Default to False. If ``enable_save_replay`` is False, this argument will be ignored. \
+                If ``enable_save_replay`` is True and ``concatenate_all_replay`` is False, \
+                the replay video of each episode will be saved separately.
+            - replay_save_path (:obj:`str`): The path to save the replay video. Default to None. \
+                If not specified, the video will be saved in ``exp_name/videos``.
+            - seed (:obj:`Union[int, List]`): The random seed, which is set before running the program. \
+                Default to None. If not specified, ``self.seed`` will be used. \
+                If ``seed`` is an integer, the agent will be deployed once. \
+                If ``seed`` is a list of integers, the agent will be deployed once for each seed in the list.
+            - debug (:obj:`bool`): Whether to use debug mode in the environment manager. Default to False. \
+                If set True, base environment manager will be used for easy debugging. Otherwise, \
+                subprocess environment manager will be used.
+        Returns:
+            - (:obj:`EvalReturn`): The evaluation result, of which the attributions are:
+                - eval_value (:obj:`np.float32`): The mean of evaluation return.
+                - eval_value_std (:obj:`np.float32`): The standard deviation of evaluation return.
+        """
+
         if debug:
             logging.getLogger().setLevel(logging.DEBUG)
         # define env and policy
@@ -249,6 +362,26 @@ def collect_data(
             context: Optional[str] = None,
             debug: bool = False
     ) -> None:
+        """
+        Overview:
+            Collect data with PPO algorithm for ``n_episode`` episodes with ``env_num`` collector environments. \
+            The collected data will be saved in ``save_data_path`` if specified, otherwise it will be saved in \
+            ``exp_name/demo_data``.
+        Arguments:
+            - env_num (:obj:`int`): The number of collector environments. Default to 8.
+            - save_data_path (:obj:`str`): The path to save the collected data. Default to None. \
+                If not specified, the data will be saved in ``exp_name/demo_data``.
+            - n_sample (:obj:`int`): The number of samples to collect. Default to None. \
+                If not specified, ``n_episode`` must be specified.
+            - n_episode (:obj:`int`): The number of episodes to collect. Default to None. \
+                If not specified, ``n_sample`` must be specified.
+            - context (:obj:`str`): The multi-process context of the environment manager. Default to None. \
+                It can be specified as ``spawn``, ``fork`` or ``forkserver``.
+            - debug (:obj:`bool`): Whether to use debug mode in the environment manager. Default to False. \
+                If set True, base environment manager will be used for easy debugging. Otherwise, \
+                subprocess environment manager will be used.
+        """
+
         if debug:
             logging.getLogger().setLevel(logging.DEBUG)
         if n_episode is not None:
@@ -274,6 +407,27 @@ def batch_evaluate(
             context: Optional[str] = None,
             debug: bool = False,
     ) -> EvalReturn:
+        """
+        Overview:
+            Evaluate the agent with PPO algorithm for ``n_evaluator_episode`` episodes with ``env_num`` evaluator \
+            environments. The evaluation result will be returned.
+            The difference between methods ``batch_evaluate`` and ``deploy`` is that ``batch_evaluate`` will create \
+            multiple evaluator environments to evaluate the agent to get an average performance, while ``deploy`` \
+            will only create one evaluator environment to evaluate the agent and save the replay video.
+        Arguments:
+            - env_num (:obj:`int`): The number of evaluator environments. Default to 4.
+            - n_evaluator_episode (:obj:`int`): The number of episodes to evaluate. Default to 4.
+            - context (:obj:`str`): The multi-process context of the environment manager. Default to None. \
+                It can be specified as ``spawn``, ``fork`` or ``forkserver``.
+            - debug (:obj:`bool`): Whether to use debug mode in the environment manager. Default to False. \
+                If set True, base environment manager will be used for easy debugging. Otherwise, \
+                subprocess environment manager will be used.
+        Returns:
+            - (:obj:`EvalReturn`): The evaluation result, of which the attributions are:
+                - eval_value (:obj:`np.float32`): The mean of evaluation return.
+                - eval_value_std (:obj:`np.float32`): The standard deviation of evaluation return.
+        """
+
         if debug:
             logging.getLogger().setLevel(logging.DEBUG)
         # define env and policy
@@ -303,6 +457,20 @@ def _setup_env_manager(
             debug: bool = False,
             caller: str = 'collector'
     ) -> BaseEnvManagerV2:
+        """
+        Overview:
+            Setup the environment manager. The environment manager is used to manage multiple environments.
+        Arguments:
+            - env_num (:obj:`int`): The number of environments.
+            - context (:obj:`str`): The multi-process context of the environment manager. Default to None. \
+                It can be specified as ``spawn``, ``fork`` or ``forkserver``.
+            - debug (:obj:`bool`): Whether to use debug mode in the environment manager. Default to False. \
+                If set True, base environment manager will be used for easy debugging. Otherwise, \
+                subprocess environment manager will be used.
+            - caller (:obj:`str`): The caller of the environment manager. Default to 'collector'.
+        Returns:
+            - (:obj:`BaseEnvManagerV2`): The environment manager.
+        """
         assert caller in ['evaluator', 'collector']
         if debug:
             env_cls = BaseEnvManagerV2
@@ -315,7 +483,24 @@ def _setup_env_manager(
         return env_cls([partial(self.env.clone, caller) for _ in range(env_num)], manager_cfg)
 
     @property
-    def best(self):
+    def best(self) -> 'PPOF':
+        """
+        Overview:
+            Load the best model from the checkpoint directory, \
+            which is by default in folder ``exp_name/ckpt/eval.pth.tar``. \
+            The return value is the agent with the best model.
+        Returns:
+            - (:obj:`PPOF`): The agent with the best model.
+        Examples:
+            >>> agent = PPOF(env_id='LunarLander-v2')
+            >>> agent.train()
+            >>> agent = agent.best()
+
+        .. note::
+            The best model is the model with the highest evaluation return. If this method is called, the current \
+            model will be replaced by the best model.
+        """
+
         best_model_file_path = os.path.join(self.checkpoint_save_dir, "eval.pth.tar")
         # Load best model if it exists
         if os.path.exists(best_model_file_path):
diff --git a/ding/bonus/sac.py b/ding/bonus/sac.py
index d12e44086b..cb6046476c 100644
--- a/ding/bonus/sac.py
+++ b/ding/bonus/sac.py
@@ -24,7 +24,23 @@
 
 
 class SACAgent:
+    """
+    Overview:
+        Class of agent for training, evaluation and deployment of Reinforcement learning algorithm \
+        Soft Actor-Critic(SAC).
+        For more information about the system design of RL agent, please refer to \
+        <https://di-engine-docs.readthedocs.io/en/latest/03_system/agent.html>.
+    Interface:
+        ``__init__``, ``train``, ``deploy``, ``collect_data``, ``batch_evaluate``, ``best``
+    """
     supported_env_list = list(supported_env_cfg.keys())
+    """
+    Overview:
+        List of supported envs.
+    Examples:
+        >>> from ding.bonus.sac import SACAgent
+        >>> print(SACAgent.supported_env_list)
+    """
 
     def __init__(
             self,
@@ -36,6 +52,52 @@ def __init__(
             cfg: Optional[Union[EasyDict, dict]] = None,
             policy_state_dict: str = None,
     ) -> None:
+        """
+        Overview:
+            Initialize agent for SAC algorithm.
+        Arguments:
+            - env_id (:obj:`str`): The environment id, which is a registered environment name in gym or gymnasium. \
+                If ``env_id`` is not specified, ``env_id`` in ``cfg.env`` must be specified. \
+                If ``env_id`` is specified, ``env_id`` in ``cfg.env`` will be ignored. \
+                ``env_id`` should be one of the supported envs, which can be found in ``supported_env_list``.
+            - env (:obj:`BaseEnv`): The environment instance for training and evaluation. \
+                If ``env`` is not specified, `env_id`` or ``cfg.env.env_id`` must be specified. \
+                ``env_id`` or ``cfg.env.env_id`` will be used to create environment instance. \
+                If ``env`` is specified, ``env_id`` and ``cfg.env.env_id`` will be ignored.
+            - seed (:obj:`int`): The random seed, which is set before running the program. \
+                Default to 0.
+            - exp_name (:obj:`str`): The name of this experiment, which will be used to create the folder to save \
+                log data. Default to None. If not specified, the folder name will be ``env_id``-``algorithm``.
+            - model (:obj:`torch.nn.Module`): The model of SAC algorithm, which should be an instance of class \
+                :class:`ding.model.ContinuousQAC`. \
+                If not specified, a default model will be generated according to the configuration.
+            - cfg (:obj:Union[EasyDict, dict]): The configuration of SAC algorithm, which is a dict. \
+                Default to None. If not specified, the default configuration will be used. \
+                The default configuration can be found in ``ding/config/example/SAC/gym_lunarlander_v2.py``.
+            - policy_state_dict (:obj:`str`): The path of policy state dict saved by PyTorch a in local file. \
+                If specified, the policy will be loaded from this file. Default to None.
+
+        .. note::
+            An RL Agent Instance can be initialized in two basic ways. \
+            For example, we have an environment with id ``LunarLanderContinuous-v2`` registered in gym, \
+            and we want to train an agent with SAC algorithm with default configuration. \
+            Then we can initialize the agent in the following ways:
+                >>> agent = SACAgent(env_id='LunarLanderContinuous-v2')
+            or, if we want can specify the env_id in the configuration:
+                >>> cfg = {'env': {'env_id': 'LunarLanderContinuous-v2'}, 'policy': ...... }
+                >>> agent = SACAgent(cfg=cfg)
+            There are also other arguments to specify the agent when initializing.
+            For example, if we want to specify the environment instance:
+                >>> env = CustomizedEnv('LunarLanderContinuous-v2')
+                >>> agent = SACAgent(cfg=cfg, env=env)
+            or, if we want to specify the model:
+                >>> model = ContinuousQAC(**cfg.policy.model)
+                >>> agent = SACAgent(cfg=cfg, model=model)
+            or, if we want to reload the policy from a saved policy state dict:
+                >>> agent = SACAgent(cfg=cfg, policy_state_dict='LunarLanderContinuous-v2.pth.tar')
+            Make sure that the configuration is consistent with the saved policy state dict.
+        """
+
         assert env_id is not None or cfg is not None, "Please specify env_id or cfg."
 
         if cfg is not None and not isinstance(cfg, EasyDict):
@@ -92,6 +154,32 @@ def train(
         debug: bool = False,
         wandb_sweep: bool = False,
     ) -> TrainingReturn:
+        """
+        Overview:
+            Train the agent with SAC algorithm for ``step`` iterations with ``collector_env_num`` collector \
+            environments and ``evaluator_env_num`` evaluator environments. Information during training will be \
+            recorded and saved by wandb.
+        Arguments:
+            - step (:obj:`int`): The total training environment steps of all collector environments. Default to 1e7.
+            - collector_env_num (:obj:`int`): The collector environment number. Default to None. \
+                If not specified, it will be set according to the configuration.
+            - evaluator_env_num (:obj:`int`): The evaluator environment number. Default to None. \
+                If not specified, it will be set according to the configuration.
+            - n_iter_save_ckpt (:obj:`int`): The frequency of saving checkpoint every training iteration. \
+                Default to 1000.
+            - context (:obj:`str`): The multi-process context of the environment manager. Default to None. \
+                It can be specified as ``spawn``, ``fork`` or ``forkserver``.
+            - debug (:obj:`bool`): Whether to use debug mode in the environment manager. Default to False. \
+                If set True, base environment manager will be used for easy debugging. Otherwise, \
+                subprocess environment manager will be used.
+            - wandb_sweep (:obj:`bool`): Whether to use wandb sweep, \
+                which is a hyper-parameter optimization process for seeking the best configurations. \
+                Default to False. If True, the wandb sweep id will be used as the experiment name.
+        Returns:
+            - (:obj:`TrainingReturn`): The training result, of which the attributions are:
+                - wandb_url (:obj:`str`): The weight & biases (wandb) project url of the trainning experiment.
+        """
+
         if debug:
             logging.getLogger().setLevel(logging.DEBUG)
         logging.debug(self.policy._model)
@@ -145,6 +233,31 @@ def deploy(
             seed: Optional[Union[int, List]] = None,
             debug: bool = False
     ) -> EvalReturn:
+        """
+        Overview:
+            Deploy the agent with SAC algorithm by interacting with the environment, during which the replay video \
+            can be saved if ``enable_save_replay`` is True. The evaluation result will be returned.
+        Arguments:
+            - enable_save_replay (:obj:`bool`): Whether to save the replay video. Default to False.
+            - concatenate_all_replay (:obj:`bool`): Whether to concatenate all replay videos into one video. \
+                Default to False. If ``enable_save_replay`` is False, this argument will be ignored. \
+                If ``enable_save_replay`` is True and ``concatenate_all_replay`` is False, \
+                the replay video of each episode will be saved separately.
+            - replay_save_path (:obj:`str`): The path to save the replay video. Default to None. \
+                If not specified, the video will be saved in ``exp_name/videos``.
+            - seed (:obj:`Union[int, List]`): The random seed, which is set before running the program. \
+                Default to None. If not specified, ``self.seed`` will be used. \
+                If ``seed`` is an integer, the agent will be deployed once. \
+                If ``seed`` is a list of integers, the agent will be deployed once for each seed in the list.
+            - debug (:obj:`bool`): Whether to use debug mode in the environment manager. Default to False. \
+                If set True, base environment manager will be used for easy debugging. Otherwise, \
+                subprocess environment manager will be used.
+        Returns:
+            - (:obj:`EvalReturn`): The evaluation result, of which the attributions are:
+                - eval_value (:obj:`np.float32`): The mean of evaluation return.
+                - eval_value_std (:obj:`np.float32`): The standard deviation of evaluation return.
+        """
+
         if debug:
             logging.getLogger().setLevel(logging.DEBUG)
         # define env and policy
@@ -224,6 +337,26 @@ def collect_data(
             context: Optional[str] = None,
             debug: bool = False
     ) -> None:
+        """
+        Overview:
+            Collect data with SAC algorithm for ``n_episode`` episodes with ``env_num`` collector environments. \
+            The collected data will be saved in ``save_data_path`` if specified, otherwise it will be saved in \
+            ``exp_name/demo_data``.
+        Arguments:
+            - env_num (:obj:`int`): The number of collector environments. Default to 8.
+            - save_data_path (:obj:`str`): The path to save the collected data. Default to None. \
+                If not specified, the data will be saved in ``exp_name/demo_data``.
+            - n_sample (:obj:`int`): The number of samples to collect. Default to None. \
+                If not specified, ``n_episode`` must be specified.
+            - n_episode (:obj:`int`): The number of episodes to collect. Default to None. \
+                If not specified, ``n_sample`` must be specified.
+            - context (:obj:`str`): The multi-process context of the environment manager. Default to None. \
+                It can be specified as ``spawn``, ``fork`` or ``forkserver``.
+            - debug (:obj:`bool`): Whether to use debug mode in the environment manager. Default to False. \
+                If set True, base environment manager will be used for easy debugging. Otherwise, \
+                subprocess environment manager will be used.
+        """
+
         if debug:
             logging.getLogger().setLevel(logging.DEBUG)
         if n_episode is not None:
@@ -255,6 +388,27 @@ def batch_evaluate(
             context: Optional[str] = None,
             debug: bool = False
     ) -> EvalReturn:
+        """
+        Overview:
+            Evaluate the agent with SAC algorithm for ``n_evaluator_episode`` episodes with ``env_num`` evaluator \
+            environments. The evaluation result will be returned.
+            The difference between methods ``batch_evaluate`` and ``deploy`` is that ``batch_evaluate`` will create \
+            multiple evaluator environments to evaluate the agent to get an average performance, while ``deploy`` \
+            will only create one evaluator environment to evaluate the agent and save the replay video.
+        Arguments:
+            - env_num (:obj:`int`): The number of evaluator environments. Default to 4.
+            - n_evaluator_episode (:obj:`int`): The number of episodes to evaluate. Default to 4.
+            - context (:obj:`str`): The multi-process context of the environment manager. Default to None. \
+                It can be specified as ``spawn``, ``fork`` or ``forkserver``.
+            - debug (:obj:`bool`): Whether to use debug mode in the environment manager. Default to False. \
+                If set True, base environment manager will be used for easy debugging. Otherwise, \
+                subprocess environment manager will be used.
+        Returns:
+            - (:obj:`EvalReturn`): The evaluation result, of which the attributions are:
+                - eval_value (:obj:`np.float32`): The mean of evaluation return.
+                - eval_value_std (:obj:`np.float32`): The standard deviation of evaluation return.
+        """
+
         if debug:
             logging.getLogger().setLevel(logging.DEBUG)
         # define env and policy
@@ -277,7 +431,24 @@ def batch_evaluate(
         return EvalReturn(eval_value=task.ctx.eval_value, eval_value_std=task.ctx.eval_value_std)
 
     @property
-    def best(self):
+    def best(self) -> 'SACAgent':
+        """
+        Overview:
+            Load the best model from the checkpoint directory, \
+            which is by default in folder ``exp_name/ckpt/eval.pth.tar``. \
+            The return value is the agent with the best model.
+        Returns:
+            - (:obj:`SACAgent`): The agent with the best model.
+        Examples:
+            >>> agent = SACAgent(env_id='LunarLanderContinuous-v2')
+            >>> agent.train()
+            >>> agent = agent.best
+
+        .. note::
+            The best model is the model with the highest evaluation return. If this method is called, the current \
+            model will be replaced by the best model.
+        """
+
         best_model_file_path = os.path.join(self.checkpoint_save_dir, "eval.pth.tar")
         # Load best model if it exists
         if os.path.exists(best_model_file_path):
diff --git a/ding/bonus/sql.py b/ding/bonus/sql.py
index 49ffb0c679..63d26acce2 100644
--- a/ding/bonus/sql.py
+++ b/ding/bonus/sql.py
@@ -24,7 +24,23 @@
 
 
 class SQLAgent:
+    """
+    Overview:
+        Class of agent for training, evaluation and deployment of Reinforcement learning algorithm \
+        Soft Q-Learning(SQL).
+        For more information about the system design of RL agent, please refer to \
+        <https://di-engine-docs.readthedocs.io/en/latest/03_system/agent.html>.
+    Interface:
+        ``__init__``, ``train``, ``deploy``, ``collect_data``, ``batch_evaluate``, ``best``
+    """
     supported_env_list = list(supported_env_cfg.keys())
+    """
+    Overview:
+        List of supported envs.
+    Examples:
+        >>> from ding.bonus.sql import SQLAgent
+        >>> print(SQLAgent.supported_env_list)
+    """
 
     def __init__(
             self,
@@ -36,6 +52,52 @@ def __init__(
             cfg: Optional[Union[EasyDict, dict]] = None,
             policy_state_dict: str = None,
     ) -> None:
+        """
+        Overview:
+            Initialize agent for SQL algorithm.
+        Arguments:
+            - env_id (:obj:`str`): The environment id, which is a registered environment name in gym or gymnasium. \
+                If ``env_id`` is not specified, ``env_id`` in ``cfg.env`` must be specified. \
+                If ``env_id`` is specified, ``env_id`` in ``cfg.env`` will be ignored. \
+                ``env_id`` should be one of the supported envs, which can be found in ``supported_env_list``.
+            - env (:obj:`BaseEnv`): The environment instance for training and evaluation. \
+                If ``env`` is not specified, `env_id`` or ``cfg.env.env_id`` must be specified. \
+                ``env_id`` or ``cfg.env.env_id`` will be used to create environment instance. \
+                If ``env`` is specified, ``env_id`` and ``cfg.env.env_id`` will be ignored.
+            - seed (:obj:`int`): The random seed, which is set before running the program. \
+                Default to 0.
+            - exp_name (:obj:`str`): The name of this experiment, which will be used to create the folder to save \
+                log data. Default to None. If not specified, the folder name will be ``env_id``-``algorithm``.
+            - model (:obj:`torch.nn.Module`): The model of SQL algorithm, which should be an instance of class \
+                :class:`ding.model.DQN`. \
+                If not specified, a default model will be generated according to the configuration.
+            - cfg (:obj:Union[EasyDict, dict]): The configuration of SQL algorithm, which is a dict. \
+                Default to None. If not specified, the default configuration will be used. \
+                The default configuration can be found in ``ding/config/example/SQL/gym_lunarlander_v2.py``.
+            - policy_state_dict (:obj:`str`): The path of policy state dict saved by PyTorch a in local file. \
+                If specified, the policy will be loaded from this file. Default to None.
+
+        .. note::
+            An RL Agent Instance can be initialized in two basic ways. \
+            For example, we have an environment with id ``LunarLander-v2`` registered in gym, \
+            and we want to train an agent with SQL algorithm with default configuration. \
+            Then we can initialize the agent in the following ways:
+                >>> agent = SQLAgent(env_id='LunarLander-v2')
+            or, if we want can specify the env_id in the configuration:
+                >>> cfg = {'env': {'env_id': 'LunarLander-v2'}, 'policy': ...... }
+                >>> agent = SQLAgent(cfg=cfg)
+            There are also other arguments to specify the agent when initializing.
+            For example, if we want to specify the environment instance:
+                >>> env = CustomizedEnv('LunarLander-v2')
+                >>> agent = SQLAgent(cfg=cfg, env=env)
+            or, if we want to specify the model:
+                >>> model = DQN(**cfg.policy.model)
+                >>> agent = SQLAgent(cfg=cfg, model=model)
+            or, if we want to reload the policy from a saved policy state dict:
+                >>> agent = SQLAgent(cfg=cfg, policy_state_dict='LunarLander-v2.pth.tar')
+            Make sure that the configuration is consistent with the saved policy state dict.
+        """
+
         assert env_id is not None or cfg is not None, "Please specify env_id or cfg."
 
         if cfg is not None and not isinstance(cfg, EasyDict):
@@ -92,6 +154,32 @@ def train(
         debug: bool = False,
         wandb_sweep: bool = False,
     ) -> TrainingReturn:
+        """
+        Overview:
+            Train the agent with SQL algorithm for ``step`` iterations with ``collector_env_num`` collector \
+            environments and ``evaluator_env_num`` evaluator environments. Information during training will be \
+            recorded and saved by wandb.
+        Arguments:
+            - step (:obj:`int`): The total training environment steps of all collector environments. Default to 1e7.
+            - collector_env_num (:obj:`int`): The collector environment number. Default to None. \
+                If not specified, it will be set according to the configuration.
+            - evaluator_env_num (:obj:`int`): The evaluator environment number. Default to None. \
+                If not specified, it will be set according to the configuration.
+            - n_iter_save_ckpt (:obj:`int`): The frequency of saving checkpoint every training iteration. \
+                Default to 1000.
+            - context (:obj:`str`): The multi-process context of the environment manager. Default to None. \
+                It can be specified as ``spawn``, ``fork`` or ``forkserver``.
+            - debug (:obj:`bool`): Whether to use debug mode in the environment manager. Default to False. \
+                If set True, base environment manager will be used for easy debugging. Otherwise, \
+                subprocess environment manager will be used.
+            - wandb_sweep (:obj:`bool`): Whether to use wandb sweep, \
+                which is a hyper-parameter optimization process for seeking the best configurations. \
+                Default to False. If True, the wandb sweep id will be used as the experiment name.
+        Returns:
+            - (:obj:`TrainingReturn`): The training result, of which the attributions are:
+                - wandb_url (:obj:`str`): The weight & biases (wandb) project url of the trainning experiment.
+        """
+
         if debug:
             logging.getLogger().setLevel(logging.DEBUG)
         logging.debug(self.policy._model)
@@ -148,6 +236,31 @@ def deploy(
             seed: Optional[Union[int, List]] = None,
             debug: bool = False
     ) -> EvalReturn:
+        """
+        Overview:
+            Deploy the agent with SQL algorithm by interacting with the environment, during which the replay video \
+            can be saved if ``enable_save_replay`` is True. The evaluation result will be returned.
+        Arguments:
+            - enable_save_replay (:obj:`bool`): Whether to save the replay video. Default to False.
+            - concatenate_all_replay (:obj:`bool`): Whether to concatenate all replay videos into one video. \
+                Default to False. If ``enable_save_replay`` is False, this argument will be ignored. \
+                If ``enable_save_replay`` is True and ``concatenate_all_replay`` is False, \
+                the replay video of each episode will be saved separately.
+            - replay_save_path (:obj:`str`): The path to save the replay video. Default to None. \
+                If not specified, the video will be saved in ``exp_name/videos``.
+            - seed (:obj:`Union[int, List]`): The random seed, which is set before running the program. \
+                Default to None. If not specified, ``self.seed`` will be used. \
+                If ``seed`` is an integer, the agent will be deployed once. \
+                If ``seed`` is a list of integers, the agent will be deployed once for each seed in the list.
+            - debug (:obj:`bool`): Whether to use debug mode in the environment manager. Default to False. \
+                If set True, base environment manager will be used for easy debugging. Otherwise, \
+                subprocess environment manager will be used.
+        Returns:
+            - (:obj:`EvalReturn`): The evaluation result, of which the attributions are:
+                - eval_value (:obj:`np.float32`): The mean of evaluation return.
+                - eval_value_std (:obj:`np.float32`): The standard deviation of evaluation return.
+        """
+
         if debug:
             logging.getLogger().setLevel(logging.DEBUG)
         # define env and policy
@@ -207,7 +320,7 @@ def _forward(obs):
                 step += 1
                 if done:
                     break
-            logging.info(f'DQN deploy is finished, final episode return with {step} steps is: {return_}')
+            logging.info(f'SQL deploy is finished, final episode return with {step} steps is: {return_}')
             returns.append(return_)
 
         env.close()
@@ -228,6 +341,26 @@ def collect_data(
             context: Optional[str] = None,
             debug: bool = False
     ) -> None:
+        """
+        Overview:
+            Collect data with SQL algorithm for ``n_episode`` episodes with ``env_num`` collector environments. \
+            The collected data will be saved in ``save_data_path`` if specified, otherwise it will be saved in \
+            ``exp_name/demo_data``.
+        Arguments:
+            - env_num (:obj:`int`): The number of collector environments. Default to 8.
+            - save_data_path (:obj:`str`): The path to save the collected data. Default to None. \
+                If not specified, the data will be saved in ``exp_name/demo_data``.
+            - n_sample (:obj:`int`): The number of samples to collect. Default to None. \
+                If not specified, ``n_episode`` must be specified.
+            - n_episode (:obj:`int`): The number of episodes to collect. Default to None. \
+                If not specified, ``n_sample`` must be specified.
+            - context (:obj:`str`): The multi-process context of the environment manager. Default to None. \
+                It can be specified as ``spawn``, ``fork`` or ``forkserver``.
+            - debug (:obj:`bool`): Whether to use debug mode in the environment manager. Default to False. \
+                If set True, base environment manager will be used for easy debugging. Otherwise, \
+                subprocess environment manager will be used.
+        """
+
         if debug:
             logging.getLogger().setLevel(logging.DEBUG)
         if n_episode is not None:
@@ -249,7 +382,7 @@ def collect_data(
             task.use(offline_data_saver(save_data_path, data_type='hdf5'))
             task.run(max_step=1)
         logging.info(
-            f'DQN collecting is finished, more than {n_sample} samples are collected and saved in `{save_data_path}`'
+            f'SQL collecting is finished, more than {n_sample} samples are collected and saved in `{save_data_path}`'
         )
 
     def batch_evaluate(
@@ -259,6 +392,27 @@ def batch_evaluate(
             context: Optional[str] = None,
             debug: bool = False
     ) -> EvalReturn:
+        """
+        Overview:
+            Evaluate the agent with SQL algorithm for ``n_evaluator_episode`` episodes with ``env_num`` evaluator \
+            environments. The evaluation result will be returned.
+            The difference between methods ``batch_evaluate`` and ``deploy`` is that ``batch_evaluate`` will create \
+            multiple evaluator environments to evaluate the agent to get an average performance, while ``deploy`` \
+            will only create one evaluator environment to evaluate the agent and save the replay video.
+        Arguments:
+            - env_num (:obj:`int`): The number of evaluator environments. Default to 4.
+            - n_evaluator_episode (:obj:`int`): The number of episodes to evaluate. Default to 4.
+            - context (:obj:`str`): The multi-process context of the environment manager. Default to None. \
+                It can be specified as ``spawn``, ``fork`` or ``forkserver``.
+            - debug (:obj:`bool`): Whether to use debug mode in the environment manager. Default to False. \
+                If set True, base environment manager will be used for easy debugging. Otherwise, \
+                subprocess environment manager will be used.
+        Returns:
+            - (:obj:`EvalReturn`): The evaluation result, of which the attributions are:
+                - eval_value (:obj:`np.float32`): The mean of evaluation return.
+                - eval_value_std (:obj:`np.float32`): The standard deviation of evaluation return.
+        """
+
         if debug:
             logging.getLogger().setLevel(logging.DEBUG)
         # define env and policy
@@ -281,7 +435,24 @@ def batch_evaluate(
         return EvalReturn(eval_value=task.ctx.eval_value, eval_value_std=task.ctx.eval_value_std)
 
     @property
-    def best(self):
+    def best(self) -> 'SQLAgent':
+        """
+        Overview:
+            Load the best model from the checkpoint directory, \
+            which is by default in folder ``exp_name/ckpt/eval.pth.tar``. \
+            The return value is the agent with the best model.
+        Returns:
+            - (:obj:`SQLAgent`): The agent with the best model.
+        Examples:
+            >>> agent = SQLAgent(env_id='LunarLander-v2')
+            >>> agent.train()
+            >>> agent = agent.best
+
+        .. note::
+            The best model is the model with the highest evaluation return. If this method is called, the current \
+            model will be replaced by the best model.
+        """
+
         best_model_file_path = os.path.join(self.checkpoint_save_dir, "eval.pth.tar")
         # Load best model if it exists
         if os.path.exists(best_model_file_path):
diff --git a/ding/bonus/td3.py b/ding/bonus/td3.py
index 894a7de5b8..a2889a370d 100644
--- a/ding/bonus/td3.py
+++ b/ding/bonus/td3.py
@@ -23,7 +23,23 @@
 
 
 class TD3Agent:
+    """
+    Overview:
+        Class of agent for training, evaluation and deployment of Reinforcement learning algorithm \
+        Twin Delayed Deep Deterministic Policy Gradient(TD3).
+        For more information about the system design of RL agent, please refer to \
+        <https://di-engine-docs.readthedocs.io/en/latest/03_system/agent.html>.
+    Interface:
+        ``__init__``, ``train``, ``deploy``, ``collect_data``, ``batch_evaluate``, ``best``
+    """
     supported_env_list = list(supported_env_cfg.keys())
+    """
+    Overview:
+        List of supported envs.
+    Examples:
+        >>> from ding.bonus.td3 import TD3Agent
+        >>> print(TD3Agent.supported_env_list)
+    """
 
     def __init__(
             self,
@@ -35,6 +51,52 @@ def __init__(
             cfg: Optional[Union[EasyDict, dict]] = None,
             policy_state_dict: str = None,
     ) -> None:
+        """
+        Overview:
+            Initialize agent for TD3 algorithm.
+        Arguments:
+            - env_id (:obj:`str`): The environment id, which is a registered environment name in gym or gymnasium. \
+                If ``env_id`` is not specified, ``env_id`` in ``cfg.env`` must be specified. \
+                If ``env_id`` is specified, ``env_id`` in ``cfg.env`` will be ignored. \
+                ``env_id`` should be one of the supported envs, which can be found in ``supported_env_list``.
+            - env (:obj:`BaseEnv`): The environment instance for training and evaluation. \
+                If ``env`` is not specified, `env_id`` or ``cfg.env.env_id`` must be specified. \
+                ``env_id`` or ``cfg.env.env_id`` will be used to create environment instance. \
+                If ``env`` is specified, ``env_id`` and ``cfg.env.env_id`` will be ignored.
+            - seed (:obj:`int`): The random seed, which is set before running the program. \
+                Default to 0.
+            - exp_name (:obj:`str`): The name of this experiment, which will be used to create the folder to save \
+                log data. Default to None. If not specified, the folder name will be ``env_id``-``algorithm``.
+            - model (:obj:`torch.nn.Module`): The model of TD3 algorithm, which should be an instance of class \
+                :class:`ding.model.ContinuousQAC`. \
+                If not specified, a default model will be generated according to the configuration.
+            - cfg (:obj:Union[EasyDict, dict]): The configuration of TD3 algorithm, which is a dict. \
+                Default to None. If not specified, the default configuration will be used. \
+                The default configuration can be found in ``ding/config/example/TD3/gym_lunarlander_v2.py``.
+            - policy_state_dict (:obj:`str`): The path of policy state dict saved by PyTorch a in local file. \
+                If specified, the policy will be loaded from this file. Default to None.
+
+        .. note::
+            An RL Agent Instance can be initialized in two basic ways. \
+            For example, we have an environment with id ``LunarLanderContinuous-v2`` registered in gym, \
+            and we want to train an agent with TD3 algorithm with default configuration. \
+            Then we can initialize the agent in the following ways:
+                >>> agent = TD3Agent(env_id='LunarLanderContinuous-v2')
+            or, if we want can specify the env_id in the configuration:
+                >>> cfg = {'env': {'env_id': 'LunarLanderContinuous-v2'}, 'policy': ...... }
+                >>> agent = TD3Agent(cfg=cfg)
+            There are also other arguments to specify the agent when initializing.
+            For example, if we want to specify the environment instance:
+                >>> env = CustomizedEnv('LunarLanderContinuous-v2')
+                >>> agent = TD3Agent(cfg=cfg, env=env)
+            or, if we want to specify the model:
+                >>> model = ContinuousQAC(**cfg.policy.model)
+                >>> agent = TD3Agent(cfg=cfg, model=model)
+            or, if we want to reload the policy from a saved policy state dict:
+                >>> agent = TD3Agent(cfg=cfg, policy_state_dict='LunarLanderContinuous-v2.pth.tar')
+            Make sure that the configuration is consistent with the saved policy state dict.
+        """
+
         assert env_id is not None or cfg is not None, "Please specify env_id or cfg."
 
         if cfg is not None and not isinstance(cfg, EasyDict):
@@ -91,6 +153,32 @@ def train(
         debug: bool = False,
         wandb_sweep: bool = False,
     ) -> TrainingReturn:
+        """
+        Overview:
+            Train the agent with TD3 algorithm for ``step`` iterations with ``collector_env_num`` collector \
+            environments and ``evaluator_env_num`` evaluator environments. Information during training will be \
+            recorded and saved by wandb.
+        Arguments:
+            - step (:obj:`int`): The total training environment steps of all collector environments. Default to 1e7.
+            - collector_env_num (:obj:`int`): The collector environment number. Default to None. \
+                If not specified, it will be set according to the configuration.
+            - evaluator_env_num (:obj:`int`): The evaluator environment number. Default to None. \
+                If not specified, it will be set according to the configuration.
+            - n_iter_save_ckpt (:obj:`int`): The frequency of saving checkpoint every training iteration. \
+                Default to 1000.
+            - context (:obj:`str`): The multi-process context of the environment manager. Default to None. \
+                It can be specified as ``spawn``, ``fork`` or ``forkserver``.
+            - debug (:obj:`bool`): Whether to use debug mode in the environment manager. Default to False. \
+                If set True, base environment manager will be used for easy debugging. Otherwise, \
+                subprocess environment manager will be used.
+            - wandb_sweep (:obj:`bool`): Whether to use wandb sweep, \
+                which is a hyper-parameter optimization process for seeking the best configurations. \
+                Default to False. If True, the wandb sweep id will be used as the experiment name.
+        Returns:
+            - (:obj:`TrainingReturn`): The training result, of which the attributions are:
+                - wandb_url (:obj:`str`): The weight & biases (wandb) project url of the trainning experiment.
+        """
+
         if debug:
             logging.getLogger().setLevel(logging.DEBUG)
         logging.debug(self.policy._model)
@@ -144,6 +232,31 @@ def deploy(
             seed: Optional[Union[int, List]] = None,
             debug: bool = False
     ) -> EvalReturn:
+        """
+        Overview:
+            Deploy the agent with TD3 algorithm by interacting with the environment, during which the replay video \
+            can be saved if ``enable_save_replay`` is True. The evaluation result will be returned.
+        Arguments:
+            - enable_save_replay (:obj:`bool`): Whether to save the replay video. Default to False.
+            - concatenate_all_replay (:obj:`bool`): Whether to concatenate all replay videos into one video. \
+                Default to False. If ``enable_save_replay`` is False, this argument will be ignored. \
+                If ``enable_save_replay`` is True and ``concatenate_all_replay`` is False, \
+                the replay video of each episode will be saved separately.
+            - replay_save_path (:obj:`str`): The path to save the replay video. Default to None. \
+                If not specified, the video will be saved in ``exp_name/videos``.
+            - seed (:obj:`Union[int, List]`): The random seed, which is set before running the program. \
+                Default to None. If not specified, ``self.seed`` will be used. \
+                If ``seed`` is an integer, the agent will be deployed once. \
+                If ``seed`` is a list of integers, the agent will be deployed once for each seed in the list.
+            - debug (:obj:`bool`): Whether to use debug mode in the environment manager. Default to False. \
+                If set True, base environment manager will be used for easy debugging. Otherwise, \
+                subprocess environment manager will be used.
+        Returns:
+            - (:obj:`EvalReturn`): The evaluation result, of which the attributions are:
+                - eval_value (:obj:`np.float32`): The mean of evaluation return.
+                - eval_value_std (:obj:`np.float32`): The standard deviation of evaluation return.
+        """
+
         if debug:
             logging.getLogger().setLevel(logging.DEBUG)
         # define env and policy
@@ -222,6 +335,26 @@ def collect_data(
             context: Optional[str] = None,
             debug: bool = False
     ) -> None:
+        """
+        Overview:
+            Collect data with TD3 algorithm for ``n_episode`` episodes with ``env_num`` collector environments. \
+            The collected data will be saved in ``save_data_path`` if specified, otherwise it will be saved in \
+            ``exp_name/demo_data``.
+        Arguments:
+            - env_num (:obj:`int`): The number of collector environments. Default to 8.
+            - save_data_path (:obj:`str`): The path to save the collected data. Default to None. \
+                If not specified, the data will be saved in ``exp_name/demo_data``.
+            - n_sample (:obj:`int`): The number of samples to collect. Default to None. \
+                If not specified, ``n_episode`` must be specified.
+            - n_episode (:obj:`int`): The number of episodes to collect. Default to None. \
+                If not specified, ``n_sample`` must be specified.
+            - context (:obj:`str`): The multi-process context of the environment manager. Default to None. \
+                It can be specified as ``spawn``, ``fork`` or ``forkserver``.
+            - debug (:obj:`bool`): Whether to use debug mode in the environment manager. Default to False. \
+                If set True, base environment manager will be used for easy debugging. Otherwise, \
+                subprocess environment manager will be used.
+        """
+
         if debug:
             logging.getLogger().setLevel(logging.DEBUG)
         if n_episode is not None:
@@ -253,6 +386,27 @@ def batch_evaluate(
             context: Optional[str] = None,
             debug: bool = False
     ) -> EvalReturn:
+        """
+        Overview:
+            Evaluate the agent with TD3 algorithm for ``n_evaluator_episode`` episodes with ``env_num`` evaluator \
+            environments. The evaluation result will be returned.
+            The difference between methods ``batch_evaluate`` and ``deploy`` is that ``batch_evaluate`` will create \
+            multiple evaluator environments to evaluate the agent to get an average performance, while ``deploy`` \
+            will only create one evaluator environment to evaluate the agent and save the replay video.
+        Arguments:
+            - env_num (:obj:`int`): The number of evaluator environments. Default to 4.
+            - n_evaluator_episode (:obj:`int`): The number of episodes to evaluate. Default to 4.
+            - context (:obj:`str`): The multi-process context of the environment manager. Default to None. \
+                It can be specified as ``spawn``, ``fork`` or ``forkserver``.
+            - debug (:obj:`bool`): Whether to use debug mode in the environment manager. Default to False. \
+                If set True, base environment manager will be used for easy debugging. Otherwise, \
+                subprocess environment manager will be used.
+        Returns:
+            - (:obj:`EvalReturn`): The evaluation result, of which the attributions are:
+                - eval_value (:obj:`np.float32`): The mean of evaluation return.
+                - eval_value_std (:obj:`np.float32`): The standard deviation of evaluation return.
+        """
+
         if debug:
             logging.getLogger().setLevel(logging.DEBUG)
         # define env and policy
@@ -275,7 +429,24 @@ def batch_evaluate(
         return EvalReturn(eval_value=task.ctx.eval_value, eval_value_std=task.ctx.eval_value_std)
 
     @property
-    def best(self):
+    def best(self) -> 'TD3Agent':
+        """
+        Overview:
+            Load the best model from the checkpoint directory, \
+            which is by default in folder ``exp_name/ckpt/eval.pth.tar``. \
+            The return value is the agent with the best model.
+        Returns:
+            - (:obj:`TD3Agent`): The agent with the best model.
+        Examples:
+            >>> agent = TD3Agent(env_id='LunarLanderContinuous-v2')
+            >>> agent.train()
+            >>> agent.best
+
+        .. note::
+            The best model is the model with the highest evaluation return. If this method is called, the current \
+            model will be replaced by the best model.
+        """
+
         best_model_file_path = os.path.join(self.checkpoint_save_dir, "eval.pth.tar")
         # Load best model if it exists
         if os.path.exists(best_model_file_path):