From 4d5307425b5a68b0892ac26ab144d5018180af9b Mon Sep 17 00:00:00 2001 From: zjowowen <93968541+zjowowen@users.noreply.github.com> Date: Wed, 27 Dec 2023 21:22:06 +0800 Subject: [PATCH] doc(zjow): add API doc for ding agent (#758) * polish API doc for agent ppof and dqn * add doc for ding agent * polish code --- ding/bonus/a2c.py | 173 ++++++++++++++++++++++++++++++++- ding/bonus/c51.py | 174 ++++++++++++++++++++++++++++++++- ding/bonus/ddpg.py | 175 ++++++++++++++++++++++++++++++++- ding/bonus/dqn.py | 172 ++++++++++++++++++++++++++++++++- ding/bonus/pg.py | 172 ++++++++++++++++++++++++++++++++- ding/bonus/ppo_offpolicy.py | 177 +++++++++++++++++++++++++++++++++- ding/bonus/ppof.py | 187 +++++++++++++++++++++++++++++++++++- ding/bonus/sac.py | 173 ++++++++++++++++++++++++++++++++- ding/bonus/sql.py | 177 +++++++++++++++++++++++++++++++++- ding/bonus/td3.py | 173 ++++++++++++++++++++++++++++++++- 10 files changed, 1738 insertions(+), 15 deletions(-) diff --git a/ding/bonus/a2c.py b/ding/bonus/a2c.py index de666b30df..d10def313b 100644 --- a/ding/bonus/a2c.py +++ b/ding/bonus/a2c.py @@ -23,7 +23,23 @@ class A2CAgent: + """ + Overview: + Class of agent for training, evaluation and deployment of Reinforcement learning algorithm \ + Advantage Actor Critic(A2C). + For more information about the system design of RL agent, please refer to \ + . + Interface: + ``__init__``, ``train``, ``deploy``, ``collect_data``, ``batch_evaluate``, ``best`` + """ supported_env_list = list(supported_env_cfg.keys()) + """ + Overview: + List of supported envs. + Examples: + >>> from ding.bonus.a2c import A2CAgent + >>> print(A2CAgent.supported_env_list) + """ def __init__( self, @@ -35,6 +51,52 @@ def __init__( cfg: Optional[Union[EasyDict, dict]] = None, policy_state_dict: str = None, ) -> None: + """ + Overview: + Initialize agent for A2C algorithm. + Arguments: + - env_id (:obj:`str`): The environment id, which is a registered environment name in gym or gymnasium. \ + If ``env_id`` is not specified, ``env_id`` in ``cfg.env`` must be specified. \ + If ``env_id`` is specified, ``env_id`` in ``cfg.env`` will be ignored. \ + ``env_id`` should be one of the supported envs, which can be found in ``supported_env_list``. + - env (:obj:`BaseEnv`): The environment instance for training and evaluation. \ + If ``env`` is not specified, `env_id`` or ``cfg.env.env_id`` must be specified. \ + ``env_id`` or ``cfg.env.env_id`` will be used to create environment instance. \ + If ``env`` is specified, ``env_id`` and ``cfg.env.env_id`` will be ignored. + - seed (:obj:`int`): The random seed, which is set before running the program. \ + Default to 0. + - exp_name (:obj:`str`): The name of this experiment, which will be used to create the folder to save \ + log data. Default to None. If not specified, the folder name will be ``env_id``-``algorithm``. + - model (:obj:`torch.nn.Module`): The model of A2C algorithm, which should be an instance of class \ + :class:`ding.model.VAC`. \ + If not specified, a default model will be generated according to the configuration. + - cfg (:obj:Union[EasyDict, dict]): The configuration of A2C algorithm, which is a dict. \ + Default to None. If not specified, the default configuration will be used. \ + The default configuration can be found in ``ding/config/example/A2C/gym_lunarlander_v2.py``. + - policy_state_dict (:obj:`str`): The path of policy state dict saved by PyTorch a in local file. \ + If specified, the policy will be loaded from this file. Default to None. + + .. note:: + An RL Agent Instance can be initialized in two basic ways. \ + For example, we have an environment with id ``LunarLanderContinuous-v2`` registered in gym, \ + and we want to train an agent with A2C algorithm with default configuration. \ + Then we can initialize the agent in the following ways: + >>> agent = A2CAgent(env_id='LunarLanderContinuous-v2') + or, if we want can specify the env_id in the configuration: + >>> cfg = {'env': {'env_id': 'LunarLanderContinuous-v2'}, 'policy': ...... } + >>> agent = A2CAgent(cfg=cfg) + There are also other arguments to specify the agent when initializing. + For example, if we want to specify the environment instance: + >>> env = CustomizedEnv('LunarLanderContinuous-v2') + >>> agent = A2CAgent(cfg=cfg, env=env) + or, if we want to specify the model: + >>> model = VAC(**cfg.policy.model) + >>> agent = A2CAgent(cfg=cfg, model=model) + or, if we want to reload the policy from a saved policy state dict: + >>> agent = A2CAgent(cfg=cfg, policy_state_dict='LunarLanderContinuous-v2.pth.tar') + Make sure that the configuration is consistent with the saved policy state dict. + """ + assert env_id is not None or cfg is not None, "Please specify env_id or cfg." if cfg is not None and not isinstance(cfg, EasyDict): @@ -91,6 +153,32 @@ def train( debug: bool = False, wandb_sweep: bool = False, ) -> TrainingReturn: + """ + Overview: + Train the agent with A2C algorithm for ``step`` iterations with ``collector_env_num`` collector \ + environments and ``evaluator_env_num`` evaluator environments. Information during training will be \ + recorded and saved by wandb. + Arguments: + - step (:obj:`int`): The total training environment steps of all collector environments. Default to 1e7. + - collector_env_num (:obj:`int`): The collector environment number. Default to None. \ + If not specified, it will be set according to the configuration. + - evaluator_env_num (:obj:`int`): The evaluator environment number. Default to None. \ + If not specified, it will be set according to the configuration. + - n_iter_save_ckpt (:obj:`int`): The frequency of saving checkpoint every training iteration. \ + Default to 1000. + - context (:obj:`str`): The multi-process context of the environment manager. Default to None. \ + It can be specified as ``spawn``, ``fork`` or ``forkserver``. + - debug (:obj:`bool`): Whether to use debug mode in the environment manager. Default to False. \ + If set True, base environment manager will be used for easy debugging. Otherwise, \ + subprocess environment manager will be used. + - wandb_sweep (:obj:`bool`): Whether to use wandb sweep, \ + which is a hyper-parameter optimization process for seeking the best configurations. \ + Default to False. If True, the wandb sweep id will be used as the experiment name. + Returns: + - (:obj:`TrainingReturn`): The training result, of which the attributions are: + - wandb_url (:obj:`str`): The weight & biases (wandb) project url of the trainning experiment. + """ + if debug: logging.getLogger().setLevel(logging.DEBUG) logging.debug(self.policy._model) @@ -142,6 +230,31 @@ def deploy( seed: Optional[Union[int, List]] = None, debug: bool = False ) -> EvalReturn: + """ + Overview: + Deploy the agent with A2C algorithm by interacting with the environment, during which the replay video \ + can be saved if ``enable_save_replay`` is True. The evaluation result will be returned. + Arguments: + - enable_save_replay (:obj:`bool`): Whether to save the replay video. Default to False. + - concatenate_all_replay (:obj:`bool`): Whether to concatenate all replay videos into one video. \ + Default to False. If ``enable_save_replay`` is False, this argument will be ignored. \ + If ``enable_save_replay`` is True and ``concatenate_all_replay`` is False, \ + the replay video of each episode will be saved separately. + - replay_save_path (:obj:`str`): The path to save the replay video. Default to None. \ + If not specified, the video will be saved in ``exp_name/videos``. + - seed (:obj:`Union[int, List]`): The random seed, which is set before running the program. \ + Default to None. If not specified, ``self.seed`` will be used. \ + If ``seed`` is an integer, the agent will be deployed once. \ + If ``seed`` is a list of integers, the agent will be deployed once for each seed in the list. + - debug (:obj:`bool`): Whether to use debug mode in the environment manager. Default to False. \ + If set True, base environment manager will be used for easy debugging. Otherwise, \ + subprocess environment manager will be used. + Returns: + - (:obj:`EvalReturn`): The evaluation result, of which the attributions are: + - eval_value (:obj:`np.float32`): The mean of evaluation return. + - eval_value_std (:obj:`np.float32`): The standard deviation of evaluation return. + """ + if debug: logging.getLogger().setLevel(logging.DEBUG) # define env and policy @@ -227,6 +340,26 @@ def collect_data( context: Optional[str] = None, debug: bool = False ) -> None: + """ + Overview: + Collect data with A2C algorithm for ``n_episode`` episodes with ``env_num`` collector environments. \ + The collected data will be saved in ``save_data_path`` if specified, otherwise it will be saved in \ + ``exp_name/demo_data``. + Arguments: + - env_num (:obj:`int`): The number of collector environments. Default to 8. + - save_data_path (:obj:`str`): The path to save the collected data. Default to None. \ + If not specified, the data will be saved in ``exp_name/demo_data``. + - n_sample (:obj:`int`): The number of samples to collect. Default to None. \ + If not specified, ``n_episode`` must be specified. + - n_episode (:obj:`int`): The number of episodes to collect. Default to None. \ + If not specified, ``n_sample`` must be specified. + - context (:obj:`str`): The multi-process context of the environment manager. Default to None. \ + It can be specified as ``spawn``, ``fork`` or ``forkserver``. + - debug (:obj:`bool`): Whether to use debug mode in the environment manager. Default to False. \ + If set True, base environment manager will be used for easy debugging. Otherwise, \ + subprocess environment manager will be used. + """ + if debug: logging.getLogger().setLevel(logging.DEBUG) if n_episode is not None: @@ -258,6 +391,27 @@ def batch_evaluate( context: Optional[str] = None, debug: bool = False ) -> EvalReturn: + """ + Overview: + Evaluate the agent with A2C algorithm for ``n_evaluator_episode`` episodes with ``env_num`` evaluator \ + environments. The evaluation result will be returned. + The difference between methods ``batch_evaluate`` and ``deploy`` is that ``batch_evaluate`` will create \ + multiple evaluator environments to evaluate the agent to get an average performance, while ``deploy`` \ + will only create one evaluator environment to evaluate the agent and save the replay video. + Arguments: + - env_num (:obj:`int`): The number of evaluator environments. Default to 4. + - n_evaluator_episode (:obj:`int`): The number of episodes to evaluate. Default to 4. + - context (:obj:`str`): The multi-process context of the environment manager. Default to None. \ + It can be specified as ``spawn``, ``fork`` or ``forkserver``. + - debug (:obj:`bool`): Whether to use debug mode in the environment manager. Default to False. \ + If set True, base environment manager will be used for easy debugging. Otherwise, \ + subprocess environment manager will be used. + Returns: + - (:obj:`EvalReturn`): The evaluation result, of which the attributions are: + - eval_value (:obj:`np.float32`): The mean of evaluation return. + - eval_value_std (:obj:`np.float32`): The standard deviation of evaluation return. + """ + if debug: logging.getLogger().setLevel(logging.DEBUG) # define env and policy @@ -280,7 +434,24 @@ def batch_evaluate( return EvalReturn(eval_value=task.ctx.eval_value, eval_value_std=task.ctx.eval_value_std) @property - def best(self): + def best(self) -> 'A2CAgent': + """ + Overview: + Load the best model from the checkpoint directory, \ + which is by default in folder ``exp_name/ckpt/eval.pth.tar``. \ + The return value is the agent with the best model. + Returns: + - (:obj:`A2CAgent`): The agent with the best model. + Examples: + >>> agent = A2CAgent(env_id='LunarLanderContinuous-v2') + >>> agent.train() + >>> agent = agent.best + + .. note:: + The best model is the model with the highest evaluation return. If this method is called, the current \ + model will be replaced by the best model. + """ + best_model_file_path = os.path.join(self.checkpoint_save_dir, "eval.pth.tar") # Load best model if it exists if os.path.exists(best_model_file_path): diff --git a/ding/bonus/c51.py b/ding/bonus/c51.py index 52ab7ec220..ab4f0be85e 100644 --- a/ding/bonus/c51.py +++ b/ding/bonus/c51.py @@ -24,7 +24,22 @@ class C51Agent: + """ + Overview: + Class of agent for training, evaluation and deployment of Reinforcement learning algorithm C51. + For more information about the system design of RL agent, please refer to \ + . + Interface: + ``__init__``, ``train``, ``deploy``, ``collect_data``, ``batch_evaluate``, ``best`` + """ supported_env_list = list(supported_env_cfg.keys()) + """ + Overview: + List of supported envs. + Examples: + >>> from ding.bonus.c51 import C51Agent + >>> print(C51Agent.supported_env_list) + """ def __init__( self, @@ -36,6 +51,52 @@ def __init__( cfg: Optional[Union[EasyDict, dict]] = None, policy_state_dict: str = None, ) -> None: + """ + Overview: + Initialize agent for C51 algorithm. + Arguments: + - env_id (:obj:`str`): The environment id, which is a registered environment name in gym or gymnasium. \ + If ``env_id`` is not specified, ``env_id`` in ``cfg.env`` must be specified. \ + If ``env_id`` is specified, ``env_id`` in ``cfg.env`` will be ignored. \ + ``env_id`` should be one of the supported envs, which can be found in ``supported_env_list``. + - env (:obj:`BaseEnv`): The environment instance for training and evaluation. \ + If ``env`` is not specified, `env_id`` or ``cfg.env.env_id`` must be specified. \ + ``env_id`` or ``cfg.env.env_id`` will be used to create environment instance. \ + If ``env`` is specified, ``env_id`` and ``cfg.env.env_id`` will be ignored. + - seed (:obj:`int`): The random seed, which is set before running the program. \ + Default to 0. + - exp_name (:obj:`str`): The name of this experiment, which will be used to create the folder to save \ + log data. Default to None. If not specified, the folder name will be ``env_id``-``algorithm``. + - model (:obj:`torch.nn.Module`): The model of C51 algorithm, which should be an instance of class \ + :class:`ding.model.C51DQN`. \ + If not specified, a default model will be generated according to the configuration. + - cfg (:obj:Union[EasyDict, dict]): The configuration of C51 algorithm, which is a dict. \ + Default to None. If not specified, the default configuration will be used. \ + The default configuration can be found in ``ding/config/example/C51/gym_lunarlander_v2.py``. + - policy_state_dict (:obj:`str`): The path of policy state dict saved by PyTorch a in local file. \ + If specified, the policy will be loaded from this file. Default to None. + + .. note:: + An RL Agent Instance can be initialized in two basic ways. \ + For example, we have an environment with id ``LunarLander-v2`` registered in gym, \ + and we want to train an agent with C51 algorithm with default configuration. \ + Then we can initialize the agent in the following ways: + >>> agent = C51Agent(env_id='LunarLander-v2') + or, if we want can specify the env_id in the configuration: + >>> cfg = {'env': {'env_id': 'LunarLander-v2'}, 'policy': ...... } + >>> agent = C51Agent(cfg=cfg) + There are also other arguments to specify the agent when initializing. + For example, if we want to specify the environment instance: + >>> env = CustomizedEnv('LunarLander-v2') + >>> agent = C51Agent(cfg=cfg, env=env) + or, if we want to specify the model: + >>> model = C51DQN(**cfg.policy.model) + >>> agent = C51Agent(cfg=cfg, model=model) + or, if we want to reload the policy from a saved policy state dict: + >>> agent = C51Agent(cfg=cfg, policy_state_dict='LunarLander-v2.pth.tar') + Make sure that the configuration is consistent with the saved policy state dict. + """ + assert env_id is not None or cfg is not None, "Please specify env_id or cfg." if cfg is not None and not isinstance(cfg, EasyDict): @@ -92,6 +153,32 @@ def train( debug: bool = False, wandb_sweep: bool = False, ) -> TrainingReturn: + """ + Overview: + Train the agent with C51 algorithm for ``step`` iterations with ``collector_env_num`` collector \ + environments and ``evaluator_env_num`` evaluator environments. Information during training will be \ + recorded and saved by wandb. + Arguments: + - step (:obj:`int`): The total training environment steps of all collector environments. Default to 1e7. + - collector_env_num (:obj:`int`): The collector environment number. Default to None. \ + If not specified, it will be set according to the configuration. + - evaluator_env_num (:obj:`int`): The evaluator environment number. Default to None. \ + If not specified, it will be set according to the configuration. + - n_iter_save_ckpt (:obj:`int`): The frequency of saving checkpoint every training iteration. \ + Default to 1000. + - context (:obj:`str`): The multi-process context of the environment manager. Default to None. \ + It can be specified as ``spawn``, ``fork`` or ``forkserver``. + - debug (:obj:`bool`): Whether to use debug mode in the environment manager. Default to False. \ + If set True, base environment manager will be used for easy debugging. Otherwise, \ + subprocess environment manager will be used. + - wandb_sweep (:obj:`bool`): Whether to use wandb sweep, \ + which is a hyper-parameter optimization process for seeking the best configurations. \ + Default to False. If True, the wandb sweep id will be used as the experiment name. + Returns: + - (:obj:`TrainingReturn`): The training result, of which the attributions are: + - wandb_url (:obj:`str`): The weight & biases (wandb) project url of the trainning experiment. + """ + if debug: logging.getLogger().setLevel(logging.DEBUG) logging.debug(self.policy._model) @@ -147,6 +234,31 @@ def deploy( seed: Optional[Union[int, List]] = None, debug: bool = False ) -> EvalReturn: + """ + Overview: + Deploy the agent with C51 algorithm by interacting with the environment, during which the replay video \ + can be saved if ``enable_save_replay`` is True. The evaluation result will be returned. + Arguments: + - enable_save_replay (:obj:`bool`): Whether to save the replay video. Default to False. + - concatenate_all_replay (:obj:`bool`): Whether to concatenate all replay videos into one video. \ + Default to False. If ``enable_save_replay`` is False, this argument will be ignored. \ + If ``enable_save_replay`` is True and ``concatenate_all_replay`` is False, \ + the replay video of each episode will be saved separately. + - replay_save_path (:obj:`str`): The path to save the replay video. Default to None. \ + If not specified, the video will be saved in ``exp_name/videos``. + - seed (:obj:`Union[int, List]`): The random seed, which is set before running the program. \ + Default to None. If not specified, ``self.seed`` will be used. \ + If ``seed`` is an integer, the agent will be deployed once. \ + If ``seed`` is a list of integers, the agent will be deployed once for each seed in the list. + - debug (:obj:`bool`): Whether to use debug mode in the environment manager. Default to False. \ + If set True, base environment manager will be used for easy debugging. Otherwise, \ + subprocess environment manager will be used. + Returns: + - (:obj:`EvalReturn`): The evaluation result, of which the attributions are: + - eval_value (:obj:`np.float32`): The mean of evaluation return. + - eval_value_std (:obj:`np.float32`): The standard deviation of evaluation return. + """ + if debug: logging.getLogger().setLevel(logging.DEBUG) # define env and policy @@ -206,7 +318,7 @@ def _forward(obs): step += 1 if done: break - logging.info(f'DQN deploy is finished, final episode return with {step} steps is: {return_}') + logging.info(f'C51 deploy is finished, final episode return with {step} steps is: {return_}') returns.append(return_) env.close() @@ -227,6 +339,26 @@ def collect_data( context: Optional[str] = None, debug: bool = False ) -> None: + """ + Overview: + Collect data with C51 algorithm for ``n_episode`` episodes with ``env_num`` collector environments. \ + The collected data will be saved in ``save_data_path`` if specified, otherwise it will be saved in \ + ``exp_name/demo_data``. + Arguments: + - env_num (:obj:`int`): The number of collector environments. Default to 8. + - save_data_path (:obj:`str`): The path to save the collected data. Default to None. \ + If not specified, the data will be saved in ``exp_name/demo_data``. + - n_sample (:obj:`int`): The number of samples to collect. Default to None. \ + If not specified, ``n_episode`` must be specified. + - n_episode (:obj:`int`): The number of episodes to collect. Default to None. \ + If not specified, ``n_sample`` must be specified. + - context (:obj:`str`): The multi-process context of the environment manager. Default to None. \ + It can be specified as ``spawn``, ``fork`` or ``forkserver``. + - debug (:obj:`bool`): Whether to use debug mode in the environment manager. Default to False. \ + If set True, base environment manager will be used for easy debugging. Otherwise, \ + subprocess environment manager will be used. + """ + if debug: logging.getLogger().setLevel(logging.DEBUG) if n_episode is not None: @@ -258,6 +390,27 @@ def batch_evaluate( context: Optional[str] = None, debug: bool = False ) -> EvalReturn: + """ + Overview: + Evaluate the agent with C51 algorithm for ``n_evaluator_episode`` episodes with ``env_num`` evaluator \ + environments. The evaluation result will be returned. + The difference between methods ``batch_evaluate`` and ``deploy`` is that ``batch_evaluate`` will create \ + multiple evaluator environments to evaluate the agent to get an average performance, while ``deploy`` \ + will only create one evaluator environment to evaluate the agent and save the replay video. + Arguments: + - env_num (:obj:`int`): The number of evaluator environments. Default to 4. + - n_evaluator_episode (:obj:`int`): The number of episodes to evaluate. Default to 4. + - context (:obj:`str`): The multi-process context of the environment manager. Default to None. \ + It can be specified as ``spawn``, ``fork`` or ``forkserver``. + - debug (:obj:`bool`): Whether to use debug mode in the environment manager. Default to False. \ + If set True, base environment manager will be used for easy debugging. Otherwise, \ + subprocess environment manager will be used. + Returns: + - (:obj:`EvalReturn`): The evaluation result, of which the attributions are: + - eval_value (:obj:`np.float32`): The mean of evaluation return. + - eval_value_std (:obj:`np.float32`): The standard deviation of evaluation return. + """ + if debug: logging.getLogger().setLevel(logging.DEBUG) # define env and policy @@ -280,7 +433,24 @@ def batch_evaluate( return EvalReturn(eval_value=task.ctx.eval_value, eval_value_std=task.ctx.eval_value_std) @property - def best(self): + def best(self) -> 'C51Agent': + """ + Overview: + Load the best model from the checkpoint directory, \ + which is by default in folder ``exp_name/ckpt/eval.pth.tar``. \ + The return value is the agent with the best model. + Returns: + - (:obj:`C51Agent`): The agent with the best model. + Examples: + >>> agent = C51Agent(env_id='LunarLander-v2') + >>> agent.train() + >>> agent = agent.best + + .. note:: + The best model is the model with the highest evaluation return. If this method is called, the current \ + model will be replaced by the best model. + """ + best_model_file_path = os.path.join(self.checkpoint_save_dir, "eval.pth.tar") # Load best model if it exists if os.path.exists(best_model_file_path): diff --git a/ding/bonus/ddpg.py b/ding/bonus/ddpg.py index ce7bdee46f..0dade9e38b 100644 --- a/ding/bonus/ddpg.py +++ b/ding/bonus/ddpg.py @@ -23,7 +23,23 @@ class DDPGAgent: + """ + Overview: + Class of agent for training, evaluation and deployment of Reinforcement learning algorithm \ + Deep Deterministic Policy Gradient(DDPG). + For more information about the system design of RL agent, please refer to \ + . + Interface: + ``__init__``, ``train``, ``deploy``, ``collect_data``, ``batch_evaluate``, ``best`` + """ supported_env_list = list(supported_env_cfg.keys()) + """ + Overview: + List of supported envs. + Examples: + >>> from ding.bonus.ddpg import DDPGAgent + >>> print(DDPGAgent.supported_env_list) + """ def __init__( self, @@ -35,6 +51,52 @@ def __init__( cfg: Optional[Union[EasyDict, dict]] = None, policy_state_dict: str = None, ) -> None: + """ + Overview: + Initialize agent for DDPG algorithm. + Arguments: + - env_id (:obj:`str`): The environment id, which is a registered environment name in gym or gymnasium. \ + If ``env_id`` is not specified, ``env_id`` in ``cfg.env`` must be specified. \ + If ``env_id`` is specified, ``env_id`` in ``cfg.env`` will be ignored. \ + ``env_id`` should be one of the supported envs, which can be found in ``supported_env_list``. + - env (:obj:`BaseEnv`): The environment instance for training and evaluation. \ + If ``env`` is not specified, `env_id`` or ``cfg.env.env_id`` must be specified. \ + ``env_id`` or ``cfg.env.env_id`` will be used to create environment instance. \ + If ``env`` is specified, ``env_id`` and ``cfg.env.env_id`` will be ignored. + - seed (:obj:`int`): The random seed, which is set before running the program. \ + Default to 0. + - exp_name (:obj:`str`): The name of this experiment, which will be used to create the folder to save \ + log data. Default to None. If not specified, the folder name will be ``env_id``-``algorithm``. + - model (:obj:`torch.nn.Module`): The model of DDPG algorithm, which should be an instance of class \ + :class:`ding.model.ContinuousQAC`. \ + If not specified, a default model will be generated according to the configuration. + - cfg (:obj:Union[EasyDict, dict]): The configuration of DDPG algorithm, which is a dict. \ + Default to None. If not specified, the default configuration will be used. \ + The default configuration can be found in ``ding/config/example/DDPG/gym_lunarlander_v2.py``. + - policy_state_dict (:obj:`str`): The path of policy state dict saved by PyTorch a in local file. \ + If specified, the policy will be loaded from this file. Default to None. + + .. note:: + An RL Agent Instance can be initialized in two basic ways. \ + For example, we have an environment with id ``LunarLanderContinuous-v2`` registered in gym, \ + and we want to train an agent with DDPG algorithm with default configuration. \ + Then we can initialize the agent in the following ways: + >>> agent = DDPGAgent(env_id='LunarLanderContinuous-v2') + or, if we want can specify the env_id in the configuration: + >>> cfg = {'env': {'env_id': 'LunarLanderContinuous-v2'}, 'policy': ...... } + >>> agent = DDPGAgent(cfg=cfg) + There are also other arguments to specify the agent when initializing. + For example, if we want to specify the environment instance: + >>> env = CustomizedEnv('LunarLanderContinuous-v2') + >>> agent = DDPGAgent(cfg=cfg, env=env) + or, if we want to specify the model: + >>> model = ContinuousQAC(**cfg.policy.model) + >>> agent = DDPGAgent(cfg=cfg, model=model) + or, if we want to reload the policy from a saved policy state dict: + >>> agent = DDPGAgent(cfg=cfg, policy_state_dict='LunarLanderContinuous-v2.pth.tar') + Make sure that the configuration is consistent with the saved policy state dict. + """ + assert env_id is not None or cfg is not None, "Please specify env_id or cfg." if cfg is not None and not isinstance(cfg, EasyDict): @@ -92,6 +154,32 @@ def train( debug: bool = False, wandb_sweep: bool = False, ) -> TrainingReturn: + """ + Overview: + Train the agent with DDPG algorithm for ``step`` iterations with ``collector_env_num`` collector \ + environments and ``evaluator_env_num`` evaluator environments. Information during training will be \ + recorded and saved by wandb. + Arguments: + - step (:obj:`int`): The total training environment steps of all collector environments. Default to 1e7. + - collector_env_num (:obj:`int`): The collector environment number. Default to None. \ + If not specified, it will be set according to the configuration. + - evaluator_env_num (:obj:`int`): The evaluator environment number. Default to None. \ + If not specified, it will be set according to the configuration. + - n_iter_save_ckpt (:obj:`int`): The frequency of saving checkpoint every training iteration. \ + Default to 1000. + - context (:obj:`str`): The multi-process context of the environment manager. Default to None. \ + It can be specified as ``spawn``, ``fork`` or ``forkserver``. + - debug (:obj:`bool`): Whether to use debug mode in the environment manager. Default to False. \ + If set True, base environment manager will be used for easy debugging. Otherwise, \ + subprocess environment manager will be used. + - wandb_sweep (:obj:`bool`): Whether to use wandb sweep, \ + which is a hyper-parameter optimization process for seeking the best configurations. \ + Default to False. If True, the wandb sweep id will be used as the experiment name. + Returns: + - (:obj:`TrainingReturn`): The training result, of which the attributions are: + - wandb_url (:obj:`str`): The weight & biases (wandb) project url of the trainning experiment. + """ + if debug: logging.getLogger().setLevel(logging.DEBUG) logging.debug(self.policy._model) @@ -145,6 +233,31 @@ def deploy( seed: Optional[Union[int, List]] = None, debug: bool = False ) -> EvalReturn: + """ + Overview: + Deploy the agent with DDPG algorithm by interacting with the environment, during which the replay video \ + can be saved if ``enable_save_replay`` is True. The evaluation result will be returned. + Arguments: + - enable_save_replay (:obj:`bool`): Whether to save the replay video. Default to False. + - concatenate_all_replay (:obj:`bool`): Whether to concatenate all replay videos into one video. \ + Default to False. If ``enable_save_replay`` is False, this argument will be ignored. \ + If ``enable_save_replay`` is True and ``concatenate_all_replay`` is False, \ + the replay video of each episode will be saved separately. + - replay_save_path (:obj:`str`): The path to save the replay video. Default to None. \ + If not specified, the video will be saved in ``exp_name/videos``. + - seed (:obj:`Union[int, List]`): The random seed, which is set before running the program. \ + Default to None. If not specified, ``self.seed`` will be used. \ + If ``seed`` is an integer, the agent will be deployed once. \ + If ``seed`` is a list of integers, the agent will be deployed once for each seed in the list. + - debug (:obj:`bool`): Whether to use debug mode in the environment manager. Default to False. \ + If set True, base environment manager will be used for easy debugging. Otherwise, \ + subprocess environment manager will be used. + Returns: + - (:obj:`EvalReturn`): The evaluation result, of which the attributions are: + - eval_value (:obj:`np.float32`): The mean of evaluation return. + - eval_value_std (:obj:`np.float32`): The standard deviation of evaluation return. + """ + if debug: logging.getLogger().setLevel(logging.DEBUG) # define env and policy @@ -202,7 +315,7 @@ def _forward(obs): step += 1 if done: break - logging.info(f'DQN deploy is finished, final episode return with {step} steps is: {return_}') + logging.info(f'DDPG deploy is finished, final episode return with {step} steps is: {return_}') returns.append(return_) env.close() @@ -223,6 +336,26 @@ def collect_data( context: Optional[str] = None, debug: bool = False ) -> None: + """ + Overview: + Collect data with DDPG algorithm for ``n_episode`` episodes with ``env_num`` collector environments. \ + The collected data will be saved in ``save_data_path`` if specified, otherwise it will be saved in \ + ``exp_name/demo_data``. + Arguments: + - env_num (:obj:`int`): The number of collector environments. Default to 8. + - save_data_path (:obj:`str`): The path to save the collected data. Default to None. \ + If not specified, the data will be saved in ``exp_name/demo_data``. + - n_sample (:obj:`int`): The number of samples to collect. Default to None. \ + If not specified, ``n_episode`` must be specified. + - n_episode (:obj:`int`): The number of episodes to collect. Default to None. \ + If not specified, ``n_sample`` must be specified. + - context (:obj:`str`): The multi-process context of the environment manager. Default to None. \ + It can be specified as ``spawn``, ``fork`` or ``forkserver``. + - debug (:obj:`bool`): Whether to use debug mode in the environment manager. Default to False. \ + If set True, base environment manager will be used for easy debugging. Otherwise, \ + subprocess environment manager will be used. + """ + if debug: logging.getLogger().setLevel(logging.DEBUG) if n_episode is not None: @@ -254,6 +387,27 @@ def batch_evaluate( context: Optional[str] = None, debug: bool = False ) -> EvalReturn: + """ + Overview: + Evaluate the agent with DDPG algorithm for ``n_evaluator_episode`` episodes with ``env_num`` evaluator \ + environments. The evaluation result will be returned. + The difference between methods ``batch_evaluate`` and ``deploy`` is that ``batch_evaluate`` will create \ + multiple evaluator environments to evaluate the agent to get an average performance, while ``deploy`` \ + will only create one evaluator environment to evaluate the agent and save the replay video. + Arguments: + - env_num (:obj:`int`): The number of evaluator environments. Default to 4. + - n_evaluator_episode (:obj:`int`): The number of episodes to evaluate. Default to 4. + - context (:obj:`str`): The multi-process context of the environment manager. Default to None. \ + It can be specified as ``spawn``, ``fork`` or ``forkserver``. + - debug (:obj:`bool`): Whether to use debug mode in the environment manager. Default to False. \ + If set True, base environment manager will be used for easy debugging. Otherwise, \ + subprocess environment manager will be used. + Returns: + - (:obj:`EvalReturn`): The evaluation result, of which the attributions are: + - eval_value (:obj:`np.float32`): The mean of evaluation return. + - eval_value_std (:obj:`np.float32`): The standard deviation of evaluation return. + """ + if debug: logging.getLogger().setLevel(logging.DEBUG) # define env and policy @@ -276,7 +430,24 @@ def batch_evaluate( return EvalReturn(eval_value=task.ctx.eval_value, eval_value_std=task.ctx.eval_value_std) @property - def best(self): + def best(self) -> 'DDPGAgent': + """ + Overview: + Load the best model from the checkpoint directory, \ + which is by default in folder ``exp_name/ckpt/eval.pth.tar``. \ + The return value is the agent with the best model. + Returns: + - (:obj:`DDPGAgent`): The agent with the best model. + Examples: + >>> agent = DDPGAgent(env_id='LunarLanderContinuous-v2') + >>> agent.train() + >>> agent = agent.best + + .. note:: + The best model is the model with the highest evaluation return. If this method is called, the current \ + model will be replaced by the best model. + """ + best_model_file_path = os.path.join(self.checkpoint_save_dir, "eval.pth.tar") # Load best model if it exists if os.path.exists(best_model_file_path): diff --git a/ding/bonus/dqn.py b/ding/bonus/dqn.py index 038551b083..4894e2aa6f 100644 --- a/ding/bonus/dqn.py +++ b/ding/bonus/dqn.py @@ -24,7 +24,22 @@ class DQNAgent: + """ + Overview: + Class of agent for training, evaluation and deployment of Reinforcement learning algorithm Deep Q-Learning(DQN). + For more information about the system design of RL agent, please refer to \ + . + Interface: + ``__init__``, ``train``, ``deploy``, ``collect_data``, ``batch_evaluate``, ``best`` + """ supported_env_list = list(supported_env_cfg.keys()) + """ + Overview: + List of supported envs. + Examples: + >>> from ding.bonus.dqn import DQNAgent + >>> print(DQNAgent.supported_env_list) + """ def __init__( self, @@ -36,6 +51,52 @@ def __init__( cfg: Optional[Union[EasyDict, dict]] = None, policy_state_dict: str = None, ) -> None: + """ + Overview: + Initialize agent for DQN algorithm. + Arguments: + - env_id (:obj:`str`): The environment id, which is a registered environment name in gym or gymnasium. \ + If ``env_id`` is not specified, ``env_id`` in ``cfg.env`` must be specified. \ + If ``env_id`` is specified, ``env_id`` in ``cfg.env`` will be ignored. \ + ``env_id`` should be one of the supported envs, which can be found in ``supported_env_list``. + - env (:obj:`BaseEnv`): The environment instance for training and evaluation. \ + If ``env`` is not specified, `env_id`` or ``cfg.env.env_id`` must be specified. \ + ``env_id`` or ``cfg.env.env_id`` will be used to create environment instance. \ + If ``env`` is specified, ``env_id`` and ``cfg.env.env_id`` will be ignored. + - seed (:obj:`int`): The random seed, which is set before running the program. \ + Default to 0. + - exp_name (:obj:`str`): The name of this experiment, which will be used to create the folder to save \ + log data. Default to None. If not specified, the folder name will be ``env_id``-``algorithm``. + - model (:obj:`torch.nn.Module`): The model of DQN algorithm, which should be an instance of class \ + :class:`ding.model.DQN`. \ + If not specified, a default model will be generated according to the configuration. + - cfg (:obj:Union[EasyDict, dict]): The configuration of DQN algorithm, which is a dict. \ + Default to None. If not specified, the default configuration will be used. \ + The default configuration can be found in ``ding/config/example/DQN/gym_lunarlander_v2.py``. + - policy_state_dict (:obj:`str`): The path of policy state dict saved by PyTorch a in local file. \ + If specified, the policy will be loaded from this file. Default to None. + + .. note:: + An RL Agent Instance can be initialized in two basic ways. \ + For example, we have an environment with id ``LunarLander-v2`` registered in gym, \ + and we want to train an agent with DQN algorithm with default configuration. \ + Then we can initialize the agent in the following ways: + >>> agent = DQNAgent(env_id='LunarLander-v2') + or, if we want can specify the env_id in the configuration: + >>> cfg = {'env': {'env_id': 'LunarLander-v2'}, 'policy': ...... } + >>> agent = DQNAgent(cfg=cfg) + There are also other arguments to specify the agent when initializing. + For example, if we want to specify the environment instance: + >>> env = CustomizedEnv('LunarLander-v2') + >>> agent = DQNAgent(cfg=cfg, env=env) + or, if we want to specify the model: + >>> model = DQN(**cfg.policy.model) + >>> agent = DQNAgent(cfg=cfg, model=model) + or, if we want to reload the policy from a saved policy state dict: + >>> agent = DQNAgent(cfg=cfg, policy_state_dict='LunarLander-v2.pth.tar') + Make sure that the configuration is consistent with the saved policy state dict. + """ + assert env_id is not None or cfg is not None, "Please specify env_id or cfg." if cfg is not None and not isinstance(cfg, EasyDict): @@ -92,6 +153,32 @@ def train( debug: bool = False, wandb_sweep: bool = False, ) -> TrainingReturn: + """ + Overview: + Train the agent with DQN algorithm for ``step`` iterations with ``collector_env_num`` collector \ + environments and ``evaluator_env_num`` evaluator environments. Information during training will be \ + recorded and saved by wandb. + Arguments: + - step (:obj:`int`): The total training environment steps of all collector environments. Default to 1e7. + - collector_env_num (:obj:`int`): The collector environment number. Default to None. \ + If not specified, it will be set according to the configuration. + - evaluator_env_num (:obj:`int`): The evaluator environment number. Default to None. \ + If not specified, it will be set according to the configuration. + - n_iter_save_ckpt (:obj:`int`): The frequency of saving checkpoint every training iteration. \ + Default to 1000. + - context (:obj:`str`): The multi-process context of the environment manager. Default to None. \ + It can be specified as ``spawn``, ``fork`` or ``forkserver``. + - debug (:obj:`bool`): Whether to use debug mode in the environment manager. Default to False. \ + If set True, base environment manager will be used for easy debugging. Otherwise, \ + subprocess environment manager will be used. + - wandb_sweep (:obj:`bool`): Whether to use wandb sweep, \ + which is a hyper-parameter optimization process for seeking the best configurations. \ + Default to False. If True, the wandb sweep id will be used as the experiment name. + Returns: + - (:obj:`TrainingReturn`): The training result, of which the attributions are: + - wandb_url (:obj:`str`): The weight & biases (wandb) project url of the trainning experiment. + """ + if debug: logging.getLogger().setLevel(logging.DEBUG) logging.debug(self.policy._model) @@ -148,6 +235,31 @@ def deploy( seed: Optional[Union[int, List]] = None, debug: bool = False ) -> EvalReturn: + """ + Overview: + Deploy the agent with DQN algorithm by interacting with the environment, during which the replay video \ + can be saved if ``enable_save_replay`` is True. The evaluation result will be returned. + Arguments: + - enable_save_replay (:obj:`bool`): Whether to save the replay video. Default to False. + - concatenate_all_replay (:obj:`bool`): Whether to concatenate all replay videos into one video. \ + Default to False. If ``enable_save_replay`` is False, this argument will be ignored. \ + If ``enable_save_replay`` is True and ``concatenate_all_replay`` is False, \ + the replay video of each episode will be saved separately. + - replay_save_path (:obj:`str`): The path to save the replay video. Default to None. \ + If not specified, the video will be saved in ``exp_name/videos``. + - seed (:obj:`Union[int, List]`): The random seed, which is set before running the program. \ + Default to None. If not specified, ``self.seed`` will be used. \ + If ``seed`` is an integer, the agent will be deployed once. \ + If ``seed`` is a list of integers, the agent will be deployed once for each seed in the list. + - debug (:obj:`bool`): Whether to use debug mode in the environment manager. Default to False. \ + If set True, base environment manager will be used for easy debugging. Otherwise, \ + subprocess environment manager will be used. + Returns: + - (:obj:`EvalReturn`): The evaluation result, of which the attributions are: + - eval_value (:obj:`np.float32`): The mean of evaluation return. + - eval_value_std (:obj:`np.float32`): The standard deviation of evaluation return. + """ + if debug: logging.getLogger().setLevel(logging.DEBUG) # define env and policy @@ -228,6 +340,26 @@ def collect_data( context: Optional[str] = None, debug: bool = False ) -> None: + """ + Overview: + Collect data with DQN algorithm for ``n_episode`` episodes with ``env_num`` collector environments. \ + The collected data will be saved in ``save_data_path`` if specified, otherwise it will be saved in \ + ``exp_name/demo_data``. + Arguments: + - env_num (:obj:`int`): The number of collector environments. Default to 8. + - save_data_path (:obj:`str`): The path to save the collected data. Default to None. \ + If not specified, the data will be saved in ``exp_name/demo_data``. + - n_sample (:obj:`int`): The number of samples to collect. Default to None. \ + If not specified, ``n_episode`` must be specified. + - n_episode (:obj:`int`): The number of episodes to collect. Default to None. \ + If not specified, ``n_sample`` must be specified. + - context (:obj:`str`): The multi-process context of the environment manager. Default to None. \ + It can be specified as ``spawn``, ``fork`` or ``forkserver``. + - debug (:obj:`bool`): Whether to use debug mode in the environment manager. Default to False. \ + If set True, base environment manager will be used for easy debugging. Otherwise, \ + subprocess environment manager will be used. + """ + if debug: logging.getLogger().setLevel(logging.DEBUG) if n_episode is not None: @@ -259,6 +391,27 @@ def batch_evaluate( context: Optional[str] = None, debug: bool = False ) -> EvalReturn: + """ + Overview: + Evaluate the agent with DQN algorithm for ``n_evaluator_episode`` episodes with ``env_num`` evaluator \ + environments. The evaluation result will be returned. + The difference between methods ``batch_evaluate`` and ``deploy`` is that ``batch_evaluate`` will create \ + multiple evaluator environments to evaluate the agent to get an average performance, while ``deploy`` \ + will only create one evaluator environment to evaluate the agent and save the replay video. + Arguments: + - env_num (:obj:`int`): The number of evaluator environments. Default to 4. + - n_evaluator_episode (:obj:`int`): The number of episodes to evaluate. Default to 4. + - context (:obj:`str`): The multi-process context of the environment manager. Default to None. \ + It can be specified as ``spawn``, ``fork`` or ``forkserver``. + - debug (:obj:`bool`): Whether to use debug mode in the environment manager. Default to False. \ + If set True, base environment manager will be used for easy debugging. Otherwise, \ + subprocess environment manager will be used. + Returns: + - (:obj:`EvalReturn`): The evaluation result, of which the attributions are: + - eval_value (:obj:`np.float32`): The mean of evaluation return. + - eval_value_std (:obj:`np.float32`): The standard deviation of evaluation return. + """ + if debug: logging.getLogger().setLevel(logging.DEBUG) # define env and policy @@ -281,7 +434,24 @@ def batch_evaluate( return EvalReturn(eval_value=task.ctx.eval_value, eval_value_std=task.ctx.eval_value_std) @property - def best(self): + def best(self) -> 'DQNAgent': + """ + Overview: + Load the best model from the checkpoint directory, \ + which is by default in folder ``exp_name/ckpt/eval.pth.tar``. \ + The return value is the agent with the best model. + Returns: + - (:obj:`DQNAgent`): The agent with the best model. + Examples: + >>> agent = DQNAgent(env_id='LunarLander-v2') + >>> agent.train() + >>> agent = agent.best + + .. note:: + The best model is the model with the highest evaluation return. If this method is called, the current \ + model will be replaced by the best model. + """ + best_model_file_path = os.path.join(self.checkpoint_save_dir, "eval.pth.tar") # Load best model if it exists if os.path.exists(best_model_file_path): diff --git a/ding/bonus/pg.py b/ding/bonus/pg.py index 4f8062d8a4..59c031d65d 100644 --- a/ding/bonus/pg.py +++ b/ding/bonus/pg.py @@ -22,7 +22,22 @@ class PGAgent: + """ + Overview: + Class of agent for training, evaluation and deployment of Reinforcement learning algorithm Policy Gradient(PG). + For more information about the system design of RL agent, please refer to \ + . + Interface: + ``__init__``, ``train``, ``deploy``, ``collect_data``, ``batch_evaluate``, ``best`` + """ supported_env_list = list(supported_env_cfg.keys()) + """ + Overview: + List of supported envs. + Examples: + >>> from ding.bonus.pg import PGAgent + >>> print(PGAgent.supported_env_list) + """ def __init__( self, @@ -34,6 +49,52 @@ def __init__( cfg: Optional[Union[EasyDict, dict]] = None, policy_state_dict: str = None, ) -> None: + """ + Overview: + Initialize agent for PG algorithm. + Arguments: + - env_id (:obj:`str`): The environment id, which is a registered environment name in gym or gymnasium. \ + If ``env_id`` is not specified, ``env_id`` in ``cfg.env`` must be specified. \ + If ``env_id`` is specified, ``env_id`` in ``cfg.env`` will be ignored. \ + ``env_id`` should be one of the supported envs, which can be found in ``supported_env_list``. + - env (:obj:`BaseEnv`): The environment instance for training and evaluation. \ + If ``env`` is not specified, `env_id`` or ``cfg.env.env_id`` must be specified. \ + ``env_id`` or ``cfg.env.env_id`` will be used to create environment instance. \ + If ``env`` is specified, ``env_id`` and ``cfg.env.env_id`` will be ignored. + - seed (:obj:`int`): The random seed, which is set before running the program. \ + Default to 0. + - exp_name (:obj:`str`): The name of this experiment, which will be used to create the folder to save \ + log data. Default to None. If not specified, the folder name will be ``env_id``-``algorithm``. + - model (:obj:`torch.nn.Module`): The model of PG algorithm, which should be an instance of class \ + :class:`ding.model.PG`. \ + If not specified, a default model will be generated according to the configuration. + - cfg (:obj:Union[EasyDict, dict]): The configuration of PG algorithm, which is a dict. \ + Default to None. If not specified, the default configuration will be used. \ + The default configuration can be found in ``ding/config/example/PG/gym_lunarlander_v2.py``. + - policy_state_dict (:obj:`str`): The path of policy state dict saved by PyTorch a in local file. \ + If specified, the policy will be loaded from this file. Default to None. + + .. note:: + An RL Agent Instance can be initialized in two basic ways. \ + For example, we have an environment with id ``LunarLanderContinuous-v2`` registered in gym, \ + and we want to train an agent with PG algorithm with default configuration. \ + Then we can initialize the agent in the following ways: + >>> agent = PGAgent(env_id='LunarLanderContinuous-v2') + or, if we want can specify the env_id in the configuration: + >>> cfg = {'env': {'env_id': 'LunarLanderContinuous-v2'}, 'policy': ...... } + >>> agent = PGAgent(cfg=cfg) + There are also other arguments to specify the agent when initializing. + For example, if we want to specify the environment instance: + >>> env = CustomizedEnv('LunarLanderContinuous-v2') + >>> agent = PGAgent(cfg=cfg, env=env) + or, if we want to specify the model: + >>> model = PG(**cfg.policy.model) + >>> agent = PGAgent(cfg=cfg, model=model) + or, if we want to reload the policy from a saved policy state dict: + >>> agent = PGAgent(cfg=cfg, policy_state_dict='LunarLanderContinuous-v2.pth.tar') + Make sure that the configuration is consistent with the saved policy state dict. + """ + assert env_id is not None or cfg is not None, "Please specify env_id or cfg." if cfg is not None and not isinstance(cfg, EasyDict): @@ -89,6 +150,32 @@ def train( debug: bool = False, wandb_sweep: bool = False, ) -> TrainingReturn: + """ + Overview: + Train the agent with PG algorithm for ``step`` iterations with ``collector_env_num`` collector \ + environments and ``evaluator_env_num`` evaluator environments. Information during training will be \ + recorded and saved by wandb. + Arguments: + - step (:obj:`int`): The total training environment steps of all collector environments. Default to 1e7. + - collector_env_num (:obj:`int`): The collector environment number. Default to None. \ + If not specified, it will be set according to the configuration. + - evaluator_env_num (:obj:`int`): The evaluator environment number. Default to None. \ + If not specified, it will be set according to the configuration. + - n_iter_save_ckpt (:obj:`int`): The frequency of saving checkpoint every training iteration. \ + Default to 1000. + - context (:obj:`str`): The multi-process context of the environment manager. Default to None. \ + It can be specified as ``spawn``, ``fork`` or ``forkserver``. + - debug (:obj:`bool`): Whether to use debug mode in the environment manager. Default to False. \ + If set True, base environment manager will be used for easy debugging. Otherwise, \ + subprocess environment manager will be used. + - wandb_sweep (:obj:`bool`): Whether to use wandb sweep, \ + which is a hyper-parameter optimization process for seeking the best configurations. \ + Default to False. If True, the wandb sweep id will be used as the experiment name. + Returns: + - (:obj:`TrainingReturn`): The training result, of which the attributions are: + - wandb_url (:obj:`str`): The weight & biases (wandb) project url of the trainning experiment. + """ + if debug: logging.getLogger().setLevel(logging.DEBUG) logging.debug(self.policy._model) @@ -134,6 +221,31 @@ def deploy( seed: Optional[Union[int, List]] = None, debug: bool = False ) -> EvalReturn: + """ + Overview: + Deploy the agent with PG algorithm by interacting with the environment, during which the replay video \ + can be saved if ``enable_save_replay`` is True. The evaluation result will be returned. + Arguments: + - enable_save_replay (:obj:`bool`): Whether to save the replay video. Default to False. + - concatenate_all_replay (:obj:`bool`): Whether to concatenate all replay videos into one video. \ + Default to False. If ``enable_save_replay`` is False, this argument will be ignored. \ + If ``enable_save_replay`` is True and ``concatenate_all_replay`` is False, \ + the replay video of each episode will be saved separately. + - replay_save_path (:obj:`str`): The path to save the replay video. Default to None. \ + If not specified, the video will be saved in ``exp_name/videos``. + - seed (:obj:`Union[int, List]`): The random seed, which is set before running the program. \ + Default to None. If not specified, ``self.seed`` will be used. \ + If ``seed`` is an integer, the agent will be deployed once. \ + If ``seed`` is a list of integers, the agent will be deployed once for each seed in the list. + - debug (:obj:`bool`): Whether to use debug mode in the environment manager. Default to False. \ + If set True, base environment manager will be used for easy debugging. Otherwise, \ + subprocess environment manager will be used. + Returns: + - (:obj:`EvalReturn`): The evaluation result, of which the attributions are: + - eval_value (:obj:`np.float32`): The mean of evaluation return. + - eval_value_std (:obj:`np.float32`): The standard deviation of evaluation return. + """ + if debug: logging.getLogger().setLevel(logging.DEBUG) # define env and policy @@ -221,6 +333,26 @@ def collect_data( context: Optional[str] = None, debug: bool = False ) -> None: + """ + Overview: + Collect data with PG algorithm for ``n_episode`` episodes with ``env_num`` collector environments. \ + The collected data will be saved in ``save_data_path`` if specified, otherwise it will be saved in \ + ``exp_name/demo_data``. + Arguments: + - env_num (:obj:`int`): The number of collector environments. Default to 8. + - save_data_path (:obj:`str`): The path to save the collected data. Default to None. \ + If not specified, the data will be saved in ``exp_name/demo_data``. + - n_sample (:obj:`int`): The number of samples to collect. Default to None. \ + If not specified, ``n_episode`` must be specified. + - n_episode (:obj:`int`): The number of episodes to collect. Default to None. \ + If not specified, ``n_sample`` must be specified. + - context (:obj:`str`): The multi-process context of the environment manager. Default to None. \ + It can be specified as ``spawn``, ``fork`` or ``forkserver``. + - debug (:obj:`bool`): Whether to use debug mode in the environment manager. Default to False. \ + If set True, base environment manager will be used for easy debugging. Otherwise, \ + subprocess environment manager will be used. + """ + if debug: logging.getLogger().setLevel(logging.DEBUG) if n_episode is not None: @@ -252,6 +384,27 @@ def batch_evaluate( context: Optional[str] = None, debug: bool = False ) -> EvalReturn: + """ + Overview: + Evaluate the agent with PG algorithm for ``n_evaluator_episode`` episodes with ``env_num`` evaluator \ + environments. The evaluation result will be returned. + The difference between methods ``batch_evaluate`` and ``deploy`` is that ``batch_evaluate`` will create \ + multiple evaluator environments to evaluate the agent to get an average performance, while ``deploy`` \ + will only create one evaluator environment to evaluate the agent and save the replay video. + Arguments: + - env_num (:obj:`int`): The number of evaluator environments. Default to 4. + - n_evaluator_episode (:obj:`int`): The number of episodes to evaluate. Default to 4. + - context (:obj:`str`): The multi-process context of the environment manager. Default to None. \ + It can be specified as ``spawn``, ``fork`` or ``forkserver``. + - debug (:obj:`bool`): Whether to use debug mode in the environment manager. Default to False. \ + If set True, base environment manager will be used for easy debugging. Otherwise, \ + subprocess environment manager will be used. + Returns: + - (:obj:`EvalReturn`): The evaluation result, of which the attributions are: + - eval_value (:obj:`np.float32`): The mean of evaluation return. + - eval_value_std (:obj:`np.float32`): The standard deviation of evaluation return. + """ + if debug: logging.getLogger().setLevel(logging.DEBUG) # define env and policy @@ -274,7 +427,24 @@ def batch_evaluate( return EvalReturn(eval_value=task.ctx.eval_value, eval_value_std=task.ctx.eval_value_std) @property - def best(self): + def best(self) -> 'PGAgent': + """ + Overview: + Load the best model from the checkpoint directory, \ + which is by default in folder ``exp_name/ckpt/eval.pth.tar``. \ + The return value is the agent with the best model. + Returns: + - (:obj:`PGAgent`): The agent with the best model. + Examples: + >>> agent = PGAgent(env_id='LunarLanderContinuous-v2') + >>> agent.train() + >>> agent = agent.best + + .. note:: + The best model is the model with the highest evaluation return. If this method is called, the current \ + model will be replaced by the best model. + """ + best_model_file_path = os.path.join(self.checkpoint_save_dir, "eval.pth.tar") # Load best model if it exists if os.path.exists(best_model_file_path): diff --git a/ding/bonus/ppo_offpolicy.py b/ding/bonus/ppo_offpolicy.py index e1d2d81c80..546aecbd6d 100644 --- a/ding/bonus/ppo_offpolicy.py +++ b/ding/bonus/ppo_offpolicy.py @@ -23,7 +23,23 @@ class PPOOffPolicyAgent: + """ + Overview: + Class of agent for training, evaluation and deployment of Reinforcement learning algorithm \ + Proximal Policy Optimization(PPO) in an off-policy style. + For more information about the system design of RL agent, please refer to \ + . + Interface: + ``__init__``, ``train``, ``deploy``, ``collect_data``, ``batch_evaluate``, ``best`` + """ supported_env_list = list(supported_env_cfg.keys()) + """ + Overview: + List of supported envs. + Examples: + >>> from ding.bonus.ppo_offpolicy import PPOOffPolicyAgent + >>> print(PPOOffPolicyAgent.supported_env_list) + """ def __init__( self, @@ -35,6 +51,52 @@ def __init__( cfg: Optional[Union[EasyDict, dict]] = None, policy_state_dict: str = None, ) -> None: + """ + Overview: + Initialize agent for PPO (offpolicy) algorithm. + Arguments: + - env_id (:obj:`str`): The environment id, which is a registered environment name in gym or gymnasium. \ + If ``env_id`` is not specified, ``env_id`` in ``cfg.env`` must be specified. \ + If ``env_id`` is specified, ``env_id`` in ``cfg.env`` will be ignored. \ + ``env_id`` should be one of the supported envs, which can be found in ``supported_env_list``. + - env (:obj:`BaseEnv`): The environment instance for training and evaluation. \ + If ``env`` is not specified, `env_id`` or ``cfg.env.env_id`` must be specified. \ + ``env_id`` or ``cfg.env.env_id`` will be used to create environment instance. \ + If ``env`` is specified, ``env_id`` and ``cfg.env.env_id`` will be ignored. + - seed (:obj:`int`): The random seed, which is set before running the program. \ + Default to 0. + - exp_name (:obj:`str`): The name of this experiment, which will be used to create the folder to save \ + log data. Default to None. If not specified, the folder name will be ``env_id``-``algorithm``. + - model (:obj:`torch.nn.Module`): The model of PPO (offpolicy) algorithm, \ + which should be an instance of class :class:`ding.model.VAC`. \ + If not specified, a default model will be generated according to the configuration. + - cfg (:obj:Union[EasyDict, dict]): The configuration of PPO (offpolicy) algorithm, which is a dict. \ + Default to None. If not specified, the default configuration will be used. \ + The default configuration can be found in ``ding/config/example/PPO (offpolicy)/gym_lunarlander_v2.py``. + - policy_state_dict (:obj:`str`): The path of policy state dict saved by PyTorch a in local file. \ + If specified, the policy will be loaded from this file. Default to None. + + .. note:: + An RL Agent Instance can be initialized in two basic ways. \ + For example, we have an environment with id ``LunarLander-v2`` registered in gym, \ + and we want to train an agent with PPO (offpolicy) algorithm with default configuration. \ + Then we can initialize the agent in the following ways: + >>> agent = PPOOffPolicyAgent(env_id='LunarLander-v2') + or, if we want can specify the env_id in the configuration: + >>> cfg = {'env': {'env_id': 'LunarLander-v2'}, 'policy': ...... } + >>> agent = PPOOffPolicyAgent(cfg=cfg) + There are also other arguments to specify the agent when initializing. + For example, if we want to specify the environment instance: + >>> env = CustomizedEnv('LunarLander-v2') + >>> agent = PPOOffPolicyAgent(cfg=cfg, env=env) + or, if we want to specify the model: + >>> model = VAC(**cfg.policy.model) + >>> agent = PPOOffPolicyAgent(cfg=cfg, model=model) + or, if we want to reload the policy from a saved policy state dict: + >>> agent = PPOOffPolicyAgent(cfg=cfg, policy_state_dict='LunarLander-v2.pth.tar') + Make sure that the configuration is consistent with the saved policy state dict. + """ + assert env_id is not None or cfg is not None, "Please specify env_id or cfg." if cfg is not None and not isinstance(cfg, EasyDict): @@ -91,6 +153,32 @@ def train( debug: bool = False, wandb_sweep: bool = False, ) -> TrainingReturn: + """ + Overview: + Train the agent with PPO (offpolicy) algorithm for ``step`` iterations with ``collector_env_num`` \ + collector environments and ``evaluator_env_num`` evaluator environments. \ + Information during training will be recorded and saved by wandb. + Arguments: + - step (:obj:`int`): The total training environment steps of all collector environments. Default to 1e7. + - collector_env_num (:obj:`int`): The collector environment number. Default to None. \ + If not specified, it will be set according to the configuration. + - evaluator_env_num (:obj:`int`): The evaluator environment number. Default to None. \ + If not specified, it will be set according to the configuration. + - n_iter_save_ckpt (:obj:`int`): The frequency of saving checkpoint every training iteration. \ + Default to 1000. + - context (:obj:`str`): The multi-process context of the environment manager. Default to None. \ + It can be specified as ``spawn``, ``fork`` or ``forkserver``. + - debug (:obj:`bool`): Whether to use debug mode in the environment manager. Default to False. \ + If set True, base environment manager will be used for easy debugging. Otherwise, \ + subprocess environment manager will be used. + - wandb_sweep (:obj:`bool`): Whether to use wandb sweep, \ + which is a hyper-parameter optimization process for seeking the best configurations. \ + Default to False. If True, the wandb sweep id will be used as the experiment name. + Returns: + - (:obj:`TrainingReturn`): The training result, of which the attributions are: + - wandb_url (:obj:`str`): The weight & biases (wandb) project url of the trainning experiment. + """ + if debug: logging.getLogger().setLevel(logging.DEBUG) logging.debug(self.policy._model) @@ -146,6 +234,32 @@ def deploy( seed: Optional[Union[int, List]] = None, debug: bool = False ) -> EvalReturn: + """ + Overview: + Deploy the agent with PPO (offpolicy) algorithm by interacting with the environment, \ + during which the replay video can be saved if ``enable_save_replay`` is True. \ + The evaluation result will be returned. + Arguments: + - enable_save_replay (:obj:`bool`): Whether to save the replay video. Default to False. + - concatenate_all_replay (:obj:`bool`): Whether to concatenate all replay videos into one video. \ + Default to False. If ``enable_save_replay`` is False, this argument will be ignored. \ + If ``enable_save_replay`` is True and ``concatenate_all_replay`` is False, \ + the replay video of each episode will be saved separately. + - replay_save_path (:obj:`str`): The path to save the replay video. Default to None. \ + If not specified, the video will be saved in ``exp_name/videos``. + - seed (:obj:`Union[int, List]`): The random seed, which is set before running the program. \ + Default to None. If not specified, ``self.seed`` will be used. \ + If ``seed`` is an integer, the agent will be deployed once. \ + If ``seed`` is a list of integers, the agent will be deployed once for each seed in the list. + - debug (:obj:`bool`): Whether to use debug mode in the environment manager. Default to False. \ + If set True, base environment manager will be used for easy debugging. Otherwise, \ + subprocess environment manager will be used. + Returns: + - (:obj:`EvalReturn`): The evaluation result, of which the attributions are: + - eval_value (:obj:`np.float32`): The mean of evaluation return. + - eval_value_std (:obj:`np.float32`): The standard deviation of evaluation return. + """ + if debug: logging.getLogger().setLevel(logging.DEBUG) # define env and policy @@ -214,7 +328,7 @@ def _forward(obs): step += 1 if done: break - logging.info(f'DQN deploy is finished, final episode return with {step} steps is: {return_}') + logging.info(f'PPO (offpolicy) deploy is finished, final episode return with {step} steps is: {return_}') returns.append(return_) env.close() @@ -235,6 +349,27 @@ def collect_data( context: Optional[str] = None, debug: bool = False ) -> None: + """ + Overview: + Collect data with PPO (offpolicy) algorithm for ``n_episode`` episodes \ + with ``env_num`` collector environments. \ + The collected data will be saved in ``save_data_path`` if specified, otherwise it will be saved in \ + ``exp_name/demo_data``. + Arguments: + - env_num (:obj:`int`): The number of collector environments. Default to 8. + - save_data_path (:obj:`str`): The path to save the collected data. Default to None. \ + If not specified, the data will be saved in ``exp_name/demo_data``. + - n_sample (:obj:`int`): The number of samples to collect. Default to None. \ + If not specified, ``n_episode`` must be specified. + - n_episode (:obj:`int`): The number of episodes to collect. Default to None. \ + If not specified, ``n_sample`` must be specified. + - context (:obj:`str`): The multi-process context of the environment manager. Default to None. \ + It can be specified as ``spawn``, ``fork`` or ``forkserver``. + - debug (:obj:`bool`): Whether to use debug mode in the environment manager. Default to False. \ + If set True, base environment manager will be used for easy debugging. Otherwise, \ + subprocess environment manager will be used. + """ + if debug: logging.getLogger().setLevel(logging.DEBUG) if n_episode is not None: @@ -267,6 +402,27 @@ def batch_evaluate( context: Optional[str] = None, debug: bool = False ) -> EvalReturn: + """ + Overview: + Evaluate the agent with PPO (offpolicy) algorithm for ``n_evaluator_episode`` episodes \ + with ``env_num`` evaluator environments. The evaluation result will be returned. + The difference between methods ``batch_evaluate`` and ``deploy`` is that ``batch_evaluate`` will create \ + multiple evaluator environments to evaluate the agent to get an average performance, while ``deploy`` \ + will only create one evaluator environment to evaluate the agent and save the replay video. + Arguments: + - env_num (:obj:`int`): The number of evaluator environments. Default to 4. + - n_evaluator_episode (:obj:`int`): The number of episodes to evaluate. Default to 4. + - context (:obj:`str`): The multi-process context of the environment manager. Default to None. \ + It can be specified as ``spawn``, ``fork`` or ``forkserver``. + - debug (:obj:`bool`): Whether to use debug mode in the environment manager. Default to False. \ + If set True, base environment manager will be used for easy debugging. Otherwise, \ + subprocess environment manager will be used. + Returns: + - (:obj:`EvalReturn`): The evaluation result, of which the attributions are: + - eval_value (:obj:`np.float32`): The mean of evaluation return. + - eval_value_std (:obj:`np.float32`): The standard deviation of evaluation return. + """ + if debug: logging.getLogger().setLevel(logging.DEBUG) # define env and policy @@ -289,7 +445,24 @@ def batch_evaluate( return EvalReturn(eval_value=task.ctx.eval_value, eval_value_std=task.ctx.eval_value_std) @property - def best(self): + def best(self) -> 'PPOOffPolicyAgent': + """ + Overview: + Load the best model from the checkpoint directory, \ + which is by default in folder ``exp_name/ckpt/eval.pth.tar``. \ + The return value is the agent with the best model. + Returns: + - (:obj:`PPOOffPolicyAgent`): The agent with the best model. + Examples: + >>> agent = PPOOffPolicyAgent(env_id='LunarLander-v2') + >>> agent.train() + >>> agent.best + + .. note:: + The best model is the model with the highest evaluation return. If this method is called, the current \ + model will be replaced by the best model. + """ + best_model_file_path = os.path.join(self.checkpoint_save_dir, "eval.pth.tar") # Load best model if it exists if os.path.exists(best_model_file_path): diff --git a/ding/bonus/ppof.py b/ding/bonus/ppof.py index bf6012240f..88d0b43e1e 100644 --- a/ding/bonus/ppof.py +++ b/ding/bonus/ppof.py @@ -21,6 +21,16 @@ class PPOF: + """ + Overview: + Class of agent for training, evaluation and deployment of Reinforcement learning algorithm \ + Proximal Policy Optimization(PPO). + For more information about the system design of RL agent, please refer to \ + . + Interface: + ``__init__``, ``train``, ``deploy``, ``collect_data``, ``batch_evaluate``, ``best`` + """ + supported_env_list = [ # common 'LunarLander-v2', @@ -53,6 +63,13 @@ class PPOF: 'HalfCheetah-v3', 'Walker2d-v3', ] + """ + Overview: + List of supported envs. + Examples: + >>> from ding.bonus.ppof import PPOF + >>> print(PPOF.supported_env_list) + """ def __init__( self, @@ -64,6 +81,51 @@ def __init__( cfg: Optional[Union[EasyDict, dict]] = None, policy_state_dict: str = None ) -> None: + """ + Overview: + Initialize agent for PPO algorithm. + Arguments: + - env_id (:obj:`str`): The environment id, which is a registered environment name in gym or gymnasium. \ + If ``env_id`` is not specified, ``env_id`` in ``cfg`` must be specified. \ + If ``env_id`` is specified, ``env_id`` in ``cfg`` will be ignored. \ + ``env_id`` should be one of the supported envs, which can be found in ``PPOF.supported_env_list``. + - env (:obj:`BaseEnv`): The environment instance for training and evaluation. \ + If ``env`` is not specified, ``env_id`` or ``cfg.env_id`` must be specified. \ + ``env_id`` or ``cfg.env_id`` will be used to create environment instance. \ + If ``env`` is specified, ``env_id`` and ``cfg.env_id`` will be ignored. + - seed (:obj:`int`): The random seed, which is set before running the program. \ + Default to 0. + - exp_name (:obj:`str`): The name of this experiment, which will be used to create the folder to save \ + log data. Default to None. If not specified, the folder name will be ``env_id``-``algorithm``. + - model (:obj:`torch.nn.Module`): The model of PPO algorithm, which should be an instance of class \ + ``ding.model.PPOFModel``. \ + If not specified, a default model will be generated according to the configuration. + - cfg (:obj:`Union[EasyDict, dict]`): The configuration of PPO algorithm, which is a dict. \ + Default to None. If not specified, the default configuration will be used. + - policy_state_dict (:obj:`str`): The path of policy state dict saved by PyTorch a in local file. \ + If specified, the policy will be loaded from this file. Default to None. + + .. note:: + An RL Agent Instance can be initialized in two basic ways. \ + For example, we have an environment with id ``LunarLander-v2`` registered in gym, \ + and we want to train an agent with PPO algorithm with default configuration. \ + Then we can initialize the agent in the following ways: + >>> agent = PPOF(env_id='LunarLander-v2') + or, if we want can specify the env_id in the configuration: + >>> cfg = {'env': {'env_id': 'LunarLander-v2'}, 'policy': ...... } + >>> agent = PPOF(cfg=cfg) + There are also other arguments to specify the agent when initializing. + For example, if we want to specify the environment instance: + >>> env = CustomizedEnv('LunarLander-v2') + >>> agent = PPOF(cfg=cfg, env=env) + or, if we want to specify the model: + >>> model = VAC(**cfg.policy.model) + >>> agent = PPOF(cfg=cfg, model=model) + or, if we want to reload the policy from a saved policy state dict: + >>> agent = PPOF(cfg=cfg, policy_state_dict='LunarLander-v2.pth.tar') + Make sure that the configuration is consistent with the saved policy state dict. + """ + assert env_id is not None or cfg is not None, "Please specify env_id or cfg." if cfg is not None and not isinstance(cfg, EasyDict): @@ -146,6 +208,32 @@ def train( debug: bool = False, wandb_sweep: bool = False, ) -> TrainingReturn: + """ + Overview: + Train the agent with PPO algorithm for ``step`` iterations with ``collector_env_num`` collector \ + environments and ``evaluator_env_num`` evaluator environments. Information during training will be \ + recorded and saved by wandb. + Arguments: + - step (:obj:`int`): The total training environment steps of all collector environments. Default to 1e7. + - collector_env_num (:obj:`int`): The number of collector environments. Default to 4. + - evaluator_env_num (:obj:`int`): The number of evaluator environments. Default to 4. + - n_iter_log_show (:obj:`int`): The frequency of logging every training iteration. Default to 500. + - n_iter_save_ckpt (:obj:`int`): The frequency of saving checkpoint every training iteration. \ + Default to 1000. + - context (:obj:`str`): The multi-process context of the environment manager. Default to None. \ + It can be specified as ``spawn``, ``fork`` or ``forkserver``. + - reward_model (:obj:`str`): The reward model name. Default to None. This argument is not supported yet. + - debug (:obj:`bool`): Whether to use debug mode in the environment manager. Default to False. \ + If set True, base environment manager will be used for easy debugging. Otherwise, \ + subprocess environment manager will be used. + - wandb_sweep (:obj:`bool`): Whether to use wandb sweep, \ + which is a hyper-parameter optimization process for seeking the best configurations. \ + Default to False. If True, the wandb sweep id will be used as the experiment name. + Returns: + - (:obj:`TrainingReturn`): The training result, of which the attributions are: + - wandb_url (:obj:`str`): The weight & biases (wandb) project url of the trainning experiment. + """ + if debug: logging.getLogger().setLevel(logging.DEBUG) logging.debug(self.policy._model) @@ -185,6 +273,31 @@ def deploy( seed: Optional[Union[int, List]] = None, debug: bool = False ) -> EvalReturn: + """ + Overview: + Deploy the agent with PPO algorithm by interacting with the environment, during which the replay video \ + can be saved if ``enable_save_replay`` is True. The evaluation result will be returned. + Arguments: + - enable_save_replay (:obj:`bool`): Whether to save the replay video. Default to False. + - concatenate_all_replay (:obj:`bool`): Whether to concatenate all replay videos into one video. \ + Default to False. If ``enable_save_replay`` is False, this argument will be ignored. \ + If ``enable_save_replay`` is True and ``concatenate_all_replay`` is False, \ + the replay video of each episode will be saved separately. + - replay_save_path (:obj:`str`): The path to save the replay video. Default to None. \ + If not specified, the video will be saved in ``exp_name/videos``. + - seed (:obj:`Union[int, List]`): The random seed, which is set before running the program. \ + Default to None. If not specified, ``self.seed`` will be used. \ + If ``seed`` is an integer, the agent will be deployed once. \ + If ``seed`` is a list of integers, the agent will be deployed once for each seed in the list. + - debug (:obj:`bool`): Whether to use debug mode in the environment manager. Default to False. \ + If set True, base environment manager will be used for easy debugging. Otherwise, \ + subprocess environment manager will be used. + Returns: + - (:obj:`EvalReturn`): The evaluation result, of which the attributions are: + - eval_value (:obj:`np.float32`): The mean of evaluation return. + - eval_value_std (:obj:`np.float32`): The standard deviation of evaluation return. + """ + if debug: logging.getLogger().setLevel(logging.DEBUG) # define env and policy @@ -249,6 +362,26 @@ def collect_data( context: Optional[str] = None, debug: bool = False ) -> None: + """ + Overview: + Collect data with PPO algorithm for ``n_episode`` episodes with ``env_num`` collector environments. \ + The collected data will be saved in ``save_data_path`` if specified, otherwise it will be saved in \ + ``exp_name/demo_data``. + Arguments: + - env_num (:obj:`int`): The number of collector environments. Default to 8. + - save_data_path (:obj:`str`): The path to save the collected data. Default to None. \ + If not specified, the data will be saved in ``exp_name/demo_data``. + - n_sample (:obj:`int`): The number of samples to collect. Default to None. \ + If not specified, ``n_episode`` must be specified. + - n_episode (:obj:`int`): The number of episodes to collect. Default to None. \ + If not specified, ``n_sample`` must be specified. + - context (:obj:`str`): The multi-process context of the environment manager. Default to None. \ + It can be specified as ``spawn``, ``fork`` or ``forkserver``. + - debug (:obj:`bool`): Whether to use debug mode in the environment manager. Default to False. \ + If set True, base environment manager will be used for easy debugging. Otherwise, \ + subprocess environment manager will be used. + """ + if debug: logging.getLogger().setLevel(logging.DEBUG) if n_episode is not None: @@ -274,6 +407,27 @@ def batch_evaluate( context: Optional[str] = None, debug: bool = False, ) -> EvalReturn: + """ + Overview: + Evaluate the agent with PPO algorithm for ``n_evaluator_episode`` episodes with ``env_num`` evaluator \ + environments. The evaluation result will be returned. + The difference between methods ``batch_evaluate`` and ``deploy`` is that ``batch_evaluate`` will create \ + multiple evaluator environments to evaluate the agent to get an average performance, while ``deploy`` \ + will only create one evaluator environment to evaluate the agent and save the replay video. + Arguments: + - env_num (:obj:`int`): The number of evaluator environments. Default to 4. + - n_evaluator_episode (:obj:`int`): The number of episodes to evaluate. Default to 4. + - context (:obj:`str`): The multi-process context of the environment manager. Default to None. \ + It can be specified as ``spawn``, ``fork`` or ``forkserver``. + - debug (:obj:`bool`): Whether to use debug mode in the environment manager. Default to False. \ + If set True, base environment manager will be used for easy debugging. Otherwise, \ + subprocess environment manager will be used. + Returns: + - (:obj:`EvalReturn`): The evaluation result, of which the attributions are: + - eval_value (:obj:`np.float32`): The mean of evaluation return. + - eval_value_std (:obj:`np.float32`): The standard deviation of evaluation return. + """ + if debug: logging.getLogger().setLevel(logging.DEBUG) # define env and policy @@ -303,6 +457,20 @@ def _setup_env_manager( debug: bool = False, caller: str = 'collector' ) -> BaseEnvManagerV2: + """ + Overview: + Setup the environment manager. The environment manager is used to manage multiple environments. + Arguments: + - env_num (:obj:`int`): The number of environments. + - context (:obj:`str`): The multi-process context of the environment manager. Default to None. \ + It can be specified as ``spawn``, ``fork`` or ``forkserver``. + - debug (:obj:`bool`): Whether to use debug mode in the environment manager. Default to False. \ + If set True, base environment manager will be used for easy debugging. Otherwise, \ + subprocess environment manager will be used. + - caller (:obj:`str`): The caller of the environment manager. Default to 'collector'. + Returns: + - (:obj:`BaseEnvManagerV2`): The environment manager. + """ assert caller in ['evaluator', 'collector'] if debug: env_cls = BaseEnvManagerV2 @@ -315,7 +483,24 @@ def _setup_env_manager( return env_cls([partial(self.env.clone, caller) for _ in range(env_num)], manager_cfg) @property - def best(self): + def best(self) -> 'PPOF': + """ + Overview: + Load the best model from the checkpoint directory, \ + which is by default in folder ``exp_name/ckpt/eval.pth.tar``. \ + The return value is the agent with the best model. + Returns: + - (:obj:`PPOF`): The agent with the best model. + Examples: + >>> agent = PPOF(env_id='LunarLander-v2') + >>> agent.train() + >>> agent = agent.best() + + .. note:: + The best model is the model with the highest evaluation return. If this method is called, the current \ + model will be replaced by the best model. + """ + best_model_file_path = os.path.join(self.checkpoint_save_dir, "eval.pth.tar") # Load best model if it exists if os.path.exists(best_model_file_path): diff --git a/ding/bonus/sac.py b/ding/bonus/sac.py index d12e44086b..cb6046476c 100644 --- a/ding/bonus/sac.py +++ b/ding/bonus/sac.py @@ -24,7 +24,23 @@ class SACAgent: + """ + Overview: + Class of agent for training, evaluation and deployment of Reinforcement learning algorithm \ + Soft Actor-Critic(SAC). + For more information about the system design of RL agent, please refer to \ + . + Interface: + ``__init__``, ``train``, ``deploy``, ``collect_data``, ``batch_evaluate``, ``best`` + """ supported_env_list = list(supported_env_cfg.keys()) + """ + Overview: + List of supported envs. + Examples: + >>> from ding.bonus.sac import SACAgent + >>> print(SACAgent.supported_env_list) + """ def __init__( self, @@ -36,6 +52,52 @@ def __init__( cfg: Optional[Union[EasyDict, dict]] = None, policy_state_dict: str = None, ) -> None: + """ + Overview: + Initialize agent for SAC algorithm. + Arguments: + - env_id (:obj:`str`): The environment id, which is a registered environment name in gym or gymnasium. \ + If ``env_id`` is not specified, ``env_id`` in ``cfg.env`` must be specified. \ + If ``env_id`` is specified, ``env_id`` in ``cfg.env`` will be ignored. \ + ``env_id`` should be one of the supported envs, which can be found in ``supported_env_list``. + - env (:obj:`BaseEnv`): The environment instance for training and evaluation. \ + If ``env`` is not specified, `env_id`` or ``cfg.env.env_id`` must be specified. \ + ``env_id`` or ``cfg.env.env_id`` will be used to create environment instance. \ + If ``env`` is specified, ``env_id`` and ``cfg.env.env_id`` will be ignored. + - seed (:obj:`int`): The random seed, which is set before running the program. \ + Default to 0. + - exp_name (:obj:`str`): The name of this experiment, which will be used to create the folder to save \ + log data. Default to None. If not specified, the folder name will be ``env_id``-``algorithm``. + - model (:obj:`torch.nn.Module`): The model of SAC algorithm, which should be an instance of class \ + :class:`ding.model.ContinuousQAC`. \ + If not specified, a default model will be generated according to the configuration. + - cfg (:obj:Union[EasyDict, dict]): The configuration of SAC algorithm, which is a dict. \ + Default to None. If not specified, the default configuration will be used. \ + The default configuration can be found in ``ding/config/example/SAC/gym_lunarlander_v2.py``. + - policy_state_dict (:obj:`str`): The path of policy state dict saved by PyTorch a in local file. \ + If specified, the policy will be loaded from this file. Default to None. + + .. note:: + An RL Agent Instance can be initialized in two basic ways. \ + For example, we have an environment with id ``LunarLanderContinuous-v2`` registered in gym, \ + and we want to train an agent with SAC algorithm with default configuration. \ + Then we can initialize the agent in the following ways: + >>> agent = SACAgent(env_id='LunarLanderContinuous-v2') + or, if we want can specify the env_id in the configuration: + >>> cfg = {'env': {'env_id': 'LunarLanderContinuous-v2'}, 'policy': ...... } + >>> agent = SACAgent(cfg=cfg) + There are also other arguments to specify the agent when initializing. + For example, if we want to specify the environment instance: + >>> env = CustomizedEnv('LunarLanderContinuous-v2') + >>> agent = SACAgent(cfg=cfg, env=env) + or, if we want to specify the model: + >>> model = ContinuousQAC(**cfg.policy.model) + >>> agent = SACAgent(cfg=cfg, model=model) + or, if we want to reload the policy from a saved policy state dict: + >>> agent = SACAgent(cfg=cfg, policy_state_dict='LunarLanderContinuous-v2.pth.tar') + Make sure that the configuration is consistent with the saved policy state dict. + """ + assert env_id is not None or cfg is not None, "Please specify env_id or cfg." if cfg is not None and not isinstance(cfg, EasyDict): @@ -92,6 +154,32 @@ def train( debug: bool = False, wandb_sweep: bool = False, ) -> TrainingReturn: + """ + Overview: + Train the agent with SAC algorithm for ``step`` iterations with ``collector_env_num`` collector \ + environments and ``evaluator_env_num`` evaluator environments. Information during training will be \ + recorded and saved by wandb. + Arguments: + - step (:obj:`int`): The total training environment steps of all collector environments. Default to 1e7. + - collector_env_num (:obj:`int`): The collector environment number. Default to None. \ + If not specified, it will be set according to the configuration. + - evaluator_env_num (:obj:`int`): The evaluator environment number. Default to None. \ + If not specified, it will be set according to the configuration. + - n_iter_save_ckpt (:obj:`int`): The frequency of saving checkpoint every training iteration. \ + Default to 1000. + - context (:obj:`str`): The multi-process context of the environment manager. Default to None. \ + It can be specified as ``spawn``, ``fork`` or ``forkserver``. + - debug (:obj:`bool`): Whether to use debug mode in the environment manager. Default to False. \ + If set True, base environment manager will be used for easy debugging. Otherwise, \ + subprocess environment manager will be used. + - wandb_sweep (:obj:`bool`): Whether to use wandb sweep, \ + which is a hyper-parameter optimization process for seeking the best configurations. \ + Default to False. If True, the wandb sweep id will be used as the experiment name. + Returns: + - (:obj:`TrainingReturn`): The training result, of which the attributions are: + - wandb_url (:obj:`str`): The weight & biases (wandb) project url of the trainning experiment. + """ + if debug: logging.getLogger().setLevel(logging.DEBUG) logging.debug(self.policy._model) @@ -145,6 +233,31 @@ def deploy( seed: Optional[Union[int, List]] = None, debug: bool = False ) -> EvalReturn: + """ + Overview: + Deploy the agent with SAC algorithm by interacting with the environment, during which the replay video \ + can be saved if ``enable_save_replay`` is True. The evaluation result will be returned. + Arguments: + - enable_save_replay (:obj:`bool`): Whether to save the replay video. Default to False. + - concatenate_all_replay (:obj:`bool`): Whether to concatenate all replay videos into one video. \ + Default to False. If ``enable_save_replay`` is False, this argument will be ignored. \ + If ``enable_save_replay`` is True and ``concatenate_all_replay`` is False, \ + the replay video of each episode will be saved separately. + - replay_save_path (:obj:`str`): The path to save the replay video. Default to None. \ + If not specified, the video will be saved in ``exp_name/videos``. + - seed (:obj:`Union[int, List]`): The random seed, which is set before running the program. \ + Default to None. If not specified, ``self.seed`` will be used. \ + If ``seed`` is an integer, the agent will be deployed once. \ + If ``seed`` is a list of integers, the agent will be deployed once for each seed in the list. + - debug (:obj:`bool`): Whether to use debug mode in the environment manager. Default to False. \ + If set True, base environment manager will be used for easy debugging. Otherwise, \ + subprocess environment manager will be used. + Returns: + - (:obj:`EvalReturn`): The evaluation result, of which the attributions are: + - eval_value (:obj:`np.float32`): The mean of evaluation return. + - eval_value_std (:obj:`np.float32`): The standard deviation of evaluation return. + """ + if debug: logging.getLogger().setLevel(logging.DEBUG) # define env and policy @@ -224,6 +337,26 @@ def collect_data( context: Optional[str] = None, debug: bool = False ) -> None: + """ + Overview: + Collect data with SAC algorithm for ``n_episode`` episodes with ``env_num`` collector environments. \ + The collected data will be saved in ``save_data_path`` if specified, otherwise it will be saved in \ + ``exp_name/demo_data``. + Arguments: + - env_num (:obj:`int`): The number of collector environments. Default to 8. + - save_data_path (:obj:`str`): The path to save the collected data. Default to None. \ + If not specified, the data will be saved in ``exp_name/demo_data``. + - n_sample (:obj:`int`): The number of samples to collect. Default to None. \ + If not specified, ``n_episode`` must be specified. + - n_episode (:obj:`int`): The number of episodes to collect. Default to None. \ + If not specified, ``n_sample`` must be specified. + - context (:obj:`str`): The multi-process context of the environment manager. Default to None. \ + It can be specified as ``spawn``, ``fork`` or ``forkserver``. + - debug (:obj:`bool`): Whether to use debug mode in the environment manager. Default to False. \ + If set True, base environment manager will be used for easy debugging. Otherwise, \ + subprocess environment manager will be used. + """ + if debug: logging.getLogger().setLevel(logging.DEBUG) if n_episode is not None: @@ -255,6 +388,27 @@ def batch_evaluate( context: Optional[str] = None, debug: bool = False ) -> EvalReturn: + """ + Overview: + Evaluate the agent with SAC algorithm for ``n_evaluator_episode`` episodes with ``env_num`` evaluator \ + environments. The evaluation result will be returned. + The difference between methods ``batch_evaluate`` and ``deploy`` is that ``batch_evaluate`` will create \ + multiple evaluator environments to evaluate the agent to get an average performance, while ``deploy`` \ + will only create one evaluator environment to evaluate the agent and save the replay video. + Arguments: + - env_num (:obj:`int`): The number of evaluator environments. Default to 4. + - n_evaluator_episode (:obj:`int`): The number of episodes to evaluate. Default to 4. + - context (:obj:`str`): The multi-process context of the environment manager. Default to None. \ + It can be specified as ``spawn``, ``fork`` or ``forkserver``. + - debug (:obj:`bool`): Whether to use debug mode in the environment manager. Default to False. \ + If set True, base environment manager will be used for easy debugging. Otherwise, \ + subprocess environment manager will be used. + Returns: + - (:obj:`EvalReturn`): The evaluation result, of which the attributions are: + - eval_value (:obj:`np.float32`): The mean of evaluation return. + - eval_value_std (:obj:`np.float32`): The standard deviation of evaluation return. + """ + if debug: logging.getLogger().setLevel(logging.DEBUG) # define env and policy @@ -277,7 +431,24 @@ def batch_evaluate( return EvalReturn(eval_value=task.ctx.eval_value, eval_value_std=task.ctx.eval_value_std) @property - def best(self): + def best(self) -> 'SACAgent': + """ + Overview: + Load the best model from the checkpoint directory, \ + which is by default in folder ``exp_name/ckpt/eval.pth.tar``. \ + The return value is the agent with the best model. + Returns: + - (:obj:`SACAgent`): The agent with the best model. + Examples: + >>> agent = SACAgent(env_id='LunarLanderContinuous-v2') + >>> agent.train() + >>> agent = agent.best + + .. note:: + The best model is the model with the highest evaluation return. If this method is called, the current \ + model will be replaced by the best model. + """ + best_model_file_path = os.path.join(self.checkpoint_save_dir, "eval.pth.tar") # Load best model if it exists if os.path.exists(best_model_file_path): diff --git a/ding/bonus/sql.py b/ding/bonus/sql.py index 49ffb0c679..63d26acce2 100644 --- a/ding/bonus/sql.py +++ b/ding/bonus/sql.py @@ -24,7 +24,23 @@ class SQLAgent: + """ + Overview: + Class of agent for training, evaluation and deployment of Reinforcement learning algorithm \ + Soft Q-Learning(SQL). + For more information about the system design of RL agent, please refer to \ + . + Interface: + ``__init__``, ``train``, ``deploy``, ``collect_data``, ``batch_evaluate``, ``best`` + """ supported_env_list = list(supported_env_cfg.keys()) + """ + Overview: + List of supported envs. + Examples: + >>> from ding.bonus.sql import SQLAgent + >>> print(SQLAgent.supported_env_list) + """ def __init__( self, @@ -36,6 +52,52 @@ def __init__( cfg: Optional[Union[EasyDict, dict]] = None, policy_state_dict: str = None, ) -> None: + """ + Overview: + Initialize agent for SQL algorithm. + Arguments: + - env_id (:obj:`str`): The environment id, which is a registered environment name in gym or gymnasium. \ + If ``env_id`` is not specified, ``env_id`` in ``cfg.env`` must be specified. \ + If ``env_id`` is specified, ``env_id`` in ``cfg.env`` will be ignored. \ + ``env_id`` should be one of the supported envs, which can be found in ``supported_env_list``. + - env (:obj:`BaseEnv`): The environment instance for training and evaluation. \ + If ``env`` is not specified, `env_id`` or ``cfg.env.env_id`` must be specified. \ + ``env_id`` or ``cfg.env.env_id`` will be used to create environment instance. \ + If ``env`` is specified, ``env_id`` and ``cfg.env.env_id`` will be ignored. + - seed (:obj:`int`): The random seed, which is set before running the program. \ + Default to 0. + - exp_name (:obj:`str`): The name of this experiment, which will be used to create the folder to save \ + log data. Default to None. If not specified, the folder name will be ``env_id``-``algorithm``. + - model (:obj:`torch.nn.Module`): The model of SQL algorithm, which should be an instance of class \ + :class:`ding.model.DQN`. \ + If not specified, a default model will be generated according to the configuration. + - cfg (:obj:Union[EasyDict, dict]): The configuration of SQL algorithm, which is a dict. \ + Default to None. If not specified, the default configuration will be used. \ + The default configuration can be found in ``ding/config/example/SQL/gym_lunarlander_v2.py``. + - policy_state_dict (:obj:`str`): The path of policy state dict saved by PyTorch a in local file. \ + If specified, the policy will be loaded from this file. Default to None. + + .. note:: + An RL Agent Instance can be initialized in two basic ways. \ + For example, we have an environment with id ``LunarLander-v2`` registered in gym, \ + and we want to train an agent with SQL algorithm with default configuration. \ + Then we can initialize the agent in the following ways: + >>> agent = SQLAgent(env_id='LunarLander-v2') + or, if we want can specify the env_id in the configuration: + >>> cfg = {'env': {'env_id': 'LunarLander-v2'}, 'policy': ...... } + >>> agent = SQLAgent(cfg=cfg) + There are also other arguments to specify the agent when initializing. + For example, if we want to specify the environment instance: + >>> env = CustomizedEnv('LunarLander-v2') + >>> agent = SQLAgent(cfg=cfg, env=env) + or, if we want to specify the model: + >>> model = DQN(**cfg.policy.model) + >>> agent = SQLAgent(cfg=cfg, model=model) + or, if we want to reload the policy from a saved policy state dict: + >>> agent = SQLAgent(cfg=cfg, policy_state_dict='LunarLander-v2.pth.tar') + Make sure that the configuration is consistent with the saved policy state dict. + """ + assert env_id is not None or cfg is not None, "Please specify env_id or cfg." if cfg is not None and not isinstance(cfg, EasyDict): @@ -92,6 +154,32 @@ def train( debug: bool = False, wandb_sweep: bool = False, ) -> TrainingReturn: + """ + Overview: + Train the agent with SQL algorithm for ``step`` iterations with ``collector_env_num`` collector \ + environments and ``evaluator_env_num`` evaluator environments. Information during training will be \ + recorded and saved by wandb. + Arguments: + - step (:obj:`int`): The total training environment steps of all collector environments. Default to 1e7. + - collector_env_num (:obj:`int`): The collector environment number. Default to None. \ + If not specified, it will be set according to the configuration. + - evaluator_env_num (:obj:`int`): The evaluator environment number. Default to None. \ + If not specified, it will be set according to the configuration. + - n_iter_save_ckpt (:obj:`int`): The frequency of saving checkpoint every training iteration. \ + Default to 1000. + - context (:obj:`str`): The multi-process context of the environment manager. Default to None. \ + It can be specified as ``spawn``, ``fork`` or ``forkserver``. + - debug (:obj:`bool`): Whether to use debug mode in the environment manager. Default to False. \ + If set True, base environment manager will be used for easy debugging. Otherwise, \ + subprocess environment manager will be used. + - wandb_sweep (:obj:`bool`): Whether to use wandb sweep, \ + which is a hyper-parameter optimization process for seeking the best configurations. \ + Default to False. If True, the wandb sweep id will be used as the experiment name. + Returns: + - (:obj:`TrainingReturn`): The training result, of which the attributions are: + - wandb_url (:obj:`str`): The weight & biases (wandb) project url of the trainning experiment. + """ + if debug: logging.getLogger().setLevel(logging.DEBUG) logging.debug(self.policy._model) @@ -148,6 +236,31 @@ def deploy( seed: Optional[Union[int, List]] = None, debug: bool = False ) -> EvalReturn: + """ + Overview: + Deploy the agent with SQL algorithm by interacting with the environment, during which the replay video \ + can be saved if ``enable_save_replay`` is True. The evaluation result will be returned. + Arguments: + - enable_save_replay (:obj:`bool`): Whether to save the replay video. Default to False. + - concatenate_all_replay (:obj:`bool`): Whether to concatenate all replay videos into one video. \ + Default to False. If ``enable_save_replay`` is False, this argument will be ignored. \ + If ``enable_save_replay`` is True and ``concatenate_all_replay`` is False, \ + the replay video of each episode will be saved separately. + - replay_save_path (:obj:`str`): The path to save the replay video. Default to None. \ + If not specified, the video will be saved in ``exp_name/videos``. + - seed (:obj:`Union[int, List]`): The random seed, which is set before running the program. \ + Default to None. If not specified, ``self.seed`` will be used. \ + If ``seed`` is an integer, the agent will be deployed once. \ + If ``seed`` is a list of integers, the agent will be deployed once for each seed in the list. + - debug (:obj:`bool`): Whether to use debug mode in the environment manager. Default to False. \ + If set True, base environment manager will be used for easy debugging. Otherwise, \ + subprocess environment manager will be used. + Returns: + - (:obj:`EvalReturn`): The evaluation result, of which the attributions are: + - eval_value (:obj:`np.float32`): The mean of evaluation return. + - eval_value_std (:obj:`np.float32`): The standard deviation of evaluation return. + """ + if debug: logging.getLogger().setLevel(logging.DEBUG) # define env and policy @@ -207,7 +320,7 @@ def _forward(obs): step += 1 if done: break - logging.info(f'DQN deploy is finished, final episode return with {step} steps is: {return_}') + logging.info(f'SQL deploy is finished, final episode return with {step} steps is: {return_}') returns.append(return_) env.close() @@ -228,6 +341,26 @@ def collect_data( context: Optional[str] = None, debug: bool = False ) -> None: + """ + Overview: + Collect data with SQL algorithm for ``n_episode`` episodes with ``env_num`` collector environments. \ + The collected data will be saved in ``save_data_path`` if specified, otherwise it will be saved in \ + ``exp_name/demo_data``. + Arguments: + - env_num (:obj:`int`): The number of collector environments. Default to 8. + - save_data_path (:obj:`str`): The path to save the collected data. Default to None. \ + If not specified, the data will be saved in ``exp_name/demo_data``. + - n_sample (:obj:`int`): The number of samples to collect. Default to None. \ + If not specified, ``n_episode`` must be specified. + - n_episode (:obj:`int`): The number of episodes to collect. Default to None. \ + If not specified, ``n_sample`` must be specified. + - context (:obj:`str`): The multi-process context of the environment manager. Default to None. \ + It can be specified as ``spawn``, ``fork`` or ``forkserver``. + - debug (:obj:`bool`): Whether to use debug mode in the environment manager. Default to False. \ + If set True, base environment manager will be used for easy debugging. Otherwise, \ + subprocess environment manager will be used. + """ + if debug: logging.getLogger().setLevel(logging.DEBUG) if n_episode is not None: @@ -249,7 +382,7 @@ def collect_data( task.use(offline_data_saver(save_data_path, data_type='hdf5')) task.run(max_step=1) logging.info( - f'DQN collecting is finished, more than {n_sample} samples are collected and saved in `{save_data_path}`' + f'SQL collecting is finished, more than {n_sample} samples are collected and saved in `{save_data_path}`' ) def batch_evaluate( @@ -259,6 +392,27 @@ def batch_evaluate( context: Optional[str] = None, debug: bool = False ) -> EvalReturn: + """ + Overview: + Evaluate the agent with SQL algorithm for ``n_evaluator_episode`` episodes with ``env_num`` evaluator \ + environments. The evaluation result will be returned. + The difference between methods ``batch_evaluate`` and ``deploy`` is that ``batch_evaluate`` will create \ + multiple evaluator environments to evaluate the agent to get an average performance, while ``deploy`` \ + will only create one evaluator environment to evaluate the agent and save the replay video. + Arguments: + - env_num (:obj:`int`): The number of evaluator environments. Default to 4. + - n_evaluator_episode (:obj:`int`): The number of episodes to evaluate. Default to 4. + - context (:obj:`str`): The multi-process context of the environment manager. Default to None. \ + It can be specified as ``spawn``, ``fork`` or ``forkserver``. + - debug (:obj:`bool`): Whether to use debug mode in the environment manager. Default to False. \ + If set True, base environment manager will be used for easy debugging. Otherwise, \ + subprocess environment manager will be used. + Returns: + - (:obj:`EvalReturn`): The evaluation result, of which the attributions are: + - eval_value (:obj:`np.float32`): The mean of evaluation return. + - eval_value_std (:obj:`np.float32`): The standard deviation of evaluation return. + """ + if debug: logging.getLogger().setLevel(logging.DEBUG) # define env and policy @@ -281,7 +435,24 @@ def batch_evaluate( return EvalReturn(eval_value=task.ctx.eval_value, eval_value_std=task.ctx.eval_value_std) @property - def best(self): + def best(self) -> 'SQLAgent': + """ + Overview: + Load the best model from the checkpoint directory, \ + which is by default in folder ``exp_name/ckpt/eval.pth.tar``. \ + The return value is the agent with the best model. + Returns: + - (:obj:`SQLAgent`): The agent with the best model. + Examples: + >>> agent = SQLAgent(env_id='LunarLander-v2') + >>> agent.train() + >>> agent = agent.best + + .. note:: + The best model is the model with the highest evaluation return. If this method is called, the current \ + model will be replaced by the best model. + """ + best_model_file_path = os.path.join(self.checkpoint_save_dir, "eval.pth.tar") # Load best model if it exists if os.path.exists(best_model_file_path): diff --git a/ding/bonus/td3.py b/ding/bonus/td3.py index 894a7de5b8..a2889a370d 100644 --- a/ding/bonus/td3.py +++ b/ding/bonus/td3.py @@ -23,7 +23,23 @@ class TD3Agent: + """ + Overview: + Class of agent for training, evaluation and deployment of Reinforcement learning algorithm \ + Twin Delayed Deep Deterministic Policy Gradient(TD3). + For more information about the system design of RL agent, please refer to \ + . + Interface: + ``__init__``, ``train``, ``deploy``, ``collect_data``, ``batch_evaluate``, ``best`` + """ supported_env_list = list(supported_env_cfg.keys()) + """ + Overview: + List of supported envs. + Examples: + >>> from ding.bonus.td3 import TD3Agent + >>> print(TD3Agent.supported_env_list) + """ def __init__( self, @@ -35,6 +51,52 @@ def __init__( cfg: Optional[Union[EasyDict, dict]] = None, policy_state_dict: str = None, ) -> None: + """ + Overview: + Initialize agent for TD3 algorithm. + Arguments: + - env_id (:obj:`str`): The environment id, which is a registered environment name in gym or gymnasium. \ + If ``env_id`` is not specified, ``env_id`` in ``cfg.env`` must be specified. \ + If ``env_id`` is specified, ``env_id`` in ``cfg.env`` will be ignored. \ + ``env_id`` should be one of the supported envs, which can be found in ``supported_env_list``. + - env (:obj:`BaseEnv`): The environment instance for training and evaluation. \ + If ``env`` is not specified, `env_id`` or ``cfg.env.env_id`` must be specified. \ + ``env_id`` or ``cfg.env.env_id`` will be used to create environment instance. \ + If ``env`` is specified, ``env_id`` and ``cfg.env.env_id`` will be ignored. + - seed (:obj:`int`): The random seed, which is set before running the program. \ + Default to 0. + - exp_name (:obj:`str`): The name of this experiment, which will be used to create the folder to save \ + log data. Default to None. If not specified, the folder name will be ``env_id``-``algorithm``. + - model (:obj:`torch.nn.Module`): The model of TD3 algorithm, which should be an instance of class \ + :class:`ding.model.ContinuousQAC`. \ + If not specified, a default model will be generated according to the configuration. + - cfg (:obj:Union[EasyDict, dict]): The configuration of TD3 algorithm, which is a dict. \ + Default to None. If not specified, the default configuration will be used. \ + The default configuration can be found in ``ding/config/example/TD3/gym_lunarlander_v2.py``. + - policy_state_dict (:obj:`str`): The path of policy state dict saved by PyTorch a in local file. \ + If specified, the policy will be loaded from this file. Default to None. + + .. note:: + An RL Agent Instance can be initialized in two basic ways. \ + For example, we have an environment with id ``LunarLanderContinuous-v2`` registered in gym, \ + and we want to train an agent with TD3 algorithm with default configuration. \ + Then we can initialize the agent in the following ways: + >>> agent = TD3Agent(env_id='LunarLanderContinuous-v2') + or, if we want can specify the env_id in the configuration: + >>> cfg = {'env': {'env_id': 'LunarLanderContinuous-v2'}, 'policy': ...... } + >>> agent = TD3Agent(cfg=cfg) + There are also other arguments to specify the agent when initializing. + For example, if we want to specify the environment instance: + >>> env = CustomizedEnv('LunarLanderContinuous-v2') + >>> agent = TD3Agent(cfg=cfg, env=env) + or, if we want to specify the model: + >>> model = ContinuousQAC(**cfg.policy.model) + >>> agent = TD3Agent(cfg=cfg, model=model) + or, if we want to reload the policy from a saved policy state dict: + >>> agent = TD3Agent(cfg=cfg, policy_state_dict='LunarLanderContinuous-v2.pth.tar') + Make sure that the configuration is consistent with the saved policy state dict. + """ + assert env_id is not None or cfg is not None, "Please specify env_id or cfg." if cfg is not None and not isinstance(cfg, EasyDict): @@ -91,6 +153,32 @@ def train( debug: bool = False, wandb_sweep: bool = False, ) -> TrainingReturn: + """ + Overview: + Train the agent with TD3 algorithm for ``step`` iterations with ``collector_env_num`` collector \ + environments and ``evaluator_env_num`` evaluator environments. Information during training will be \ + recorded and saved by wandb. + Arguments: + - step (:obj:`int`): The total training environment steps of all collector environments. Default to 1e7. + - collector_env_num (:obj:`int`): The collector environment number. Default to None. \ + If not specified, it will be set according to the configuration. + - evaluator_env_num (:obj:`int`): The evaluator environment number. Default to None. \ + If not specified, it will be set according to the configuration. + - n_iter_save_ckpt (:obj:`int`): The frequency of saving checkpoint every training iteration. \ + Default to 1000. + - context (:obj:`str`): The multi-process context of the environment manager. Default to None. \ + It can be specified as ``spawn``, ``fork`` or ``forkserver``. + - debug (:obj:`bool`): Whether to use debug mode in the environment manager. Default to False. \ + If set True, base environment manager will be used for easy debugging. Otherwise, \ + subprocess environment manager will be used. + - wandb_sweep (:obj:`bool`): Whether to use wandb sweep, \ + which is a hyper-parameter optimization process for seeking the best configurations. \ + Default to False. If True, the wandb sweep id will be used as the experiment name. + Returns: + - (:obj:`TrainingReturn`): The training result, of which the attributions are: + - wandb_url (:obj:`str`): The weight & biases (wandb) project url of the trainning experiment. + """ + if debug: logging.getLogger().setLevel(logging.DEBUG) logging.debug(self.policy._model) @@ -144,6 +232,31 @@ def deploy( seed: Optional[Union[int, List]] = None, debug: bool = False ) -> EvalReturn: + """ + Overview: + Deploy the agent with TD3 algorithm by interacting with the environment, during which the replay video \ + can be saved if ``enable_save_replay`` is True. The evaluation result will be returned. + Arguments: + - enable_save_replay (:obj:`bool`): Whether to save the replay video. Default to False. + - concatenate_all_replay (:obj:`bool`): Whether to concatenate all replay videos into one video. \ + Default to False. If ``enable_save_replay`` is False, this argument will be ignored. \ + If ``enable_save_replay`` is True and ``concatenate_all_replay`` is False, \ + the replay video of each episode will be saved separately. + - replay_save_path (:obj:`str`): The path to save the replay video. Default to None. \ + If not specified, the video will be saved in ``exp_name/videos``. + - seed (:obj:`Union[int, List]`): The random seed, which is set before running the program. \ + Default to None. If not specified, ``self.seed`` will be used. \ + If ``seed`` is an integer, the agent will be deployed once. \ + If ``seed`` is a list of integers, the agent will be deployed once for each seed in the list. + - debug (:obj:`bool`): Whether to use debug mode in the environment manager. Default to False. \ + If set True, base environment manager will be used for easy debugging. Otherwise, \ + subprocess environment manager will be used. + Returns: + - (:obj:`EvalReturn`): The evaluation result, of which the attributions are: + - eval_value (:obj:`np.float32`): The mean of evaluation return. + - eval_value_std (:obj:`np.float32`): The standard deviation of evaluation return. + """ + if debug: logging.getLogger().setLevel(logging.DEBUG) # define env and policy @@ -222,6 +335,26 @@ def collect_data( context: Optional[str] = None, debug: bool = False ) -> None: + """ + Overview: + Collect data with TD3 algorithm for ``n_episode`` episodes with ``env_num`` collector environments. \ + The collected data will be saved in ``save_data_path`` if specified, otherwise it will be saved in \ + ``exp_name/demo_data``. + Arguments: + - env_num (:obj:`int`): The number of collector environments. Default to 8. + - save_data_path (:obj:`str`): The path to save the collected data. Default to None. \ + If not specified, the data will be saved in ``exp_name/demo_data``. + - n_sample (:obj:`int`): The number of samples to collect. Default to None. \ + If not specified, ``n_episode`` must be specified. + - n_episode (:obj:`int`): The number of episodes to collect. Default to None. \ + If not specified, ``n_sample`` must be specified. + - context (:obj:`str`): The multi-process context of the environment manager. Default to None. \ + It can be specified as ``spawn``, ``fork`` or ``forkserver``. + - debug (:obj:`bool`): Whether to use debug mode in the environment manager. Default to False. \ + If set True, base environment manager will be used for easy debugging. Otherwise, \ + subprocess environment manager will be used. + """ + if debug: logging.getLogger().setLevel(logging.DEBUG) if n_episode is not None: @@ -253,6 +386,27 @@ def batch_evaluate( context: Optional[str] = None, debug: bool = False ) -> EvalReturn: + """ + Overview: + Evaluate the agent with TD3 algorithm for ``n_evaluator_episode`` episodes with ``env_num`` evaluator \ + environments. The evaluation result will be returned. + The difference between methods ``batch_evaluate`` and ``deploy`` is that ``batch_evaluate`` will create \ + multiple evaluator environments to evaluate the agent to get an average performance, while ``deploy`` \ + will only create one evaluator environment to evaluate the agent and save the replay video. + Arguments: + - env_num (:obj:`int`): The number of evaluator environments. Default to 4. + - n_evaluator_episode (:obj:`int`): The number of episodes to evaluate. Default to 4. + - context (:obj:`str`): The multi-process context of the environment manager. Default to None. \ + It can be specified as ``spawn``, ``fork`` or ``forkserver``. + - debug (:obj:`bool`): Whether to use debug mode in the environment manager. Default to False. \ + If set True, base environment manager will be used for easy debugging. Otherwise, \ + subprocess environment manager will be used. + Returns: + - (:obj:`EvalReturn`): The evaluation result, of which the attributions are: + - eval_value (:obj:`np.float32`): The mean of evaluation return. + - eval_value_std (:obj:`np.float32`): The standard deviation of evaluation return. + """ + if debug: logging.getLogger().setLevel(logging.DEBUG) # define env and policy @@ -275,7 +429,24 @@ def batch_evaluate( return EvalReturn(eval_value=task.ctx.eval_value, eval_value_std=task.ctx.eval_value_std) @property - def best(self): + def best(self) -> 'TD3Agent': + """ + Overview: + Load the best model from the checkpoint directory, \ + which is by default in folder ``exp_name/ckpt/eval.pth.tar``. \ + The return value is the agent with the best model. + Returns: + - (:obj:`TD3Agent`): The agent with the best model. + Examples: + >>> agent = TD3Agent(env_id='LunarLanderContinuous-v2') + >>> agent.train() + >>> agent.best + + .. note:: + The best model is the model with the highest evaluation return. If this method is called, the current \ + model will be replaced by the best model. + """ + best_model_file_path = os.path.join(self.checkpoint_save_dir, "eval.pth.tar") # Load best model if it exists if os.path.exists(best_model_file_path):