SforAiDl
diff --git a/‎.lgtm.yml
+2 b/‎.lgtm.yml
+2
diff --git a/‎.pre-commit-config.yaml
+1-1 b/‎.pre-commit-config.yaml
+1-1
diff --git a/‎README.md
+1-1 b/‎README.md
+1-1
diff --git a/‎genrl/agents/bandits/contextual/common/base_model.py
+2-2 b/‎genrl/agents/bandits/contextual/common/base_model.py
+2-2
diff --git a/‎genrl/agents/bandits/contextual/common/bayesian.py
+2-2 b/‎genrl/agents/bandits/contextual/common/bayesian.py
+2-2
diff --git a/‎genrl/agents/bandits/contextual/common/neural.py
+2-2 b/‎genrl/agents/bandits/contextual/common/neural.py
+2-2
diff --git a/‎genrl/agents/deep/a2c/a2c.py
+7-10 b/‎genrl/agents/deep/a2c/a2c.py
+7-10
diff --git a/‎genrl/agents/deep/base/base.py
+1 b/‎genrl/agents/deep/base/base.py
+1
diff --git a/‎genrl/agents/deep/base/offpolicy.py
+6-8 b/‎genrl/agents/deep/base/offpolicy.py
+6-8
diff --git a/‎genrl/agents/deep/base/onpolicy.py
+5-8 b/‎genrl/agents/deep/base/onpolicy.py
+5-8
diff --git a/‎genrl/agents/deep/ddpg/ddpg.py
+2-2 b/‎genrl/agents/deep/ddpg/ddpg.py
+2-2
diff --git a/‎genrl/agents/deep/dqn/base.py
+16-16 b/‎genrl/agents/deep/dqn/base.py
+16-16
diff --git a/‎genrl/agents/deep/dqn/categorical.py
+5-6 b/‎genrl/agents/deep/dqn/categorical.py
+5-6
diff --git a/‎genrl/agents/deep/dqn/utils.py
+15-13 b/‎genrl/agents/deep/dqn/utils.py
+15-13
@@ -0,0 +1,2 @@
+queries: 
+  exclude: py/import-and-import-from
@@ -6,7 +6,7 @@ repos:
           args: [--exclude=^((examples|docs)/.*)$]
 
   - repo: https://github.com/timothycrosley/isort
-    rev: 5.4.2
+    rev: 4.3.2
     hooks:
         - id: isort
 
 
@@ -142,6 +142,6 @@ trainer.plot(episode_rewards)
 - [Gym](https://gym.openai.com/) - Environments 
 - [Ray](https://github.com/ray-project/ray)
 - [OpenAI Baselines](https://github.com/openai/baselines) - Logger
-- [Stable Baselines 3](https://github.com/DLR-RM/stable-baselines3): Stable Baselines aims to provide _baselines_ for Deep RL Algorithms. Part of our code (e.g. Rollout Storage) is inspired from Stable Baselines. 
+- [Stable Baselines 3](https://github.com/DLR-RM/stable-baselines3): Stable Baselines aims to provide _baselines_ for Deep RL Algorithms. 
 - [pytorch-a2c-ppo-acktr](https://github.com/ikostrikov/pytorch-a2c-ppo-acktr-gail)
 - [Deep Contextual Bandits](https://github.com/tensorflow/models/tree/archive/research/deep_contextual_bandits)
@@ -1,8 +1,8 @@
 from abc import ABC, abstractmethod
 from typing import Dict
 
-import torch
-import torch.nn as nn
+import torch  # noqa
+import torch.nn as nn  # noqa
 import torch.nn.functional as F
 
 from genrl.agents.bandits.contextual.common.transition import TransitionDB
 
@@ -1,7 +1,7 @@
 from typing import Dict, Optional, Tuple
 
-import torch
-import torch.nn as nn
+import torch  # noqa
+import torch.nn as nn  # noqa
 import torch.nn.functional as F
 
 from genrl.agents.bandits.contextual.common.base_model import Model
 
@@ -1,7 +1,7 @@
 from typing import Dict
 
-import torch
-import torch.nn as nn
+import torch  # noqa
+import torch.nn as nn  # noqa
 import torch.nn.functional as F
 
 from genrl.agents.bandits.contextual.common.base_model import Model
 
@@ -1,10 +1,9 @@
 from typing import Any, Dict
 
 import gym
-import numpy as np
 import torch
-import torch.nn.functional as F
 import torch.optim as opt
+from torch.nn import functional as F
 
 from genrl.agents.deep.base import OnPolicyAgent
 from genrl.utils import get_env_properties, get_model, safe_mean
@@ -83,35 +82,33 @@ def _create_model(self) -> None:
 
         if self.noise is not None:
             self.noise = self.noise(
-                np.zeros_like(action_dim), self.noise_std * np.ones_like(action_dim)
+                torch.zeros(action_dim), self.noise_std * torch.ones(action_dim)
             )
 
         self.optimizer_policy = opt.Adam(self.ac.actor.parameters(), lr=self.lr_policy)
         self.optimizer_value = opt.Adam(self.ac.critic.parameters(), lr=self.lr_value)
 
     def select_action(
-        self, state: np.ndarray, deterministic: bool = False
-    ) -> np.ndarray:
+        self, state: torch.Tensor, deterministic: bool = False
+    ) -> torch.Tensor:
         """Select action given state
 
         Action Selection for On Policy Agents with Actor Critic
 
         Args:
-            state (:obj:`np.ndarray`): Current state of the environment
+            state (:obj:`torch.Tensor`): Current state of the environment
             deterministic (bool): Should the policy be deterministic or stochastic
 
         Returns:
-            action (:obj:`np.ndarray`): Action taken by the agent
+            action (:obj:`torch.Tensor`): Action taken by the agent
             value (:obj:`torch.Tensor`): Value of given state
             log_prob (:obj:`torch.Tensor`): Log probability of selected action
         """
-        state = torch.as_tensor(state).float().to(self.device)
-
         # create distribution based on actor output
         action, dist = self.ac.get_action(state, deterministic=deterministic)
         value = self.ac.get_value(state)
 
-        return action.detach().cpu().numpy(), value, dist.log_prob(action).cpu()
+        return action.detach(), value, dist.log_prob(action).cpu()
 
     def get_traj_loss(self, values: torch.Tensor, dones: torch.Tensor) -> None:
         """Get loss from trajectory traversed by agent during rollouts
 
@@ -46,6 +46,7 @@ def __init__(
         self.batch_size = batch_size
         self.gamma = gamma
         self.policy_layers = policy_layers
+        self.rewards = []
         self.value_layers = value_layers
         self.lr_policy = lr_policy
         self.lr_value = lr_value
 
@@ -1,7 +1,6 @@
 import collections
 from typing import List
 
-import numpy as np
 import torch
 from torch.nn import functional as F
 
@@ -155,28 +154,27 @@ def __init__(self, *args, polyak=0.995, **kwargs):
         self.doublecritic = False
 
     def select_action(
-        self, state: np.ndarray, deterministic: bool = True
-    ) -> np.ndarray:
+        self, state: torch.Tensor, deterministic: bool = True
+    ) -> torch.Tensor:
         """Select action given state
 
         Deterministic Action Selection with Noise
 
         Args:
-            state (:obj:`np.ndarray`): Current state of the environment
+            state (:obj:`torch.Tensor`): Current state of the environment
             deterministic (bool): Should the policy be deterministic or stochastic
 
         Returns:
-            action (:obj:`np.ndarray`): Action taken by the agent
+            action (:obj:`torch.Tensor`): Action taken by the agent
         """
-        state = torch.as_tensor(state).float()
         action, _ = self.ac.get_action(state, deterministic)
-        action = action.detach().cpu().numpy()
+        action = action.detach()
 
         # add noise to output from policy network
         if self.noise is not None:
             action += self.noise()
 
-        return np.clip(
+        return torch.clamp(
             action, self.env.action_space.low[0], self.env.action_space.high[0]
         )
 
 
@@ -1,6 +1,3 @@
-from typing import List
-
-import numpy as np
 import torch
 
 from genrl.agents.deep.base import BaseAgent
@@ -48,18 +45,18 @@ def update_params(self) -> None:
         """Update parameters of the model"""
         raise NotImplementedError
 
-    def collect_rewards(self, dones: List[bool], timestep: int):
+    def collect_rewards(self, dones: torch.Tensor, timestep: int):
         """Helper function to collect rewards
 
         Runs through all the envs and collects rewards accumulated during rollouts
 
         Args:
-            dones (:obj:`list` of bool): Game over statuses of each environment
+            dones (:obj:`torch.Tensor`): Game over statuses of each environment
             timestep (int): Timestep during rollout
         """
         for i, done in enumerate(dones):
             if done or timestep == self.rollout_size - 1:
-                self.rewards.append(self.env.episode_reward[i])
+                self.rewards.append(self.env.episode_reward[i].detach().clone())
                 self.env.reset_single_env(i)
 
     def collect_rollouts(self, state: torch.Tensor):
@@ -73,12 +70,12 @@ def collect_rollouts(self, state: torch.Tensor):
 
         Returns:
             values (:obj:`torch.Tensor`): Values of states encountered during the rollout
-            dones (:obj:`list` of bool): Game over statuses of each environment
+            dones (:obj:`torch.Tensor`): Game over statuses of each environment
         """
         for i in range(self.rollout_size):
             action, values, old_log_probs = self.select_action(state)
 
-            next_state, reward, dones, _ = self.env.step(np.array(action))
+            next_state, reward, dones, _ = self.env.step(action)
 
             if self.render:
                 self.env.render()
 
@@ -1,7 +1,7 @@
 from copy import deepcopy
 from typing import Any, Dict
 
-import numpy as np
+import torch
 import torch.optim as opt
 
 from genrl.agents import OffPolicyAgentAC
@@ -59,7 +59,7 @@ def _create_model(self) -> None:
             )
         if self.noise is not None:
             self.noise = self.noise(
-                np.zeros_like(action_dim), self.noise_std * np.ones_like(action_dim)
+                torch.zeros(action_dim), self.noise_std * torch.ones(action_dim)
             )
 
         if isinstance(self.network, str):
 
@@ -1,9 +1,10 @@
+import math
+import random
 from copy import deepcopy
 from typing import Any, Dict, List
 
-import numpy as np
-import torch
-import torch.optim as opt
+import torch  # noqa
+import torch.optim as opt  # noqa
 
 from genrl.agents import OffPolicyAgent
 from genrl.utils import get_env_properties, get_model, safe_mean
@@ -94,38 +95,37 @@ def update_params_before_select_action(self, timestep: int) -> None:
         self.epsilon = self.calculate_epsilon_by_frame()
         self.logs["epsilon"].append(self.epsilon)
 
-    def get_greedy_action(self, state: torch.Tensor) -> np.ndarray:
+    def get_greedy_action(self, state: torch.Tensor) -> torch.Tensor:
         """Greedy action selection
 
         Args:
-            state (:obj:`np.ndarray`): Current state of the environment
+            state (:obj:`torch.Tensor`): Current state of the environment
 
         Returns:
-            action (:obj:`np.ndarray`): Action taken by the agent
+            action (:obj:`torch.Tensor`): Action taken by the agent
         """
-        q_values = self.model(state.unsqueeze(0)).detach().numpy()
-        action = np.argmax(q_values, axis=-1).squeeze(0)
+        q_values = self.model(state.unsqueeze(0))
+        action = torch.argmax(q_values.squeeze(), dim=-1)
         return action
 
     def select_action(
-        self, state: np.ndarray, deterministic: bool = False
-    ) -> np.ndarray:
+        self, state: torch.Tensor, deterministic: bool = False
+    ) -> torch.Tensor:
         """Select action given state
 
         Epsilon-greedy action-selection
 
         Args:
-            state (:obj:`np.ndarray`): Current state of the environment
+            state (:obj:`torch.Tensor`): Current state of the environment
             deterministic (bool): Should the policy be deterministic or stochastic
 
         Returns:
-            action (:obj:`np.ndarray`): Action taken by the agent
+            action (:obj:`torch.Tensor`): Action taken by the agent
         """
-        state = torch.as_tensor(state).float()
         action = self.get_greedy_action(state)
         if not deterministic:
-            if np.random.rand() < self.epsilon:
-                action = np.asarray(self.env.sample())
+            if random.random() < self.epsilon:
+                action = self.env.sample()
         return action
 
     def _reshape_batch(self, batch: List):
@@ -208,7 +208,7 @@ def calculate_epsilon_by_frame(self) -> float:
         Exponentially decays exploration rate from max epsilon to min epsilon
         The greater the value of epsilon_decay, the slower the decrease in epsilon
         """
-        return self.min_epsilon + (self.max_epsilon - self.min_epsilon) * np.exp(
+        return self.min_epsilon + (self.max_epsilon - self.min_epsilon) * math.exp(
             -1.0 * self.timestep / self.epsilon_decay
         )
 
 
@@ -1,7 +1,6 @@
 import collections
-from typing import List, Tuple
+from typing import Tuple
 
-import numpy as np
 import torch
 
 from genrl.agents.deep.dqn.base import DQN
@@ -67,14 +66,14 @@ def __init__(
         if self.create_model:
             self._create_model(noisy_layers=self.noisy_layers, num_atoms=self.num_atoms)
 
-    def get_greedy_action(self, state: torch.Tensor) -> np.ndarray:
+    def get_greedy_action(self, state: torch.Tensor) -> torch.Tensor:
         """Greedy action selection
 
         Args:
-            state (:obj:`np.ndarray`): Current state of the environment
+            state (:obj:`torch.Tensor`): Current state of the environment
 
         Returns:
-            action (:obj:`np.ndarray`): Action taken by the agent
+            action (:obj:`torch.Tensor`): Action taken by the agent
         """
         return categorical_greedy_action(self, state)
 
@@ -91,7 +90,7 @@ def get_q_values(self, states: torch.Tensor, actions: torch.Tensor):
         return categorical_q_values(self, states, actions)
 
     def get_target_q_values(
-        self, next_states: np.ndarray, rewards: List[float], dones: List[bool]
+        self, next_states: torch.Tensor, rewards: torch.Tensor, dones: torch.Tensor
     ):
         """Projected Distribution of Q-values
 
 
@@ -1,7 +1,5 @@
 import collections
-from typing import List
 
-import numpy as np
 import torch
 
 from genrl.agents.deep.dqn.base import DQN
@@ -64,25 +62,27 @@ def prioritized_q_loss(agent: DQN, batch: collections.namedtuple):
     return loss
 
 
-def categorical_greedy_action(agent: DQN, state: torch.Tensor) -> np.ndarray:
+def categorical_greedy_action(agent: DQN, state: torch.Tensor) -> torch.Tensor:
     """Greedy action selection for Categorical DQN
 
     Args:
         agent (:obj:`DQN`): The agent
-        state (:obj:`np.ndarray`): Current state of the environment
+        state (:obj:`torch.Tensor`): Current state of the environment
 
     Returns:
-        action (:obj:`np.ndarray`): Action taken by the agent
+        action (:obj:`torch.Tensor`): Action taken by the agent
     """
-    q_value_dist = agent.model(state.unsqueeze(0)).detach().numpy()
+    q_value_dist = agent.model(state.unsqueeze(0)).detach()  # .numpy()
     # We need to scale and discretise the Q-value distribution obtained above
-    q_value_dist = q_value_dist * np.linspace(agent.v_min, agent.v_max, agent.num_atoms)
+    q_value_dist = q_value_dist * torch.linspace(
+        agent.v_min, agent.v_max, agent.num_atoms
+    )
     # Then we find the action with the highest Q-values for all discrete regions
     # Current shape of the q_value_dist is [1, n_envs, action_dim, num_atoms]
     # So we take the sum of all the individual atom q_values and then take argmax
     # along action dim to get the optimal action. Since batch_size is 1 for this
     # function, we squeeze the first dimension out.
-    action = np.argmax(q_value_dist.sum(-1), axis=-1).squeeze(0)
+    action = torch.argmax(q_value_dist.sum(-1), axis=-1).squeeze(0)
     return action
 
 
@@ -119,9 +119,9 @@ def categorical_q_values(agent: DQN, states: torch.Tensor, actions: torch.Tensor
 
 def categorical_q_target(
     agent: DQN,
-    next_states: np.ndarray,
-    rewards: List[float],
-    dones: List[bool],
+    next_states: torch.Tensor,
+    rewards: torch.Tensor,
+    dones: torch.Tensor,
 ):
     """Projected Distribution of Q-values
 
@@ -140,8 +140,10 @@ def categorical_q_target(
     support = torch.linspace(agent.v_min, agent.v_max, agent.num_atoms)
 
     next_q_value_dist = agent.target_model(next_states) * support
-    next_actions = torch.argmax(next_q_value_dist.sum(-1), axis=-1)
-    next_actions = next_actions[:, :, np.newaxis, np.newaxis]
+    next_actions = (
+        torch.argmax(next_q_value_dist.sum(-1), axis=-1).unsqueeze(-1).unsqueeze(-1)
+    )
+
     next_actions = next_actions.expand(
         agent.batch_size, agent.env.n_envs, 1, agent.num_atoms
     )
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+queries:`
	`2`	`+ exclude: py/import-and-import-from`
Original file line number	Diff line number	Diff line change
`@@ -1,7 +1,7 @@`
`1`	`1`	`from copy import deepcopy`
`2`	`2`	`from typing import Any, Dict`
`3`	`3`
`4`		`-import numpy as np`
	`4`	`+import torch`
`5`	`5`	`import torch.optim as opt`
`6`	`6`
`7`	`7`	`from genrl.agents import OffPolicyAgentAC`
`@@ -59,7 +59,7 @@ def _create_model(self) -> None:`
`59`	`59`	`)`
`60`	`60`	`if self.noise is not None:`
`61`	`61`	`self.noise = self.noise(`
`62`		`- np.zeros_like(action_dim), self.noise_std * np.ones_like(action_dim)`
	`62`	`+ torch.zeros(action_dim), self.noise_std * torch.ones(action_dim)`
`63`	`63`	`)`
`64`	`64`
`65`	`65`	`if isinstance(self.network, str):`