From 6b8a0cb12f772813ccb2ae1a49635b167cd4566a Mon Sep 17 00:00:00 2001
From: Yann Bouteiller <yann.bouteiller@hotmail.fr>
Date: Sat, 25 Mar 2023 01:41:06 -0400
Subject: [PATCH] Release 0.10

---
 README.md                   |  49 ++++---
 rtgym/envs/real_time_env.py |  35 ++++-
 setup.py                    |   4 +-
 tests/test_all.py           | 260 +++++++++++++++++++++++++++++-------
 4 files changed, 278 insertions(+), 70 deletions(-)

diff --git a/README.md b/README.md
index 15f2c01..ccfcb09 100644
--- a/README.md
+++ b/README.md
@@ -2,7 +2,7 @@
 Easily implement your custom [Gymnasium](https://gymnasium.farama.org) environments for real-time applications.
  
 Real-Time Gym (```rtgym```) is typically needed when trying to use Reinforcement Learning algorithms in robotics or real-time video games.
-Its purpose is to clock your Gym environments in a way that is transparent to the user.
+Its purpose is to clock your Gymnasium environments in a way that is transparent to the user.
 
 ## Quick links
 - [Installation](#installation)
@@ -37,8 +37,8 @@ Non-abstract methods can be overidden if desired.
 
 Then, copy the ```rtgym``` default [configuration dictionary](https://github.com/yannbouteiller/rtgym/blob/969799b596e91808543f781b513901426b88d138/rtgym/envs/real_time_env.py#L96) in your code and replace the ``` 'interface' ``` entry with the class of your custom interface. You probably also want to modify other entries in this dictionary depending on your application.
 
-Once the custom interface is implemented, ```rtgym``` uses it to instantiate a fully-fledged Gym environment that automatically deals with time constraints.
-This environment can be used by simply following the usual Gym pattern, therefore compatible with many implemented Reinforcement Learning (RL) algorithms:
+Once the custom interface is implemented, ```rtgym``` uses it to instantiate a fully-fledged Gymnasium environment that automatically deals with time constraints.
+This environment can be used by simply following the usual Gymnasium pattern, therefore compatible with many implemented Reinforcement Learning (RL) algorithms:
 
 ```python
 from rtgym.envs.real_time_env import DEFAULT_CONFIG_DICT
@@ -71,7 +71,7 @@ Once the clock is started, it can be stopped via a call to the `wait()` API to a
 The following figure illustrates how `rtgym` behaves around `reset` transitions when:
 - the configuration dictionary has `"wait_on_done": True`
 - `wait` is customized to execute some arbitrary behavior
-- `env.default_action` is `a0`
+- The default action is `a0`
 
 ![Reset Transitions](https://github.com/yannbouteiller/rtgym/releases/download/v0.9/reset.png "Reset Transitions")
 
@@ -91,12 +91,13 @@ while True:
     obs, rew, terminated, truncated, info = env.step(act)
     done = terminated or truncated
     if done:
-        env.default_action = act
+        env.set_default_action(act)
         obs, info = env.reset()  # here, act will be applied
 ```
+_(NB: you can achieve this behavior without resorting to `set_default_action`. Just set `"last_act_on_reset": True` in your configuration dictionary.)_
 
 _In this code snippet, the action buffer contained in `obs` is the same after `step` and after the second `reset`.
-Otherwise, the last action in the buffer would be `act` after `step` and would be replaced by the default action in `reset`, as the last `act` would in fact never be applied (see `a2` in the previous figure, imagining that `a1` keeps being applied instead of arbitrary actions being applied by `wait` and `reset`, which should then be much shorter / near-instantaneous)._
+Otherwise, the last action in the buffer would be `act` after `step` and would be replaced by the default action in `reset`, as the last `act` would in fact never be applied (see `a2` in the previous figure, imagining that `a1` keeps being applied instead of arbitrary actions being applied by `wait` and `reset`, which in this case should be much shorter / near-instantaneous)._
 
 _It is worth thinking about this if you wish to replace the action buffer with, e.g., recurrent units of a neural network while artificially splitting a non-episodic problem into finite episodes._
 
@@ -107,12 +108,12 @@ The complete script for this tutorial is provided [here](https://github.com/yann
 
 ### Custom Real-Time Gym environment
 #### Introduction
-Implementing a Gym environment on a real system is not straightforward when time cannot be paused between time-steps for observation capture, inference, transfers and actuation.
+Implementing a Gymnasium environment on a real system is not straightforward when time cannot be paused between time-steps for observation capture, inference, transfers and actuation.
 
 Real-Time Gym provides a python interface that enables doing this with minimal effort.
 
-In this tutorial, we will see how to use this interface in order to create a Gym environment for your robot, video game, or other real-time application.
-From the user's point of view, this environment will work as Gym environments usually do, and therefore will be compatible with many readily implemented Reinforcement Learning (RL) algorithms.
+In this tutorial, we will see how to use this interface in order to create a Gymnasium environment for your robot, video game, or other real-time application.
+From the user's point of view, this environment will work as Gymnasium environments usually do, and therefore will be compatible with many readily implemented Reinforcement Learning (RL) algorithms.
 
 #### Install Real-Time Gym
 First, we need to install the Real-Time Gym package.
@@ -132,7 +133,7 @@ You can import the RealTimeGymInterface class as follows:
 from rtgym import RealTimeGymInterface
 ```
 
-The [RealTimeGymInterface](https://github.com/yannbouteiller/rtgym/blob/969799b596e91808543f781b513901426b88d138/rtgym/envs/real_time_env.py#L12) is all you need to implement in order to create your custom real-time Gym environment.
+The [RealTimeGymInterface](https://github.com/yannbouteiller/rtgym/blob/969799b596e91808543f781b513901426b88d138/rtgym/envs/real_time_env.py#L12) is all you need to implement in order to create your custom Real-Time Gym environment.
 
 This class has 6 abstract methods that you need to implement: ```get_observation_space```, ```get_action_space```, ```get_default_action```, ```reset```, ```get_obs_rew_terminated_info``` and ```send_control```.
 It also has a ```wait``` and a ```render``` methods that you may want to override.
@@ -285,7 +286,7 @@ def get_action_space(self):
 ---
 ```RealTimeGymInterface``` also requires a default action.
 This is to initialize the action buffer, and optionally to reinitialize it when the environment is reset.
-In addition, ```send_control``` is called with the default action as parameter when the Gym environment is reset.
+In addition, ```send_control``` is called with the default action as parameter when the Gymnasium environment is reset.
 This default action is returned as a numpy array by the ```get_default_action``` method.
 Of course, the default action must be within the action space that we defined in ```get_action_space```.
 
@@ -315,14 +316,14 @@ As you know if you are familiar with Reinforcement Learning, the underlying math
 This means that RL algorithms consider the world as a fixed state, from which an action is taken that leads to a new fixed state, and so on.
 
 However, real applications are of course often far from this assumption, which is why we developed the ```rtgym``` framework.
-Usually, RL theorists use fake Gym environments that are paused between each call to the step() function.
+Usually, RL theorists use fake Gymnasium environments that are paused between each call to the step() function.
 By contrast, ```rtgym``` environments are never really paused, because you simply cannot pause the real world.
 
 Instead, when calling step() in a ```rtgym``` environment, an internal procedure will ensure that the control passed as argument is sent at the beginning of the next real time-step.
 The step() function will block until this point, when a new observation is retrieved.
 Then, step() will return the observation so that inference can be performed in parallel to the next time-step, and so on.
 
-This is convenient because the user doesn't have to worry about these kinds of complicated dynamics and simply alternates between inference and calls to step() as they would usually do with any Gym environment.
+This is convenient because the user doesn't have to worry about these kinds of complicated dynamics and simply alternates between inference and calls to step() as they would usually do with any Gymnasium environment.
 However, this needs to be done repeatedly, otherwise step() will time-out.
 
 Yet, you may still want to artificially 'pause' the environment occasionally, e.g. because you collected a batch of samples, or because you want to pause the whole experiment.
@@ -442,13 +443,13 @@ def reset(self, seed=None, options=None):
           np.array([self.target[1]], dtype='float32')], {}
 ```
 
-We have now fully implemented our custom ```RealTimeGymInterface``` and can use it to instantiate a Gym environment for our real-time application.
+We have now fully implemented our custom ```RealTimeGymInterface``` and can use it to instantiate a Gymnasium environment for our real-time application.
 To do this, we simply pass our custom interface as a parameter to ```gymnasium.make``` in a configuration dictionary, as illustrated in the next section.
 
 ---
 #### Create a configuration dictionary
 
-Now that our custom interface is implemented, we can easily instantiate a fully fledged Gym environment for our dummy RC drone.
+Now that our custom interface is implemented, we can easily instantiate a fully fledged Gymnasium environment for our dummy RC drone.
 This is done by loading the ```rtgym``` ```DEFAULT_CONFIG_DICT``` and replacing the value stored under the ```"interface"``` key by our custom interface:
 
 ```python
@@ -501,13 +502,13 @@ Therefore we set this to ```False```.
 #### Instantiate the custom real-time environment
 
 We are all done!
-Instantiating our Gym environment is now as simple as:
+Instantiating our Gymnasium environment is now as simple as:
 
 ```python
 env = gymnasium.make("real-time-gym-v1", config=my_config)
 ``` 
 
-We can use it as any usual Gym environment:
+We can use it as any usual Gymnasium environment:
 
 ```python
 def model(obs):
@@ -651,7 +652,19 @@ This is to maintain the real-time flow of time-steps during reset transitions.
 
 It may happen that you prefer to repeat the previous action instead, for instance because it is hard in your application to implement a no-op action.
 
-To achieve this behavior, you can simply replace the `default_action` attribute of your environment with the action that you want being sent, right before calling `reset()`.
+To achieve this behavior, you can simply replace the default action of your environment via `set_default_action` with the action that you want being sent, right before calling `reset()`:
+```python
+env.set_default_action(my_new_default_action)
+obs, info = env.reset()
+
+# Note: alternatively, you can set the "last_act_on_reset" entry to True in your configuration.
+# This would make reset() send the last action instead of the default action.
+# In rtgym, when terminated or truncated is True, the action passed to step() is not sent.
+# Setting "last_act_on_reset" to True sends it on the subsequent reset().
+# Think thoroughly before setting this to True, as this might not ne suitable.
+# In Real-Time RL, the last action of an episode has no effect in terms of reward.
+# Thus, it may be entirely random depending on your training algorithm.
+```
 
 ---
 
diff --git a/rtgym/envs/real_time_env.py b/rtgym/envs/real_time_env.py
index fdb2e6e..15c7b3f 100644
--- a/rtgym/envs/real_time_env.py
+++ b/rtgym/envs/real_time_env.py
@@ -151,6 +151,7 @@ def render(self):
     "benchmark": False,  # When True, a simple benchmark will be run to estimate useful timing metrics
     "benchmark_polyak": 0.1,  # Polyak averaging factor for the benchmarks (0.0 < x <= 1); smaller is slower, bigger is noisier
     "wait_on_done": False,  # Whether the wait() method should be called when either terminated or truncated is True
+    "last_act_on_reset": False,  # When False, reset() sends the default action; when False, it sends the last action
 }
 """Default configuration dictionary of Real-Time Gym.
 
@@ -304,9 +305,10 @@ def __init__(self, config: dict=DEFAULT_CONFIG_DICT):
 
         # config variables:
         self.wait_on_done = config["wait_on_done"] if "wait_on_done" in config else False
+        self.last_act_on_reset = config["last_act_on_reset"] if "last_act_on_reset" in config else False
         self.act_prepro_func: callable = config["act_prepro_func"] if "act_prepro_func" in config else None
         self.obs_prepro_func = config["obs_prepro_func"] if "obs_prepro_func" in config else None
-        self.ep_max_length = config["ep_max_length"]
+        self.ep_max_length = config["ep_max_length"] - 1
 
         self.time_step_duration = config["time_step_duration"] if "time_step_duration" in config else 0.0
         self.time_step_timeout_factor = config["time_step_timeout_factor"] if "time_step_timeout_factor" in config else 1.0
@@ -345,9 +347,10 @@ def __init__(self, config: dict=DEFAULT_CONFIG_DICT):
         self.observation_space = self._get_observation_space()
         self.current_step = 0
         self.time_initialized = False
+        self.running = False
+
         # state variables:
         self.default_action = self.interface.get_default_action()
-        self.last_action = self.default_action
 
         # gymnasium variables:
         self.seed = None
@@ -500,6 +503,7 @@ def reset(self, seed=None, options=None):
             info: info dictionary
         """
         self._join_thread()
+        self.running = True
         self.seed = seed
         self.options = options
         self.current_step = 0
@@ -508,7 +512,8 @@ def reset(self, seed=None, options=None):
             self.init_action_buffer()
         else:
             # replace the last (non-applied) action from the previous episode by the action that is going to be applied:
-            self.act_buf[-1] = self.default_action
+            if not self.last_act_on_reset:
+                self.act_buf[-1] = self.default_action
         elt, info = self.interface.reset(seed=seed, options=options)
         if self.act_in_obs:
             elt = elt + list(self.act_buf)
@@ -544,12 +549,16 @@ def step(self, action):
         self.act_buf.append(action)  # the action is always appended to the buffer
         if not self.real_time:
             self._run_time_step(action)
+        if not self.running:
+            raise RuntimeError("The episode is terminated or truncated. Call reset before step.")
         obs, rew, terminated, truncated, info = self._retrieve_obs_rew_terminated_truncated_info()
         done = (terminated or truncated)
         if not done:  # apply action only when not done
             self._run_time_step(action)
-        elif self.wait_on_done:
-            self.wait()
+        else:
+            self.running = False
+            if self.wait_on_done:
+                self.wait()
         if self.act_in_obs:
             obs = tuple((*obs, *tuple(self.act_buf),))
         if self.benchmark:
@@ -589,3 +598,19 @@ def render(self, mode='human', join_thread=False):
         if join_thread:
             self._join_thread()
         self.interface.render()
+
+    def set_default_action(self, default_action):
+        """Changes the default action.
+
+        Use this method right before calling reset() if you want the environment to send another default_action.
+        This is useful when you want to maintain the real-time flow around the end of an episode.
+        For instance, you may want to call set_default_action() with default_action as the action sent to step() right
+        before the episode got terminated or truncated, because this action was never applied (thus, it will be applied
+        by reset() - note however that this last action can be random unless you take special care).
+
+        Note: alternatively, you can set the "last_act_on_reset" entry to True in the rtgym configuration.
+
+        Args:
+            default_action: numpy.array: new default action (make sure it complies with the action space)
+        """
+        self.default_action = default_action
diff --git a/setup.py b/setup.py
index cda8b1e..12b1b28 100644
--- a/setup.py
+++ b/setup.py
@@ -7,14 +7,14 @@
 
 setup(name='rtgym',
       packages=[package for package in find_packages()],
-      version='0.9',
+      version='0.10',
       license='MIT',
       description='Easily implement custom Gymnasium environments for real-time applications',
       long_description=long_description,
       long_description_content_type="text/markdown",
       author='Yann Bouteiller',
       url='https://github.com/yannbouteiller/rtgym',
-      download_url='https://github.com/yannbouteiller/rtgym/archive/refs/tags/v0.9.tar.gz',
+      download_url='https://github.com/yannbouteiller/rtgym/archive/refs/tags/v0.10.tar.gz',
       keywords=['gymnasium', 'real', 'time', 'custom', 'environment', 'reinforcement', 'learning', 'random', 'delays'],
       install_requires=['gymnasium', 'numpy'],
       classifiers=[
diff --git a/tests/test_all.py b/tests/test_all.py
index d077ef2..284ad6a 100644
--- a/tests/test_all.py
+++ b/tests/test_all.py
@@ -17,7 +17,7 @@ def send_control(self, control):
     def reset(self, seed=None, options=None):
         now = time.time()
         self.control_time = now
-        self.control = [0.0]
+        self.control = [-2.0]
         obs = [np.array([now], dtype=np.float64),
                np.array(self.control, dtype=np.float64),
                np.array([self.control_time], dtype=np.float64)]
@@ -43,75 +43,245 @@ def get_default_action(self):
         return np.array([-1.0], dtype=np.float64)
 
 
-config = DEFAULT_CONFIG_DICT
-config["interface"] = DummyInterface
-config["time_step_duration"] = 0.1
-config["start_obs_capture"] = 0.1
-config["act_buf_len"] = 1
-config["wait_on_done"] = False
-config["reset_act_buf"] = False
-
-
 class TestEnv(unittest.TestCase):
     def test_timing(self):
+
         epsilon = 0.02
+        act_buf_len = 3
+        time_step_duration = 0.1
+        start_obs_capture = 0.08
+
+        print("--- new environment ---")
+
+        config = DEFAULT_CONFIG_DICT
+        config["interface"] = DummyInterface
+        config["time_step_duration"] = time_step_duration
+        config["start_obs_capture"] = start_obs_capture
+        config["act_buf_len"] = act_buf_len
+        config["wait_on_done"] = False
+        config["reset_act_buf"] = False
+        config["ep_max_length"] = 10
+        config["last_act_on_reset"] = True
+
         env = gymnasium.make("real-time-gym-v1", config=config)
 
+        # first reset, the default action (-1) will be sent:
         obs1, info = env.reset()
-        elapsed_since_obs1_capture = time.time() - obs1[0]
-        self.assertGreater(epsilon, elapsed_since_obs1_capture)
 
-        # default action (buffer):
-        self.assertEqual(obs1[3], -1)
+        # now, action -1 is on its way
 
-        # arbitrary value:
-        self.assertEqual(obs1[1], np.array([0.]))
+        # what is the difference between now and the moment reset was called?
+        now = time.time()
+        elapsed_since_reset = now - obs1[0]
+        print(f"Call to reset took {elapsed_since_reset} seconds")
+        self.assertGreater(epsilon, elapsed_since_reset)
+
+        # the actions in the buffer should all be the default (-1):
+        for j in range(act_buf_len):
+            print(f"The action buffer is {obs1[3 + j]} at index {j}")
+            self.assertEqual(obs1[3 + j], -1)
+
+        # the first control is -2 at reset:
+        print(f"The control is {obs1[1]}")
+        self.assertEqual(obs1[1], np.array([-2.]))
 
-        # elapsed between now and control time:
+        # now let us step the environment
+        a = 1
+        print(f"--- Step {a} ---")
+        act = np.array([float(a)], dtype=np.float64)
+        obs2, _, terminated, truncated, _ = env.step(act)
         now = time.time()
-        self.assertGreater(0.1 + epsilon, now - obs1[0])
+        print(f"terminated: {terminated}, truncated:{truncated}")
 
-        act = np.array([0.0], dtype=np.float64)
-        obs2, _, _, _, _ = env.step(act)
-        self.assertEqual(obs2[3], act)
-        self.assertEqual(obs2[1], -1.0)
+        # let us look at the action buffer:
+        for j in range(act_buf_len):
+            print(f"The action buffer is {obs2[3 + j]} at index {j}")
+            if j < act_buf_len - 1:
+                self.assertEqual(obs2[3 + j], -1)
+            else:
+                self.assertEqual(obs2[3 + j], a)
 
-        # elapsed between beginning of new timestep and previous obs capture:
-        self.assertGreater(0.1 + epsilon, obs2[2] - obs1[0])
+        # Now, we look at the time elapsed between the observation retrieval of step and reset:
+        elapsed = obs2[0] - obs1[0]
+        print(f"The two last obs are spaced by {elapsed} seconds")
+        self.assertGreater(time_step_duration + epsilon, elapsed)
+        self.assertGreater(elapsed, start_obs_capture - epsilon)
 
-        # elapsed between new obs capture and previous obs capture:
-        self.assertGreater(0.1 + epsilon, obs2[0] - obs1[0])
+        # the control applied when obs2 was captured should be the default -1
+        print(f"The action applied when obs was captured was {obs2[1]}")
+        self.assertEqual(obs2[1], np.array([-1.]))
+
+        # the sending timestamp of the control should be the beginning of the last time-step:
+        elapsed = now - obs2[2]
+        print(f"This action was sent {elapsed} seconds ago")
+        self.assertGreater(time_step_duration + epsilon, elapsed)
+        self.assertGreater(elapsed, time_step_duration - epsilon)
+
+        for i in range(9):
 
-        for i in range(10):
             obs1 = obs2
-            act = np.array([float(i + 1)])
-            obs2, _, terminated, _, _ = env.step(act)
+            a += 1
+
+            print(f"--- Step {a} ---")
+            act = np.array([float(a)], dtype=np.float64)
+            obs2, _, terminated, truncated, _ = env.step(act)
             now = time.time()
-            self.assertEqual(obs2[3], act)
-            self.assertEqual(obs2[1], act - 1.0)
+            print(f"terminated: {terminated}, truncated:{truncated}")
+
+            # let us look at the action buffer:
+            for j in range(act_buf_len):
+                print(f"The action buffer is {obs2[3 + j]} at index {j}")
+            self.assertEqual(obs2[-1], a)
+
+            # Now, we look at the time elapsed between the two observations:
+            elapsed = obs2[0] - obs1[0]
+            print(f"The two last obs are spaced by {elapsed} seconds")
+            self.assertGreater(time_step_duration + epsilon, elapsed)
+            self.assertGreater(elapsed, time_step_duration - epsilon)
 
-            # elapsed between now and start of last timestep:
-            self.assertGreater(0.1 + epsilon, now - obs2[2])
+            # the control applied when obs2 was captured should be the previous a
+            print(f"The action applied when obs was captured was {obs2[1]}")
+            self.assertEqual(obs2[1], np.array([float(a - 1)]))
 
-            # elapsed between new obs capture and previous obs capture:
-            self.assertGreater(obs2[0] - obs1[0], 0.1 - epsilon)
-            self.assertGreater(0.1 + epsilon, obs2[0] - obs1[0])
+            # the sending timestamp of the control should be the beginning of the last time-step:
+            elapsed = now - obs2[2]
+            print(f"This action was sent {elapsed} seconds ago")
+            self.assertGreater(time_step_duration + epsilon, elapsed)
+            self.assertGreater(elapsed, time_step_duration - epsilon)
 
-            # terminated signal:
-            if i >= 9:
+            # end of episode:
+            if i == 8:
+                # the terminated signal should override the truncated signal:
                 self.assertTrue(terminated)
+                self.assertFalse(truncated)
+
+        # let us test the real-time reset mechanism:
+        print("--- reset ---")
+        obs1, info = env.reset()
+        now = time.time()
+
+        # this call to reset should be near-instantaneous:
+        elapsed_since_reset = now - obs1[0]
+        print(f"Call to reset took {elapsed_since_reset} seconds")
+        self.assertGreater(epsilon, elapsed_since_reset)
+
+        # let us look at the action buffer:
+        for j in range(act_buf_len):
+            print(f"The action buffer is {obs1[3 + j]} at index {j}")
+        # since we use "last_act_on_reset":True, the buffer should end with act:
+        self.assertEqual(obs1[-1], act)
+
+        # Now let us step the environment:
+
+        a = 0
+        act = np.array([float(a)], dtype=np.float64)
+        obs1, _, terminated, truncated, _ = env.step(act)
+
+        # because we sent the previous act (10) on reset(), terminated should now be True:
+        print(f"terminated: {terminated}, truncated:{truncated}")
+        self.assertEqual(terminated, True)
+        self.assertEqual(truncated, False)
 
-        # test reset:
+        # Let us retry:
+
+        print("--- reset ---")
+        obs1, info = env.reset()
+
+        for i in range(10):
+            print(f"--- step {i + 1} ---")
+            obs1, _, terminated, truncated, _ = env.step(act)
+            print(f"terminated: {terminated}, truncated:{truncated}")
+            if i < 9:
+                # reset() sent 0, so we should be good:
+                self.assertEqual(terminated, False)
+                self.assertEqual(truncated, False)
+            else:
+                # the episode should now be truncated:
+                self.assertEqual(terminated, False)
+                self.assertEqual(truncated, True)
+
+        # Now the episode is truncated, we should not be able to call step again:
+
+        try:
+            obs1, _, terminated, truncated, _ = env.step(act)
+            assert False, "step did not raise a RuntimeError"
+        except RuntimeError:
+            print("step cannot be called here.")
+
+        # Now let us test the default reset behavior:
+
+        print("--- new environment ---")
+
+        config = DEFAULT_CONFIG_DICT
+        config["interface"] = DummyInterface
+        config["time_step_duration"] = time_step_duration
+        config["start_obs_capture"] = start_obs_capture
+        config["act_buf_len"] = act_buf_len
+        config["wait_on_done"] = True
+        config["reset_act_buf"] = True
+        config["ep_max_length"] = 10
+        config["last_act_on_reset"] = False
+
+        env = gymnasium.make("real-time-gym-v1", config=config)
+
+        # reset, the default action (-1) will be sent:
         obs1, info = env.reset()
 
-        # default action (buffer):
-        self.assertEqual(obs1[3], -1)
+        # the actions in the buffer should all be the default (-1):
+        for j in range(act_buf_len):
+            print(f"The action buffer is {obs1[3 + j]} at index {j}")
+            self.assertEqual(obs1[3 + j], -1)
+
+        # the first control is -2 at reset:
+        print(f"The control is {obs1[1]}")
+        self.assertEqual(obs1[1], np.array([-2.]))
+
+        # let us step:
+        print("--- step ---")
+        obs1, _, terminated, truncated, _ = env.step(act)
+
+        # let us look at the action buffer:
+        for j in range(act_buf_len):
+            print(f"The action buffer is {obs1[3 + j]} at index {j}")
+        # the buffer should end with act:
+        self.assertEqual(obs1[-1], act)
+
+        # let us step again:
+        print("--- step ---")
+        obs1, _, terminated, truncated, _ = env.step(act)
+
+        # let us look at the action buffer:
+        for j in range(act_buf_len):
+            print(f"The action buffer is {obs1[3 + j]} at index {j}")
+        # the buffer should still end with act:
+        self.assertEqual(obs1[-1], act)
+
+        # not let us reset again:
+        print("--- reset ---")
+        obs1, info = env.reset()
+
+        # the actions in the buffer should now all be the default (-1):
+        for j in range(act_buf_len):
+            print(f"The action buffer is {obs1[3 + j]} at index {j}")
+            self.assertEqual(obs1[3 + j], -1)
+
+        # and the first control is -2 at reset:
+        print(f"The control is {obs1[1]}")
+        self.assertEqual(obs1[1], np.array([-2.]))
+
+        # for good measure, let us step one last time:
+
+        print("--- step ---")
+        obs1, _, terminated, truncated, _ = env.step(act)
 
-        act = np.array([float(22)])
-        obs1, _, _, _, _ = env.step(act)
+        # the last actions in the buffer should be act:
+        for j in range(act_buf_len):
+            print(f"The action buffer is {obs1[3 + j]} at index {j}")
+        self.assertEqual(obs1[-1], -act)
 
-        # new action (buffer):
-        self.assertEqual(obs1[3], 22)
+        # the applied control should be -1:
+        print(f"The action applied when obs was captured was {obs1[1]}")
+        self.assertEqual(obs1[1], np.array([-1.]))
 
 
 if __name__ == '__main__':