anyinlover
diff --git a/‎docs/reinforcement_learning/code/.gitignore
Lines changed: 1 addition & 0 deletions b/‎docs/reinforcement_learning/code/.gitignore
Lines changed: 1 addition & 0 deletions
diff --git a/‎docs/reinforcement_learning/code/algo/multi_armed_bandits.ipynb
Lines changed: 508 additions & 0 deletions b/‎docs/reinforcement_learning/code/algo/multi_armed_bandits.ipynb
Lines changed: 508 additions & 0 deletions
diff --git a/‎docs/reinforcement_learning/code/algo/value_iter.py
Lines changed: 39 additions & 0 deletions b/‎docs/reinforcement_learning/code/algo/value_iter.py
Lines changed: 39 additions & 0 deletions
diff --git a/‎docs/reinforcement_learning/code/gym_env/__init__.py b/‎docs/reinforcement_learning/code/gym_env/__init__.py
diff --git a/‎docs/reinforcement_learning/code/gym_env/envs/__init__.py b/‎docs/reinforcement_learning/code/gym_env/envs/__init__.py
diff --git a/‎docs/reinforcement_learning/code/gym_env/envs/stochastic_grid_world.py
Lines changed: 223 additions & 85 deletions b/‎docs/reinforcement_learning/code/gym_env/envs/stochastic_grid_world.py
Lines changed: 223 additions & 85 deletions
diff --git a/‎docs/reinforcement_learning/code/pyproject.toml
Lines changed: 3 additions & 0 deletions b/‎docs/reinforcement_learning/code/pyproject.toml
Lines changed: 3 additions & 0 deletions
diff --git a/‎docs/reinforcement_learning/code/test.ipynb
Lines changed: 152 additions & 0 deletions b/‎docs/reinforcement_learning/code/test.ipynb
Lines changed: 152 additions & 0 deletions
@@ -0,0 +1 @@
+__pycache__
@@ -0,0 +1,39 @@
+import numpy as np
+from collections import defaultdict
+from gym_env.envs.stochastic_grid_world import StochasticGridWorldEnv
+
+
+def value_iter(env, theta=0.001, discount_factor=1.0):
+    def one_step_lookahead(state, V):
+        A = np.zeros(len(env._action_to_direction))
+        for a in env._action_to_direction:
+            for prob, next_state, reward, done in env.P[state][a]:
+                A[a] += prob * (reward + discount_factor * V[next_state])
+        return A
+
+    V = defaultdict(int)
+    while True:
+        delta = 0
+        for s in env.P:
+            A = one_step_lookahead(s, V)
+            best_action_value = np.max(A)
+            delta = max(delta, np.abs(best_action_value - V[s]))
+            V[s] = best_action_value
+
+        if delta < theta:
+            break
+
+    policy = defaultdict(int)
+    for s in env.P:
+        A = one_step_lookahead(s, V)
+        best_action = np.argmax(A)
+        policy[s] = best_action
+
+    return policy, V
+
+
+if __name__ == "__main__":
+    env = StochasticGridWorldEnv()
+    policy, V = value_iter(env)
+    print(policy)
+    print(V)
@@ -6,5 +6,8 @@ readme = "README.md"
 requires-python = ">=3.13"
 dependencies = [
     "gymnasium>=1.1.1",
+    "ipykernel>=6.29.5",
     "pygame>=2.6.1",
+    "torch>=2.7.0",
+    "torchrl>=0.8.0",
 ]
@@ -0,0 +1,152 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "94e6bb6e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from torchrl.envs import GymEnv"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "beb5c210",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "env = GymEnv(\"Pendulum-v1\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "aa974bfc",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "TensorDict(\n",
+       "    fields={\n",
+       "        done: Tensor(shape=torch.Size([1]), device=cpu, dtype=torch.bool, is_shared=False),\n",
+       "        observation: Tensor(shape=torch.Size([3]), device=cpu, dtype=torch.float32, is_shared=False),\n",
+       "        terminated: Tensor(shape=torch.Size([1]), device=cpu, dtype=torch.bool, is_shared=False),\n",
+       "        truncated: Tensor(shape=torch.Size([1]), device=cpu, dtype=torch.bool, is_shared=False)},\n",
+       "    batch_size=torch.Size([]),\n",
+       "    device=None,\n",
+       "    is_shared=False)"
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "reset = env.reset()\n",
+    "reset"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "1a6b8ef6",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "TensorDict(\n",
+       "    fields={\n",
+       "        action: Tensor(shape=torch.Size([1]), device=cpu, dtype=torch.float32, is_shared=False),\n",
+       "        done: Tensor(shape=torch.Size([1]), device=cpu, dtype=torch.bool, is_shared=False),\n",
+       "        observation: Tensor(shape=torch.Size([3]), device=cpu, dtype=torch.float32, is_shared=False),\n",
+       "        terminated: Tensor(shape=torch.Size([1]), device=cpu, dtype=torch.bool, is_shared=False),\n",
+       "        truncated: Tensor(shape=torch.Size([1]), device=cpu, dtype=torch.bool, is_shared=False)},\n",
+       "    batch_size=torch.Size([]),\n",
+       "    device=None,\n",
+       "    is_shared=False)"
+      ]
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "reset_with_action = env.rand_action(reset)\n",
+    "reset_with_action"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "65ff51e9",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from enum import IntEnum\n",
+    "\n",
+    "\n",
+    "class ACTIONS(IntEnum):\n",
+    "    NORTH = 0\n",
+    "    SOUTH = 1\n",
+    "    EAST = 2\n",
+    "    WEST = 3"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "fc0818d2",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "<ACTIONS.SOUTH: 1>"
+      ]
+     },
+     "execution_count": 8,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "ACTIONS(1)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e1dafbc4",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": ".venv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.13.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
Original file line number	Diff line number	Diff line change
`@@ -6,5 +6,8 @@ readme = "README.md"`
`6`	`6`	`requires-python = ">=3.13"`
`7`	`7`	`dependencies = [`
`8`	`8`	`"gymnasium>=1.1.1",`
	`9`	`+ "ipykernel>=6.29.5",`
`9`	`10`	`"pygame>=2.6.1",`
	`11`	`+ "torch>=2.7.0",`
	`12`	`+ "torchrl>=0.8.0",`
`10`	`13`	`]`