From 6cc734e1e024b6da55d7a156aa4801da0b044e39 Mon Sep 17 00:00:00 2001
From: thoriq <41317726+thoriqakbar0@users.noreply.github.com>
Date: Mon, 5 Jan 2026 11:14:01 +0700
Subject: [PATCH 01/14] feat(local-repl): add multi-context support for
 persistent REPL

---
 rlm/environments/local_repl.py | 77 ++++++++++++++++++++++++----------
 tests/test_local_repl.py       | 35 ++++++++++++++++
 2 files changed, 91 insertions(+), 21 deletions(-)

diff --git a/rlm/environments/local_repl.py b/rlm/environments/local_repl.py
index b8183800..0c9ec6f9 100644
--- a/rlm/environments/local_repl.py
+++ b/rlm/environments/local_repl.py
@@ -130,6 +130,7 @@ def __init__(
         self.original_cwd = os.getcwd()
         self.temp_dir = tempfile.mkdtemp(prefix=f"repl_env_{uuid.uuid4()}_")
         self._lock = threading.Lock()
+        self._context_count: int = 0
 
         # Setup globals, locals, and modules in environment.
         self.setup()
@@ -222,20 +223,55 @@ def _llm_query_batched(self, prompts: list[str], model: str | None = None) -> li
             return [f"Error: LM query failed - {e}"] * len(prompts)
 
     def load_context(self, context_payload: dict | list | str):
-        """Load context into the environment."""
+        """Load context into the environment as context_0 (and 'context' alias)."""
+        self.add_context(context_payload, 0)
+
+    def add_context(
+        self, context_payload: dict | list | str, context_index: int | None = None
+    ) -> int:
+        """
+        Add a context with versioned variable name.
+
+        Args:
+            context_payload: The context data to add
+            context_index: Optional explicit index. If None, auto-increments.
+
+        Returns:
+            The context index used.
+        """
+        if context_index is None:
+            context_index = self._context_count
+
+        var_name = f"context_{context_index}"
+
         if isinstance(context_payload, str):
-            context_path = os.path.join(self.temp_dir, "context.txt")
+            context_path = os.path.join(self.temp_dir, f"context_{context_index}.txt")
             with open(context_path, "w") as f:
                 f.write(context_payload)
-            self.execute_code(f"with open(r'{context_path}', 'r') as f:\n    context = f.read()")
+            self.execute_code(f"with open(r'{context_path}', 'r') as f:\n    {var_name} = f.read()")
         else:
-            context_path = os.path.join(self.temp_dir, "context.json")
+            context_path = os.path.join(self.temp_dir, f"context_{context_index}.json")
             with open(context_path, "w") as f:
                 json.dump(context_payload, f)
             self.execute_code(
-                f"import json\nwith open(r'{context_path}', 'r') as f:\n    context = json.load(f)"
+                f"import json\nwith open(r'{context_path}', 'r') as f:\n    {var_name} = json.load(f)"
             )
 
+        # Alias context_0 as 'context' for backward compatibility
+        if context_index == 0:
+            self.execute_code(f"context = {var_name}")
+
+        self._context_count = max(self._context_count, context_index + 1)
+        return context_index
+
+    def update_handler_address(self, address: tuple[str, int]) -> None:
+        """Update the LM handler address for a new completion call."""
+        self.lm_handler_address = address
+
+    def get_context_count(self) -> int:
+        """Return the number of contexts loaded."""
+        return self._context_count
+
     @contextmanager
     def _capture_output(self):
         """Thread-safe context manager to capture stdout/stderr."""
@@ -265,22 +301,21 @@ def execute_code(self, code: str) -> REPLResult:
         # Clear pending LLM calls from previous execution
         self._pending_llm_calls = []
 
-        with self._capture_output() as (stdout_buf, stderr_buf):
-            with self._temp_cwd():
-                try:
-                    combined = {**self.globals, **self.locals}
-                    exec(code, combined, combined)
-
-                    # Update locals with new variables
-                    for key, value in combined.items():
-                        if key not in self.globals and not key.startswith("_"):
-                            self.locals[key] = value
-
-                    stdout = stdout_buf.getvalue()
-                    stderr = stderr_buf.getvalue()
-                except Exception as e:
-                    stdout = stdout_buf.getvalue()
-                    stderr = stderr_buf.getvalue() + f"\n{type(e).__name__}: {e}"
+        with self._capture_output() as (stdout_buf, stderr_buf), self._temp_cwd():
+            try:
+                combined = {**self.globals, **self.locals}
+                exec(code, combined, combined)
+
+                # Update locals with new variables
+                for key, value in combined.items():
+                    if key not in self.globals and not key.startswith("_"):
+                        self.locals[key] = value
+
+                stdout = stdout_buf.getvalue()
+                stderr = stderr_buf.getvalue()
+            except Exception as e:
+                stdout = stdout_buf.getvalue()
+                stderr = stderr_buf.getvalue() + f"\n{type(e).__name__}: {e}"
 
         return REPLResult(
             stdout=stdout,
diff --git a/tests/test_local_repl.py b/tests/test_local_repl.py
index f03c523c..ec5fac2c 100644
--- a/tests/test_local_repl.py
+++ b/tests/test_local_repl.py
@@ -193,3 +193,38 @@ def test_temp_dir_created_and_cleaned(self):
         assert os.path.exists(temp_dir)
         repl.cleanup()
         assert not os.path.exists(temp_dir)
+
+
+class TestLocalREPLMultiContext:
+    """Tests for multi-context support."""
+
+    def test_add_context_versioning(self):
+        """Test that add_context creates versioned variables."""
+        repl = LocalREPL()
+        repl.add_context("First", 0)
+        repl.add_context("Second", 1)
+        assert repl.locals["context_0"] == "First"
+        assert repl.locals["context_1"] == "Second"
+        assert repl.locals["context"] == "First"
+        assert repl.get_context_count() == 2
+        repl.cleanup()
+
+    def test_update_handler_address(self):
+        """Test handler address can be updated."""
+        repl = LocalREPL(lm_handler_address=("127.0.0.1", 5000))
+        repl.update_handler_address(("127.0.0.1", 6000))
+        assert repl.lm_handler_address == ("127.0.0.1", 6000)
+        repl.cleanup()
+
+    def test_add_context_auto_increment(self):
+        """Test that add_context auto-increments when no index provided."""
+        repl = LocalREPL()
+        idx1 = repl.add_context("First")
+        idx2 = repl.add_context("Second")
+        assert idx1 == 0
+        assert idx2 == 1
+        assert repl.locals["context_0"] == "First"
+        assert repl.locals["context_1"] == "Second"
+        assert repl.get_context_count() == 2
+        repl.cleanup()
+

From de6460878092b3f6b6e9acce12277833af813d60 Mon Sep 17 00:00:00 2001
From: thoriq <41317726+thoriqakbar0@users.noreply.github.com>
Date: Mon, 5 Jan 2026 11:57:08 +0700
Subject: [PATCH 02/14] feat(core): add persistent flag for multi-turn REPL
 conversations

Add persistent=True option to RLM that reuses the environment across
completion() calls instead of creating/destroying for each call. This
enables multi-turn conversations where variables and contexts persist.

- Add persistent parameter to RLM constructor
- Reuse environment when persistent=True, store as _persistent_env
- Add close() method and context manager support (__enter__/__exit__)
- Environment cleanup only on explicit close() when persistent
---
 rlm/core/rlm.py          | 45 ++++++++++++++++++++++++++++++++--------
 tests/test_local_repl.py |  1 -
 2 files changed, 36 insertions(+), 10 deletions(-)

diff --git a/rlm/core/rlm.py b/rlm/core/rlm.py
index 55a3cf58..349c20bf 100644
--- a/rlm/core/rlm.py
+++ b/rlm/core/rlm.py
@@ -51,6 +51,7 @@ def __init__(
         other_backend_kwargs: list[dict[str, Any]] | None = None,
         logger: RLMLogger | None = None,
         verbose: bool = False,
+        persistent: bool = False,
     ):
         """
         Args:
@@ -66,6 +67,7 @@ def __init__(
             other_backend_kwargs: The kwargs to pass to the other client backends (ordered to match other_backends).
             logger: The logger to use for the RLM.
             verbose: Whether to print verbose output in rich to console.
+            persistent: If True, reuse the environment across completion() calls for multi-turn conversations.
         """
         # Store config for spawning per-completion
         self.backend = backend
@@ -84,6 +86,10 @@ def __init__(
         self.logger = logger
         self.verbose = VerbosePrinter(enabled=verbose)
 
+        # Persistence support
+        self.persistent = persistent
+        self._persistent_env: BaseEnv | None = None
+
         # Log metadata if logger is provided
         if self.logger or verbose:
             metadata = RLMMetadata(
@@ -108,7 +114,9 @@ def __init__(
     def _spawn_completion_context(self, prompt: str | dict[str, Any]):
         """
         Spawn an LM handler and environment for a single completion call.
-        Cleans up both when the context exits.
+
+        When persistent=True, the environment is reused across calls.
+        When persistent=False (default), creates fresh environment each call.
         """
         # Create client and wrap in handler
         client: BaseLM = get_client(self.backend, self.backend_kwargs)
@@ -122,20 +130,25 @@ def _spawn_completion_context(self, prompt: str | dict[str, Any]):
 
         lm_handler.start()
 
-        # Pass handler address to environment so it can make llm_query() calls
-        env_kwargs = self.environment_kwargs.copy()
-        env_kwargs["lm_handler_address"] = (lm_handler.host, lm_handler.port)
-        env_kwargs["context_payload"] = prompt
+        # Environment: reuse if persistent, otherwise create fresh
+        if self.persistent and self._persistent_env is not None:
+            environment = self._persistent_env
+            environment.update_handler_address((lm_handler.host, lm_handler.port))
+            environment.add_context(prompt)
+        else:
+            env_kwargs = self.environment_kwargs.copy()
+            env_kwargs["lm_handler_address"] = (lm_handler.host, lm_handler.port)
+            env_kwargs["context_payload"] = prompt
+            environment: BaseEnv = get_environment(self.environment_type, env_kwargs)
 
-        # Initialize the environment
-        environment: BaseEnv = get_environment(self.environment_type, env_kwargs)
+            if self.persistent:
+                self._persistent_env = environment
 
         try:
             yield lm_handler, environment
         finally:
-            # Cleanup
             lm_handler.stop()
-            if hasattr(environment, "cleanup"):
+            if not self.persistent and hasattr(environment, "cleanup"):
                 environment.cleanup()
 
     def _setup_prompt(self, prompt: str | dict[str, Any]) -> list[dict[str, Any]]:
@@ -292,3 +305,17 @@ def _fallback_answer(self, message: str | dict[str, Any]) -> str:
         client: BaseLM = get_client(self.backend, self.backend_kwargs)
         response = client.completion(message)
         return response
+
+    def close(self) -> None:
+        """Clean up persistent environment. Call when done with multi-turn conversations."""
+        if self._persistent_env is not None:
+            if hasattr(self._persistent_env, "cleanup"):
+                self._persistent_env.cleanup()
+            self._persistent_env = None
+
+    def __enter__(self) -> "RLM":
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb) -> bool:
+        self.close()
+        return False
diff --git a/tests/test_local_repl.py b/tests/test_local_repl.py
index ec5fac2c..15c33c01 100644
--- a/tests/test_local_repl.py
+++ b/tests/test_local_repl.py
@@ -227,4 +227,3 @@ def test_add_context_auto_increment(self):
         assert repl.locals["context_1"] == "Second"
         assert repl.get_context_count() == 2
         repl.cleanup()
-

From b6e7c8500b2f44687e41aba456ffdbc6ee1cd3dd Mon Sep 17 00:00:00 2001
From: thoriq <41317726+thoriqakbar0@users.noreply.github.com>
Date: Mon, 5 Jan 2026 12:18:12 +0700
Subject: [PATCH 03/14] feat(prompts): inform model about multiple contexts in
 multi-turn sessions

Add context_count parameter to build_user_prompt() so the model knows
when multiple contexts are available (context_0, context_1, etc.) during
persistent REPL sessions.
---
 rlm/core/rlm.py      |  9 ++++++++-
 rlm/utils/prompts.py | 14 +++++++++++---
 2 files changed, 19 insertions(+), 4 deletions(-)

diff --git a/rlm/core/rlm.py b/rlm/core/rlm.py
index 349c20bf..c2dc69d9 100644
--- a/rlm/core/rlm.py
+++ b/rlm/core/rlm.py
@@ -190,7 +190,14 @@ def completion(
 
             for i in range(self.max_iterations):
                 # Current prompt = message history + additional prompt suffix
-                current_prompt = message_history + [build_user_prompt(root_prompt, i)]
+                context_count = (
+                    environment.get_context_count()
+                    if hasattr(environment, "get_context_count")
+                    else 1
+                )
+                current_prompt = message_history + [
+                    build_user_prompt(root_prompt, i, context_count)
+                ]
 
                 iteration: RLMIteration = self._completion_turn(
                     prompt=current_prompt,
diff --git a/rlm/utils/prompts.py b/rlm/utils/prompts.py
index 3d0bd624..e47488e2 100644
--- a/rlm/utils/prompts.py
+++ b/rlm/utils/prompts.py
@@ -116,15 +116,23 @@ def build_rlm_system_prompt(
 USER_PROMPT_WITH_ROOT = """Think step-by-step on what to do using the REPL environment (which contains the context) to answer the original prompt: \"{root_prompt}\".\n\nContinue using the REPL environment, which has the `context` variable, and querying sub-LLMs by writing to ```repl``` tags, and determine your answer. Your next action:"""
 
 
-def build_user_prompt(root_prompt: str | None = None, iteration: int = 0) -> dict[str, str]:
+def build_user_prompt(
+    root_prompt: str | None = None,
+    iteration: int = 0,
+    context_count: int = 1,
+) -> dict[str, str]:
     if iteration == 0:
         safeguard = "You have not interacted with the REPL environment or seen your prompt / context yet. Your next action should be to look through and figure out how to answer the prompt, so don't just provide a final answer yet.\n\n"
         prompt = safeguard + (
             USER_PROMPT_WITH_ROOT.format(root_prompt=root_prompt) if root_prompt else USER_PROMPT
         )
-        return {"role": "user", "content": prompt}
     else:
         prompt = "The history before is your previous interactions with the REPL environment. " + (
             USER_PROMPT_WITH_ROOT.format(root_prompt=root_prompt) if root_prompt else USER_PROMPT
         )
-        return {"role": "user", "content": prompt}
+
+    # Inform model about multiple contexts if present
+    if context_count > 1:
+        prompt += f"\n\nNote: You have {context_count} contexts available (context_0 through context_{context_count - 1})."
+
+    return {"role": "user", "content": prompt}

From aa6645e269c73bd274c6d09ad67f94713105ba3d Mon Sep 17 00:00:00 2001
From: thoriq <41317726+thoriqakbar0@users.noreply.github.com>
Date: Tue, 6 Jan 2026 14:26:45 +0700
Subject: [PATCH 04/14] fix(core): validate environment supports persistent
 mode before use

Add validation to prevent AttributeError when persistent=True is used
with environments that don't implement the required methods
(update_handler_address, add_context, get_context_count).

- Add _validate_persistent_environment_support() called at init time
- Add _env_supports_persistence() for runtime capability checking
- Add defensive runtime check before calling persistence methods
- Raise clear ValueError if unsupported environment is configured
---
 rlm/core/rlm.py | 48 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 48 insertions(+)

diff --git a/rlm/core/rlm.py b/rlm/core/rlm.py
index c2dc69d9..b1ec2e83 100644
--- a/rlm/core/rlm.py
+++ b/rlm/core/rlm.py
@@ -90,6 +90,10 @@ def __init__(
         self.persistent = persistent
         self._persistent_env: BaseEnv | None = None
 
+        # Validate persistence support at initialization
+        if self.persistent:
+            self._validate_persistent_environment_support()
+
         # Log metadata if logger is provided
         if self.logger or verbose:
             metadata = RLMMetadata(
@@ -133,6 +137,13 @@ def _spawn_completion_context(self, prompt: str | dict[str, Any]):
         # Environment: reuse if persistent, otherwise create fresh
         if self.persistent and self._persistent_env is not None:
             environment = self._persistent_env
+            # Defensive check: ensure environment supports persistence methods
+            if not self._env_supports_persistence(environment):
+                raise RuntimeError(
+                    f"Persistent environment of type '{type(environment).__name__}' does not "
+                    f"implement required methods (update_handler_address, add_context, get_context_count). "
+                    f"This should have been caught at initialization."
+                )
             environment.update_handler_address((lm_handler.host, lm_handler.port))
             environment.add_context(prompt)
         else:
@@ -313,6 +324,43 @@ def _fallback_answer(self, message: str | dict[str, Any]) -> str:
         response = client.completion(message)
         return response
 
+    def _validate_persistent_environment_support(self) -> None:
+        """
+        Validate that the configured environment type supports persistent mode.
+
+        Persistent mode requires environments to implement:
+        - update_handler_address(address): Update LM handler address between calls
+        - add_context(payload, index): Add new context for multi-turn conversations
+        - get_context_count(): Return the number of loaded contexts
+
+        Currently only 'local' (LocalREPL) supports these methods.
+
+        Raises:
+            ValueError: If the environment type does not support persistent mode.
+        """
+        # Known environments that support persistence
+        persistent_supported_environments = {"local"}
+
+        if self.environment_type not in persistent_supported_environments:
+            raise ValueError(
+                f"persistent=True is not supported for environment type '{self.environment_type}'. "
+                f"Persistent mode requires environments that implement update_handler_address(), "
+                f"add_context(), and get_context_count(). "
+                f"Supported environments: {sorted(persistent_supported_environments)}"
+            )
+
+    @staticmethod
+    def _env_supports_persistence(env: BaseEnv) -> bool:
+        """Check if an environment instance supports persistent mode methods."""
+        return (
+            hasattr(env, "update_handler_address")
+            and hasattr(env, "add_context")
+            and hasattr(env, "get_context_count")
+            and callable(getattr(env, "update_handler_address", None))
+            and callable(getattr(env, "add_context", None))
+            and callable(getattr(env, "get_context_count", None))
+        )
+
     def close(self) -> None:
         """Clean up persistent environment. Call when done with multi-turn conversations."""
         if self._persistent_env is not None:

From 5d6cefc728d1c0a8f14b71054e6c7182430fad76 Mon Sep 17 00:00:00 2001
From: thoriq <41317726+thoriqakbar0@users.noreply.github.com>
Date: Wed, 7 Jan 2026 17:08:52 +0700
Subject: [PATCH 05/14] feat(local-repl): add message history storage for
 multi-turn sessions

Store conversation histories as versioned variables (history_0, history_1, etc.)
in the REPL environment, making them accessible for subsequent queries. This
enables models to reference prior conversation context in persistent sessions.
---
 rlm/core/rlm.py                | 21 +++++++++-
 rlm/environments/local_repl.py | 34 +++++++++++++++++
 rlm/utils/prompts.py           |  8 ++++
 tests/test_local_repl.py       | 70 ++++++++++++++++++++++++++++++++++
 4 files changed, 132 insertions(+), 1 deletion(-)

diff --git a/rlm/core/rlm.py b/rlm/core/rlm.py
index b1ec2e83..195837d8 100644
--- a/rlm/core/rlm.py
+++ b/rlm/core/rlm.py
@@ -206,8 +206,13 @@ def completion(
                     if hasattr(environment, "get_context_count")
                     else 1
                 )
+                history_count = (
+                    environment.get_history_count()
+                    if hasattr(environment, "get_history_count")
+                    else 0
+                )
                 current_prompt = message_history + [
-                    build_user_prompt(root_prompt, i, context_count)
+                    build_user_prompt(root_prompt, i, context_count, history_count)
                 ]
 
                 iteration: RLMIteration = self._completion_turn(
@@ -232,6 +237,11 @@ def completion(
                     usage = lm_handler.get_usage_summary()
                     self.verbose.print_final_answer(final_answer)
                     self.verbose.print_summary(i + 1, time_end - time_start, usage.to_dict())
+
+                    # Store message history in persistent environment
+                    if self.persistent and hasattr(environment, "add_history"):
+                        environment.add_history(message_history)
+
                     return RLMChatCompletion(
                         root_model=self.backend_kwargs.get("model_name", "unknown")
                         if self.backend_kwargs
@@ -254,6 +264,11 @@ def completion(
             usage = lm_handler.get_usage_summary()
             self.verbose.print_final_answer(final_answer)
             self.verbose.print_summary(self.max_iterations, time_end - time_start, usage.to_dict())
+
+            # Store message history in persistent environment
+            if self.persistent and hasattr(environment, "add_history"):
+                environment.add_history(message_history)
+
             return RLMChatCompletion(
                 root_model=self.backend_kwargs.get("model_name", "unknown")
                 if self.backend_kwargs
@@ -356,9 +371,13 @@ def _env_supports_persistence(env: BaseEnv) -> bool:
             hasattr(env, "update_handler_address")
             and hasattr(env, "add_context")
             and hasattr(env, "get_context_count")
+            and hasattr(env, "add_history")
+            and hasattr(env, "get_history_count")
             and callable(getattr(env, "update_handler_address", None))
             and callable(getattr(env, "add_context", None))
             and callable(getattr(env, "get_context_count", None))
+            and callable(getattr(env, "add_history", None))
+            and callable(getattr(env, "get_history_count", None))
         )
 
     def close(self) -> None:
diff --git a/rlm/environments/local_repl.py b/rlm/environments/local_repl.py
index 0c9ec6f9..7f1fcd6b 100644
--- a/rlm/environments/local_repl.py
+++ b/rlm/environments/local_repl.py
@@ -1,3 +1,4 @@
+import copy
 import io
 import json
 import os
@@ -131,6 +132,7 @@ def __init__(
         self.temp_dir = tempfile.mkdtemp(prefix=f"repl_env_{uuid.uuid4()}_")
         self._lock = threading.Lock()
         self._context_count: int = 0
+        self._history_count: int = 0
 
         # Setup globals, locals, and modules in environment.
         self.setup()
@@ -272,6 +274,38 @@ def get_context_count(self) -> int:
         """Return the number of contexts loaded."""
         return self._context_count
 
+    def add_history(
+        self, message_history: list[dict[str, Any]], history_index: int | None = None
+    ) -> int:
+        """
+        Store a conversation's message history as a versioned variable.
+
+        Args:
+            message_history: The list of message dicts from a completion call
+            history_index: Optional explicit index. If None, auto-increments.
+
+        Returns:
+            The history index used.
+        """
+        if history_index is None:
+            history_index = self._history_count
+
+        var_name = f"history_{history_index}"
+
+        # Store deep copy to avoid reference issues with nested dicts
+        self.locals[var_name] = copy.deepcopy(message_history)
+
+        # Alias history_0 as 'history' for convenience
+        if history_index == 0:
+            self.locals["history"] = self.locals[var_name]
+
+        self._history_count = max(self._history_count, history_index + 1)
+        return history_index
+
+    def get_history_count(self) -> int:
+        """Return the number of conversation histories stored."""
+        return self._history_count
+
     @contextmanager
     def _capture_output(self):
         """Thread-safe context manager to capture stdout/stderr."""
diff --git a/rlm/utils/prompts.py b/rlm/utils/prompts.py
index e47488e2..f69b2292 100644
--- a/rlm/utils/prompts.py
+++ b/rlm/utils/prompts.py
@@ -120,6 +120,7 @@ def build_user_prompt(
     root_prompt: str | None = None,
     iteration: int = 0,
     context_count: int = 1,
+    history_count: int = 0,
 ) -> dict[str, str]:
     if iteration == 0:
         safeguard = "You have not interacted with the REPL environment or seen your prompt / context yet. Your next action should be to look through and figure out how to answer the prompt, so don't just provide a final answer yet.\n\n"
@@ -135,4 +136,11 @@ def build_user_prompt(
     if context_count > 1:
         prompt += f"\n\nNote: You have {context_count} contexts available (context_0 through context_{context_count - 1})."
 
+    # Inform model about prior conversation histories if present
+    if history_count > 0:
+        if history_count == 1:
+            prompt += "\n\nNote: You have 1 prior conversation history available in the `history` variable."
+        else:
+            prompt += f"\n\nNote: You have {history_count} prior conversation histories available (history_0 through history_{history_count - 1})."
+
     return {"role": "user", "content": prompt}
diff --git a/tests/test_local_repl.py b/tests/test_local_repl.py
index 15c33c01..9a6ccf42 100644
--- a/tests/test_local_repl.py
+++ b/tests/test_local_repl.py
@@ -227,3 +227,73 @@ def test_add_context_auto_increment(self):
         assert repl.locals["context_1"] == "Second"
         assert repl.get_context_count() == 2
         repl.cleanup()
+
+
+class TestLocalREPLHistory:
+    """Tests for message history storage in LocalREPL."""
+
+    def test_add_history_basic(self):
+        """Test that add_history stores message history correctly."""
+        repl = LocalREPL()
+
+        history = [
+            {"role": "system", "content": "You are helpful."},
+            {"role": "user", "content": "Hello"},
+            {"role": "assistant", "content": "Hi there!"},
+        ]
+
+        index = repl.add_history(history)
+
+        assert index == 0
+        assert "history_0" in repl.locals
+        assert "history" in repl.locals  # alias
+        assert repl.locals["history_0"] == history
+        assert repl.locals["history"] == history
+        assert repl.get_history_count() == 1
+
+        repl.cleanup()
+
+    def test_add_multiple_histories(self):
+        """Test adding multiple conversation histories."""
+        repl = LocalREPL()
+
+        history1 = [{"role": "user", "content": "First conversation"}]
+        history2 = [{"role": "user", "content": "Second conversation"}]
+
+        repl.add_history(history1)
+        repl.add_history(history2)
+
+        assert repl.get_history_count() == 2
+        assert repl.locals["history_0"] == history1
+        assert repl.locals["history_1"] == history2
+        assert repl.locals["history"] == history1  # alias stays on first
+
+        repl.cleanup()
+
+    def test_history_accessible_via_code(self):
+        """Test that stored history is accessible via code execution."""
+        repl = LocalREPL()
+
+        history = [{"role": "user", "content": "Test message"}]
+        repl.add_history(history)
+
+        result = repl.execute_code("msg = history[0]['content']")
+        assert result.stderr == ""
+        assert repl.locals["msg"] == "Test message"
+
+        repl.cleanup()
+
+    def test_history_is_copy(self):
+        """Test that stored history is a copy, not a reference."""
+        repl = LocalREPL()
+
+        history = [{"role": "user", "content": "Original"}]
+        repl.add_history(history)
+
+        # Modify original
+        history[0]["content"] = "Modified"
+
+        # Stored copy should be unchanged
+        assert repl.locals["history_0"][0]["content"] == "Original"
+
+        repl.cleanup()

From 60c4b76cf39c03e0694e5f60169518fc383e3233 Mon Sep 17 00:00:00 2001
From: thoriq <41317726+thoriqakbar0@users.noreply.github.com>
Date: Thu, 8 Jan 2026 09:53:08 +0700
Subject: [PATCH 06/14] feat(environments): add persistent parameter to all
 REPL classes

Add persistent parameter to BaseEnv, IsolatedEnv, and NonIsolatedEnv.
DockerREPL, ModalREPL, and PrimeREPL raise NotImplementedError when
persistent=True since they don't yet support it. LocalREPL passes
the parameter through to the base class.

Also makes DockerREPL cleanup more defensive with hasattr checks.
---
 rlm/environments/base_env.py    | 11 ++++++-----
 rlm/environments/docker_repl.py | 13 +++++++++----
 rlm/environments/local_repl.py  |  3 ++-
 rlm/environments/modal_repl.py  |  7 ++++++-
 rlm/environments/prime_repl.py  |  7 ++++++-
 5 files changed, 29 insertions(+), 12 deletions(-)

diff --git a/rlm/environments/base_env.py b/rlm/environments/base_env.py
index 6a99c64a..ae74577b 100644
--- a/rlm/environments/base_env.py
+++ b/rlm/environments/base_env.py
@@ -9,7 +9,8 @@ class BaseEnv(ABC):
     where isolated environments are on a separate machine from the LM.
     """
 
-    def __init__(self, **kwargs):
+    def __init__(self, persistent: bool = False, **kwargs):
+        self.persistent = persistent
         self.kwargs = kwargs
 
     @abstractmethod
@@ -31,8 +32,8 @@ class IsolatedEnv(BaseEnv, ABC):
     guaranteeing complete isolation from the LM process.
     """
 
-    def __init__(self, **kwargs):
-        super().__init__(**kwargs)
+    def __init__(self, persistent: bool = False, **kwargs):
+        super().__init__(persistent=persistent, **kwargs)
 
     @abstractmethod
     def setup(self):
@@ -54,8 +55,8 @@ class NonIsolatedEnv(BaseEnv, ABC):
     as a subprocess.
     """
 
-    def __init__(self, **kwargs):
-        super().__init__(**kwargs)
+    def __init__(self, persistent: bool = False, **kwargs):
+        super().__init__(persistent=persistent, **kwargs)
 
     @abstractmethod
     def setup(self):
diff --git a/rlm/environments/docker_repl.py b/rlm/environments/docker_repl.py
index 6dd8c00a..19714857 100644
--- a/rlm/environments/docker_repl.py
+++ b/rlm/environments/docker_repl.py
@@ -180,9 +180,14 @@ def __init__(
         lm_handler_address: tuple[str, int] | None = None,
         context_payload: dict | list | str | None = None,
         setup_code: str | None = None,
+        persistent: bool = False,
         **kwargs,
     ):
-        super().__init__(**kwargs)
+        if persistent:
+            raise NotImplementedError(
+                "Persistent REPLs are currently not supported for environment: DockerREPL"
+            )
+        super().__init__(persistent=persistent, **kwargs)
 
         self.image = image
         self.lm_handler_address = lm_handler_address
@@ -292,13 +297,13 @@ def execute_code(self, code: str) -> REPLResult:
             )
 
     def cleanup(self):
-        if self.container_id:
+        if hasattr(self, "container_id") and self.container_id:
             subprocess.run(["docker", "stop", self.container_id], capture_output=True)
             self.container_id = None
-        if self.proxy_server:
+        if hasattr(self, "proxy_server") and self.proxy_server:
             self.proxy_server.shutdown()
             self.proxy_server = None
-        if os.path.exists(self.temp_dir):
+        if hasattr(self, "temp_dir") and os.path.exists(self.temp_dir):
             import shutil
 
             shutil.rmtree(self.temp_dir, ignore_errors=True)
diff --git a/rlm/environments/local_repl.py b/rlm/environments/local_repl.py
index 7f1fcd6b..05de1d3d 100644
--- a/rlm/environments/local_repl.py
+++ b/rlm/environments/local_repl.py
@@ -123,9 +123,10 @@ def __init__(
         lm_handler_address: tuple[str, int] | None = None,
         context_payload: dict | list | str | None = None,
         setup_code: str | None = None,
+        persistent: bool = False,
         **kwargs,
     ):
-        super().__init__(**kwargs)
+        super().__init__(persistent=persistent, **kwargs)
 
         self.lm_handler_address = lm_handler_address
         self.original_cwd = os.getcwd()
diff --git a/rlm/environments/modal_repl.py b/rlm/environments/modal_repl.py
index 2acfed18..82fa24cf 100644
--- a/rlm/environments/modal_repl.py
+++ b/rlm/environments/modal_repl.py
@@ -309,9 +309,14 @@ def __init__(
         lm_handler_address: tuple[str, int] | None = None,
         context_payload: dict | list | str | None = None,
         setup_code: str | None = None,
+        persistent: bool = False,
         **kwargs,
     ):
-        super().__init__(**kwargs)
+        if persistent:
+            raise NotImplementedError(
+                "Persistent REPLs are currently not supported for environment: ModalREPL"
+            )
+        super().__init__(persistent=persistent, **kwargs)
 
         self.app_name = app_name
         self.timeout = timeout
diff --git a/rlm/environments/prime_repl.py b/rlm/environments/prime_repl.py
index 19a082ca..0d88ce90 100644
--- a/rlm/environments/prime_repl.py
+++ b/rlm/environments/prime_repl.py
@@ -8,9 +8,14 @@ def __init__(
         context_payload: dict | list | str | None = None,
         sandbox_name: str | None = None,
         api_key: str | None = None,
+        persistent: bool = False,
         **kwargs,
     ):
-        pass
+        if persistent:
+            raise NotImplementedError(
+                "Persistent REPLs are currently not supported for environment: PrimeREPL"
+            )
+        super().__init__(persistent=persistent, **kwargs)
 
     def setup(self):
         pass

From 2ac518ad34f6aaaad504c1c0cc87c1fb2fd7abbf Mon Sep 17 00:00:00 2001
From: thoriq <41317726+thoriqakbar0@users.noreply.github.com>
Date: Thu, 8 Jan 2026 09:53:20 +0700
Subject: [PATCH 07/14] test(local-repl): replace multi-context tests with
 non-persistent simulation tests

Replace TestLocalREPLMultiContext and TestLocalREPLHistory with
TestLocalREPLSimulatingRLMNoPersistence. New tests verify that
environments reset between RLM completions when persistent=False.
---
 tests/test_local_repl.py | 149 +++++++++++++--------------------------
 1 file changed, 48 insertions(+), 101 deletions(-)

diff --git a/tests/test_local_repl.py b/tests/test_local_repl.py
index 9a6ccf42..115f671c 100644
--- a/tests/test_local_repl.py
+++ b/tests/test_local_repl.py
@@ -195,105 +195,52 @@ def test_temp_dir_created_and_cleaned(self):
         assert not os.path.exists(temp_dir)
 
 
-class TestLocalREPLMultiContext:
-    """Tests for multi-context support."""
+class TestLocalREPLSimulatingRLMNoPersistence:
+    """
+    Tests simulating RLM's non-persistent completion behavior.
+
+    When RLM is configured without persistent=True (the default), each
+    get_completion() call spawns a fresh environment and destroys it after.
+    This test suite simulates that behavior to prove variables don't survive
+    across RLM completions.
+
+    Why this matters: This is NOT just testing that two Python objects don't
+    share state (trivially true). This simulates the actual RLM workflow where
+    environments are created and destroyed per completion.
+    """
+
+    def test_simulated_rlm_completions_reset_environment(self):
+        """
+        Simulates 2 RLM completions to show env resets between calls.
+
+        Without persistent=True, RLM creates a fresh environment for each
+        completion, so state doesn't carry over.
+        """
+        completion_1_env = LocalREPL()
+        completion_1_env.execute_code("important_result = 42")
+        assert completion_1_env.locals["important_result"] == 42
+        completion_1_env.cleanup()
+
+        completion_2_env = LocalREPL()
+        result = completion_2_env.execute_code("print(important_result)")
+
+        assert "NameError" in result.stderr
+        assert "important_result" in result.stderr
+        completion_2_env.cleanup()
+
+    def test_simulated_rlm_completions_functions_not_preserved(self):
+        """
+        Simulates 2 RLM completions to show functions don't persist.
+        """
+        completion_1_env = LocalREPL()
+        completion_1_env.execute_code("def my_helper(): return 'useful'")
+        assert completion_1_env.execute_code("print(my_helper())").stdout.strip() == "useful"
+        completion_1_env.cleanup()
+
+        completion_2_env = LocalREPL()
+        result = completion_2_env.execute_code("my_helper()")
+
+        assert "NameError" in result.stderr
+        assert "my_helper" in result.stderr
+        completion_2_env.cleanup()
 
-    def test_add_context_versioning(self):
-        """Test that add_context creates versioned variables."""
-        repl = LocalREPL()
-        repl.add_context("First", 0)
-        repl.add_context("Second", 1)
-        assert repl.locals["context_0"] == "First"
-        assert repl.locals["context_1"] == "Second"
-        assert repl.locals["context"] == "First"
-        assert repl.get_context_count() == 2
-        repl.cleanup()
-
-    def test_update_handler_address(self):
-        """Test handler address can be updated."""
-        repl = LocalREPL(lm_handler_address=("127.0.0.1", 5000))
-        repl.update_handler_address(("127.0.0.1", 6000))
-        assert repl.lm_handler_address == ("127.0.0.1", 6000)
-        repl.cleanup()
-
-    def test_add_context_auto_increment(self):
-        """Test that add_context auto-increments when no index provided."""
-        repl = LocalREPL()
-        idx1 = repl.add_context("First")
-        idx2 = repl.add_context("Second")
-        assert idx1 == 0
-        assert idx2 == 1
-        assert repl.locals["context_0"] == "First"
-        assert repl.locals["context_1"] == "Second"
-        assert repl.get_context_count() == 2
-        repl.cleanup()
-
-
-class TestLocalREPLHistory:
-    """Tests for message history storage in LocalREPL."""
-
-    def test_add_history_basic(self):
-        """Test that add_history stores message history correctly."""
-        repl = LocalREPL()
-
-        history = [
-            {"role": "system", "content": "You are helpful."},
-            {"role": "user", "content": "Hello"},
-            {"role": "assistant", "content": "Hi there!"},
-        ]
-
-        index = repl.add_history(history)
-
-        assert index == 0
-        assert "history_0" in repl.locals
-        assert "history" in repl.locals  # alias
-        assert repl.locals["history_0"] == history
-        assert repl.locals["history"] == history
-        assert repl.get_history_count() == 1
-
-        repl.cleanup()
-
-    def test_add_multiple_histories(self):
-        """Test adding multiple conversation histories."""
-        repl = LocalREPL()
-
-        history1 = [{"role": "user", "content": "First conversation"}]
-        history2 = [{"role": "user", "content": "Second conversation"}]
-
-        repl.add_history(history1)
-        repl.add_history(history2)
-
-        assert repl.get_history_count() == 2
-        assert repl.locals["history_0"] == history1
-        assert repl.locals["history_1"] == history2
-        assert repl.locals["history"] == history1  # alias stays on first
-
-        repl.cleanup()
-
-    def test_history_accessible_via_code(self):
-        """Test that stored history is accessible via code execution."""
-        repl = LocalREPL()
-
-        history = [{"role": "user", "content": "Test message"}]
-        repl.add_history(history)
-
-        result = repl.execute_code("msg = history[0]['content']")
-        assert result.stderr == ""
-        assert repl.locals["msg"] == "Test message"
-
-        repl.cleanup()
-
-    def test_history_is_copy(self):
-        """Test that stored history is a copy, not a reference."""
-        repl = LocalREPL()
-
-        history = [{"role": "user", "content": "Original"}]
-        repl.add_history(history)
-
-        # Modify original
-        history[0]["content"] = "Modified"
-
-        # Stored copy should be unchanged
-        assert repl.locals["history_0"][0]["content"] == "Original"
-
-        repl.cleanup()

From ac3c2d7f9553647219d34ef46086f8b3b5c6ae58 Mon Sep 17 00:00:00 2001
From: thoriq <41317726+thoriqakbar0@users.noreply.github.com>
Date: Thu, 8 Jan 2026 09:59:25 +0700
Subject: [PATCH 08/14] test(local-repl): add persistent mode unit tests

Add TestLocalREPLMultiContext, TestLocalREPLHistory, and
TestLocalREPLPersistentState test classes for LocalREPL's
persistent mode features including multi-context versioning
and message history storage.
---
 tests/test_local_repl_persistent.py | 218 ++++++++++++++++++++++++++++
 1 file changed, 218 insertions(+)
 create mode 100644 tests/test_local_repl_persistent.py

diff --git a/tests/test_local_repl_persistent.py b/tests/test_local_repl_persistent.py
new file mode 100644
index 00000000..4624c04a
--- /dev/null
+++ b/tests/test_local_repl_persistent.py
@@ -0,0 +1,218 @@
+"""Tests for LocalREPL persistence features.
+
+These tests verify LocalREPL's multi-context and multi-history capabilities
+which support the persistent=True mode in RLM for multi-turn conversations.
+"""
+
+from rlm.environments.local_repl import LocalREPL
+
+
+class TestLocalREPLMultiContext:
+    """Tests for multi-context support in persistent mode."""
+
+    def test_add_context_versioning(self):
+        """Test that add_context creates versioned variables."""
+        repl = LocalREPL()
+        repl.add_context("First", 0)
+        repl.add_context("Second", 1)
+        assert repl.locals["context_0"] == "First"
+        assert repl.locals["context_1"] == "Second"
+        assert repl.locals["context"] == "First"
+        assert repl.get_context_count() == 2
+        repl.cleanup()
+
+    def test_update_handler_address(self):
+        """Test handler address can be updated."""
+        repl = LocalREPL(lm_handler_address=("127.0.0.1", 5000))
+        repl.update_handler_address(("127.0.0.1", 6000))
+        assert repl.lm_handler_address == ("127.0.0.1", 6000)
+        repl.cleanup()
+
+    def test_add_context_auto_increment(self):
+        """Test that add_context auto-increments when no index provided."""
+        repl = LocalREPL()
+        idx1 = repl.add_context("First")
+        idx2 = repl.add_context("Second")
+        assert idx1 == 0
+        assert idx2 == 1
+        assert repl.locals["context_0"] == "First"
+        assert repl.locals["context_1"] == "Second"
+        assert repl.get_context_count() == 2
+        repl.cleanup()
+
+    def test_contexts_accessible_in_code(self):
+        """Test that multiple contexts can be accessed in code execution."""
+        repl = LocalREPL()
+        repl.add_context("Document A content")
+        repl.add_context("Document B content")
+
+        result = repl.execute_code("combined = f'{context_0} + {context_1}'")
+        assert result.stderr == ""
+        assert repl.locals["combined"] == "Document A content + Document B content"
+        repl.cleanup()
+
+    def test_context_alias_points_to_first(self):
+        """Test that 'context' always aliases context_0."""
+        repl = LocalREPL()
+        repl.add_context("First")
+        repl.add_context("Second")
+        repl.add_context("Third")
+
+        result = repl.execute_code("is_first = context == context_0")
+        assert result.stderr == ""
+        assert repl.locals["is_first"] is True
+        repl.cleanup()
+
+
+class TestLocalREPLHistory:
+    """Tests for message history storage in LocalREPL for persistent sessions."""
+
+    def test_add_history_basic(self):
+        """Test that add_history stores message history correctly."""
+        repl = LocalREPL()
+
+        history = [
+            {"role": "system", "content": "You are helpful."},
+            {"role": "user", "content": "Hello"},
+            {"role": "assistant", "content": "Hi there!"},
+        ]
+
+        index = repl.add_history(history)
+
+        assert index == 0
+        assert "history_0" in repl.locals
+        assert "history" in repl.locals  # alias
+        assert repl.locals["history_0"] == history
+        assert repl.locals["history"] == history
+        assert repl.get_history_count() == 1
+
+        repl.cleanup()
+
+    def test_add_multiple_histories(self):
+        """Test adding multiple conversation histories."""
+        repl = LocalREPL()
+
+        history1 = [{"role": "user", "content": "First conversation"}]
+        history2 = [{"role": "user", "content": "Second conversation"}]
+
+        repl.add_history(history1)
+        repl.add_history(history2)
+
+        assert repl.get_history_count() == 2
+        assert repl.locals["history_0"] == history1
+        assert repl.locals["history_1"] == history2
+        assert repl.locals["history"] == history1  # alias stays on first
+
+        repl.cleanup()
+
+    def test_history_accessible_via_code(self):
+        """Test that stored history is accessible via code execution."""
+        repl = LocalREPL()
+
+        history = [{"role": "user", "content": "Test message"}]
+        repl.add_history(history)
+
+        result = repl.execute_code("msg = history[0]['content']")
+        assert result.stderr == ""
+        assert repl.locals["msg"] == "Test message"
+
+        repl.cleanup()
+
+    def test_history_is_copy(self):
+        """Test that stored history is a copy, not a reference."""
+        repl = LocalREPL()
+
+        history = [{"role": "user", "content": "Original"}]
+        repl.add_history(history)
+
+        history[0]["content"] = "Modified"
+
+        assert repl.locals["history_0"][0]["content"] == "Original"
+
+        repl.cleanup()
+
+    def test_can_iterate_histories_in_code(self):
+        """Test iterating through multiple histories in code."""
+        repl = LocalREPL()
+
+        repl.add_history([{"role": "user", "content": "Query 1"}])
+        repl.add_history([{"role": "user", "content": "Query 2"}])
+        repl.add_history([{"role": "user", "content": "Query 3"}])
+
+        code = """
+all_contents = [
+    history_0[0]['content'],
+    history_1[0]['content'],
+    history_2[0]['content'],
+]
+"""
+        result = repl.execute_code(code)
+        assert result.stderr == ""
+        assert repl.locals["all_contents"] == ["Query 1", "Query 2", "Query 3"]
+
+        repl.cleanup()
+
+
+class TestLocalREPLPersistentState:
+    """Tests for state persistence across multiple operations in a single REPL instance."""
+
+    def test_variables_persist_with_contexts(self):
+        """Variables and contexts should coexist."""
+        repl = LocalREPL()
+
+        repl.add_context("My context data")
+        repl.execute_code("summary = context.upper()")
+        assert repl.locals["summary"] == "MY CONTEXT DATA"
+
+        repl.add_context("New context")
+
+        assert repl.locals["summary"] == "MY CONTEXT DATA"
+        assert repl.locals["context_1"] == "New context"
+
+        repl.cleanup()
+
+    def test_variables_persist_with_histories(self):
+        """Variables and histories should coexist."""
+        repl = LocalREPL()
+
+        repl.add_history([{"role": "user", "content": "Hello"}])
+        repl.execute_code("extracted = history[0]['content']")
+        assert repl.locals["extracted"] == "Hello"
+
+        repl.add_history([{"role": "user", "content": "World"}])
+
+        assert repl.locals["extracted"] == "Hello"
+        assert repl.locals["history_1"][0]["content"] == "World"
+
+        repl.cleanup()
+
+    def test_full_persistent_session_simulation(self):
+        """Simulate a multi-turn persistent session."""
+        repl = LocalREPL()
+
+        repl.add_context("Document: Sales were $1000")
+        repl.execute_code("sales = 1000")
+
+        repl.add_context("Document: Costs were $400")
+        result = repl.execute_code("profit = sales - 400")
+        assert result.stderr == ""
+        assert repl.locals["profit"] == 600
+
+        repl.add_history([
+            {"role": "user", "content": "What were the sales?"},
+            {"role": "assistant", "content": "Sales were $1000"},
+        ])
+
+        code = """
+summary = f"Sales: {context_0}, Costs: {context_1}, Profit: {profit}"
+prev_question = history_0[0]['content']
+"""
+        result = repl.execute_code(code)
+        assert result.stderr == ""
+        assert "Profit: 600" in repl.locals["summary"]
+        assert repl.locals["prev_question"] == "What were the sales?"
+
+        assert repl.get_context_count() == 2
+        assert repl.get_history_count() == 1
+
+        repl.cleanup()

From 1397cbd1f6d9af667177ebe59eb86452f4216e68 Mon Sep 17 00:00:00 2001
From: thoriq <41317726+thoriqakbar0@users.noreply.github.com>
Date: Thu, 8 Jan 2026 09:59:34 +0700
Subject: [PATCH 09/14] test(rlm): add multi-turn integration tests

Add comprehensive integration tests for persistent RLM sessions:
- Environment reuse across completion calls
- Context and history accumulation
- Variable persistence between completions
- Prompt awareness of contexts/histories
- Resource cleanup on close
- Validation of unsupported environments
---
 tests/test_multi_turn_integration.py | 443 +++++++++++++++++++++++++++
 1 file changed, 443 insertions(+)
 create mode 100644 tests/test_multi_turn_integration.py

diff --git a/tests/test_multi_turn_integration.py b/tests/test_multi_turn_integration.py
new file mode 100644
index 00000000..c67b22d4
--- /dev/null
+++ b/tests/test_multi_turn_integration.py
@@ -0,0 +1,443 @@
+"""Integration tests for multi-turn persistent REPL sessions.
+
+Tests that multiple LM completion calls in one RLM session:
+1. Share the same environment
+2. Accumulate contexts (context_0, context_1, ...)
+3. Accumulate histories (history_0, history_1, ...)
+4. Preserve variables across calls
+5. Properly inform the model about available contexts/histories
+"""
+
+import pytest
+from unittest.mock import patch
+from typing import Any
+
+from rlm import RLM
+from rlm.clients.base_lm import BaseLM
+from rlm.core.types import UsageSummary, ModelUsageSummary
+import rlm.core.rlm as rlm_module  # Import the module for patching
+
+
+class ScriptedMockLM(BaseLM):
+    """Mock LM that returns scripted responses to enable controlled testing."""
+
+    def __init__(self, responses: list[str] | None = None):
+        super().__init__(model_name="scripted-mock")
+        self.responses = responses or []
+        self.call_index = 0
+        self.prompts_received: list[Any] = []
+
+    def completion(self, prompt: Any) -> str:
+        self.prompts_received.append(prompt)
+        if self.call_index < len(self.responses):
+            response = self.responses[self.call_index]
+            self.call_index += 1
+            return response
+        return "FINAL(default answer)"
+
+    async def acompletion(self, prompt: Any) -> str:
+        return self.completion(prompt)
+
+    def get_usage_summary(self) -> UsageSummary:
+        return UsageSummary(
+            model_usage_summaries={
+                "scripted-mock": ModelUsageSummary(
+                    total_calls=self.call_index,
+                    total_input_tokens=100 * self.call_index,
+                    total_output_tokens=50 * self.call_index,
+                )
+            }
+        )
+
+    def get_last_usage(self) -> UsageSummary:
+        return self.get_usage_summary()
+
+    def reset(self) -> None:
+        """Reset for next completion call (call index resets, but keep history of prompts)."""
+        self.call_index = 0
+
+
+class TestMultiTurnPersistentEnvironment:
+    """Tests for environment persistence across completion calls."""
+
+    def test_environment_reused_in_persistent_mode(self):
+        """Verify the same environment instance is reused across completion calls."""
+        # Response that immediately provides a final answer
+        responses = ["FINAL(answer from call)"]
+
+        with patch.object(rlm_module, "get_client") as mock_get_client:
+            mock_lm = ScriptedMockLM(responses)
+            mock_get_client.return_value = mock_lm
+
+            with RLM(
+                backend="openai",
+                backend_kwargs={"model_name": "test"},
+                persistent=True,
+            ) as rlm:
+                rlm.completion("First context")
+                first_env = rlm._persistent_env
+
+                mock_lm.reset()
+
+                rlm.completion("Second context")
+                second_env = rlm._persistent_env
+
+                assert first_env is second_env
+                assert first_env is not None
+
+    def test_context_accumulation_across_calls(self):
+        """Verify contexts accumulate: context_0, context_1, etc."""
+        responses = ["FINAL(got it)"]
+
+        with patch.object(rlm_module, "get_client") as mock_get_client:
+            mock_lm = ScriptedMockLM(responses)
+            mock_get_client.return_value = mock_lm
+
+            with RLM(
+                backend="openai",
+                backend_kwargs={"model_name": "test"},
+                persistent=True,
+            ) as rlm:
+                rlm.completion("First document")
+                mock_lm.reset()
+                rlm.completion("Second document")
+                mock_lm.reset()
+                rlm.completion("Third document")
+
+                env = rlm._persistent_env
+                assert env.get_context_count() == 3
+                assert env.locals["context_0"] == "First document"
+                assert env.locals["context_1"] == "Second document"
+                assert env.locals["context_2"] == "Third document"
+                # context alias should point to first context
+                assert env.locals["context"] == "First document"
+
+    def test_history_accumulation_across_calls(self):
+        """Verify message histories accumulate: history_0, history_1, etc."""
+        responses = ["FINAL(done)"]
+
+        with patch.object(rlm_module, "get_client") as mock_get_client:
+            mock_lm = ScriptedMockLM(responses)
+            mock_get_client.return_value = mock_lm
+
+            with RLM(
+                backend="openai",
+                backend_kwargs={"model_name": "test"},
+                persistent=True,
+            ) as rlm:
+                rlm.completion("Context A")
+                mock_lm.reset()
+                rlm.completion("Context B")
+                mock_lm.reset()
+                rlm.completion("Context C")
+
+                env = rlm._persistent_env
+                # After 3 completions, should have 3 histories stored
+                assert env.get_history_count() == 3
+                assert "history_0" in env.locals
+                assert "history_1" in env.locals
+                assert "history_2" in env.locals
+                # Each history should be a list of message dicts
+                assert isinstance(env.locals["history_0"], list)
+                assert len(env.locals["history_0"]) > 0
+                # history alias should point to first history
+                assert env.locals["history"] == env.locals["history_0"]
+
+    def test_variable_persistence_across_completions(self):
+        """Variables computed in one completion should be available in subsequent ones."""
+        # First call: compute a variable
+        # Second call: use that variable
+        first_responses = [
+            "Let me compute something\n```repl\ncomputed_value = 42 * 2\nprint(computed_value)\n```",
+            "FINAL(84)",
+        ]
+        second_responses = [
+            "```repl\nresult = computed_value + 10\nprint(result)\n```",
+            "FINAL(94)",
+        ]
+
+        with patch.object(rlm_module, "get_client") as mock_get_client:
+            mock_lm = ScriptedMockLM(first_responses)
+            mock_get_client.return_value = mock_lm
+
+            with RLM(
+                backend="openai",
+                backend_kwargs={"model_name": "test"},
+                persistent=True,
+            ) as rlm:
+                rlm.completion("Compute 42 * 2")
+                assert rlm._persistent_env.locals.get("computed_value") == 84
+
+                mock_lm.responses = second_responses
+                mock_lm.reset()
+                rlm.completion("Add 10 to the previous result")
+
+                assert rlm._persistent_env.locals.get("computed_value") == 84
+                assert rlm._persistent_env.locals.get("result") == 94
+
+
+class TestMultiTurnPromptAwareness:
+    """Tests that prompts correctly inform the model about contexts/histories."""
+
+    def test_prompt_includes_context_count(self):
+        """Model should be informed about available contexts."""
+        responses = ["FINAL(ok)"]
+
+        with patch.object(rlm_module, "get_client") as mock_get_client:
+            mock_lm = ScriptedMockLM(responses)
+            mock_get_client.return_value = mock_lm
+
+            with RLM(
+                backend="openai",
+                backend_kwargs={"model_name": "test"},
+                persistent=True,
+            ) as rlm:
+                rlm.completion("First")
+                mock_lm.reset()
+                rlm.completion("Second")
+
+                # Check that the second call's prompt mentions multiple contexts
+                last_prompt = mock_lm.prompts_received[-1]
+                # last_prompt is a list of message dicts
+                user_messages = [m for m in last_prompt if m.get("role") == "user"]
+                user_content = " ".join(m.get("content", "") for m in user_messages)
+
+                assert "2 contexts" in user_content or "context_0" in user_content
+
+    def test_prompt_includes_history_count(self):
+        """Model should be informed about available histories."""
+        responses = ["FINAL(ok)"]
+
+        with patch.object(rlm_module, "get_client") as mock_get_client:
+            mock_lm = ScriptedMockLM(responses)
+            mock_get_client.return_value = mock_lm
+
+            with RLM(
+                backend="openai",
+                backend_kwargs={"model_name": "test"},
+                persistent=True,
+            ) as rlm:
+                rlm.completion("First task")
+                mock_lm.reset()
+                rlm.completion("Second task")  # Should have 1 history from first
+
+                # The prompt for second call should mention history
+                last_prompt = mock_lm.prompts_received[-1]
+                user_messages = [m for m in last_prompt if m.get("role") == "user"]
+                user_content = " ".join(m.get("content", "") for m in user_messages)
+
+                assert "history" in user_content.lower()
+
+
+class TestMultiTurnCodeExecution:
+    """Tests for code execution in multi-turn sessions."""
+
+    def test_can_access_previous_context_in_code(self):
+        """Code should be able to reference earlier contexts."""
+        responses = [
+            "```repl\nprint(f'First: {context_0}, Second: {context_1}')\n```",
+            "FINAL(printed both)",
+        ]
+
+        with patch.object(rlm_module, "get_client") as mock_get_client:
+            mock_lm = ScriptedMockLM(["FINAL(first done)"])
+            mock_get_client.return_value = mock_lm
+
+            with RLM(
+                backend="openai",
+                backend_kwargs={"model_name": "test"},
+                persistent=True,
+            ) as rlm:
+                rlm.completion("Document A")
+
+                mock_lm.responses = responses
+                mock_lm.reset()
+                rlm.completion("Document B")
+
+                # Both contexts should be accessible
+                env = rlm._persistent_env
+                assert env.locals["context_0"] == "Document A"
+                assert env.locals["context_1"] == "Document B"
+
+    def test_can_access_history_in_code(self):
+        """Code should be able to reference stored histories."""
+        responses = [
+            "```repl\nprint(f'History entries: {len(history)}')\n```",
+            "FINAL(accessed history)",
+        ]
+
+        with patch.object(rlm_module, "get_client") as mock_get_client:
+            mock_lm = ScriptedMockLM(["FINAL(first)"])
+            mock_get_client.return_value = mock_lm
+
+            with RLM(
+                backend="openai",
+                backend_kwargs={"model_name": "test"},
+                persistent=True,
+            ) as rlm:
+                rlm.completion("First query")
+
+                mock_lm.responses = responses
+                mock_lm.reset()
+
+                rlm.completion("Second query")
+
+                env = rlm._persistent_env
+                assert "history" in env.locals
+                assert isinstance(env.locals["history"], list)
+
+
+class TestNonPersistentMode:
+    """Tests to ensure non-persistent mode still works correctly."""
+
+    def test_non_persistent_creates_fresh_environment(self):
+        """Non-persistent mode should create new environment each call."""
+        responses = ["FINAL(done)"]
+
+        with patch.object(rlm_module, "get_client") as mock_get_client:
+            mock_lm = ScriptedMockLM(responses)
+            mock_get_client.return_value = mock_lm
+
+            rlm = RLM(
+                backend="openai",
+                backend_kwargs={"model_name": "test"},
+                persistent=False,  # Explicitly non-persistent
+            )
+
+            rlm.completion("First")
+            # In non-persistent mode, no persistent env should be stored
+            assert rlm._persistent_env is None
+
+            mock_lm.reset()
+            rlm.completion("Second")
+            assert rlm._persistent_env is None
+
+    def test_default_is_non_persistent(self):
+        """Default behavior should be non-persistent."""
+        rlm = RLM(
+            backend="openai",
+            backend_kwargs={"model_name": "test"},
+        )
+        assert rlm.persistent is False
+
+
+class TestPersistentModeResourceManagement:
+    """Tests for proper resource cleanup in persistent mode."""
+
+    def test_context_manager_cleanup(self):
+        """Environment should be cleaned up when exiting context manager."""
+        responses = ["FINAL(done)"]
+
+        with patch.object(rlm_module, "get_client") as mock_get_client:
+            mock_lm = ScriptedMockLM(responses)
+            mock_get_client.return_value = mock_lm
+
+            with RLM(
+                backend="openai",
+                backend_kwargs={"model_name": "test"},
+                persistent=True,
+            ) as rlm:
+                rlm.completion("Test")
+                assert rlm._persistent_env is not None
+
+            # After exiting context manager, env should be cleaned up
+            assert rlm._persistent_env is None
+
+    def test_explicit_close(self):
+        """Calling close() should clean up persistent environment."""
+        responses = ["FINAL(done)"]
+
+        with patch.object(rlm_module, "get_client") as mock_get_client:
+            mock_lm = ScriptedMockLM(responses)
+            mock_get_client.return_value = mock_lm
+
+            rlm = RLM(
+                backend="openai",
+                backend_kwargs={"model_name": "test"},
+                persistent=True,
+            )
+            rlm.completion("Test")
+            assert rlm._persistent_env is not None
+
+            rlm.close()
+            assert rlm._persistent_env is None
+
+
+class TestPersistentModeValidation:
+    """Tests for persistent mode validation."""
+
+    def test_unsupported_environment_raises_error(self):
+        """Persistent mode should raise error for unsupported environments."""
+        with pytest.raises(ValueError, match="persistent=True is not supported"):
+            RLM(
+                backend="openai",
+                backend_kwargs={"model_name": "test"},
+                environment="docker",  # Not supported for persistent
+                persistent=True,
+            )
+
+    def test_local_environment_supported(self):
+        """Local environment should support persistent mode."""
+        # Should not raise
+        rlm = RLM(
+            backend="openai",
+            backend_kwargs={"model_name": "test"},
+            environment="local",
+            persistent=True,
+        )
+        assert rlm.persistent is True
+
+
+class TestMultiTurnEndToEnd:
+    """End-to-end tests simulating realistic multi-turn usage."""
+
+    def test_three_turn_conversation(self):
+        """Simulate a 3-turn conversation with context accumulation."""
+        turn1_responses = [
+            "Looking at the first document\n```repl\ndoc1_summary = 'Has info about cats'\nprint(doc1_summary)\n```",
+            "FINAL(Summarized first doc)",
+        ]
+        turn2_responses = [
+            "Looking at second document and comparing\n```repl\ndoc2_summary = 'Has info about dogs'\nprint(f'Doc1: {doc1_summary}, Doc2: {doc2_summary}')\n```",
+            "FINAL(Compared both docs)",
+        ]
+        turn3_responses = [
+            "Final synthesis\n```repl\nfinal = f'Combined: {doc1_summary} and {doc2_summary} from context_2'\nprint(final)\n```",
+            "FINAL(synthesized all)",
+        ]
+
+        with patch.object(rlm_module, "get_client") as mock_get_client:
+            mock_lm = ScriptedMockLM(turn1_responses)
+            mock_get_client.return_value = mock_lm
+
+            with RLM(
+                backend="openai",
+                backend_kwargs={"model_name": "test"},
+                persistent=True,
+            ) as rlm:
+                # Turn 1
+                result1 = rlm.completion("First document about cats")
+                assert "Summarized" in result1.response
+
+                # Turn 2
+                mock_lm.responses = turn2_responses
+                mock_lm.reset()
+                result2 = rlm.completion("Second document about dogs")
+                assert "Compared" in result2.response
+
+                # Turn 3
+                mock_lm.responses = turn3_responses
+                mock_lm.reset()
+                result3 = rlm.completion("Synthesize everything")
+                assert "synthesized" in result3.response
+
+                # Verify state
+                env = rlm._persistent_env
+                assert env.get_context_count() == 3
+                assert env.get_history_count() == 3
+                assert env.locals.get("doc1_summary") == "Has info about cats"
+                assert env.locals.get("doc2_summary") == "Has info about dogs"
+
+
+if __name__ == "__main__":
+    pytest.main([__file__, "-v"])

From 1e76ac6fd97fd9df0678938daaa93347c18385f2 Mon Sep 17 00:00:00 2001
From: thoriq <41317726+thoriqakbar0@users.noreply.github.com>
Date: Thu, 8 Jan 2026 10:30:43 +0700
Subject: [PATCH 10/14] refactor(tests): replace ScriptedMockLM with standard
 unittest.mock

Remove custom ScriptedMockLM class (58 lines) and replace with
create_mock_lm() helper (14 lines) using standard Mock patterns.
---
 tests/test_multi_turn_integration.py | 143 +++++++++------------------
 1 file changed, 48 insertions(+), 95 deletions(-)

diff --git a/tests/test_multi_turn_integration.py b/tests/test_multi_turn_integration.py
index c67b22d4..662b3e83 100644
--- a/tests/test_multi_turn_integration.py
+++ b/tests/test_multi_turn_integration.py
@@ -9,52 +9,26 @@
 """
 
 import pytest
-from unittest.mock import patch
-from typing import Any
+from unittest.mock import patch, Mock
 
 from rlm import RLM
-from rlm.clients.base_lm import BaseLM
 from rlm.core.types import UsageSummary, ModelUsageSummary
-import rlm.core.rlm as rlm_module  # Import the module for patching
-
-
-class ScriptedMockLM(BaseLM):
-    """Mock LM that returns scripted responses to enable controlled testing."""
-
-    def __init__(self, responses: list[str] | None = None):
-        super().__init__(model_name="scripted-mock")
-        self.responses = responses or []
-        self.call_index = 0
-        self.prompts_received: list[Any] = []
-
-    def completion(self, prompt: Any) -> str:
-        self.prompts_received.append(prompt)
-        if self.call_index < len(self.responses):
-            response = self.responses[self.call_index]
-            self.call_index += 1
-            return response
-        return "FINAL(default answer)"
-
-    async def acompletion(self, prompt: Any) -> str:
-        return self.completion(prompt)
-
-    def get_usage_summary(self) -> UsageSummary:
-        return UsageSummary(
-            model_usage_summaries={
-                "scripted-mock": ModelUsageSummary(
-                    total_calls=self.call_index,
-                    total_input_tokens=100 * self.call_index,
-                    total_output_tokens=50 * self.call_index,
-                )
-            }
-        )
+import rlm.core.rlm as rlm_module
 
-    def get_last_usage(self) -> UsageSummary:
-        return self.get_usage_summary()
 
-    def reset(self) -> None:
-        """Reset for next completion call (call index resets, but keep history of prompts)."""
-        self.call_index = 0
+def create_mock_lm(responses: list[str]) -> Mock:
+    """Create a mock LM that returns responses in order."""
+    mock = Mock()
+    mock.completion.side_effect = list(responses)
+    mock.get_usage_summary.return_value = UsageSummary(
+        model_usage_summaries={
+            "mock": ModelUsageSummary(
+                total_calls=1, total_input_tokens=100, total_output_tokens=50
+            )
+        }
+    )
+    mock.get_last_usage.return_value = mock.get_usage_summary.return_value
+    return mock
 
 
 class TestMultiTurnPersistentEnvironment:
@@ -62,11 +36,10 @@ class TestMultiTurnPersistentEnvironment:
 
     def test_environment_reused_in_persistent_mode(self):
         """Verify the same environment instance is reused across completion calls."""
-        # Response that immediately provides a final answer
         responses = ["FINAL(answer from call)"]
 
         with patch.object(rlm_module, "get_client") as mock_get_client:
-            mock_lm = ScriptedMockLM(responses)
+            mock_lm = create_mock_lm(responses)
             mock_get_client.return_value = mock_lm
 
             with RLM(
@@ -77,7 +50,7 @@ def test_environment_reused_in_persistent_mode(self):
                 rlm.completion("First context")
                 first_env = rlm._persistent_env
 
-                mock_lm.reset()
+                mock_lm.completion.side_effect = list(responses)
 
                 rlm.completion("Second context")
                 second_env = rlm._persistent_env
@@ -90,7 +63,7 @@ def test_context_accumulation_across_calls(self):
         responses = ["FINAL(got it)"]
 
         with patch.object(rlm_module, "get_client") as mock_get_client:
-            mock_lm = ScriptedMockLM(responses)
+            mock_lm = create_mock_lm(responses)
             mock_get_client.return_value = mock_lm
 
             with RLM(
@@ -99,9 +72,9 @@ def test_context_accumulation_across_calls(self):
                 persistent=True,
             ) as rlm:
                 rlm.completion("First document")
-                mock_lm.reset()
+                mock_lm.completion.side_effect = list(responses)
                 rlm.completion("Second document")
-                mock_lm.reset()
+                mock_lm.completion.side_effect = list(responses)
                 rlm.completion("Third document")
 
                 env = rlm._persistent_env
@@ -109,7 +82,6 @@ def test_context_accumulation_across_calls(self):
                 assert env.locals["context_0"] == "First document"
                 assert env.locals["context_1"] == "Second document"
                 assert env.locals["context_2"] == "Third document"
-                # context alias should point to first context
                 assert env.locals["context"] == "First document"
 
     def test_history_accumulation_across_calls(self):
@@ -117,7 +89,7 @@ def test_history_accumulation_across_calls(self):
         responses = ["FINAL(done)"]
 
         with patch.object(rlm_module, "get_client") as mock_get_client:
-            mock_lm = ScriptedMockLM(responses)
+            mock_lm = create_mock_lm(responses)
             mock_get_client.return_value = mock_lm
 
             with RLM(
@@ -126,27 +98,22 @@ def test_history_accumulation_across_calls(self):
                 persistent=True,
             ) as rlm:
                 rlm.completion("Context A")
-                mock_lm.reset()
+                mock_lm.completion.side_effect = list(responses)
                 rlm.completion("Context B")
-                mock_lm.reset()
+                mock_lm.completion.side_effect = list(responses)
                 rlm.completion("Context C")
 
                 env = rlm._persistent_env
-                # After 3 completions, should have 3 histories stored
                 assert env.get_history_count() == 3
                 assert "history_0" in env.locals
                 assert "history_1" in env.locals
                 assert "history_2" in env.locals
-                # Each history should be a list of message dicts
                 assert isinstance(env.locals["history_0"], list)
                 assert len(env.locals["history_0"]) > 0
-                # history alias should point to first history
                 assert env.locals["history"] == env.locals["history_0"]
 
     def test_variable_persistence_across_completions(self):
         """Variables computed in one completion should be available in subsequent ones."""
-        # First call: compute a variable
-        # Second call: use that variable
         first_responses = [
             "Let me compute something\n```repl\ncomputed_value = 42 * 2\nprint(computed_value)\n```",
             "FINAL(84)",
@@ -157,7 +124,7 @@ def test_variable_persistence_across_completions(self):
         ]
 
         with patch.object(rlm_module, "get_client") as mock_get_client:
-            mock_lm = ScriptedMockLM(first_responses)
+            mock_lm = create_mock_lm(first_responses)
             mock_get_client.return_value = mock_lm
 
             with RLM(
@@ -168,8 +135,7 @@ def test_variable_persistence_across_completions(self):
                 rlm.completion("Compute 42 * 2")
                 assert rlm._persistent_env.locals.get("computed_value") == 84
 
-                mock_lm.responses = second_responses
-                mock_lm.reset()
+                mock_lm.completion.side_effect = list(second_responses)
                 rlm.completion("Add 10 to the previous result")
 
                 assert rlm._persistent_env.locals.get("computed_value") == 84
@@ -184,7 +150,7 @@ def test_prompt_includes_context_count(self):
         responses = ["FINAL(ok)"]
 
         with patch.object(rlm_module, "get_client") as mock_get_client:
-            mock_lm = ScriptedMockLM(responses)
+            mock_lm = create_mock_lm(responses)
             mock_get_client.return_value = mock_lm
 
             with RLM(
@@ -193,12 +159,10 @@ def test_prompt_includes_context_count(self):
                 persistent=True,
             ) as rlm:
                 rlm.completion("First")
-                mock_lm.reset()
+                mock_lm.completion.side_effect = list(responses)
                 rlm.completion("Second")
 
-                # Check that the second call's prompt mentions multiple contexts
-                last_prompt = mock_lm.prompts_received[-1]
-                # last_prompt is a list of message dicts
+                last_prompt = mock_lm.completion.call_args[0][0]
                 user_messages = [m for m in last_prompt if m.get("role") == "user"]
                 user_content = " ".join(m.get("content", "") for m in user_messages)
 
@@ -209,7 +173,7 @@ def test_prompt_includes_history_count(self):
         responses = ["FINAL(ok)"]
 
         with patch.object(rlm_module, "get_client") as mock_get_client:
-            mock_lm = ScriptedMockLM(responses)
+            mock_lm = create_mock_lm(responses)
             mock_get_client.return_value = mock_lm
 
             with RLM(
@@ -218,11 +182,10 @@ def test_prompt_includes_history_count(self):
                 persistent=True,
             ) as rlm:
                 rlm.completion("First task")
-                mock_lm.reset()
-                rlm.completion("Second task")  # Should have 1 history from first
+                mock_lm.completion.side_effect = list(responses)
+                rlm.completion("Second task")
 
-                # The prompt for second call should mention history
-                last_prompt = mock_lm.prompts_received[-1]
+                last_prompt = mock_lm.completion.call_args[0][0]
                 user_messages = [m for m in last_prompt if m.get("role") == "user"]
                 user_content = " ".join(m.get("content", "") for m in user_messages)
 
@@ -234,13 +197,14 @@ class TestMultiTurnCodeExecution:
 
     def test_can_access_previous_context_in_code(self):
         """Code should be able to reference earlier contexts."""
-        responses = [
+        first_responses = ["FINAL(first done)"]
+        second_responses = [
             "```repl\nprint(f'First: {context_0}, Second: {context_1}')\n```",
             "FINAL(printed both)",
         ]
 
         with patch.object(rlm_module, "get_client") as mock_get_client:
-            mock_lm = ScriptedMockLM(["FINAL(first done)"])
+            mock_lm = create_mock_lm(first_responses)
             mock_get_client.return_value = mock_lm
 
             with RLM(
@@ -250,24 +214,23 @@ def test_can_access_previous_context_in_code(self):
             ) as rlm:
                 rlm.completion("Document A")
 
-                mock_lm.responses = responses
-                mock_lm.reset()
+                mock_lm.completion.side_effect = list(second_responses)
                 rlm.completion("Document B")
 
-                # Both contexts should be accessible
                 env = rlm._persistent_env
                 assert env.locals["context_0"] == "Document A"
                 assert env.locals["context_1"] == "Document B"
 
     def test_can_access_history_in_code(self):
         """Code should be able to reference stored histories."""
-        responses = [
+        first_responses = ["FINAL(first)"]
+        second_responses = [
             "```repl\nprint(f'History entries: {len(history)}')\n```",
             "FINAL(accessed history)",
         ]
 
         with patch.object(rlm_module, "get_client") as mock_get_client:
-            mock_lm = ScriptedMockLM(["FINAL(first)"])
+            mock_lm = create_mock_lm(first_responses)
             mock_get_client.return_value = mock_lm
 
             with RLM(
@@ -277,9 +240,7 @@ def test_can_access_history_in_code(self):
             ) as rlm:
                 rlm.completion("First query")
 
-                mock_lm.responses = responses
-                mock_lm.reset()
-
+                mock_lm.completion.side_effect = list(second_responses)
                 rlm.completion("Second query")
 
                 env = rlm._persistent_env
@@ -295,20 +256,19 @@ def test_non_persistent_creates_fresh_environment(self):
         responses = ["FINAL(done)"]
 
         with patch.object(rlm_module, "get_client") as mock_get_client:
-            mock_lm = ScriptedMockLM(responses)
+            mock_lm = create_mock_lm(responses)
             mock_get_client.return_value = mock_lm
 
             rlm = RLM(
                 backend="openai",
                 backend_kwargs={"model_name": "test"},
-                persistent=False,  # Explicitly non-persistent
+                persistent=False,
             )
 
             rlm.completion("First")
-            # In non-persistent mode, no persistent env should be stored
             assert rlm._persistent_env is None
 
-            mock_lm.reset()
+            mock_lm.completion.side_effect = list(responses)
             rlm.completion("Second")
             assert rlm._persistent_env is None
 
@@ -329,7 +289,7 @@ def test_context_manager_cleanup(self):
         responses = ["FINAL(done)"]
 
         with patch.object(rlm_module, "get_client") as mock_get_client:
-            mock_lm = ScriptedMockLM(responses)
+            mock_lm = create_mock_lm(responses)
             mock_get_client.return_value = mock_lm
 
             with RLM(
@@ -340,7 +300,6 @@ def test_context_manager_cleanup(self):
                 rlm.completion("Test")
                 assert rlm._persistent_env is not None
 
-            # After exiting context manager, env should be cleaned up
             assert rlm._persistent_env is None
 
     def test_explicit_close(self):
@@ -348,7 +307,7 @@ def test_explicit_close(self):
         responses = ["FINAL(done)"]
 
         with patch.object(rlm_module, "get_client") as mock_get_client:
-            mock_lm = ScriptedMockLM(responses)
+            mock_lm = create_mock_lm(responses)
             mock_get_client.return_value = mock_lm
 
             rlm = RLM(
@@ -407,7 +366,7 @@ def test_three_turn_conversation(self):
         ]
 
         with patch.object(rlm_module, "get_client") as mock_get_client:
-            mock_lm = ScriptedMockLM(turn1_responses)
+            mock_lm = create_mock_lm(turn1_responses)
             mock_get_client.return_value = mock_lm
 
             with RLM(
@@ -415,23 +374,17 @@ def test_three_turn_conversation(self):
                 backend_kwargs={"model_name": "test"},
                 persistent=True,
             ) as rlm:
-                # Turn 1
                 result1 = rlm.completion("First document about cats")
                 assert "Summarized" in result1.response
 
-                # Turn 2
-                mock_lm.responses = turn2_responses
-                mock_lm.reset()
+                mock_lm.completion.side_effect = list(turn2_responses)
                 result2 = rlm.completion("Second document about dogs")
                 assert "Compared" in result2.response
 
-                # Turn 3
-                mock_lm.responses = turn3_responses
-                mock_lm.reset()
+                mock_lm.completion.side_effect = list(turn3_responses)
                 result3 = rlm.completion("Synthesize everything")
                 assert "synthesized" in result3.response
 
-                # Verify state
                 env = rlm._persistent_env
                 assert env.get_context_count() == 3
                 assert env.get_history_count() == 3

From 4be974990e6b66d488b93df8657f902079d2dab9 Mon Sep 17 00:00:00 2001
From: thoriq <41317726+thoriqakbar0@users.noreply.github.com>
Date: Thu, 8 Jan 2026 12:51:10 +0700
Subject: [PATCH 11/14] refactor(environments): add SupportsPersistence
 Protocol

Replace scattered hasattr checks with a runtime-checkable Protocol
that defines the persistence capability contract. This provides
type checker enforcement and IDE autocomplete support.
---
 rlm/core/rlm.py              | 25 +++++++------------------
 rlm/environments/__init__.py |  4 +++-
 rlm/environments/base_env.py | 20 ++++++++++++++++++++
 3 files changed, 30 insertions(+), 19 deletions(-)

diff --git a/rlm/core/rlm.py b/rlm/core/rlm.py
index 195837d8..1de29776 100644
--- a/rlm/core/rlm.py
+++ b/rlm/core/rlm.py
@@ -13,7 +13,7 @@
     RLMIteration,
     RLMMetadata,
 )
-from rlm.environments import BaseEnv, get_environment
+from rlm.environments import BaseEnv, SupportsPersistence, get_environment
 from rlm.logger import RLMLogger, VerbosePrinter
 from rlm.utils.parsing import (
     find_code_blocks,
@@ -88,7 +88,7 @@ def __init__(
 
         # Persistence support
         self.persistent = persistent
-        self._persistent_env: BaseEnv | None = None
+        self._persistent_env: SupportsPersistence | None = None
 
         # Validate persistence support at initialization
         if self.persistent:
@@ -203,12 +203,12 @@ def completion(
                 # Current prompt = message history + additional prompt suffix
                 context_count = (
                     environment.get_context_count()
-                    if hasattr(environment, "get_context_count")
+                    if isinstance(environment, SupportsPersistence)
                     else 1
                 )
                 history_count = (
                     environment.get_history_count()
-                    if hasattr(environment, "get_history_count")
+                    if isinstance(environment, SupportsPersistence)
                     else 0
                 )
                 current_prompt = message_history + [
@@ -239,7 +239,7 @@ def completion(
                     self.verbose.print_summary(i + 1, time_end - time_start, usage.to_dict())
 
                     # Store message history in persistent environment
-                    if self.persistent and hasattr(environment, "add_history"):
+                    if self.persistent and isinstance(environment, SupportsPersistence):
                         environment.add_history(message_history)
 
                     return RLMChatCompletion(
@@ -266,7 +266,7 @@ def completion(
             self.verbose.print_summary(self.max_iterations, time_end - time_start, usage.to_dict())
 
             # Store message history in persistent environment
-            if self.persistent and hasattr(environment, "add_history"):
+            if self.persistent and isinstance(environment, SupportsPersistence):
                 environment.add_history(message_history)
 
             return RLMChatCompletion(
@@ -367,18 +367,7 @@ def _validate_persistent_environment_support(self) -> None:
     @staticmethod
     def _env_supports_persistence(env: BaseEnv) -> bool:
         """Check if an environment instance supports persistent mode methods."""
-        return (
-            hasattr(env, "update_handler_address")
-            and hasattr(env, "add_context")
-            and hasattr(env, "get_context_count")
-            and hasattr(env, "add_history")
-            and hasattr(env, "get_history_count")
-            and callable(getattr(env, "update_handler_address", None))
-            and callable(getattr(env, "add_context", None))
-            and callable(getattr(env, "get_context_count", None))
-            and callable(getattr(env, "add_history", None))
-            and callable(getattr(env, "get_history_count", None))
-        )
+        return isinstance(env, SupportsPersistence)
 
     def close(self) -> None:
         """Clean up persistent environment. Call when done with multi-turn conversations."""
diff --git a/rlm/environments/__init__.py b/rlm/environments/__init__.py
index 9d2719f2..6a70431d 100644
--- a/rlm/environments/__init__.py
+++ b/rlm/environments/__init__.py
@@ -1,8 +1,10 @@
 from typing import Any, Literal
 
-from rlm.environments.base_env import BaseEnv
+from rlm.environments.base_env import BaseEnv, SupportsPersistence
 from rlm.environments.local_repl import LocalREPL
 
+__all__ = ["BaseEnv", "LocalREPL", "SupportsPersistence", "get_environment"]
+
 
 def get_environment(
     environment: Literal["local", "modal", "docker"],
diff --git a/rlm/environments/base_env.py b/rlm/environments/base_env.py
index ae74577b..b185848a 100644
--- a/rlm/environments/base_env.py
+++ b/rlm/environments/base_env.py
@@ -1,4 +1,5 @@
 from abc import ABC, abstractmethod
+from typing import Any, Protocol, runtime_checkable
 
 from rlm.core.types import REPLResult
 
@@ -69,3 +70,22 @@ def load_context(self, context_payload: dict | list | str):
     @abstractmethod
     def execute_code(self, code: str) -> REPLResult:
         raise NotImplementedError
+
+
+@runtime_checkable
+class SupportsPersistence(Protocol):
+    """Protocol for environments that support persistent multi-turn sessions.
+
+    Use isinstance(env, SupportsPersistence) to check if an environment
+    supports persistence capabilities.
+    """
+
+    def update_handler_address(self, address: tuple[str, int]) -> None: ...
+    def add_context(
+        self, context_payload: dict | list | str, context_index: int | None = None
+    ) -> int: ...
+    def get_context_count(self) -> int: ...
+    def add_history(
+        self, message_history: list[dict[str, Any]], history_index: int | None = None
+    ) -> int: ...
+    def get_history_count(self) -> int: ...

From 86862b41389a8191d9f85e88f20752f2924c22b9 Mon Sep 17 00:00:00 2001
From: thoriq <41317726+thoriqakbar0@users.noreply.github.com>
Date: Thu, 8 Jan 2026 13:41:36 +0700
Subject: [PATCH 12/14] docs(environments): add comprehensive
 SupportsPersistence Protocol docs

Document expected behavior to implement persistence:
- Versioning behavior (context_0, context_1, ...)
- Aliasing behavior (context -> context_0)
- Method contracts with detailed docstrings
- References to tests and example implementation
---
 rlm/environments/base_env.py | 104 ++++++++++++++++++++++++++++++++---
 1 file changed, 97 insertions(+), 7 deletions(-)

diff --git a/rlm/environments/base_env.py b/rlm/environments/base_env.py
index b185848a..963018b4 100644
--- a/rlm/environments/base_env.py
+++ b/rlm/environments/base_env.py
@@ -76,16 +76,106 @@ def execute_code(self, code: str) -> REPLResult:
 class SupportsPersistence(Protocol):
     """Protocol for environments that support persistent multi-turn sessions.
 
-    Use isinstance(env, SupportsPersistence) to check if an environment
-    supports persistence capabilities.
+    CHECKING SUPPORT:
+        Use isinstance(env, SupportsPersistence) to check if an environment
+        supports persistence capabilities.
+
+    IMPLEMENTING THIS PROTOCOL:
+        To add persistence to your environment, implement these 5 methods.
+        See tests/test_local_repl_persistent.py for expected behavior.
+
+    VERSIONING BEHAVIOR:
+        Contexts and histories are versioned with numeric suffixes:
+        - First context  -> context_0, context_1, context_2, ...
+        - First history  -> history_0, history_1, history_2, ...
+
+    ALIASING BEHAVIOR:
+        The unversioned names always point to index 0:
+        - context  -> context_0 (first context)
+        - history  -> history_0 (first history)
+
+    EXAMPLE IMPLEMENTATION:
+        See rlm/environments/local_repl.py for a complete reference.
+
+    TESTS:
+        - Unit tests: tests/test_local_repl_persistent.py
+        - Integration tests: tests/test_multi_turn_integration.py
+
+        Run: uv run pytest tests/test_local_repl_persistent.py -v
     """
 
-    def update_handler_address(self, address: tuple[str, int]) -> None: ...
+    def update_handler_address(self, address: tuple[str, int]) -> None:
+        """Update the LM handler address for nested LLM calls.
+
+        Called by RLM when the handler address changes between completions.
+        Store the address so llm_query() calls from executed code can reach
+        the LM handler.
+
+        Args:
+            address: (host, port) tuple for the LM handler server.
+        """
+        ...
+
     def add_context(
         self, context_payload: dict | list | str, context_index: int | None = None
-    ) -> int: ...
-    def get_context_count(self) -> int: ...
+    ) -> int:
+        """Add a context payload, making it available as context_N in code.
+
+        Versioning:
+            - context_index=None: auto-increment (0, 1, 2, ...)
+            - context_index=N: use specific index N
+
+        Storage:
+            Must store so executed code can access:
+            - context_0, context_1, etc. (versioned)
+            - context (alias to context_0)
+
+        Args:
+            context_payload: The context data (string, dict, or list).
+            context_index: Optional specific index, or None to auto-increment.
+
+        Returns:
+            The index used (for auto-increment, returns the assigned index).
+        """
+        ...
+
+    def get_context_count(self) -> int:
+        """Return the number of contexts added so far.
+
+        Used by RLM to inform the model how many contexts are available.
+        """
+        ...
+
     def add_history(
         self, message_history: list[dict[str, Any]], history_index: int | None = None
-    ) -> int: ...
-    def get_history_count(self) -> int: ...
+    ) -> int:
+        """Add a message history, making it available as history_N in code.
+
+        Versioning:
+            - history_index=None: auto-increment (0, 1, 2, ...)
+            - history_index=N: use specific index N
+
+        Storage:
+            Must store so executed code can access:
+            - history_0, history_1, etc. (versioned)
+            - history (alias to history_0)
+
+        IMPORTANT: Store a deep copy, not a reference. The caller may
+        modify the list after calling this method.
+
+        Args:
+            message_history: List of message dicts (role, content).
+            history_index: Optional specific index, or None to auto-increment.
+
+        Returns:
+            The index used.
+        """
+        ...
+
+    def get_history_count(self) -> int:
+        """Return the number of histories added so far.
+
+        Used by RLM to inform the model how many conversation histories
+        are available.
+        """
+        ...

From ea0b91c2b7f8bcb922ea84acf916cbbc5a1bc973 Mon Sep 17 00:00:00 2001
From: Alex Zhang <alex.lx.zhang@gmail.com>
Date: Thu, 8 Jan 2026 15:42:22 -0500
Subject: [PATCH 13/14] add ruff lint on test

---
 tests/test_multi_turn_integration.py | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/tests/test_multi_turn_integration.py b/tests/test_multi_turn_integration.py
index 662b3e83..f55de39b 100644
--- a/tests/test_multi_turn_integration.py
+++ b/tests/test_multi_turn_integration.py
@@ -8,12 +8,13 @@
 5. Properly inform the model about available contexts/histories
 """
 
+from unittest.mock import Mock, patch
+
 import pytest
-from unittest.mock import patch, Mock
 
-from rlm import RLM
-from rlm.core.types import UsageSummary, ModelUsageSummary
 import rlm.core.rlm as rlm_module
+from rlm import RLM
+from rlm.core.types import ModelUsageSummary, UsageSummary
 
 
 def create_mock_lm(responses: list[str]) -> Mock:
@@ -22,9 +23,7 @@ def create_mock_lm(responses: list[str]) -> Mock:
     mock.completion.side_effect = list(responses)
     mock.get_usage_summary.return_value = UsageSummary(
         model_usage_summaries={
-            "mock": ModelUsageSummary(
-                total_calls=1, total_input_tokens=100, total_output_tokens=50
-            )
+            "mock": ModelUsageSummary(total_calls=1, total_input_tokens=100, total_output_tokens=50)
         }
     )
     mock.get_last_usage.return_value = mock.get_usage_summary.return_value

From f9755c8df3c64fd6297145885d37ec5a458511a8 Mon Sep 17 00:00:00 2001
From: Alex Zhang <alex.lx.zhang@gmail.com>
Date: Thu, 8 Jan 2026 15:43:52 -0500
Subject: [PATCH 14/14] fix linting on tests

---
 tests/test_local_repl.py            |  1 -
 tests/test_local_repl_persistent.py | 10 ++++++----
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/tests/test_local_repl.py b/tests/test_local_repl.py
index 115f671c..787a5b54 100644
--- a/tests/test_local_repl.py
+++ b/tests/test_local_repl.py
@@ -243,4 +243,3 @@ def test_simulated_rlm_completions_functions_not_preserved(self):
         assert "NameError" in result.stderr
         assert "my_helper" in result.stderr
         completion_2_env.cleanup()
-
diff --git a/tests/test_local_repl_persistent.py b/tests/test_local_repl_persistent.py
index 4624c04a..f654679d 100644
--- a/tests/test_local_repl_persistent.py
+++ b/tests/test_local_repl_persistent.py
@@ -198,10 +198,12 @@ def test_full_persistent_session_simulation(self):
         assert result.stderr == ""
         assert repl.locals["profit"] == 600
 
-        repl.add_history([
-            {"role": "user", "content": "What were the sales?"},
-            {"role": "assistant", "content": "Sales were $1000"},
-        ])
+        repl.add_history(
+            [
+                {"role": "user", "content": "What were the sales?"},
+                {"role": "assistant", "content": "Sales were $1000"},
+            ]
+        )
 
         code = """
 summary = f"Sales: {context_0}, Costs: {context_1}, Profit: {profit}"