From 6cc734e1e024b6da55d7a156aa4801da0b044e39 Mon Sep 17 00:00:00 2001 From: thoriq <41317726+thoriqakbar0@users.noreply.github.com> Date: Mon, 5 Jan 2026 11:14:01 +0700 Subject: [PATCH 01/14] feat(local-repl): add multi-context support for persistent REPL --- rlm/environments/local_repl.py | 77 ++++++++++++++++++++++++---------- tests/test_local_repl.py | 35 ++++++++++++++++ 2 files changed, 91 insertions(+), 21 deletions(-) diff --git a/rlm/environments/local_repl.py b/rlm/environments/local_repl.py index b8183800..0c9ec6f9 100644 --- a/rlm/environments/local_repl.py +++ b/rlm/environments/local_repl.py @@ -130,6 +130,7 @@ def __init__( self.original_cwd = os.getcwd() self.temp_dir = tempfile.mkdtemp(prefix=f"repl_env_{uuid.uuid4()}_") self._lock = threading.Lock() + self._context_count: int = 0 # Setup globals, locals, and modules in environment. self.setup() @@ -222,20 +223,55 @@ def _llm_query_batched(self, prompts: list[str], model: str | None = None) -> li return [f"Error: LM query failed - {e}"] * len(prompts) def load_context(self, context_payload: dict | list | str): - """Load context into the environment.""" + """Load context into the environment as context_0 (and 'context' alias).""" + self.add_context(context_payload, 0) + + def add_context( + self, context_payload: dict | list | str, context_index: int | None = None + ) -> int: + """ + Add a context with versioned variable name. + + Args: + context_payload: The context data to add + context_index: Optional explicit index. If None, auto-increments. + + Returns: + The context index used. + """ + if context_index is None: + context_index = self._context_count + + var_name = f"context_{context_index}" + if isinstance(context_payload, str): - context_path = os.path.join(self.temp_dir, "context.txt") + context_path = os.path.join(self.temp_dir, f"context_{context_index}.txt") with open(context_path, "w") as f: f.write(context_payload) - self.execute_code(f"with open(r'{context_path}', 'r') as f:\n context = f.read()") + self.execute_code(f"with open(r'{context_path}', 'r') as f:\n {var_name} = f.read()") else: - context_path = os.path.join(self.temp_dir, "context.json") + context_path = os.path.join(self.temp_dir, f"context_{context_index}.json") with open(context_path, "w") as f: json.dump(context_payload, f) self.execute_code( - f"import json\nwith open(r'{context_path}', 'r') as f:\n context = json.load(f)" + f"import json\nwith open(r'{context_path}', 'r') as f:\n {var_name} = json.load(f)" ) + # Alias context_0 as 'context' for backward compatibility + if context_index == 0: + self.execute_code(f"context = {var_name}") + + self._context_count = max(self._context_count, context_index + 1) + return context_index + + def update_handler_address(self, address: tuple[str, int]) -> None: + """Update the LM handler address for a new completion call.""" + self.lm_handler_address = address + + def get_context_count(self) -> int: + """Return the number of contexts loaded.""" + return self._context_count + @contextmanager def _capture_output(self): """Thread-safe context manager to capture stdout/stderr.""" @@ -265,22 +301,21 @@ def execute_code(self, code: str) -> REPLResult: # Clear pending LLM calls from previous execution self._pending_llm_calls = [] - with self._capture_output() as (stdout_buf, stderr_buf): - with self._temp_cwd(): - try: - combined = {**self.globals, **self.locals} - exec(code, combined, combined) - - # Update locals with new variables - for key, value in combined.items(): - if key not in self.globals and not key.startswith("_"): - self.locals[key] = value - - stdout = stdout_buf.getvalue() - stderr = stderr_buf.getvalue() - except Exception as e: - stdout = stdout_buf.getvalue() - stderr = stderr_buf.getvalue() + f"\n{type(e).__name__}: {e}" + with self._capture_output() as (stdout_buf, stderr_buf), self._temp_cwd(): + try: + combined = {**self.globals, **self.locals} + exec(code, combined, combined) + + # Update locals with new variables + for key, value in combined.items(): + if key not in self.globals and not key.startswith("_"): + self.locals[key] = value + + stdout = stdout_buf.getvalue() + stderr = stderr_buf.getvalue() + except Exception as e: + stdout = stdout_buf.getvalue() + stderr = stderr_buf.getvalue() + f"\n{type(e).__name__}: {e}" return REPLResult( stdout=stdout, diff --git a/tests/test_local_repl.py b/tests/test_local_repl.py index f03c523c..ec5fac2c 100644 --- a/tests/test_local_repl.py +++ b/tests/test_local_repl.py @@ -193,3 +193,38 @@ def test_temp_dir_created_and_cleaned(self): assert os.path.exists(temp_dir) repl.cleanup() assert not os.path.exists(temp_dir) + + +class TestLocalREPLMultiContext: + """Tests for multi-context support.""" + + def test_add_context_versioning(self): + """Test that add_context creates versioned variables.""" + repl = LocalREPL() + repl.add_context("First", 0) + repl.add_context("Second", 1) + assert repl.locals["context_0"] == "First" + assert repl.locals["context_1"] == "Second" + assert repl.locals["context"] == "First" + assert repl.get_context_count() == 2 + repl.cleanup() + + def test_update_handler_address(self): + """Test handler address can be updated.""" + repl = LocalREPL(lm_handler_address=("127.0.0.1", 5000)) + repl.update_handler_address(("127.0.0.1", 6000)) + assert repl.lm_handler_address == ("127.0.0.1", 6000) + repl.cleanup() + + def test_add_context_auto_increment(self): + """Test that add_context auto-increments when no index provided.""" + repl = LocalREPL() + idx1 = repl.add_context("First") + idx2 = repl.add_context("Second") + assert idx1 == 0 + assert idx2 == 1 + assert repl.locals["context_0"] == "First" + assert repl.locals["context_1"] == "Second" + assert repl.get_context_count() == 2 + repl.cleanup() + From de6460878092b3f6b6e9acce12277833af813d60 Mon Sep 17 00:00:00 2001 From: thoriq <41317726+thoriqakbar0@users.noreply.github.com> Date: Mon, 5 Jan 2026 11:57:08 +0700 Subject: [PATCH 02/14] feat(core): add persistent flag for multi-turn REPL conversations Add persistent=True option to RLM that reuses the environment across completion() calls instead of creating/destroying for each call. This enables multi-turn conversations where variables and contexts persist. - Add persistent parameter to RLM constructor - Reuse environment when persistent=True, store as _persistent_env - Add close() method and context manager support (__enter__/__exit__) - Environment cleanup only on explicit close() when persistent --- rlm/core/rlm.py | 45 ++++++++++++++++++++++++++++++++-------- tests/test_local_repl.py | 1 - 2 files changed, 36 insertions(+), 10 deletions(-) diff --git a/rlm/core/rlm.py b/rlm/core/rlm.py index 55a3cf58..349c20bf 100644 --- a/rlm/core/rlm.py +++ b/rlm/core/rlm.py @@ -51,6 +51,7 @@ def __init__( other_backend_kwargs: list[dict[str, Any]] | None = None, logger: RLMLogger | None = None, verbose: bool = False, + persistent: bool = False, ): """ Args: @@ -66,6 +67,7 @@ def __init__( other_backend_kwargs: The kwargs to pass to the other client backends (ordered to match other_backends). logger: The logger to use for the RLM. verbose: Whether to print verbose output in rich to console. + persistent: If True, reuse the environment across completion() calls for multi-turn conversations. """ # Store config for spawning per-completion self.backend = backend @@ -84,6 +86,10 @@ def __init__( self.logger = logger self.verbose = VerbosePrinter(enabled=verbose) + # Persistence support + self.persistent = persistent + self._persistent_env: BaseEnv | None = None + # Log metadata if logger is provided if self.logger or verbose: metadata = RLMMetadata( @@ -108,7 +114,9 @@ def __init__( def _spawn_completion_context(self, prompt: str | dict[str, Any]): """ Spawn an LM handler and environment for a single completion call. - Cleans up both when the context exits. + + When persistent=True, the environment is reused across calls. + When persistent=False (default), creates fresh environment each call. """ # Create client and wrap in handler client: BaseLM = get_client(self.backend, self.backend_kwargs) @@ -122,20 +130,25 @@ def _spawn_completion_context(self, prompt: str | dict[str, Any]): lm_handler.start() - # Pass handler address to environment so it can make llm_query() calls - env_kwargs = self.environment_kwargs.copy() - env_kwargs["lm_handler_address"] = (lm_handler.host, lm_handler.port) - env_kwargs["context_payload"] = prompt + # Environment: reuse if persistent, otherwise create fresh + if self.persistent and self._persistent_env is not None: + environment = self._persistent_env + environment.update_handler_address((lm_handler.host, lm_handler.port)) + environment.add_context(prompt) + else: + env_kwargs = self.environment_kwargs.copy() + env_kwargs["lm_handler_address"] = (lm_handler.host, lm_handler.port) + env_kwargs["context_payload"] = prompt + environment: BaseEnv = get_environment(self.environment_type, env_kwargs) - # Initialize the environment - environment: BaseEnv = get_environment(self.environment_type, env_kwargs) + if self.persistent: + self._persistent_env = environment try: yield lm_handler, environment finally: - # Cleanup lm_handler.stop() - if hasattr(environment, "cleanup"): + if not self.persistent and hasattr(environment, "cleanup"): environment.cleanup() def _setup_prompt(self, prompt: str | dict[str, Any]) -> list[dict[str, Any]]: @@ -292,3 +305,17 @@ def _fallback_answer(self, message: str | dict[str, Any]) -> str: client: BaseLM = get_client(self.backend, self.backend_kwargs) response = client.completion(message) return response + + def close(self) -> None: + """Clean up persistent environment. Call when done with multi-turn conversations.""" + if self._persistent_env is not None: + if hasattr(self._persistent_env, "cleanup"): + self._persistent_env.cleanup() + self._persistent_env = None + + def __enter__(self) -> "RLM": + return self + + def __exit__(self, exc_type, exc_val, exc_tb) -> bool: + self.close() + return False diff --git a/tests/test_local_repl.py b/tests/test_local_repl.py index ec5fac2c..15c33c01 100644 --- a/tests/test_local_repl.py +++ b/tests/test_local_repl.py @@ -227,4 +227,3 @@ def test_add_context_auto_increment(self): assert repl.locals["context_1"] == "Second" assert repl.get_context_count() == 2 repl.cleanup() - From b6e7c8500b2f44687e41aba456ffdbc6ee1cd3dd Mon Sep 17 00:00:00 2001 From: thoriq <41317726+thoriqakbar0@users.noreply.github.com> Date: Mon, 5 Jan 2026 12:18:12 +0700 Subject: [PATCH 03/14] feat(prompts): inform model about multiple contexts in multi-turn sessions Add context_count parameter to build_user_prompt() so the model knows when multiple contexts are available (context_0, context_1, etc.) during persistent REPL sessions. --- rlm/core/rlm.py | 9 ++++++++- rlm/utils/prompts.py | 14 +++++++++++--- 2 files changed, 19 insertions(+), 4 deletions(-) diff --git a/rlm/core/rlm.py b/rlm/core/rlm.py index 349c20bf..c2dc69d9 100644 --- a/rlm/core/rlm.py +++ b/rlm/core/rlm.py @@ -190,7 +190,14 @@ def completion( for i in range(self.max_iterations): # Current prompt = message history + additional prompt suffix - current_prompt = message_history + [build_user_prompt(root_prompt, i)] + context_count = ( + environment.get_context_count() + if hasattr(environment, "get_context_count") + else 1 + ) + current_prompt = message_history + [ + build_user_prompt(root_prompt, i, context_count) + ] iteration: RLMIteration = self._completion_turn( prompt=current_prompt, diff --git a/rlm/utils/prompts.py b/rlm/utils/prompts.py index 3d0bd624..e47488e2 100644 --- a/rlm/utils/prompts.py +++ b/rlm/utils/prompts.py @@ -116,15 +116,23 @@ def build_rlm_system_prompt( USER_PROMPT_WITH_ROOT = """Think step-by-step on what to do using the REPL environment (which contains the context) to answer the original prompt: \"{root_prompt}\".\n\nContinue using the REPL environment, which has the `context` variable, and querying sub-LLMs by writing to ```repl``` tags, and determine your answer. Your next action:""" -def build_user_prompt(root_prompt: str | None = None, iteration: int = 0) -> dict[str, str]: +def build_user_prompt( + root_prompt: str | None = None, + iteration: int = 0, + context_count: int = 1, +) -> dict[str, str]: if iteration == 0: safeguard = "You have not interacted with the REPL environment or seen your prompt / context yet. Your next action should be to look through and figure out how to answer the prompt, so don't just provide a final answer yet.\n\n" prompt = safeguard + ( USER_PROMPT_WITH_ROOT.format(root_prompt=root_prompt) if root_prompt else USER_PROMPT ) - return {"role": "user", "content": prompt} else: prompt = "The history before is your previous interactions with the REPL environment. " + ( USER_PROMPT_WITH_ROOT.format(root_prompt=root_prompt) if root_prompt else USER_PROMPT ) - return {"role": "user", "content": prompt} + + # Inform model about multiple contexts if present + if context_count > 1: + prompt += f"\n\nNote: You have {context_count} contexts available (context_0 through context_{context_count - 1})." + + return {"role": "user", "content": prompt} From aa6645e269c73bd274c6d09ad67f94713105ba3d Mon Sep 17 00:00:00 2001 From: thoriq <41317726+thoriqakbar0@users.noreply.github.com> Date: Tue, 6 Jan 2026 14:26:45 +0700 Subject: [PATCH 04/14] fix(core): validate environment supports persistent mode before use Add validation to prevent AttributeError when persistent=True is used with environments that don't implement the required methods (update_handler_address, add_context, get_context_count). - Add _validate_persistent_environment_support() called at init time - Add _env_supports_persistence() for runtime capability checking - Add defensive runtime check before calling persistence methods - Raise clear ValueError if unsupported environment is configured --- rlm/core/rlm.py | 48 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) diff --git a/rlm/core/rlm.py b/rlm/core/rlm.py index c2dc69d9..b1ec2e83 100644 --- a/rlm/core/rlm.py +++ b/rlm/core/rlm.py @@ -90,6 +90,10 @@ def __init__( self.persistent = persistent self._persistent_env: BaseEnv | None = None + # Validate persistence support at initialization + if self.persistent: + self._validate_persistent_environment_support() + # Log metadata if logger is provided if self.logger or verbose: metadata = RLMMetadata( @@ -133,6 +137,13 @@ def _spawn_completion_context(self, prompt: str | dict[str, Any]): # Environment: reuse if persistent, otherwise create fresh if self.persistent and self._persistent_env is not None: environment = self._persistent_env + # Defensive check: ensure environment supports persistence methods + if not self._env_supports_persistence(environment): + raise RuntimeError( + f"Persistent environment of type '{type(environment).__name__}' does not " + f"implement required methods (update_handler_address, add_context, get_context_count). " + f"This should have been caught at initialization." + ) environment.update_handler_address((lm_handler.host, lm_handler.port)) environment.add_context(prompt) else: @@ -313,6 +324,43 @@ def _fallback_answer(self, message: str | dict[str, Any]) -> str: response = client.completion(message) return response + def _validate_persistent_environment_support(self) -> None: + """ + Validate that the configured environment type supports persistent mode. + + Persistent mode requires environments to implement: + - update_handler_address(address): Update LM handler address between calls + - add_context(payload, index): Add new context for multi-turn conversations + - get_context_count(): Return the number of loaded contexts + + Currently only 'local' (LocalREPL) supports these methods. + + Raises: + ValueError: If the environment type does not support persistent mode. + """ + # Known environments that support persistence + persistent_supported_environments = {"local"} + + if self.environment_type not in persistent_supported_environments: + raise ValueError( + f"persistent=True is not supported for environment type '{self.environment_type}'. " + f"Persistent mode requires environments that implement update_handler_address(), " + f"add_context(), and get_context_count(). " + f"Supported environments: {sorted(persistent_supported_environments)}" + ) + + @staticmethod + def _env_supports_persistence(env: BaseEnv) -> bool: + """Check if an environment instance supports persistent mode methods.""" + return ( + hasattr(env, "update_handler_address") + and hasattr(env, "add_context") + and hasattr(env, "get_context_count") + and callable(getattr(env, "update_handler_address", None)) + and callable(getattr(env, "add_context", None)) + and callable(getattr(env, "get_context_count", None)) + ) + def close(self) -> None: """Clean up persistent environment. Call when done with multi-turn conversations.""" if self._persistent_env is not None: From 5d6cefc728d1c0a8f14b71054e6c7182430fad76 Mon Sep 17 00:00:00 2001 From: thoriq <41317726+thoriqakbar0@users.noreply.github.com> Date: Wed, 7 Jan 2026 17:08:52 +0700 Subject: [PATCH 05/14] feat(local-repl): add message history storage for multi-turn sessions Store conversation histories as versioned variables (history_0, history_1, etc.) in the REPL environment, making them accessible for subsequent queries. This enables models to reference prior conversation context in persistent sessions. --- rlm/core/rlm.py | 21 +++++++++- rlm/environments/local_repl.py | 34 +++++++++++++++++ rlm/utils/prompts.py | 8 ++++ tests/test_local_repl.py | 70 ++++++++++++++++++++++++++++++++++ 4 files changed, 132 insertions(+), 1 deletion(-) diff --git a/rlm/core/rlm.py b/rlm/core/rlm.py index b1ec2e83..195837d8 100644 --- a/rlm/core/rlm.py +++ b/rlm/core/rlm.py @@ -206,8 +206,13 @@ def completion( if hasattr(environment, "get_context_count") else 1 ) + history_count = ( + environment.get_history_count() + if hasattr(environment, "get_history_count") + else 0 + ) current_prompt = message_history + [ - build_user_prompt(root_prompt, i, context_count) + build_user_prompt(root_prompt, i, context_count, history_count) ] iteration: RLMIteration = self._completion_turn( @@ -232,6 +237,11 @@ def completion( usage = lm_handler.get_usage_summary() self.verbose.print_final_answer(final_answer) self.verbose.print_summary(i + 1, time_end - time_start, usage.to_dict()) + + # Store message history in persistent environment + if self.persistent and hasattr(environment, "add_history"): + environment.add_history(message_history) + return RLMChatCompletion( root_model=self.backend_kwargs.get("model_name", "unknown") if self.backend_kwargs @@ -254,6 +264,11 @@ def completion( usage = lm_handler.get_usage_summary() self.verbose.print_final_answer(final_answer) self.verbose.print_summary(self.max_iterations, time_end - time_start, usage.to_dict()) + + # Store message history in persistent environment + if self.persistent and hasattr(environment, "add_history"): + environment.add_history(message_history) + return RLMChatCompletion( root_model=self.backend_kwargs.get("model_name", "unknown") if self.backend_kwargs @@ -356,9 +371,13 @@ def _env_supports_persistence(env: BaseEnv) -> bool: hasattr(env, "update_handler_address") and hasattr(env, "add_context") and hasattr(env, "get_context_count") + and hasattr(env, "add_history") + and hasattr(env, "get_history_count") and callable(getattr(env, "update_handler_address", None)) and callable(getattr(env, "add_context", None)) and callable(getattr(env, "get_context_count", None)) + and callable(getattr(env, "add_history", None)) + and callable(getattr(env, "get_history_count", None)) ) def close(self) -> None: diff --git a/rlm/environments/local_repl.py b/rlm/environments/local_repl.py index 0c9ec6f9..7f1fcd6b 100644 --- a/rlm/environments/local_repl.py +++ b/rlm/environments/local_repl.py @@ -1,3 +1,4 @@ +import copy import io import json import os @@ -131,6 +132,7 @@ def __init__( self.temp_dir = tempfile.mkdtemp(prefix=f"repl_env_{uuid.uuid4()}_") self._lock = threading.Lock() self._context_count: int = 0 + self._history_count: int = 0 # Setup globals, locals, and modules in environment. self.setup() @@ -272,6 +274,38 @@ def get_context_count(self) -> int: """Return the number of contexts loaded.""" return self._context_count + def add_history( + self, message_history: list[dict[str, Any]], history_index: int | None = None + ) -> int: + """ + Store a conversation's message history as a versioned variable. + + Args: + message_history: The list of message dicts from a completion call + history_index: Optional explicit index. If None, auto-increments. + + Returns: + The history index used. + """ + if history_index is None: + history_index = self._history_count + + var_name = f"history_{history_index}" + + # Store deep copy to avoid reference issues with nested dicts + self.locals[var_name] = copy.deepcopy(message_history) + + # Alias history_0 as 'history' for convenience + if history_index == 0: + self.locals["history"] = self.locals[var_name] + + self._history_count = max(self._history_count, history_index + 1) + return history_index + + def get_history_count(self) -> int: + """Return the number of conversation histories stored.""" + return self._history_count + @contextmanager def _capture_output(self): """Thread-safe context manager to capture stdout/stderr.""" diff --git a/rlm/utils/prompts.py b/rlm/utils/prompts.py index e47488e2..f69b2292 100644 --- a/rlm/utils/prompts.py +++ b/rlm/utils/prompts.py @@ -120,6 +120,7 @@ def build_user_prompt( root_prompt: str | None = None, iteration: int = 0, context_count: int = 1, + history_count: int = 0, ) -> dict[str, str]: if iteration == 0: safeguard = "You have not interacted with the REPL environment or seen your prompt / context yet. Your next action should be to look through and figure out how to answer the prompt, so don't just provide a final answer yet.\n\n" @@ -135,4 +136,11 @@ def build_user_prompt( if context_count > 1: prompt += f"\n\nNote: You have {context_count} contexts available (context_0 through context_{context_count - 1})." + # Inform model about prior conversation histories if present + if history_count > 0: + if history_count == 1: + prompt += "\n\nNote: You have 1 prior conversation history available in the `history` variable." + else: + prompt += f"\n\nNote: You have {history_count} prior conversation histories available (history_0 through history_{history_count - 1})." + return {"role": "user", "content": prompt} diff --git a/tests/test_local_repl.py b/tests/test_local_repl.py index 15c33c01..9a6ccf42 100644 --- a/tests/test_local_repl.py +++ b/tests/test_local_repl.py @@ -227,3 +227,73 @@ def test_add_context_auto_increment(self): assert repl.locals["context_1"] == "Second" assert repl.get_context_count() == 2 repl.cleanup() + + +class TestLocalREPLHistory: + """Tests for message history storage in LocalREPL.""" + + def test_add_history_basic(self): + """Test that add_history stores message history correctly.""" + repl = LocalREPL() + + history = [ + {"role": "system", "content": "You are helpful."}, + {"role": "user", "content": "Hello"}, + {"role": "assistant", "content": "Hi there!"}, + ] + + index = repl.add_history(history) + + assert index == 0 + assert "history_0" in repl.locals + assert "history" in repl.locals # alias + assert repl.locals["history_0"] == history + assert repl.locals["history"] == history + assert repl.get_history_count() == 1 + + repl.cleanup() + + def test_add_multiple_histories(self): + """Test adding multiple conversation histories.""" + repl = LocalREPL() + + history1 = [{"role": "user", "content": "First conversation"}] + history2 = [{"role": "user", "content": "Second conversation"}] + + repl.add_history(history1) + repl.add_history(history2) + + assert repl.get_history_count() == 2 + assert repl.locals["history_0"] == history1 + assert repl.locals["history_1"] == history2 + assert repl.locals["history"] == history1 # alias stays on first + + repl.cleanup() + + def test_history_accessible_via_code(self): + """Test that stored history is accessible via code execution.""" + repl = LocalREPL() + + history = [{"role": "user", "content": "Test message"}] + repl.add_history(history) + + result = repl.execute_code("msg = history[0]['content']") + assert result.stderr == "" + assert repl.locals["msg"] == "Test message" + + repl.cleanup() + + def test_history_is_copy(self): + """Test that stored history is a copy, not a reference.""" + repl = LocalREPL() + + history = [{"role": "user", "content": "Original"}] + repl.add_history(history) + + # Modify original + history[0]["content"] = "Modified" + + # Stored copy should be unchanged + assert repl.locals["history_0"][0]["content"] == "Original" + + repl.cleanup() From 60c4b76cf39c03e0694e5f60169518fc383e3233 Mon Sep 17 00:00:00 2001 From: thoriq <41317726+thoriqakbar0@users.noreply.github.com> Date: Thu, 8 Jan 2026 09:53:08 +0700 Subject: [PATCH 06/14] feat(environments): add persistent parameter to all REPL classes Add persistent parameter to BaseEnv, IsolatedEnv, and NonIsolatedEnv. DockerREPL, ModalREPL, and PrimeREPL raise NotImplementedError when persistent=True since they don't yet support it. LocalREPL passes the parameter through to the base class. Also makes DockerREPL cleanup more defensive with hasattr checks. --- rlm/environments/base_env.py | 11 ++++++----- rlm/environments/docker_repl.py | 13 +++++++++---- rlm/environments/local_repl.py | 3 ++- rlm/environments/modal_repl.py | 7 ++++++- rlm/environments/prime_repl.py | 7 ++++++- 5 files changed, 29 insertions(+), 12 deletions(-) diff --git a/rlm/environments/base_env.py b/rlm/environments/base_env.py index 6a99c64a..ae74577b 100644 --- a/rlm/environments/base_env.py +++ b/rlm/environments/base_env.py @@ -9,7 +9,8 @@ class BaseEnv(ABC): where isolated environments are on a separate machine from the LM. """ - def __init__(self, **kwargs): + def __init__(self, persistent: bool = False, **kwargs): + self.persistent = persistent self.kwargs = kwargs @abstractmethod @@ -31,8 +32,8 @@ class IsolatedEnv(BaseEnv, ABC): guaranteeing complete isolation from the LM process. """ - def __init__(self, **kwargs): - super().__init__(**kwargs) + def __init__(self, persistent: bool = False, **kwargs): + super().__init__(persistent=persistent, **kwargs) @abstractmethod def setup(self): @@ -54,8 +55,8 @@ class NonIsolatedEnv(BaseEnv, ABC): as a subprocess. """ - def __init__(self, **kwargs): - super().__init__(**kwargs) + def __init__(self, persistent: bool = False, **kwargs): + super().__init__(persistent=persistent, **kwargs) @abstractmethod def setup(self): diff --git a/rlm/environments/docker_repl.py b/rlm/environments/docker_repl.py index 6dd8c00a..19714857 100644 --- a/rlm/environments/docker_repl.py +++ b/rlm/environments/docker_repl.py @@ -180,9 +180,14 @@ def __init__( lm_handler_address: tuple[str, int] | None = None, context_payload: dict | list | str | None = None, setup_code: str | None = None, + persistent: bool = False, **kwargs, ): - super().__init__(**kwargs) + if persistent: + raise NotImplementedError( + "Persistent REPLs are currently not supported for environment: DockerREPL" + ) + super().__init__(persistent=persistent, **kwargs) self.image = image self.lm_handler_address = lm_handler_address @@ -292,13 +297,13 @@ def execute_code(self, code: str) -> REPLResult: ) def cleanup(self): - if self.container_id: + if hasattr(self, "container_id") and self.container_id: subprocess.run(["docker", "stop", self.container_id], capture_output=True) self.container_id = None - if self.proxy_server: + if hasattr(self, "proxy_server") and self.proxy_server: self.proxy_server.shutdown() self.proxy_server = None - if os.path.exists(self.temp_dir): + if hasattr(self, "temp_dir") and os.path.exists(self.temp_dir): import shutil shutil.rmtree(self.temp_dir, ignore_errors=True) diff --git a/rlm/environments/local_repl.py b/rlm/environments/local_repl.py index 7f1fcd6b..05de1d3d 100644 --- a/rlm/environments/local_repl.py +++ b/rlm/environments/local_repl.py @@ -123,9 +123,10 @@ def __init__( lm_handler_address: tuple[str, int] | None = None, context_payload: dict | list | str | None = None, setup_code: str | None = None, + persistent: bool = False, **kwargs, ): - super().__init__(**kwargs) + super().__init__(persistent=persistent, **kwargs) self.lm_handler_address = lm_handler_address self.original_cwd = os.getcwd() diff --git a/rlm/environments/modal_repl.py b/rlm/environments/modal_repl.py index 2acfed18..82fa24cf 100644 --- a/rlm/environments/modal_repl.py +++ b/rlm/environments/modal_repl.py @@ -309,9 +309,14 @@ def __init__( lm_handler_address: tuple[str, int] | None = None, context_payload: dict | list | str | None = None, setup_code: str | None = None, + persistent: bool = False, **kwargs, ): - super().__init__(**kwargs) + if persistent: + raise NotImplementedError( + "Persistent REPLs are currently not supported for environment: ModalREPL" + ) + super().__init__(persistent=persistent, **kwargs) self.app_name = app_name self.timeout = timeout diff --git a/rlm/environments/prime_repl.py b/rlm/environments/prime_repl.py index 19a082ca..0d88ce90 100644 --- a/rlm/environments/prime_repl.py +++ b/rlm/environments/prime_repl.py @@ -8,9 +8,14 @@ def __init__( context_payload: dict | list | str | None = None, sandbox_name: str | None = None, api_key: str | None = None, + persistent: bool = False, **kwargs, ): - pass + if persistent: + raise NotImplementedError( + "Persistent REPLs are currently not supported for environment: PrimeREPL" + ) + super().__init__(persistent=persistent, **kwargs) def setup(self): pass From 2ac518ad34f6aaaad504c1c0cc87c1fb2fd7abbf Mon Sep 17 00:00:00 2001 From: thoriq <41317726+thoriqakbar0@users.noreply.github.com> Date: Thu, 8 Jan 2026 09:53:20 +0700 Subject: [PATCH 07/14] test(local-repl): replace multi-context tests with non-persistent simulation tests Replace TestLocalREPLMultiContext and TestLocalREPLHistory with TestLocalREPLSimulatingRLMNoPersistence. New tests verify that environments reset between RLM completions when persistent=False. --- tests/test_local_repl.py | 149 +++++++++++++-------------------------- 1 file changed, 48 insertions(+), 101 deletions(-) diff --git a/tests/test_local_repl.py b/tests/test_local_repl.py index 9a6ccf42..115f671c 100644 --- a/tests/test_local_repl.py +++ b/tests/test_local_repl.py @@ -195,105 +195,52 @@ def test_temp_dir_created_and_cleaned(self): assert not os.path.exists(temp_dir) -class TestLocalREPLMultiContext: - """Tests for multi-context support.""" +class TestLocalREPLSimulatingRLMNoPersistence: + """ + Tests simulating RLM's non-persistent completion behavior. + + When RLM is configured without persistent=True (the default), each + get_completion() call spawns a fresh environment and destroys it after. + This test suite simulates that behavior to prove variables don't survive + across RLM completions. + + Why this matters: This is NOT just testing that two Python objects don't + share state (trivially true). This simulates the actual RLM workflow where + environments are created and destroyed per completion. + """ + + def test_simulated_rlm_completions_reset_environment(self): + """ + Simulates 2 RLM completions to show env resets between calls. + + Without persistent=True, RLM creates a fresh environment for each + completion, so state doesn't carry over. + """ + completion_1_env = LocalREPL() + completion_1_env.execute_code("important_result = 42") + assert completion_1_env.locals["important_result"] == 42 + completion_1_env.cleanup() + + completion_2_env = LocalREPL() + result = completion_2_env.execute_code("print(important_result)") + + assert "NameError" in result.stderr + assert "important_result" in result.stderr + completion_2_env.cleanup() + + def test_simulated_rlm_completions_functions_not_preserved(self): + """ + Simulates 2 RLM completions to show functions don't persist. + """ + completion_1_env = LocalREPL() + completion_1_env.execute_code("def my_helper(): return 'useful'") + assert completion_1_env.execute_code("print(my_helper())").stdout.strip() == "useful" + completion_1_env.cleanup() + + completion_2_env = LocalREPL() + result = completion_2_env.execute_code("my_helper()") + + assert "NameError" in result.stderr + assert "my_helper" in result.stderr + completion_2_env.cleanup() - def test_add_context_versioning(self): - """Test that add_context creates versioned variables.""" - repl = LocalREPL() - repl.add_context("First", 0) - repl.add_context("Second", 1) - assert repl.locals["context_0"] == "First" - assert repl.locals["context_1"] == "Second" - assert repl.locals["context"] == "First" - assert repl.get_context_count() == 2 - repl.cleanup() - - def test_update_handler_address(self): - """Test handler address can be updated.""" - repl = LocalREPL(lm_handler_address=("127.0.0.1", 5000)) - repl.update_handler_address(("127.0.0.1", 6000)) - assert repl.lm_handler_address == ("127.0.0.1", 6000) - repl.cleanup() - - def test_add_context_auto_increment(self): - """Test that add_context auto-increments when no index provided.""" - repl = LocalREPL() - idx1 = repl.add_context("First") - idx2 = repl.add_context("Second") - assert idx1 == 0 - assert idx2 == 1 - assert repl.locals["context_0"] == "First" - assert repl.locals["context_1"] == "Second" - assert repl.get_context_count() == 2 - repl.cleanup() - - -class TestLocalREPLHistory: - """Tests for message history storage in LocalREPL.""" - - def test_add_history_basic(self): - """Test that add_history stores message history correctly.""" - repl = LocalREPL() - - history = [ - {"role": "system", "content": "You are helpful."}, - {"role": "user", "content": "Hello"}, - {"role": "assistant", "content": "Hi there!"}, - ] - - index = repl.add_history(history) - - assert index == 0 - assert "history_0" in repl.locals - assert "history" in repl.locals # alias - assert repl.locals["history_0"] == history - assert repl.locals["history"] == history - assert repl.get_history_count() == 1 - - repl.cleanup() - - def test_add_multiple_histories(self): - """Test adding multiple conversation histories.""" - repl = LocalREPL() - - history1 = [{"role": "user", "content": "First conversation"}] - history2 = [{"role": "user", "content": "Second conversation"}] - - repl.add_history(history1) - repl.add_history(history2) - - assert repl.get_history_count() == 2 - assert repl.locals["history_0"] == history1 - assert repl.locals["history_1"] == history2 - assert repl.locals["history"] == history1 # alias stays on first - - repl.cleanup() - - def test_history_accessible_via_code(self): - """Test that stored history is accessible via code execution.""" - repl = LocalREPL() - - history = [{"role": "user", "content": "Test message"}] - repl.add_history(history) - - result = repl.execute_code("msg = history[0]['content']") - assert result.stderr == "" - assert repl.locals["msg"] == "Test message" - - repl.cleanup() - - def test_history_is_copy(self): - """Test that stored history is a copy, not a reference.""" - repl = LocalREPL() - - history = [{"role": "user", "content": "Original"}] - repl.add_history(history) - - # Modify original - history[0]["content"] = "Modified" - - # Stored copy should be unchanged - assert repl.locals["history_0"][0]["content"] == "Original" - - repl.cleanup() From ac3c2d7f9553647219d34ef46086f8b3b5c6ae58 Mon Sep 17 00:00:00 2001 From: thoriq <41317726+thoriqakbar0@users.noreply.github.com> Date: Thu, 8 Jan 2026 09:59:25 +0700 Subject: [PATCH 08/14] test(local-repl): add persistent mode unit tests Add TestLocalREPLMultiContext, TestLocalREPLHistory, and TestLocalREPLPersistentState test classes for LocalREPL's persistent mode features including multi-context versioning and message history storage. --- tests/test_local_repl_persistent.py | 218 ++++++++++++++++++++++++++++ 1 file changed, 218 insertions(+) create mode 100644 tests/test_local_repl_persistent.py diff --git a/tests/test_local_repl_persistent.py b/tests/test_local_repl_persistent.py new file mode 100644 index 00000000..4624c04a --- /dev/null +++ b/tests/test_local_repl_persistent.py @@ -0,0 +1,218 @@ +"""Tests for LocalREPL persistence features. + +These tests verify LocalREPL's multi-context and multi-history capabilities +which support the persistent=True mode in RLM for multi-turn conversations. +""" + +from rlm.environments.local_repl import LocalREPL + + +class TestLocalREPLMultiContext: + """Tests for multi-context support in persistent mode.""" + + def test_add_context_versioning(self): + """Test that add_context creates versioned variables.""" + repl = LocalREPL() + repl.add_context("First", 0) + repl.add_context("Second", 1) + assert repl.locals["context_0"] == "First" + assert repl.locals["context_1"] == "Second" + assert repl.locals["context"] == "First" + assert repl.get_context_count() == 2 + repl.cleanup() + + def test_update_handler_address(self): + """Test handler address can be updated.""" + repl = LocalREPL(lm_handler_address=("127.0.0.1", 5000)) + repl.update_handler_address(("127.0.0.1", 6000)) + assert repl.lm_handler_address == ("127.0.0.1", 6000) + repl.cleanup() + + def test_add_context_auto_increment(self): + """Test that add_context auto-increments when no index provided.""" + repl = LocalREPL() + idx1 = repl.add_context("First") + idx2 = repl.add_context("Second") + assert idx1 == 0 + assert idx2 == 1 + assert repl.locals["context_0"] == "First" + assert repl.locals["context_1"] == "Second" + assert repl.get_context_count() == 2 + repl.cleanup() + + def test_contexts_accessible_in_code(self): + """Test that multiple contexts can be accessed in code execution.""" + repl = LocalREPL() + repl.add_context("Document A content") + repl.add_context("Document B content") + + result = repl.execute_code("combined = f'{context_0} + {context_1}'") + assert result.stderr == "" + assert repl.locals["combined"] == "Document A content + Document B content" + repl.cleanup() + + def test_context_alias_points_to_first(self): + """Test that 'context' always aliases context_0.""" + repl = LocalREPL() + repl.add_context("First") + repl.add_context("Second") + repl.add_context("Third") + + result = repl.execute_code("is_first = context == context_0") + assert result.stderr == "" + assert repl.locals["is_first"] is True + repl.cleanup() + + +class TestLocalREPLHistory: + """Tests for message history storage in LocalREPL for persistent sessions.""" + + def test_add_history_basic(self): + """Test that add_history stores message history correctly.""" + repl = LocalREPL() + + history = [ + {"role": "system", "content": "You are helpful."}, + {"role": "user", "content": "Hello"}, + {"role": "assistant", "content": "Hi there!"}, + ] + + index = repl.add_history(history) + + assert index == 0 + assert "history_0" in repl.locals + assert "history" in repl.locals # alias + assert repl.locals["history_0"] == history + assert repl.locals["history"] == history + assert repl.get_history_count() == 1 + + repl.cleanup() + + def test_add_multiple_histories(self): + """Test adding multiple conversation histories.""" + repl = LocalREPL() + + history1 = [{"role": "user", "content": "First conversation"}] + history2 = [{"role": "user", "content": "Second conversation"}] + + repl.add_history(history1) + repl.add_history(history2) + + assert repl.get_history_count() == 2 + assert repl.locals["history_0"] == history1 + assert repl.locals["history_1"] == history2 + assert repl.locals["history"] == history1 # alias stays on first + + repl.cleanup() + + def test_history_accessible_via_code(self): + """Test that stored history is accessible via code execution.""" + repl = LocalREPL() + + history = [{"role": "user", "content": "Test message"}] + repl.add_history(history) + + result = repl.execute_code("msg = history[0]['content']") + assert result.stderr == "" + assert repl.locals["msg"] == "Test message" + + repl.cleanup() + + def test_history_is_copy(self): + """Test that stored history is a copy, not a reference.""" + repl = LocalREPL() + + history = [{"role": "user", "content": "Original"}] + repl.add_history(history) + + history[0]["content"] = "Modified" + + assert repl.locals["history_0"][0]["content"] == "Original" + + repl.cleanup() + + def test_can_iterate_histories_in_code(self): + """Test iterating through multiple histories in code.""" + repl = LocalREPL() + + repl.add_history([{"role": "user", "content": "Query 1"}]) + repl.add_history([{"role": "user", "content": "Query 2"}]) + repl.add_history([{"role": "user", "content": "Query 3"}]) + + code = """ +all_contents = [ + history_0[0]['content'], + history_1[0]['content'], + history_2[0]['content'], +] +""" + result = repl.execute_code(code) + assert result.stderr == "" + assert repl.locals["all_contents"] == ["Query 1", "Query 2", "Query 3"] + + repl.cleanup() + + +class TestLocalREPLPersistentState: + """Tests for state persistence across multiple operations in a single REPL instance.""" + + def test_variables_persist_with_contexts(self): + """Variables and contexts should coexist.""" + repl = LocalREPL() + + repl.add_context("My context data") + repl.execute_code("summary = context.upper()") + assert repl.locals["summary"] == "MY CONTEXT DATA" + + repl.add_context("New context") + + assert repl.locals["summary"] == "MY CONTEXT DATA" + assert repl.locals["context_1"] == "New context" + + repl.cleanup() + + def test_variables_persist_with_histories(self): + """Variables and histories should coexist.""" + repl = LocalREPL() + + repl.add_history([{"role": "user", "content": "Hello"}]) + repl.execute_code("extracted = history[0]['content']") + assert repl.locals["extracted"] == "Hello" + + repl.add_history([{"role": "user", "content": "World"}]) + + assert repl.locals["extracted"] == "Hello" + assert repl.locals["history_1"][0]["content"] == "World" + + repl.cleanup() + + def test_full_persistent_session_simulation(self): + """Simulate a multi-turn persistent session.""" + repl = LocalREPL() + + repl.add_context("Document: Sales were $1000") + repl.execute_code("sales = 1000") + + repl.add_context("Document: Costs were $400") + result = repl.execute_code("profit = sales - 400") + assert result.stderr == "" + assert repl.locals["profit"] == 600 + + repl.add_history([ + {"role": "user", "content": "What were the sales?"}, + {"role": "assistant", "content": "Sales were $1000"}, + ]) + + code = """ +summary = f"Sales: {context_0}, Costs: {context_1}, Profit: {profit}" +prev_question = history_0[0]['content'] +""" + result = repl.execute_code(code) + assert result.stderr == "" + assert "Profit: 600" in repl.locals["summary"] + assert repl.locals["prev_question"] == "What were the sales?" + + assert repl.get_context_count() == 2 + assert repl.get_history_count() == 1 + + repl.cleanup() From 1397cbd1f6d9af667177ebe59eb86452f4216e68 Mon Sep 17 00:00:00 2001 From: thoriq <41317726+thoriqakbar0@users.noreply.github.com> Date: Thu, 8 Jan 2026 09:59:34 +0700 Subject: [PATCH 09/14] test(rlm): add multi-turn integration tests Add comprehensive integration tests for persistent RLM sessions: - Environment reuse across completion calls - Context and history accumulation - Variable persistence between completions - Prompt awareness of contexts/histories - Resource cleanup on close - Validation of unsupported environments --- tests/test_multi_turn_integration.py | 443 +++++++++++++++++++++++++++ 1 file changed, 443 insertions(+) create mode 100644 tests/test_multi_turn_integration.py diff --git a/tests/test_multi_turn_integration.py b/tests/test_multi_turn_integration.py new file mode 100644 index 00000000..c67b22d4 --- /dev/null +++ b/tests/test_multi_turn_integration.py @@ -0,0 +1,443 @@ +"""Integration tests for multi-turn persistent REPL sessions. + +Tests that multiple LM completion calls in one RLM session: +1. Share the same environment +2. Accumulate contexts (context_0, context_1, ...) +3. Accumulate histories (history_0, history_1, ...) +4. Preserve variables across calls +5. Properly inform the model about available contexts/histories +""" + +import pytest +from unittest.mock import patch +from typing import Any + +from rlm import RLM +from rlm.clients.base_lm import BaseLM +from rlm.core.types import UsageSummary, ModelUsageSummary +import rlm.core.rlm as rlm_module # Import the module for patching + + +class ScriptedMockLM(BaseLM): + """Mock LM that returns scripted responses to enable controlled testing.""" + + def __init__(self, responses: list[str] | None = None): + super().__init__(model_name="scripted-mock") + self.responses = responses or [] + self.call_index = 0 + self.prompts_received: list[Any] = [] + + def completion(self, prompt: Any) -> str: + self.prompts_received.append(prompt) + if self.call_index < len(self.responses): + response = self.responses[self.call_index] + self.call_index += 1 + return response + return "FINAL(default answer)" + + async def acompletion(self, prompt: Any) -> str: + return self.completion(prompt) + + def get_usage_summary(self) -> UsageSummary: + return UsageSummary( + model_usage_summaries={ + "scripted-mock": ModelUsageSummary( + total_calls=self.call_index, + total_input_tokens=100 * self.call_index, + total_output_tokens=50 * self.call_index, + ) + } + ) + + def get_last_usage(self) -> UsageSummary: + return self.get_usage_summary() + + def reset(self) -> None: + """Reset for next completion call (call index resets, but keep history of prompts).""" + self.call_index = 0 + + +class TestMultiTurnPersistentEnvironment: + """Tests for environment persistence across completion calls.""" + + def test_environment_reused_in_persistent_mode(self): + """Verify the same environment instance is reused across completion calls.""" + # Response that immediately provides a final answer + responses = ["FINAL(answer from call)"] + + with patch.object(rlm_module, "get_client") as mock_get_client: + mock_lm = ScriptedMockLM(responses) + mock_get_client.return_value = mock_lm + + with RLM( + backend="openai", + backend_kwargs={"model_name": "test"}, + persistent=True, + ) as rlm: + rlm.completion("First context") + first_env = rlm._persistent_env + + mock_lm.reset() + + rlm.completion("Second context") + second_env = rlm._persistent_env + + assert first_env is second_env + assert first_env is not None + + def test_context_accumulation_across_calls(self): + """Verify contexts accumulate: context_0, context_1, etc.""" + responses = ["FINAL(got it)"] + + with patch.object(rlm_module, "get_client") as mock_get_client: + mock_lm = ScriptedMockLM(responses) + mock_get_client.return_value = mock_lm + + with RLM( + backend="openai", + backend_kwargs={"model_name": "test"}, + persistent=True, + ) as rlm: + rlm.completion("First document") + mock_lm.reset() + rlm.completion("Second document") + mock_lm.reset() + rlm.completion("Third document") + + env = rlm._persistent_env + assert env.get_context_count() == 3 + assert env.locals["context_0"] == "First document" + assert env.locals["context_1"] == "Second document" + assert env.locals["context_2"] == "Third document" + # context alias should point to first context + assert env.locals["context"] == "First document" + + def test_history_accumulation_across_calls(self): + """Verify message histories accumulate: history_0, history_1, etc.""" + responses = ["FINAL(done)"] + + with patch.object(rlm_module, "get_client") as mock_get_client: + mock_lm = ScriptedMockLM(responses) + mock_get_client.return_value = mock_lm + + with RLM( + backend="openai", + backend_kwargs={"model_name": "test"}, + persistent=True, + ) as rlm: + rlm.completion("Context A") + mock_lm.reset() + rlm.completion("Context B") + mock_lm.reset() + rlm.completion("Context C") + + env = rlm._persistent_env + # After 3 completions, should have 3 histories stored + assert env.get_history_count() == 3 + assert "history_0" in env.locals + assert "history_1" in env.locals + assert "history_2" in env.locals + # Each history should be a list of message dicts + assert isinstance(env.locals["history_0"], list) + assert len(env.locals["history_0"]) > 0 + # history alias should point to first history + assert env.locals["history"] == env.locals["history_0"] + + def test_variable_persistence_across_completions(self): + """Variables computed in one completion should be available in subsequent ones.""" + # First call: compute a variable + # Second call: use that variable + first_responses = [ + "Let me compute something\n```repl\ncomputed_value = 42 * 2\nprint(computed_value)\n```", + "FINAL(84)", + ] + second_responses = [ + "```repl\nresult = computed_value + 10\nprint(result)\n```", + "FINAL(94)", + ] + + with patch.object(rlm_module, "get_client") as mock_get_client: + mock_lm = ScriptedMockLM(first_responses) + mock_get_client.return_value = mock_lm + + with RLM( + backend="openai", + backend_kwargs={"model_name": "test"}, + persistent=True, + ) as rlm: + rlm.completion("Compute 42 * 2") + assert rlm._persistent_env.locals.get("computed_value") == 84 + + mock_lm.responses = second_responses + mock_lm.reset() + rlm.completion("Add 10 to the previous result") + + assert rlm._persistent_env.locals.get("computed_value") == 84 + assert rlm._persistent_env.locals.get("result") == 94 + + +class TestMultiTurnPromptAwareness: + """Tests that prompts correctly inform the model about contexts/histories.""" + + def test_prompt_includes_context_count(self): + """Model should be informed about available contexts.""" + responses = ["FINAL(ok)"] + + with patch.object(rlm_module, "get_client") as mock_get_client: + mock_lm = ScriptedMockLM(responses) + mock_get_client.return_value = mock_lm + + with RLM( + backend="openai", + backend_kwargs={"model_name": "test"}, + persistent=True, + ) as rlm: + rlm.completion("First") + mock_lm.reset() + rlm.completion("Second") + + # Check that the second call's prompt mentions multiple contexts + last_prompt = mock_lm.prompts_received[-1] + # last_prompt is a list of message dicts + user_messages = [m for m in last_prompt if m.get("role") == "user"] + user_content = " ".join(m.get("content", "") for m in user_messages) + + assert "2 contexts" in user_content or "context_0" in user_content + + def test_prompt_includes_history_count(self): + """Model should be informed about available histories.""" + responses = ["FINAL(ok)"] + + with patch.object(rlm_module, "get_client") as mock_get_client: + mock_lm = ScriptedMockLM(responses) + mock_get_client.return_value = mock_lm + + with RLM( + backend="openai", + backend_kwargs={"model_name": "test"}, + persistent=True, + ) as rlm: + rlm.completion("First task") + mock_lm.reset() + rlm.completion("Second task") # Should have 1 history from first + + # The prompt for second call should mention history + last_prompt = mock_lm.prompts_received[-1] + user_messages = [m for m in last_prompt if m.get("role") == "user"] + user_content = " ".join(m.get("content", "") for m in user_messages) + + assert "history" in user_content.lower() + + +class TestMultiTurnCodeExecution: + """Tests for code execution in multi-turn sessions.""" + + def test_can_access_previous_context_in_code(self): + """Code should be able to reference earlier contexts.""" + responses = [ + "```repl\nprint(f'First: {context_0}, Second: {context_1}')\n```", + "FINAL(printed both)", + ] + + with patch.object(rlm_module, "get_client") as mock_get_client: + mock_lm = ScriptedMockLM(["FINAL(first done)"]) + mock_get_client.return_value = mock_lm + + with RLM( + backend="openai", + backend_kwargs={"model_name": "test"}, + persistent=True, + ) as rlm: + rlm.completion("Document A") + + mock_lm.responses = responses + mock_lm.reset() + rlm.completion("Document B") + + # Both contexts should be accessible + env = rlm._persistent_env + assert env.locals["context_0"] == "Document A" + assert env.locals["context_1"] == "Document B" + + def test_can_access_history_in_code(self): + """Code should be able to reference stored histories.""" + responses = [ + "```repl\nprint(f'History entries: {len(history)}')\n```", + "FINAL(accessed history)", + ] + + with patch.object(rlm_module, "get_client") as mock_get_client: + mock_lm = ScriptedMockLM(["FINAL(first)"]) + mock_get_client.return_value = mock_lm + + with RLM( + backend="openai", + backend_kwargs={"model_name": "test"}, + persistent=True, + ) as rlm: + rlm.completion("First query") + + mock_lm.responses = responses + mock_lm.reset() + + rlm.completion("Second query") + + env = rlm._persistent_env + assert "history" in env.locals + assert isinstance(env.locals["history"], list) + + +class TestNonPersistentMode: + """Tests to ensure non-persistent mode still works correctly.""" + + def test_non_persistent_creates_fresh_environment(self): + """Non-persistent mode should create new environment each call.""" + responses = ["FINAL(done)"] + + with patch.object(rlm_module, "get_client") as mock_get_client: + mock_lm = ScriptedMockLM(responses) + mock_get_client.return_value = mock_lm + + rlm = RLM( + backend="openai", + backend_kwargs={"model_name": "test"}, + persistent=False, # Explicitly non-persistent + ) + + rlm.completion("First") + # In non-persistent mode, no persistent env should be stored + assert rlm._persistent_env is None + + mock_lm.reset() + rlm.completion("Second") + assert rlm._persistent_env is None + + def test_default_is_non_persistent(self): + """Default behavior should be non-persistent.""" + rlm = RLM( + backend="openai", + backend_kwargs={"model_name": "test"}, + ) + assert rlm.persistent is False + + +class TestPersistentModeResourceManagement: + """Tests for proper resource cleanup in persistent mode.""" + + def test_context_manager_cleanup(self): + """Environment should be cleaned up when exiting context manager.""" + responses = ["FINAL(done)"] + + with patch.object(rlm_module, "get_client") as mock_get_client: + mock_lm = ScriptedMockLM(responses) + mock_get_client.return_value = mock_lm + + with RLM( + backend="openai", + backend_kwargs={"model_name": "test"}, + persistent=True, + ) as rlm: + rlm.completion("Test") + assert rlm._persistent_env is not None + + # After exiting context manager, env should be cleaned up + assert rlm._persistent_env is None + + def test_explicit_close(self): + """Calling close() should clean up persistent environment.""" + responses = ["FINAL(done)"] + + with patch.object(rlm_module, "get_client") as mock_get_client: + mock_lm = ScriptedMockLM(responses) + mock_get_client.return_value = mock_lm + + rlm = RLM( + backend="openai", + backend_kwargs={"model_name": "test"}, + persistent=True, + ) + rlm.completion("Test") + assert rlm._persistent_env is not None + + rlm.close() + assert rlm._persistent_env is None + + +class TestPersistentModeValidation: + """Tests for persistent mode validation.""" + + def test_unsupported_environment_raises_error(self): + """Persistent mode should raise error for unsupported environments.""" + with pytest.raises(ValueError, match="persistent=True is not supported"): + RLM( + backend="openai", + backend_kwargs={"model_name": "test"}, + environment="docker", # Not supported for persistent + persistent=True, + ) + + def test_local_environment_supported(self): + """Local environment should support persistent mode.""" + # Should not raise + rlm = RLM( + backend="openai", + backend_kwargs={"model_name": "test"}, + environment="local", + persistent=True, + ) + assert rlm.persistent is True + + +class TestMultiTurnEndToEnd: + """End-to-end tests simulating realistic multi-turn usage.""" + + def test_three_turn_conversation(self): + """Simulate a 3-turn conversation with context accumulation.""" + turn1_responses = [ + "Looking at the first document\n```repl\ndoc1_summary = 'Has info about cats'\nprint(doc1_summary)\n```", + "FINAL(Summarized first doc)", + ] + turn2_responses = [ + "Looking at second document and comparing\n```repl\ndoc2_summary = 'Has info about dogs'\nprint(f'Doc1: {doc1_summary}, Doc2: {doc2_summary}')\n```", + "FINAL(Compared both docs)", + ] + turn3_responses = [ + "Final synthesis\n```repl\nfinal = f'Combined: {doc1_summary} and {doc2_summary} from context_2'\nprint(final)\n```", + "FINAL(synthesized all)", + ] + + with patch.object(rlm_module, "get_client") as mock_get_client: + mock_lm = ScriptedMockLM(turn1_responses) + mock_get_client.return_value = mock_lm + + with RLM( + backend="openai", + backend_kwargs={"model_name": "test"}, + persistent=True, + ) as rlm: + # Turn 1 + result1 = rlm.completion("First document about cats") + assert "Summarized" in result1.response + + # Turn 2 + mock_lm.responses = turn2_responses + mock_lm.reset() + result2 = rlm.completion("Second document about dogs") + assert "Compared" in result2.response + + # Turn 3 + mock_lm.responses = turn3_responses + mock_lm.reset() + result3 = rlm.completion("Synthesize everything") + assert "synthesized" in result3.response + + # Verify state + env = rlm._persistent_env + assert env.get_context_count() == 3 + assert env.get_history_count() == 3 + assert env.locals.get("doc1_summary") == "Has info about cats" + assert env.locals.get("doc2_summary") == "Has info about dogs" + + +if __name__ == "__main__": + pytest.main([__file__, "-v"]) From 1e76ac6fd97fd9df0678938daaa93347c18385f2 Mon Sep 17 00:00:00 2001 From: thoriq <41317726+thoriqakbar0@users.noreply.github.com> Date: Thu, 8 Jan 2026 10:30:43 +0700 Subject: [PATCH 10/14] refactor(tests): replace ScriptedMockLM with standard unittest.mock Remove custom ScriptedMockLM class (58 lines) and replace with create_mock_lm() helper (14 lines) using standard Mock patterns. --- tests/test_multi_turn_integration.py | 143 +++++++++------------------ 1 file changed, 48 insertions(+), 95 deletions(-) diff --git a/tests/test_multi_turn_integration.py b/tests/test_multi_turn_integration.py index c67b22d4..662b3e83 100644 --- a/tests/test_multi_turn_integration.py +++ b/tests/test_multi_turn_integration.py @@ -9,52 +9,26 @@ """ import pytest -from unittest.mock import patch -from typing import Any +from unittest.mock import patch, Mock from rlm import RLM -from rlm.clients.base_lm import BaseLM from rlm.core.types import UsageSummary, ModelUsageSummary -import rlm.core.rlm as rlm_module # Import the module for patching - - -class ScriptedMockLM(BaseLM): - """Mock LM that returns scripted responses to enable controlled testing.""" - - def __init__(self, responses: list[str] | None = None): - super().__init__(model_name="scripted-mock") - self.responses = responses or [] - self.call_index = 0 - self.prompts_received: list[Any] = [] - - def completion(self, prompt: Any) -> str: - self.prompts_received.append(prompt) - if self.call_index < len(self.responses): - response = self.responses[self.call_index] - self.call_index += 1 - return response - return "FINAL(default answer)" - - async def acompletion(self, prompt: Any) -> str: - return self.completion(prompt) - - def get_usage_summary(self) -> UsageSummary: - return UsageSummary( - model_usage_summaries={ - "scripted-mock": ModelUsageSummary( - total_calls=self.call_index, - total_input_tokens=100 * self.call_index, - total_output_tokens=50 * self.call_index, - ) - } - ) +import rlm.core.rlm as rlm_module - def get_last_usage(self) -> UsageSummary: - return self.get_usage_summary() - def reset(self) -> None: - """Reset for next completion call (call index resets, but keep history of prompts).""" - self.call_index = 0 +def create_mock_lm(responses: list[str]) -> Mock: + """Create a mock LM that returns responses in order.""" + mock = Mock() + mock.completion.side_effect = list(responses) + mock.get_usage_summary.return_value = UsageSummary( + model_usage_summaries={ + "mock": ModelUsageSummary( + total_calls=1, total_input_tokens=100, total_output_tokens=50 + ) + } + ) + mock.get_last_usage.return_value = mock.get_usage_summary.return_value + return mock class TestMultiTurnPersistentEnvironment: @@ -62,11 +36,10 @@ class TestMultiTurnPersistentEnvironment: def test_environment_reused_in_persistent_mode(self): """Verify the same environment instance is reused across completion calls.""" - # Response that immediately provides a final answer responses = ["FINAL(answer from call)"] with patch.object(rlm_module, "get_client") as mock_get_client: - mock_lm = ScriptedMockLM(responses) + mock_lm = create_mock_lm(responses) mock_get_client.return_value = mock_lm with RLM( @@ -77,7 +50,7 @@ def test_environment_reused_in_persistent_mode(self): rlm.completion("First context") first_env = rlm._persistent_env - mock_lm.reset() + mock_lm.completion.side_effect = list(responses) rlm.completion("Second context") second_env = rlm._persistent_env @@ -90,7 +63,7 @@ def test_context_accumulation_across_calls(self): responses = ["FINAL(got it)"] with patch.object(rlm_module, "get_client") as mock_get_client: - mock_lm = ScriptedMockLM(responses) + mock_lm = create_mock_lm(responses) mock_get_client.return_value = mock_lm with RLM( @@ -99,9 +72,9 @@ def test_context_accumulation_across_calls(self): persistent=True, ) as rlm: rlm.completion("First document") - mock_lm.reset() + mock_lm.completion.side_effect = list(responses) rlm.completion("Second document") - mock_lm.reset() + mock_lm.completion.side_effect = list(responses) rlm.completion("Third document") env = rlm._persistent_env @@ -109,7 +82,6 @@ def test_context_accumulation_across_calls(self): assert env.locals["context_0"] == "First document" assert env.locals["context_1"] == "Second document" assert env.locals["context_2"] == "Third document" - # context alias should point to first context assert env.locals["context"] == "First document" def test_history_accumulation_across_calls(self): @@ -117,7 +89,7 @@ def test_history_accumulation_across_calls(self): responses = ["FINAL(done)"] with patch.object(rlm_module, "get_client") as mock_get_client: - mock_lm = ScriptedMockLM(responses) + mock_lm = create_mock_lm(responses) mock_get_client.return_value = mock_lm with RLM( @@ -126,27 +98,22 @@ def test_history_accumulation_across_calls(self): persistent=True, ) as rlm: rlm.completion("Context A") - mock_lm.reset() + mock_lm.completion.side_effect = list(responses) rlm.completion("Context B") - mock_lm.reset() + mock_lm.completion.side_effect = list(responses) rlm.completion("Context C") env = rlm._persistent_env - # After 3 completions, should have 3 histories stored assert env.get_history_count() == 3 assert "history_0" in env.locals assert "history_1" in env.locals assert "history_2" in env.locals - # Each history should be a list of message dicts assert isinstance(env.locals["history_0"], list) assert len(env.locals["history_0"]) > 0 - # history alias should point to first history assert env.locals["history"] == env.locals["history_0"] def test_variable_persistence_across_completions(self): """Variables computed in one completion should be available in subsequent ones.""" - # First call: compute a variable - # Second call: use that variable first_responses = [ "Let me compute something\n```repl\ncomputed_value = 42 * 2\nprint(computed_value)\n```", "FINAL(84)", @@ -157,7 +124,7 @@ def test_variable_persistence_across_completions(self): ] with patch.object(rlm_module, "get_client") as mock_get_client: - mock_lm = ScriptedMockLM(first_responses) + mock_lm = create_mock_lm(first_responses) mock_get_client.return_value = mock_lm with RLM( @@ -168,8 +135,7 @@ def test_variable_persistence_across_completions(self): rlm.completion("Compute 42 * 2") assert rlm._persistent_env.locals.get("computed_value") == 84 - mock_lm.responses = second_responses - mock_lm.reset() + mock_lm.completion.side_effect = list(second_responses) rlm.completion("Add 10 to the previous result") assert rlm._persistent_env.locals.get("computed_value") == 84 @@ -184,7 +150,7 @@ def test_prompt_includes_context_count(self): responses = ["FINAL(ok)"] with patch.object(rlm_module, "get_client") as mock_get_client: - mock_lm = ScriptedMockLM(responses) + mock_lm = create_mock_lm(responses) mock_get_client.return_value = mock_lm with RLM( @@ -193,12 +159,10 @@ def test_prompt_includes_context_count(self): persistent=True, ) as rlm: rlm.completion("First") - mock_lm.reset() + mock_lm.completion.side_effect = list(responses) rlm.completion("Second") - # Check that the second call's prompt mentions multiple contexts - last_prompt = mock_lm.prompts_received[-1] - # last_prompt is a list of message dicts + last_prompt = mock_lm.completion.call_args[0][0] user_messages = [m for m in last_prompt if m.get("role") == "user"] user_content = " ".join(m.get("content", "") for m in user_messages) @@ -209,7 +173,7 @@ def test_prompt_includes_history_count(self): responses = ["FINAL(ok)"] with patch.object(rlm_module, "get_client") as mock_get_client: - mock_lm = ScriptedMockLM(responses) + mock_lm = create_mock_lm(responses) mock_get_client.return_value = mock_lm with RLM( @@ -218,11 +182,10 @@ def test_prompt_includes_history_count(self): persistent=True, ) as rlm: rlm.completion("First task") - mock_lm.reset() - rlm.completion("Second task") # Should have 1 history from first + mock_lm.completion.side_effect = list(responses) + rlm.completion("Second task") - # The prompt for second call should mention history - last_prompt = mock_lm.prompts_received[-1] + last_prompt = mock_lm.completion.call_args[0][0] user_messages = [m for m in last_prompt if m.get("role") == "user"] user_content = " ".join(m.get("content", "") for m in user_messages) @@ -234,13 +197,14 @@ class TestMultiTurnCodeExecution: def test_can_access_previous_context_in_code(self): """Code should be able to reference earlier contexts.""" - responses = [ + first_responses = ["FINAL(first done)"] + second_responses = [ "```repl\nprint(f'First: {context_0}, Second: {context_1}')\n```", "FINAL(printed both)", ] with patch.object(rlm_module, "get_client") as mock_get_client: - mock_lm = ScriptedMockLM(["FINAL(first done)"]) + mock_lm = create_mock_lm(first_responses) mock_get_client.return_value = mock_lm with RLM( @@ -250,24 +214,23 @@ def test_can_access_previous_context_in_code(self): ) as rlm: rlm.completion("Document A") - mock_lm.responses = responses - mock_lm.reset() + mock_lm.completion.side_effect = list(second_responses) rlm.completion("Document B") - # Both contexts should be accessible env = rlm._persistent_env assert env.locals["context_0"] == "Document A" assert env.locals["context_1"] == "Document B" def test_can_access_history_in_code(self): """Code should be able to reference stored histories.""" - responses = [ + first_responses = ["FINAL(first)"] + second_responses = [ "```repl\nprint(f'History entries: {len(history)}')\n```", "FINAL(accessed history)", ] with patch.object(rlm_module, "get_client") as mock_get_client: - mock_lm = ScriptedMockLM(["FINAL(first)"]) + mock_lm = create_mock_lm(first_responses) mock_get_client.return_value = mock_lm with RLM( @@ -277,9 +240,7 @@ def test_can_access_history_in_code(self): ) as rlm: rlm.completion("First query") - mock_lm.responses = responses - mock_lm.reset() - + mock_lm.completion.side_effect = list(second_responses) rlm.completion("Second query") env = rlm._persistent_env @@ -295,20 +256,19 @@ def test_non_persistent_creates_fresh_environment(self): responses = ["FINAL(done)"] with patch.object(rlm_module, "get_client") as mock_get_client: - mock_lm = ScriptedMockLM(responses) + mock_lm = create_mock_lm(responses) mock_get_client.return_value = mock_lm rlm = RLM( backend="openai", backend_kwargs={"model_name": "test"}, - persistent=False, # Explicitly non-persistent + persistent=False, ) rlm.completion("First") - # In non-persistent mode, no persistent env should be stored assert rlm._persistent_env is None - mock_lm.reset() + mock_lm.completion.side_effect = list(responses) rlm.completion("Second") assert rlm._persistent_env is None @@ -329,7 +289,7 @@ def test_context_manager_cleanup(self): responses = ["FINAL(done)"] with patch.object(rlm_module, "get_client") as mock_get_client: - mock_lm = ScriptedMockLM(responses) + mock_lm = create_mock_lm(responses) mock_get_client.return_value = mock_lm with RLM( @@ -340,7 +300,6 @@ def test_context_manager_cleanup(self): rlm.completion("Test") assert rlm._persistent_env is not None - # After exiting context manager, env should be cleaned up assert rlm._persistent_env is None def test_explicit_close(self): @@ -348,7 +307,7 @@ def test_explicit_close(self): responses = ["FINAL(done)"] with patch.object(rlm_module, "get_client") as mock_get_client: - mock_lm = ScriptedMockLM(responses) + mock_lm = create_mock_lm(responses) mock_get_client.return_value = mock_lm rlm = RLM( @@ -407,7 +366,7 @@ def test_three_turn_conversation(self): ] with patch.object(rlm_module, "get_client") as mock_get_client: - mock_lm = ScriptedMockLM(turn1_responses) + mock_lm = create_mock_lm(turn1_responses) mock_get_client.return_value = mock_lm with RLM( @@ -415,23 +374,17 @@ def test_three_turn_conversation(self): backend_kwargs={"model_name": "test"}, persistent=True, ) as rlm: - # Turn 1 result1 = rlm.completion("First document about cats") assert "Summarized" in result1.response - # Turn 2 - mock_lm.responses = turn2_responses - mock_lm.reset() + mock_lm.completion.side_effect = list(turn2_responses) result2 = rlm.completion("Second document about dogs") assert "Compared" in result2.response - # Turn 3 - mock_lm.responses = turn3_responses - mock_lm.reset() + mock_lm.completion.side_effect = list(turn3_responses) result3 = rlm.completion("Synthesize everything") assert "synthesized" in result3.response - # Verify state env = rlm._persistent_env assert env.get_context_count() == 3 assert env.get_history_count() == 3 From 4be974990e6b66d488b93df8657f902079d2dab9 Mon Sep 17 00:00:00 2001 From: thoriq <41317726+thoriqakbar0@users.noreply.github.com> Date: Thu, 8 Jan 2026 12:51:10 +0700 Subject: [PATCH 11/14] refactor(environments): add SupportsPersistence Protocol Replace scattered hasattr checks with a runtime-checkable Protocol that defines the persistence capability contract. This provides type checker enforcement and IDE autocomplete support. --- rlm/core/rlm.py | 25 +++++++------------------ rlm/environments/__init__.py | 4 +++- rlm/environments/base_env.py | 20 ++++++++++++++++++++ 3 files changed, 30 insertions(+), 19 deletions(-) diff --git a/rlm/core/rlm.py b/rlm/core/rlm.py index 195837d8..1de29776 100644 --- a/rlm/core/rlm.py +++ b/rlm/core/rlm.py @@ -13,7 +13,7 @@ RLMIteration, RLMMetadata, ) -from rlm.environments import BaseEnv, get_environment +from rlm.environments import BaseEnv, SupportsPersistence, get_environment from rlm.logger import RLMLogger, VerbosePrinter from rlm.utils.parsing import ( find_code_blocks, @@ -88,7 +88,7 @@ def __init__( # Persistence support self.persistent = persistent - self._persistent_env: BaseEnv | None = None + self._persistent_env: SupportsPersistence | None = None # Validate persistence support at initialization if self.persistent: @@ -203,12 +203,12 @@ def completion( # Current prompt = message history + additional prompt suffix context_count = ( environment.get_context_count() - if hasattr(environment, "get_context_count") + if isinstance(environment, SupportsPersistence) else 1 ) history_count = ( environment.get_history_count() - if hasattr(environment, "get_history_count") + if isinstance(environment, SupportsPersistence) else 0 ) current_prompt = message_history + [ @@ -239,7 +239,7 @@ def completion( self.verbose.print_summary(i + 1, time_end - time_start, usage.to_dict()) # Store message history in persistent environment - if self.persistent and hasattr(environment, "add_history"): + if self.persistent and isinstance(environment, SupportsPersistence): environment.add_history(message_history) return RLMChatCompletion( @@ -266,7 +266,7 @@ def completion( self.verbose.print_summary(self.max_iterations, time_end - time_start, usage.to_dict()) # Store message history in persistent environment - if self.persistent and hasattr(environment, "add_history"): + if self.persistent and isinstance(environment, SupportsPersistence): environment.add_history(message_history) return RLMChatCompletion( @@ -367,18 +367,7 @@ def _validate_persistent_environment_support(self) -> None: @staticmethod def _env_supports_persistence(env: BaseEnv) -> bool: """Check if an environment instance supports persistent mode methods.""" - return ( - hasattr(env, "update_handler_address") - and hasattr(env, "add_context") - and hasattr(env, "get_context_count") - and hasattr(env, "add_history") - and hasattr(env, "get_history_count") - and callable(getattr(env, "update_handler_address", None)) - and callable(getattr(env, "add_context", None)) - and callable(getattr(env, "get_context_count", None)) - and callable(getattr(env, "add_history", None)) - and callable(getattr(env, "get_history_count", None)) - ) + return isinstance(env, SupportsPersistence) def close(self) -> None: """Clean up persistent environment. Call when done with multi-turn conversations.""" diff --git a/rlm/environments/__init__.py b/rlm/environments/__init__.py index 9d2719f2..6a70431d 100644 --- a/rlm/environments/__init__.py +++ b/rlm/environments/__init__.py @@ -1,8 +1,10 @@ from typing import Any, Literal -from rlm.environments.base_env import BaseEnv +from rlm.environments.base_env import BaseEnv, SupportsPersistence from rlm.environments.local_repl import LocalREPL +__all__ = ["BaseEnv", "LocalREPL", "SupportsPersistence", "get_environment"] + def get_environment( environment: Literal["local", "modal", "docker"], diff --git a/rlm/environments/base_env.py b/rlm/environments/base_env.py index ae74577b..b185848a 100644 --- a/rlm/environments/base_env.py +++ b/rlm/environments/base_env.py @@ -1,4 +1,5 @@ from abc import ABC, abstractmethod +from typing import Any, Protocol, runtime_checkable from rlm.core.types import REPLResult @@ -69,3 +70,22 @@ def load_context(self, context_payload: dict | list | str): @abstractmethod def execute_code(self, code: str) -> REPLResult: raise NotImplementedError + + +@runtime_checkable +class SupportsPersistence(Protocol): + """Protocol for environments that support persistent multi-turn sessions. + + Use isinstance(env, SupportsPersistence) to check if an environment + supports persistence capabilities. + """ + + def update_handler_address(self, address: tuple[str, int]) -> None: ... + def add_context( + self, context_payload: dict | list | str, context_index: int | None = None + ) -> int: ... + def get_context_count(self) -> int: ... + def add_history( + self, message_history: list[dict[str, Any]], history_index: int | None = None + ) -> int: ... + def get_history_count(self) -> int: ... From 86862b41389a8191d9f85e88f20752f2924c22b9 Mon Sep 17 00:00:00 2001 From: thoriq <41317726+thoriqakbar0@users.noreply.github.com> Date: Thu, 8 Jan 2026 13:41:36 +0700 Subject: [PATCH 12/14] docs(environments): add comprehensive SupportsPersistence Protocol docs Document expected behavior to implement persistence: - Versioning behavior (context_0, context_1, ...) - Aliasing behavior (context -> context_0) - Method contracts with detailed docstrings - References to tests and example implementation --- rlm/environments/base_env.py | 104 ++++++++++++++++++++++++++++++++--- 1 file changed, 97 insertions(+), 7 deletions(-) diff --git a/rlm/environments/base_env.py b/rlm/environments/base_env.py index b185848a..963018b4 100644 --- a/rlm/environments/base_env.py +++ b/rlm/environments/base_env.py @@ -76,16 +76,106 @@ def execute_code(self, code: str) -> REPLResult: class SupportsPersistence(Protocol): """Protocol for environments that support persistent multi-turn sessions. - Use isinstance(env, SupportsPersistence) to check if an environment - supports persistence capabilities. + CHECKING SUPPORT: + Use isinstance(env, SupportsPersistence) to check if an environment + supports persistence capabilities. + + IMPLEMENTING THIS PROTOCOL: + To add persistence to your environment, implement these 5 methods. + See tests/test_local_repl_persistent.py for expected behavior. + + VERSIONING BEHAVIOR: + Contexts and histories are versioned with numeric suffixes: + - First context -> context_0, context_1, context_2, ... + - First history -> history_0, history_1, history_2, ... + + ALIASING BEHAVIOR: + The unversioned names always point to index 0: + - context -> context_0 (first context) + - history -> history_0 (first history) + + EXAMPLE IMPLEMENTATION: + See rlm/environments/local_repl.py for a complete reference. + + TESTS: + - Unit tests: tests/test_local_repl_persistent.py + - Integration tests: tests/test_multi_turn_integration.py + + Run: uv run pytest tests/test_local_repl_persistent.py -v """ - def update_handler_address(self, address: tuple[str, int]) -> None: ... + def update_handler_address(self, address: tuple[str, int]) -> None: + """Update the LM handler address for nested LLM calls. + + Called by RLM when the handler address changes between completions. + Store the address so llm_query() calls from executed code can reach + the LM handler. + + Args: + address: (host, port) tuple for the LM handler server. + """ + ... + def add_context( self, context_payload: dict | list | str, context_index: int | None = None - ) -> int: ... - def get_context_count(self) -> int: ... + ) -> int: + """Add a context payload, making it available as context_N in code. + + Versioning: + - context_index=None: auto-increment (0, 1, 2, ...) + - context_index=N: use specific index N + + Storage: + Must store so executed code can access: + - context_0, context_1, etc. (versioned) + - context (alias to context_0) + + Args: + context_payload: The context data (string, dict, or list). + context_index: Optional specific index, or None to auto-increment. + + Returns: + The index used (for auto-increment, returns the assigned index). + """ + ... + + def get_context_count(self) -> int: + """Return the number of contexts added so far. + + Used by RLM to inform the model how many contexts are available. + """ + ... + def add_history( self, message_history: list[dict[str, Any]], history_index: int | None = None - ) -> int: ... - def get_history_count(self) -> int: ... + ) -> int: + """Add a message history, making it available as history_N in code. + + Versioning: + - history_index=None: auto-increment (0, 1, 2, ...) + - history_index=N: use specific index N + + Storage: + Must store so executed code can access: + - history_0, history_1, etc. (versioned) + - history (alias to history_0) + + IMPORTANT: Store a deep copy, not a reference. The caller may + modify the list after calling this method. + + Args: + message_history: List of message dicts (role, content). + history_index: Optional specific index, or None to auto-increment. + + Returns: + The index used. + """ + ... + + def get_history_count(self) -> int: + """Return the number of histories added so far. + + Used by RLM to inform the model how many conversation histories + are available. + """ + ... From ea0b91c2b7f8bcb922ea84acf916cbbc5a1bc973 Mon Sep 17 00:00:00 2001 From: Alex Zhang Date: Thu, 8 Jan 2026 15:42:22 -0500 Subject: [PATCH 13/14] add ruff lint on test --- tests/test_multi_turn_integration.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/tests/test_multi_turn_integration.py b/tests/test_multi_turn_integration.py index 662b3e83..f55de39b 100644 --- a/tests/test_multi_turn_integration.py +++ b/tests/test_multi_turn_integration.py @@ -8,12 +8,13 @@ 5. Properly inform the model about available contexts/histories """ +from unittest.mock import Mock, patch + import pytest -from unittest.mock import patch, Mock -from rlm import RLM -from rlm.core.types import UsageSummary, ModelUsageSummary import rlm.core.rlm as rlm_module +from rlm import RLM +from rlm.core.types import ModelUsageSummary, UsageSummary def create_mock_lm(responses: list[str]) -> Mock: @@ -22,9 +23,7 @@ def create_mock_lm(responses: list[str]) -> Mock: mock.completion.side_effect = list(responses) mock.get_usage_summary.return_value = UsageSummary( model_usage_summaries={ - "mock": ModelUsageSummary( - total_calls=1, total_input_tokens=100, total_output_tokens=50 - ) + "mock": ModelUsageSummary(total_calls=1, total_input_tokens=100, total_output_tokens=50) } ) mock.get_last_usage.return_value = mock.get_usage_summary.return_value From f9755c8df3c64fd6297145885d37ec5a458511a8 Mon Sep 17 00:00:00 2001 From: Alex Zhang Date: Thu, 8 Jan 2026 15:43:52 -0500 Subject: [PATCH 14/14] fix linting on tests --- tests/test_local_repl.py | 1 - tests/test_local_repl_persistent.py | 10 ++++++---- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/tests/test_local_repl.py b/tests/test_local_repl.py index 115f671c..787a5b54 100644 --- a/tests/test_local_repl.py +++ b/tests/test_local_repl.py @@ -243,4 +243,3 @@ def test_simulated_rlm_completions_functions_not_preserved(self): assert "NameError" in result.stderr assert "my_helper" in result.stderr completion_2_env.cleanup() - diff --git a/tests/test_local_repl_persistent.py b/tests/test_local_repl_persistent.py index 4624c04a..f654679d 100644 --- a/tests/test_local_repl_persistent.py +++ b/tests/test_local_repl_persistent.py @@ -198,10 +198,12 @@ def test_full_persistent_session_simulation(self): assert result.stderr == "" assert repl.locals["profit"] == 600 - repl.add_history([ - {"role": "user", "content": "What were the sales?"}, - {"role": "assistant", "content": "Sales were $1000"}, - ]) + repl.add_history( + [ + {"role": "user", "content": "What were the sales?"}, + {"role": "assistant", "content": "Sales were $1000"}, + ] + ) code = """ summary = f"Sales: {context_0}, Costs: {context_1}, Profit: {profit}"