vectorize-io · nicoloboschi · Apr 14, 2026 · Apr 14, 2026 · Apr 14, 2026 · Apr 14, 2026
diff --git a/hindsight-api-slim/hindsight_api/api/http.py b/hindsight-api-slim/hindsight_api/api/http.py
@@ -1526,6 +1526,27 @@ class MentalModelTrigger(BaseModel):
             "Supports nested and/or/not expressions for complex tag-based scoping."
         ),
     )
+    include_chunks: bool | None = Field(
+        default=None,
+        description=(
+            "Override whether the internal recall used during refresh returns raw chunk text. "
+            "None means use the bank/global config default (recall_include_chunks)."
+        ),
+    )
+    recall_max_tokens: int | None = Field(
+        default=None,
+        description=(
+            "Override the token budget for facts returned by the internal recall during refresh. "
+            "None means use the bank/global config default (recall_max_tokens)."
+        ),
+    )
+    recall_chunks_max_tokens: int | None = Field(
+        default=None,
+        description=(
+            "Override the token budget for raw chunks returned by the internal recall during refresh. "
+            "None means use the bank/global config default (recall_chunks_max_tokens)."
+        ),
+    )
 
     @field_validator("fact_types")
     @classmethod

diff --git a/hindsight-api-slim/hindsight_api/config.py b/hindsight-api-slim/hindsight_api/config.py
@@ -387,6 +387,9 @@ def normalize_config_dict(config: dict[str, Any]) -> dict[str, Any]:
 ENV_REFLECT_WALL_TIMEOUT = "HINDSIGHT_API_REFLECT_WALL_TIMEOUT"
 ENV_REFLECT_MISSION = "HINDSIGHT_API_REFLECT_MISSION"
 ENV_REFLECT_SOURCE_FACTS_MAX_TOKENS = "HINDSIGHT_API_REFLECT_SOURCE_FACTS_MAX_TOKENS"
+ENV_RECALL_INCLUDE_CHUNKS = "HINDSIGHT_API_RECALL_INCLUDE_CHUNKS"
+ENV_RECALL_MAX_TOKENS = "HINDSIGHT_API_RECALL_MAX_TOKENS"
+ENV_RECALL_CHUNKS_MAX_TOKENS = "HINDSIGHT_API_RECALL_CHUNKS_MAX_TOKENS"
 
 # Audit log settings
 ENV_AUDIT_LOG_ENABLED = "HINDSIGHT_API_AUDIT_LOG_ENABLED"
@@ -587,6 +590,9 @@ def normalize_config_dict(config: dict[str, Any]) -> dict[str, Any]:
 DEFAULT_REFLECT_MAX_CONTEXT_TOKENS = 100_000  # Max accumulated context tokens before forcing final prompt
 DEFAULT_REFLECT_WALL_TIMEOUT = 300  # Wall-clock timeout in seconds for the entire reflect operation (5 minutes)
 DEFAULT_REFLECT_SOURCE_FACTS_MAX_TOKENS = -1  # Token budget for source facts in search_observations (-1 = disabled)
+DEFAULT_RECALL_INCLUDE_CHUNKS = True  # Whether internal recall (e.g. mental model refresh) returns raw chunks
+DEFAULT_RECALL_MAX_TOKENS = 2048  # Token budget for facts returned by internal recall
+DEFAULT_RECALL_CHUNKS_MAX_TOKENS = 1000  # Token budget for raw chunks returned by internal recall
 
 # Disposition defaults (None = not set, fall back to bank DB value or 3)
 DEFAULT_DISPOSITION_SKEPTICISM = None
@@ -925,6 +931,11 @@ class HindsightConfig:
     reflect_mission: str | None
     reflect_source_facts_max_tokens: int
 
+    # Recall settings (used by internal recall, e.g. during mental model refresh)
+    recall_include_chunks: bool
+    recall_max_tokens: int
+    recall_chunks_max_tokens: int
+
     # Disposition settings (hierarchical - can be overridden per bank; None = fall back to DB)
     disposition_skepticism: int | None
     disposition_literalism: int | None
@@ -1038,6 +1049,10 @@ class HindsightConfig:
         # Reflect settings
         "reflect_mission",
         "reflect_source_facts_max_tokens",
+        # Recall settings (used by internal recall, e.g. mental model refresh)
+        "recall_include_chunks",
+        "recall_max_tokens",
+        "recall_chunks_max_tokens",
         # Disposition settings
         "disposition_skepticism",
         "disposition_literalism",
@@ -1523,6 +1538,12 @@ def from_env(cls) -> "HindsightConfig":
             reflect_source_facts_max_tokens=int(
                 os.getenv(ENV_REFLECT_SOURCE_FACTS_MAX_TOKENS, str(DEFAULT_REFLECT_SOURCE_FACTS_MAX_TOKENS))
             ),
+            recall_include_chunks=os.getenv(ENV_RECALL_INCLUDE_CHUNKS, str(DEFAULT_RECALL_INCLUDE_CHUNKS)).lower()
+            in ("true", "1", "yes"),
+            recall_max_tokens=int(os.getenv(ENV_RECALL_MAX_TOKENS, str(DEFAULT_RECALL_MAX_TOKENS))),
+            recall_chunks_max_tokens=int(
+                os.getenv(ENV_RECALL_CHUNKS_MAX_TOKENS, str(DEFAULT_RECALL_CHUNKS_MAX_TOKENS))
+            ),
             # Disposition settings (None = fall back to DB value)
             disposition_skepticism=int(os.getenv(ENV_DISPOSITION_SKEPTICISM))
             if os.getenv(ENV_DISPOSITION_SKEPTICISM)

diff --git a/hindsight-api-slim/hindsight_api/engine/memory_engine.py b/hindsight-api-slim/hindsight_api/engine/memory_engine.py
@@ -24,7 +24,13 @@
 import httpx
 import tiktoken
 
-from ..config import DEFAULT_REFLECT_SOURCE_FACTS_MAX_TOKENS, get_config
+from ..config import (
+    DEFAULT_RECALL_CHUNKS_MAX_TOKENS,
+    DEFAULT_RECALL_INCLUDE_CHUNKS,
+    DEFAULT_RECALL_MAX_TOKENS,
+    DEFAULT_REFLECT_SOURCE_FACTS_MAX_TOKENS,
+    get_config,
+)
 from ..metrics import get_metrics_collector
 from ..tracing import create_operation_span
 from ..utils import mask_network_location
@@ -952,6 +958,9 @@ async def _handle_refresh_mental_model(self, task_dict: dict[str, Any]):
         fact_types = trigger_data.get("fact_types")
         exclude_mental_models = trigger_data.get("exclude_mental_models", False)
         stored_exclude_ids: list[str] = trigger_data.get("exclude_mental_model_ids") or []
+        recall_include_chunks_override = trigger_data.get("include_chunks")
+        recall_max_tokens_override = trigger_data.get("recall_max_tokens")
+        recall_chunks_max_tokens_override = trigger_data.get("recall_chunks_max_tokens")
 
         tag_filtering = _resolve_refresh_tag_filtering(mental_model.get("tags"), trigger_data)
 
@@ -967,6 +976,9 @@ async def _handle_refresh_mental_model(self, task_dict: dict[str, Any]):
             fact_types=fact_types,
             exclude_mental_models=exclude_mental_models,
             exclude_mental_model_ids=list({*stored_exclude_ids, mental_model_id}),
+            recall_include_chunks=recall_include_chunks_override,
+            recall_max_tokens_override=recall_max_tokens_override,
+            recall_chunks_max_tokens_override=recall_chunks_max_tokens_override,
         )
 
         generated_content = reflect_result.text or "No content generated"
@@ -5399,6 +5411,9 @@ async def reflect_async(
         exclude_mental_model_ids: list[str] | None = None,
         fact_types: list[str] | None = None,
         exclude_mental_models: bool = False,
+        recall_include_chunks: bool | None = None,
+        recall_max_tokens_override: int | None = None,
+        recall_chunks_max_tokens_override: int | None = None,
         _skip_span: bool = False,
     ) -> ReflectResult:
         """
@@ -5521,6 +5536,23 @@ async def search_mental_models_fn(q: str, max_results: int = 5) -> dict[str, Any
             "reflect_source_facts_max_tokens", DEFAULT_REFLECT_SOURCE_FACTS_MAX_TOKENS
         )
 
+        # Resolve recall overrides: caller arg (e.g. mental model trigger) → bank config → env default
+        effective_recall_include_chunks = (
+            recall_include_chunks
+            if recall_include_chunks is not None
+            else config_dict.get("recall_include_chunks", DEFAULT_RECALL_INCLUDE_CHUNKS)
+        )
+        effective_recall_max_tokens = (
+            recall_max_tokens_override
+            if recall_max_tokens_override is not None
+            else config_dict.get("recall_max_tokens", DEFAULT_RECALL_MAX_TOKENS)
+        )
+        effective_recall_chunks_max_tokens = (
+            recall_chunks_max_tokens_override
+            if recall_chunks_max_tokens_override is not None
+            else config_dict.get("recall_chunks_max_tokens", DEFAULT_RECALL_CHUNKS_MAX_TOKENS)
+        )
+
         async def search_observations_fn(q: str, max_tokens: int = 5000) -> dict[str, Any]:
             return await tool_search_observations(
                 self,
@@ -5541,7 +5573,14 @@ async def search_observations_fn(q: str, max_tokens: int = 5000) -> dict[str, An
         recall_fact_types = [ft for ft in (fact_types or ["world", "experience"]) if ft in ("world", "experience")]
         include_recall = bool(recall_fact_types)
 
-        async def recall_fn(q: str, max_tokens: int = 4096, max_chunk_tokens: int = 1000) -> dict[str, Any]:
+        # Defaults are bound at closure-definition time (re-evaluated on each
+        # reflect_async call), so per-bank/per-trigger overrides apply when the
+        # agent invokes recall without explicit token args.
+        async def recall_fn(
+            q: str,
+            max_tokens: int = effective_recall_max_tokens,
+            max_chunk_tokens: int = effective_recall_chunks_max_tokens,
+        ) -> dict[str, Any]:
             return await tool_recall(
                 self,
                 bank_id,
@@ -5553,6 +5592,7 @@ async def recall_fn(q: str, max_tokens: int = 4096, max_chunk_tokens: int = 1000
                 tag_groups=tag_groups,
                 max_chunk_tokens=max_chunk_tokens,
                 fact_types=recall_fact_types if fact_types is not None else None,
+                include_chunks=effective_recall_include_chunks,
             )
 
         async def expand_fn(memory_ids: list[str], depth: str) -> dict[str, Any]:
@@ -6770,6 +6810,9 @@ async def refresh_mental_model(
             fact_types = trigger_data.get("fact_types")
             exclude_mental_models = trigger_data.get("exclude_mental_models", False)
             stored_exclude_ids: list[str] = trigger_data.get("exclude_mental_model_ids") or []
+            recall_include_chunks_override = trigger_data.get("include_chunks")
+            recall_max_tokens_override = trigger_data.get("recall_max_tokens")
+            recall_chunks_max_tokens_override = trigger_data.get("recall_chunks_max_tokens")
 
             tag_filtering = _resolve_refresh_tag_filtering(mental_model.get("tags"), trigger_data)
 
@@ -6785,6 +6828,9 @@ async def refresh_mental_model(
                 fact_types=fact_types,
                 exclude_mental_models=exclude_mental_models,
                 exclude_mental_model_ids=list({*stored_exclude_ids, mental_model_id}),
+                recall_include_chunks=recall_include_chunks_override,
+                recall_max_tokens_override=recall_max_tokens_override,
+                recall_chunks_max_tokens_override=recall_chunks_max_tokens_override,
                 _skip_span=True,
             )
 

diff --git a/hindsight-api-slim/hindsight_api/engine/reflect/tools.py b/hindsight-api-slim/hindsight_api/engine/reflect/tools.py
@@ -214,6 +214,7 @@ async def tool_recall(
     connection_budget: int = 1,
     max_chunk_tokens: int = 1000,
     fact_types: list[str] | None = None,
+    include_chunks: bool = True,
 ) -> dict[str, Any]:
     """
     Search memories using TEMPR retrieval.
@@ -230,15 +231,15 @@ async def tool_recall(
         tags: Filter by tags (includes untagged memories)
         tags_match: How to match tags - "any" (OR), "all" (AND), or "exact"
         connection_budget: Max DB connections for this recall (default 1 for internal ops)
-        max_chunk_tokens: Maximum tokens for raw source chunk text (default 1000, always included)
+        max_chunk_tokens: Maximum tokens for raw source chunk text (default 1000)
         fact_types: Optional filter for fact types to retrieve. Defaults to ["experience", "world"].
+        include_chunks: Whether to fetch raw chunk text alongside facts (default True).
 
     Returns:
-        Dict with list of matching memories including raw chunk text
+        Dict with list of matching memories including raw chunk text (when include_chunks)
     """
     # Only world/experience are valid for raw recall (observation is handled by search_observations)
     recall_fact_type = [ft for ft in (fact_types or ["experience", "world"]) if ft in ("world", "experience")]
-    include_chunks = True
     internal_ctx = replace(request_context, internal=True)
     result = await memory_engine.recall_async(
         bank_id=bank_id,

diff --git a/hindsight-api-slim/tests/test_hierarchical_config.py b/hindsight-api-slim/tests/test_hierarchical_config.py
@@ -98,7 +98,7 @@ async def test_hierarchical_fields_categorization():
     assert "retain_chunk_batch_size" in configurable
 
     # Verify count is correct
-    assert len(configurable) == 22
+    assert len(configurable) == 25
 
     # Verify credential fields (NEVER exposed)
     assert "llm_api_key" in credentials
@@ -458,7 +458,7 @@ async def test_config_get_bank_config_no_static_or_credential_fields_leak(memory
             assert field in config, f"Expected configurable field '{field}' missing from config"
 
         # Should have a small number of configurable fields (not hundreds)
-        assert len(config) < 25, f"Too many fields returned: {len(config)}"
+        assert len(config) < 30, f"Too many fields returned: {len(config)}"
 
     finally:
         await memory.delete_bank(bank_id, request_context=request_context)