From d940513395647a53c560a07c48b43dcfcadc28cc Mon Sep 17 00:00:00 2001 From: John Tramm Date: Wed, 4 Mar 2026 22:51:59 +0000 Subject: [PATCH 01/67] Add AI agent codebase indexing tools for OpenMC Adds two opt-in tools that help AI agents understand the OpenMC codebase: 1. Repo Map: Tree-sitter based structural overview (~160 lines) showing the most important classes, functions, and relationships, ranked by cross-file usage via PageRank. 2. RAG Semantic Search: Vector-based search across all source code, tests, and documentation using sentence-transformers + LanceDB. Enables finding cross-cutting concerns (e.g., "where are particle seeds initialized") even when naming differs across code paths. Both tools are fully opt-in via /enable-openmc-index (per-session) and rebuild via /refresh-openmc-index. No API keys, no cloud services, no settings.json changes required. All generated artifacts live in the gitignored .claude/cache/ directory. Co-Authored-By: Claude Opus 4.6 --- .claude/skills/enable-openmc-index/SKILL.md | 73 +++ .claude/skills/refresh-openmc-index/SKILL.md | 36 ++ .claude/tools/rag/chunker.py | 241 +++++++++ .claude/tools/rag/embeddings.py | 101 ++++ .claude/tools/rag/indexer.py | 140 +++++ .claude/tools/rag/openmc_search.py | 179 +++++++ .claude/tools/repomap/generate_repomap.py | 521 +++++++++++++++++++ .claude/tools/requirements.txt | 17 + .gitignore | 3 + CLAUDE.md | 38 ++ 10 files changed, 1349 insertions(+) create mode 100644 .claude/skills/enable-openmc-index/SKILL.md create mode 100644 .claude/skills/refresh-openmc-index/SKILL.md create mode 100644 .claude/tools/rag/chunker.py create mode 100644 .claude/tools/rag/embeddings.py create mode 100644 .claude/tools/rag/indexer.py create mode 100644 .claude/tools/rag/openmc_search.py create mode 100644 .claude/tools/repomap/generate_repomap.py create mode 100644 .claude/tools/requirements.txt create mode 100644 CLAUDE.md diff --git a/.claude/skills/enable-openmc-index/SKILL.md b/.claude/skills/enable-openmc-index/SKILL.md new file mode 100644 index 00000000000..8c115470264 --- /dev/null +++ b/.claude/skills/enable-openmc-index/SKILL.md @@ -0,0 +1,73 @@ +--- +name: enable-openmc-index +description: Enable the OpenMC codebase index for this session. Provides a structural repo map and semantic code search. Run this when investigating, modifying, or debugging OpenMC code. +allowed-tools: Bash(*), Read +--- + +# Enable OpenMC Index + +Set up (if needed) and activate the OpenMC codebase index for this session. This gives you: +1. A structural repo map showing the most important classes, functions, and their relationships +2. Semantic search across all source code, tests, and documentation + +## Step 1: Ensure the virtual environment exists + +Check if `.claude/cache/.venv/` exists. If not, create it and install dependencies: + +```bash +if [ ! -d .claude/cache/.venv ]; then + python3 -m venv .claude/cache/.venv + .claude/cache/.venv/bin/pip install -r .claude/tools/requirements.txt -q + echo "INSTALLED" +else + echo "VENV_EXISTS" +fi +``` + +## Step 2: Ensure the index exists + +Check if `.claude/cache/rag_index/` exists. If not, build it: + +```bash +if [ ! -d .claude/cache/rag_index ]; then + echo "Building index for the first time (this takes ~3 minutes)..." + .claude/cache/.venv/bin/python .claude/tools/repomap/generate_repomap.py + HF_HUB_DISABLE_TELEMETRY=1 .claude/cache/.venv/bin/python .claude/tools/rag/indexer.py + echo "INDEX_BUILT" +else + echo "INDEX_EXISTS" +fi +``` + +## Step 3: Load the repo map + +Read the file `.claude/cache/repomap.md` and internalize the codebase structure. + +## Step 4: Confirm activation + +Tell the user the OpenMC index is active and briefly describe what's available: +- The repo map is loaded (structural overview) +- Semantic search is ready via: `.claude/cache/.venv/bin/python .claude/tools/rag/openmc_search.py "query"` + +## Using semantic search after activation + +For the rest of this session, before modifying unfamiliar code or when investigating how a change might affect other parts of the codebase, search for related code: + +```bash +# Search source code +.claude/cache/.venv/bin/python .claude/tools/rag/openmc_search.py "your query here" + +# Search documentation +.claude/cache/.venv/bin/python .claude/tools/rag/openmc_search.py "your query" --docs + +# Search both code and docs +.claude/cache/.venv/bin/python .claude/tools/rag/openmc_search.py "your query" --all + +# Find code related to a specific file +.claude/cache/.venv/bin/python .claude/tools/rag/openmc_search.py --related src/somefile.cpp +``` + +## Subagent guidance + +When spawning subagents that will investigate or modify OpenMC code, include in their prompt: +"The OpenMC search index is available. Read .claude/cache/repomap.md for a structural overview. Use `.claude/cache/.venv/bin/python .claude/tools/rag/openmc_search.py 'query'` for semantic search." diff --git a/.claude/skills/refresh-openmc-index/SKILL.md b/.claude/skills/refresh-openmc-index/SKILL.md new file mode 100644 index 00000000000..55b922e3ac9 --- /dev/null +++ b/.claude/skills/refresh-openmc-index/SKILL.md @@ -0,0 +1,36 @@ +--- +name: refresh-openmc-index +description: Rebuild the OpenMC codebase index. Use after pulling new code, switching branches, or when the index feels stale. +allowed-tools: Bash(*), Read +--- + +# Refresh OpenMC Index + +Rebuild the repo map and RAG vector index from scratch. + +## Step 1: Ensure venv exists + +```bash +if [ ! -d .claude/cache/.venv ]; then + python3 -m venv .claude/cache/.venv + .claude/cache/.venv/bin/pip install -r .claude/tools/requirements.txt -q +fi +``` + +## Step 2: Rebuild everything + +```bash +echo "Rebuilding repo map..." +.claude/cache/.venv/bin/python .claude/tools/repomap/generate_repomap.py + +echo "Rebuilding RAG index (this takes ~3 minutes)..." +HF_HUB_DISABLE_TELEMETRY=1 .claude/cache/.venv/bin/python .claude/tools/rag/indexer.py +``` + +## Step 3: Reload the repo map + +Read the updated `.claude/cache/repomap.md` and internalize the new codebase structure. + +## Step 4: Confirm + +Tell the user the index has been refreshed and is ready to use. diff --git a/.claude/tools/rag/chunker.py b/.claude/tools/rag/chunker.py new file mode 100644 index 00000000000..6717f5863ad --- /dev/null +++ b/.claude/tools/rag/chunker.py @@ -0,0 +1,241 @@ +"""Chunk OpenMC source files and documentation for RAG indexing. + +Code files are chunked at the function/class level using tree-sitter. +RST documentation is chunked by section headers. +""" + +import re +from pathlib import Path + +import tree_sitter_cpp as tscpp +import tree_sitter_python as tspy +from tree_sitter import Language, Parser + +CPP_LANG = Language(tscpp.language()) +PY_LANG = Language(tspy.language()) + +cpp_parser = Parser(CPP_LANG) +py_parser = Parser(PY_LANG) + +MAX_CHUNK_CHARS = 1500 +MIN_CHUNK_CHARS = 50 + + +def chunk_file(filepath, openmc_root): + """Chunk a single file based on its extension.""" + filepath = Path(filepath) + rel = str(filepath.relative_to(openmc_root)) + try: + content = filepath.read_text(errors="replace") + except Exception: + return [] + + if filepath.suffix in (".cpp", ".h"): + return _chunk_cpp(rel, content) + elif filepath.suffix == ".py": + return _chunk_python(rel, content) + elif filepath.suffix == ".rst": + return _chunk_rst(rel, content) + return [] + + +def _chunk_cpp(rel_path, content): + """Extract function and class-level chunks from C++ code.""" + tree = cpp_parser.parse(content.encode()) + chunks = [] + used_ranges = [] + + def _extract_node(node, kind_override=None): + text = content[node.start_byte:node.end_byte] + if len(text) < MIN_CHUNK_CHARS: + return + # Extract symbol name + name = _get_node_name(node) + kind = kind_override or node.type + for sub in _split_if_large(text): + chunks.append({ + "text": sub, + "filepath": rel_path, + "kind": kind, + "symbol": name or "", + "start_line": node.start_point[0] + 1, + "end_line": node.end_point[0] + 1, + }) + used_ranges.append((node.start_byte, node.end_byte)) + + def _visit(node): + if node.type in ( + "function_definition", "class_specifier", + "struct_specifier", "enum_specifier", + ): + _extract_node(node) + elif node.type == "namespace_definition": + # Visit children inside namespaces + for child in node.children: + _visit(child) + elif node.type == "declaration_list": + for child in node.children: + _visit(child) + else: + for child in node.children: + if child.type in ( + "function_definition", "class_specifier", + "struct_specifier", "namespace_definition", + ): + _visit(child) + + for child in tree.root_node.children: + _visit(child) + + # Add file header (includes, forward declarations) as a separate chunk + header_lines = [] + for line in content.split("\n")[:50]: + if line.strip().startswith("#include") or line.strip().startswith("namespace") \ + or line.strip().startswith("//") or line.strip().startswith("using") \ + or line.strip() == "": + header_lines.append(line) + else: + break + header = "\n".join(header_lines).strip() + if len(header) >= MIN_CHUNK_CHARS: + chunks.append({ + "text": header, + "filepath": rel_path, + "kind": "file_header", + "symbol": Path(rel_path).name, + "start_line": 1, + "end_line": len(header_lines), + }) + + return chunks + + +def _chunk_python(rel_path, content): + """Extract function and class-level chunks from Python code.""" + tree = py_parser.parse(content.encode()) + chunks = [] + + for node in tree.root_node.children: + if node.type in ("class_definition", "function_definition"): + text = content[node.start_byte:node.end_byte] + if len(text) < MIN_CHUNK_CHARS: + continue + name_node = node.child_by_field_name("name") + name = name_node.text.decode() if name_node else "" + for sub in _split_if_large(text): + chunks.append({ + "text": sub, + "filepath": rel_path, + "kind": node.type.replace("_definition", ""), + "symbol": name, + "start_line": node.start_point[0] + 1, + "end_line": node.end_point[0] + 1, + }) + + # Module-level docstring + imports as header + header_lines = [] + for line in content.split("\n")[:40]: + stripped = line.strip() + if stripped.startswith(("import ", "from ", "#", '"""', "'''", "")) \ + or stripped == "": + header_lines.append(line) + elif stripped.startswith(("def ", "class ")): + break + else: + header_lines.append(line) + header = "\n".join(header_lines).strip() + if len(header) >= MIN_CHUNK_CHARS: + chunks.append({ + "text": header, + "filepath": rel_path, + "kind": "file_header", + "symbol": Path(rel_path).name, + "start_line": 1, + "end_line": len(header_lines), + }) + + return chunks + + +def _chunk_rst(rel_path, content): + """Chunk RST documentation by section headers.""" + # RST sections are indicated by underlines of =, -, ~, ^, etc. + section_pattern = re.compile( + r'^(.+)\n([=\-~^"+]+)\s*$', re.MULTILINE + ) + chunks = [] + + # Find all section positions + positions = [0] + for m in section_pattern.finditer(content): + # The section title starts at the beginning of the title line + positions.append(m.start()) + positions.append(len(content)) + + for i in range(len(positions) - 1): + section = content[positions[i]:positions[i + 1]].strip() + if len(section) < MIN_CHUNK_CHARS: + continue + # Extract title + title_match = section_pattern.match(section) + title = title_match.group(1).strip() if title_match else "" + start_line = content[:positions[i]].count("\n") + 1 + end_line = content[:positions[i + 1]].count("\n") + 1 + for sub in _split_if_large(section): + chunks.append({ + "text": sub, + "filepath": rel_path, + "kind": "doc_section", + "symbol": title, + "start_line": start_line, + "end_line": end_line, + }) + + return chunks + + +def _get_node_name(node): + """Extract the name from a tree-sitter node.""" + name_node = node.child_by_field_name("name") + if name_node: + return name_node.text.decode() + # For function_definition, check declarator + decl = node.child_by_field_name("declarator") + if decl: + # Walk down to find the identifier + while decl.type not in ("identifier", "qualified_identifier", + "field_identifier", "destructor_name"): + found = False + for child in decl.children: + if child.type in ("function_declarator", "identifier", + "qualified_identifier", "field_identifier", + "destructor_name", "template_function"): + decl = child + found = True + break + if not found: + break + return decl.text.decode() + return "" + + +def _split_if_large(text, max_chars=MAX_CHUNK_CHARS): + """Split text into chunks if it exceeds max_chars.""" + if len(text) <= max_chars: + return [text] + # Split on line boundaries + lines = text.split("\n") + chunks = [] + current = [] + current_len = 0 + for line in lines: + if current_len + len(line) + 1 > max_chars and current: + chunks.append("\n".join(current)) + current = [line] + current_len = len(line) + else: + current.append(line) + current_len += len(line) + 1 + if current: + chunks.append("\n".join(current)) + return chunks diff --git a/.claude/tools/rag/embeddings.py b/.claude/tools/rag/embeddings.py new file mode 100644 index 00000000000..1f937a19f94 --- /dev/null +++ b/.claude/tools/rag/embeddings.py @@ -0,0 +1,101 @@ +"""Embedding provider with auto-detection fallback chain. + +1. sentence-transformers (all-MiniLM-L6-v2) - good quality, ~80MB model +2. TF-IDF + SVD - zero downloads, decent for code identifiers +""" + +import os +import sys +from abc import ABC, abstractmethod + +# Suppress noisy HuggingFace warnings about authentication +os.environ.setdefault("HF_HUB_DISABLE_TELEMETRY", "1") +os.environ.setdefault("TOKENIZERS_PARALLELISM", "false") + + +class EmbeddingProvider(ABC): + """Abstract base for embedding providers.""" + + dim: int = 0 + + @abstractmethod + def embed(self, texts: list[str]) -> list[list[float]]: + """Embed a list of texts into vectors.""" + ... + + @abstractmethod + def embed_query(self, text: str) -> list[float]: + """Embed a single query text.""" + ... + + @staticmethod + def create(corpus_texts: list[str] | None = None) -> "EmbeddingProvider": + """Auto-detect best available embedding backend. + + Args: + corpus_texts: For TF-IDF fallback, the full corpus to fit on. + Not needed for sentence-transformers. + """ + # Try sentence-transformers first + try: + return SentenceTransformerProvider() + except (ImportError, Exception) as e: + print(f" sentence-transformers unavailable: {e}", file=sys.stderr) + + # Fall back to TF-IDF + if corpus_texts is None: + raise RuntimeError( + "No embedding provider available. Install sentence-transformers " + "or provide corpus_texts for TF-IDF fallback." + ) + print(" Using TF-IDF fallback embeddings", file=sys.stderr) + return TfidfProvider(corpus_texts) + + +class SentenceTransformerProvider(EmbeddingProvider): + """sentence-transformers with all-MiniLM-L6-v2.""" + + def __init__(self, model_name: str = "all-MiniLM-L6-v2"): + from sentence_transformers import SentenceTransformer + self.model = SentenceTransformer(model_name) + self.dim = self.model.get_sentence_embedding_dimension() + + def embed(self, texts: list[str]) -> list[list[float]]: + embeddings = self.model.encode(texts, show_progress_bar=True, + batch_size=64) + return embeddings.tolist() + + def embed_query(self, text: str) -> list[float]: + return self.model.encode([text])[0].tolist() + + +class TfidfProvider(EmbeddingProvider): + """TF-IDF vectors projected to dense via SVD. No model download needed.""" + + def __init__(self, corpus_texts: list[str], dim: int = 256): + from sklearn.decomposition import TruncatedSVD + from sklearn.feature_extraction.text import TfidfVectorizer + + self.dim = dim + self.vectorizer = TfidfVectorizer( + max_features=10000, + sublinear_tf=True, + token_pattern=r"(?u)\b[a-zA-Z_][a-zA-Z0-9_]{2,}\b", # Code identifiers + ) + tfidf_matrix = self.vectorizer.fit_transform(corpus_texts) + + # Project to dense using SVD + actual_dim = min(dim, tfidf_matrix.shape[1] - 1, tfidf_matrix.shape[0] - 1) + self.svd = TruncatedSVD(n_components=actual_dim) + self.svd.fit(tfidf_matrix) + self.dim = actual_dim + + def embed(self, texts: list[str]) -> list[list[float]]: + tfidf = self.vectorizer.transform(texts) + dense = self.svd.transform(tfidf) + return dense.tolist() + + def embed_query(self, text: str) -> list[float]: + tfidf = self.vectorizer.transform([text]) + dense = self.svd.transform(tfidf) + return dense[0].tolist() diff --git a/.claude/tools/rag/indexer.py b/.claude/tools/rag/indexer.py new file mode 100644 index 00000000000..8922b77d65d --- /dev/null +++ b/.claude/tools/rag/indexer.py @@ -0,0 +1,140 @@ +#!/usr/bin/env python3 +"""Build the RAG vector index for OpenMC source code and documentation. + +Chunks all C++, Python, and RST files, embeds them, and stores in LanceDB. + +Output: .claude/cache/rag_index/ (LanceDB directory) +""" + +import sys +import time +from pathlib import Path + +# Add tools dir to path for imports +TOOLS_DIR = Path(__file__).resolve().parent.parent +sys.path.insert(0, str(TOOLS_DIR / "rag")) + +import lancedb +import pyarrow as pa + +from chunker import chunk_file +from embeddings import EmbeddingProvider + +OPENMC_ROOT = Path(__file__).resolve().parents[3] +CACHE_DIR = OPENMC_ROOT / ".claude" / "cache" +INDEX_DIR = CACHE_DIR / "rag_index" + +CODE_PATTERNS = [ + "src/**/*.cpp", + "include/openmc/**/*.h", + "openmc/**/*.py", + "tests/**/*.py", + "examples/**/*.py", +] + +DOC_PATTERNS = [ + "docs/**/*.rst", +] + + +def collect_chunks(patterns, openmc_root): + """Collect all chunks from files matching the given patterns.""" + chunks = [] + for pattern in patterns: + for filepath in sorted(openmc_root.glob(pattern)): + if "__pycache__" in str(filepath): + continue + file_chunks = chunk_file(filepath, openmc_root) + chunks.extend(file_chunks) + return chunks + + +def build_index(): + """Build or rebuild the complete vector index.""" + start = time.time() + + # Collect all chunks + print("Collecting code chunks...") + code_chunks = collect_chunks(CODE_PATTERNS, OPENMC_ROOT) + print(f" {len(code_chunks)} code chunks") + + print("Collecting doc chunks...") + doc_chunks = collect_chunks(DOC_PATTERNS, OPENMC_ROOT) + print(f" {len(doc_chunks)} doc chunks") + + all_chunks = code_chunks + doc_chunks + if not all_chunks: + print("ERROR: No chunks collected!", file=sys.stderr) + sys.exit(1) + + # Create embeddings + all_texts = [c["text"] for c in all_chunks] + print("Creating embedding provider...") + embedder = EmbeddingProvider.create(corpus_texts=all_texts) + print(f" Using {embedder.__class__.__name__} (dim={embedder.dim})") + + print("Embedding chunks...") + all_embeddings = embedder.embed(all_texts) + + # Build LanceDB tables + INDEX_DIR.mkdir(parents=True, exist_ok=True) + db = lancedb.connect(str(INDEX_DIR)) + + # Prepare records + code_records = [] + doc_records = [] + for chunk, emb in zip(all_chunks, all_embeddings): + record = { + "text": chunk["text"], + "filepath": chunk["filepath"], + "kind": chunk["kind"], + "symbol": chunk.get("symbol", ""), + "start_line": chunk.get("start_line", 0), + "end_line": chunk.get("end_line", 0), + "vector": emb, + } + if chunk in code_chunks: + code_records.append(record) + else: + doc_records.append(record) + + # The chunk identity comparison above is slow for large lists. + # Instead, use index-based separation. + n_code = len(code_chunks) + code_records = [] + doc_records = [] + for i, (chunk, emb) in enumerate(zip(all_chunks, all_embeddings)): + record = { + "text": chunk["text"], + "filepath": chunk["filepath"], + "kind": chunk["kind"], + "symbol": chunk.get("symbol", ""), + "start_line": chunk.get("start_line", 0), + "end_line": chunk.get("end_line", 0), + "vector": emb, + } + if i < n_code: + code_records.append(record) + else: + doc_records.append(record) + + # Create tables (drop existing) + existing = db.list_tables().tables + for table_name in ("code", "docs"): + if table_name in existing: + db.drop_table(table_name) + + if code_records: + db.create_table("code", code_records) + print(f" Created 'code' table: {len(code_records)} rows") + + if doc_records: + db.create_table("docs", doc_records) + print(f" Created 'docs' table: {len(doc_records)} rows") + + elapsed = time.time() - start + print(f"Done in {elapsed:.1f}s") + + +if __name__ == "__main__": + build_index() diff --git a/.claude/tools/rag/openmc_search.py b/.claude/tools/rag/openmc_search.py new file mode 100644 index 00000000000..9de73499e0b --- /dev/null +++ b/.claude/tools/rag/openmc_search.py @@ -0,0 +1,179 @@ +#!/usr/bin/env python3 +"""Semantic search across the OpenMC codebase and documentation. + +Usage: + openmc_search.py "query" # Search code (default) + openmc_search.py "query" --docs # Search documentation + openmc_search.py "query" --all # Search both code and docs + openmc_search.py --related src/particle.cpp # Find related code + openmc_search.py "query" --top-k 20 # Return more results + +Examples: + openmc_search.py "particle random number seed initialization" + openmc_search.py "how to define tallies" --docs + openmc_search.py --related src/simulation.cpp +""" + +import argparse +import sys +from pathlib import Path + +# Add tools dir to path +TOOLS_DIR = Path(__file__).resolve().parent.parent +sys.path.insert(0, str(TOOLS_DIR / "rag")) + +OPENMC_ROOT = Path(__file__).resolve().parents[3] +CACHE_DIR = OPENMC_ROOT / ".claude" / "cache" +INDEX_DIR = CACHE_DIR / "rag_index" + + +def get_db_and_embedder(): + """Load the LanceDB database and embedding provider.""" + import lancedb + from embeddings import EmbeddingProvider + + if not INDEX_DIR.exists(): + print("ERROR: No index found. Run /enable-openmc-index first.", + file=sys.stderr) + sys.exit(1) + + db = lancedb.connect(str(INDEX_DIR)) + + # For query embedding, we need the same type of embedder used for indexing. + # Try sentence-transformers first (matches indexing default). + embedder = EmbeddingProvider.create() + return db, embedder + + +def search_table(db, embedder, table_name, query, top_k): + """Search a LanceDB table with a text query.""" + if table_name not in db.list_tables().tables: + print(f"Table '{table_name}' not found in index.", file=sys.stderr) + return [] + + table = db.open_table(table_name) + query_vec = embedder.embed_query(query) + results = table.search(query_vec).limit(top_k).to_list() + return results + + +def format_results(results, label=""): + """Format search results for display.""" + if not results: + return "No results found.\n" + + output = [] + if label: + output.append(f"=== {label} ===\n") + + for i, r in enumerate(results, 1): + filepath = r["filepath"] + start = r["start_line"] + end = r["end_line"] + kind = r["kind"] + symbol = r["symbol"] + dist = r.get("_distance", 0) + + header = f"[{i}] {filepath}:{start}-{end} ({kind}" + if symbol: + header += f": {symbol}" + header += f", dist={dist:.3f})" + output.append(header) + + # Show text preview (first 500 chars) + text = r["text"][:500] + if len(r["text"]) > 500: + text += "\n ..." + # Indent the text + for line in text.split("\n"): + output.append(f" {line}") + output.append("") + + return "\n".join(output) + + +def search_related(db, embedder, filepath, top_k): + """Find code related to a given file.""" + if "code" not in db.list_tables().tables: + print("No 'code' table in index.", file=sys.stderr) + return [] + + table = db.open_table("code") + + # Normalize filepath + fp = filepath + if filepath.startswith("/"): + try: + fp = str(Path(filepath).relative_to(OPENMC_ROOT)) + except ValueError: + pass + + # Get chunks from target file + try: + target_chunks = table.search().where( + f"filepath = '{fp}'" + ).limit(50).to_list() + except Exception: + # LanceDB where clause might not work in all versions + # Fall back to fetching all and filtering + all_data = table.to_pandas() + target_rows = all_data[all_data["filepath"] == fp] + if target_rows.empty: + print(f"No chunks found for '{fp}'", file=sys.stderr) + return [] + target_chunks = target_rows.head(50).to_dict("records") + + if not target_chunks: + print(f"No chunks found for '{fp}'", file=sys.stderr) + return [] + + # Combine top chunks as the query + combined_text = " ".join(c["text"][:200] for c in target_chunks[:5]) + query_vec = embedder.embed_query(combined_text) + + # Search excluding the source file + results = table.search(query_vec).limit(top_k + 10).to_list() + # Filter out same file + results = [r for r in results if r["filepath"] != fp][:top_k] + return results + + +def main(): + parser = argparse.ArgumentParser( + description="Semantic search across OpenMC codebase and docs" + ) + parser.add_argument("query", nargs="?", help="Search query") + parser.add_argument("--docs", action="store_true", + help="Search documentation instead of code") + parser.add_argument("--all", action="store_true", + help="Search both code and documentation") + parser.add_argument("--related", metavar="FILE", + help="Find code related to a given file") + parser.add_argument("--top-k", type=int, default=10, + help="Number of results (default: 10)") + args = parser.parse_args() + + if not args.query and not args.related: + parser.print_help() + sys.exit(1) + + db, embedder = get_db_and_embedder() + + if args.related: + results = search_related(db, embedder, args.related, args.top_k) + print(format_results(results, f"Code related to {args.related}")) + elif args.all: + code_results = search_table(db, embedder, "code", args.query, args.top_k) + doc_results = search_table(db, embedder, "docs", args.query, args.top_k) + print(format_results(code_results, "Code")) + print(format_results(doc_results, "Documentation")) + elif args.docs: + results = search_table(db, embedder, "docs", args.query, args.top_k) + print(format_results(results, "Documentation")) + else: + results = search_table(db, embedder, "code", args.query, args.top_k) + print(format_results(results, "Code")) + + +if __name__ == "__main__": + main() diff --git a/.claude/tools/repomap/generate_repomap.py b/.claude/tools/repomap/generate_repomap.py new file mode 100644 index 00000000000..4ee3ef28036 --- /dev/null +++ b/.claude/tools/repomap/generate_repomap.py @@ -0,0 +1,521 @@ +#!/usr/bin/env python3 +"""Generate a structural repo map of the OpenMC codebase. + +Uses tree-sitter to parse C++ and Python files, extracts class/function +signatures, builds a cross-file reference graph, applies PageRank-like +ranking, and outputs a concise markdown map grouped by subsystem. + +Output: + .claude/cache/repomap.md - Concise map (~170 lines) for agent context + .claude/cache/repomap_full.json - Full symbol data for other tools +""" + +import json +import os +import sys +from collections import defaultdict +from pathlib import Path + +import tree_sitter_cpp as tscpp +import tree_sitter_python as tspy +from tree_sitter import Language, Parser + +# --------------------------------------------------------------------------- +# Configuration +# --------------------------------------------------------------------------- + +OPENMC_ROOT = Path(__file__).resolve().parents[3] # .claude/tools/repomap -> repo root +CACHE_DIR = OPENMC_ROOT / ".claude" / "cache" + +CPP_PATTERNS = ["src/**/*.cpp", "include/openmc/**/*.h"] +PY_PATTERNS = ["openmc/**/*.py"] + +# Map file path keywords to logical subsystems +SUBSYSTEM_RULES = [ + # (path substring, subsystem name) + ("random_ray", "Random Ray Solver"), + ("tallies/", "Tallies & Filters"), + ("deplete", "Depletion"), + ("mgxs", "Multi-Group Cross Sections"), + ("data/", "Nuclear Data"), + ("lattice", "Geometry"), + ("universe", "Geometry"), + ("surface", "Geometry"), + ("cell", "Geometry"), + ("geometry", "Geometry"), + ("dagmc", "Geometry (DAGMC)"), + ("mesh", "Mesh"), + ("material", "Materials"), + ("nuclide", "Nuclear Data"), + ("cross_section", "Nuclear Data"), + ("thermal", "Nuclear Data"), + ("wmp", "Nuclear Data"), + ("particle", "Particle Transport"), + ("physics", "Particle Transport"), + ("collision", "Particle Transport"), + ("photon", "Particle Transport"), + ("bremsstrahlung", "Particle Transport"), + ("secondary_", "Particle Transport"), + ("reaction", "Particle Transport"), + ("source", "Sources & Distributions"), + ("distribution", "Sources & Distributions"), + ("eigenvalue", "Eigenvalue Solver"), + ("cmfd", "CMFD Acceleration"), + ("weight_window", "Variance Reduction"), + ("tally", "Tallies & Filters"), + ("filter", "Tallies & Filters"), + ("trigger", "Tallies & Filters"), + ("plot", "Plotting"), + ("track", "Plotting"), + ("volume_calc", "Volume Calculation"), + ("random_lcg", "Random Number Generation"), + ("random_dist", "Random Number Generation"), + ("settings", "Settings & Configuration"), + ("simulation", "Simulation Control"), + ("initialize", "Simulation Control"), + ("finalize", "Simulation Control"), + ("state_point", "I/O & Serialization"), + ("summary", "I/O & Serialization"), + ("hdf5", "I/O & Serialization"), + ("xml_interface", "I/O & Serialization"), + ("output", "I/O & Serialization"), + ("bank", "Particle Banking"), + ("event", "Event-Based Transport"), + ("error", "Utilities"), + ("string_utils", "Utilities"), + ("math_functions", "Utilities"), + ("file_utils", "Utilities"), + ("timer", "Utilities"), + ("memory", "Utilities"), + ("position", "Utilities"), + ("model", "Model Builder (Python)"), + ("stats", "Statistics (Python)"), + ("lib/", "C API Bindings (Python)"), +] + +MAX_REPOMAP_LINES = 160 + +# --------------------------------------------------------------------------- +# Tree-sitter setup +# --------------------------------------------------------------------------- + +CPP_LANG = Language(tscpp.language()) +PY_LANG = Language(tspy.language()) + + +def make_parser(lang): + p = Parser(lang) + return p + + +cpp_parser = make_parser(CPP_LANG) +py_parser = make_parser(PY_LANG) + +# --------------------------------------------------------------------------- +# Symbol extraction +# --------------------------------------------------------------------------- + + +def extract_cpp_symbols(filepath, content): + """Extract class/struct/function definitions from C++ code.""" + symbols = [] + tree = cpp_parser.parse(content.encode()) + + def visit(node, namespace=""): + if node.type == "namespace_definition": + ns_name = "" + for child in node.children: + if child.type == "namespace_identifier": + ns_name = child.text.decode() + break + body = None + for child in node.children: + if child.type == "declaration_list": + body = child + break + if body: + prefix = f"{namespace}{ns_name}::" if ns_name else namespace + for child in body.children: + visit(child, prefix) + return + + if node.type in ("class_specifier", "struct_specifier"): + name_node = node.child_by_field_name("name") + if name_node: + name = name_node.text.decode() + full_name = f"{namespace}{name}" + # Extract method signatures + methods = [] + body = node.child_by_field_name("body") + if body: + for child in body.children: + if child.type == "function_definition": + sig = _cpp_func_signature(child) + if sig: + methods.append(sig) + elif child.type == "declaration": + # Could be a method declaration + sig = _cpp_decl_signature(child) + if sig: + methods.append(sig) + kind = "class" if node.type == "class_specifier" else "struct" + symbols.append({ + "name": full_name, + "kind": kind, + "signature": f"{kind} {full_name}", + "methods": methods[:10], # Cap to avoid bloat + "file": str(filepath), + "line": node.start_point[0] + 1, + }) + + elif node.type == "function_definition": + sig = _cpp_func_signature(node) + if sig: + symbols.append({ + "name": sig.split("(")[0].split()[-1] if "(" in sig else sig, + "kind": "function", + "signature": sig, + "methods": [], + "file": str(filepath), + "line": node.start_point[0] + 1, + }) + + # Visit children for top-level traversal + for child in node.children: + visit(child, namespace) + + for child in tree.root_node.children: + visit(child) + + return symbols + + +def _cpp_func_signature(node): + """Extract a concise function signature from a function_definition node.""" + declarator = node.child_by_field_name("declarator") + if not declarator: + return None + # Get return type + ret_type = "" + for child in node.children: + if child == declarator: + break + if child.type not in ("comment", "attribute_declaration"): + ret_type += child.text.decode() + " " + ret_type = ret_type.strip() + decl_text = declarator.text.decode() + # Truncate long signatures + sig = f"{ret_type} {decl_text}".strip() + if len(sig) > 120: + sig = sig[:117] + "..." + return sig + + +def _cpp_decl_signature(node): + """Extract signature from a declaration that might be a method decl.""" + text = node.text.decode().strip() + if "(" in text and ";" in text: + sig = text.rstrip(";").strip() + if len(sig) > 120: + sig = sig[:117] + "..." + return sig + return None + + +def extract_py_symbols(filepath, content): + """Extract class/function definitions from Python code.""" + symbols = [] + tree = py_parser.parse(content.encode()) + + for node in tree.root_node.children: + if node.type == "class_definition": + name_node = node.child_by_field_name("name") + if not name_node: + continue + name = name_node.text.decode() + # Get superclasses + superclass = "" + for child in node.children: + if child.type == "argument_list": + superclass = child.text.decode() + break + # Get method names + methods = [] + body = node.child_by_field_name("body") + if body: + for child in body.children: + if child.type == "function_definition": + mname = child.child_by_field_name("name") + if mname: + mtext = mname.text.decode() + if not mtext.startswith("_") or mtext in ( + "__init__", "__repr__", "__iter__" + ): + params = child.child_by_field_name("parameters") + psig = params.text.decode() if params else "()" + methods.append(f"{mtext}{psig}") + + sig = f"class {name}{superclass}" if superclass else f"class {name}" + symbols.append({ + "name": name, + "kind": "class", + "signature": sig, + "methods": methods[:10], + "file": str(filepath), + "line": node.start_point[0] + 1, + }) + + elif node.type == "function_definition": + name_node = node.child_by_field_name("name") + if not name_node: + continue + name = name_node.text.decode() + if name.startswith("_") and name != "__init__": + continue + params = node.child_by_field_name("parameters") + psig = params.text.decode() if params else "()" + sig = f"def {name}{psig}" + if len(sig) > 120: + sig = sig[:117] + "..." + symbols.append({ + "name": name, + "kind": "function", + "signature": sig, + "methods": [], + "file": str(filepath), + "line": node.start_point[0] + 1, + }) + + return symbols + + +# --------------------------------------------------------------------------- +# Reference graph and ranking +# --------------------------------------------------------------------------- + + +def build_reference_graph(all_symbols, file_contents): + """Build a graph of cross-file symbol references. + + Returns a dict: symbol_name -> number of other files that reference it. + Only counts classes, structs, and non-trivial functions. + """ + # Filter out trivial/common names that would create noise + TRIVIAL_NAMES = { + "name", "type", "end", "begin", "size", "empty", "get", "set", + "data", "value", "index", "clear", "push", "pop", "front", "back", + "format", "write", "read", "to_string", "operator", "iterator", + "const_iterator", "surface", "run", "reset", "init", + } + + # Collect all symbol names and their source files + symbol_files = {} # name -> set of files where defined + for sym in all_symbols: + name = sym["name"] + # Skip trivial names and very short names + if name.lower() in TRIVIAL_NAMES or len(name) < 4: + continue + # Skip pure accessor patterns + if sym["kind"] == "function" and sym["signature"]: + sig_lower = sym["signature"].lower() + if any(p in sig_lower for p in [ + "const {", "() const", "& name()", "type() const override" + ]): + # Only skip if it's a simple accessor (short signature) + if len(sym["signature"]) < 60 and "(" in sig_lower: + parts = sig_lower.split("(")[0].split() + if parts and parts[-1] in TRIVIAL_NAMES: + continue + symbol_files.setdefault(name, set()).add(sym["file"]) + + # Count how many OTHER files reference each symbol + ref_counts = defaultdict(int) + for filepath, content in file_contents.items(): + for sym_name, def_files in symbol_files.items(): + if filepath not in def_files and sym_name in content: + ref_counts[sym_name] += 1 + + return ref_counts + + +def rank_symbols(all_symbols, ref_counts): + """Rank symbols by cross-file reference count (simplified PageRank). + + Boost classes/structs since they represent key abstractions. + """ + for sym in all_symbols: + base_score = ref_counts.get(sym["name"], 0) + # Boost classes/structs - they're more informative than individual functions + if sym["kind"] in ("class", "struct"): + base_score = int(base_score * 1.5) + 2 + sym["score"] = base_score + return sorted(all_symbols, key=lambda s: (-s["score"], s["name"])) + + +# --------------------------------------------------------------------------- +# Subsystem categorization +# --------------------------------------------------------------------------- + + +def categorize_file(filepath): + """Map a file path to a logical subsystem.""" + rel = str(filepath).replace("\\", "/").lower() + for keyword, subsystem in SUBSYSTEM_RULES: + if keyword in rel: + return subsystem + return "Other" + + +# --------------------------------------------------------------------------- +# Output generation +# --------------------------------------------------------------------------- + + +def generate_repomap_md(ranked_symbols, max_lines=MAX_REPOMAP_LINES): + """Generate concise markdown repo map.""" + # Deduplicate: keep highest-scored version of each name per subsystem + seen = set() + deduped = [] + for sym in ranked_symbols: + subsystem = categorize_file(sym["file"]) + key = (subsystem, sym["name"]) + if key not in seen: + seen.add(key) + deduped.append(sym) + + # Group by subsystem + groups = defaultdict(list) + for sym in deduped: + subsystem = categorize_file(sym["file"]) + if subsystem == "Other" and sym["score"] < 3: + continue # Skip low-value "Other" symbols + groups[subsystem].append(sym) + + # Sort groups by max score in group, drop "Other" to the end + sorted_groups = sorted( + groups.items(), + key=lambda g: (g[0] == "Other", -max(s["score"] for s in g[1]) if g[1] else 0), + ) + + lines = [ + "# OpenMC Repo Map", + "", + "Auto-generated structural overview. Top symbols ranked by cross-file usage.", + "", + ] + + for group_name, syms in sorted_groups: + if len(lines) >= max_lines - 2: + break + + lines.append(f"## {group_name}") + + # Show top symbols in this group + shown = 0 + for sym in syms: + if shown >= 5 or len(lines) >= max_lines - 1: + break + rel_file = os.path.relpath(sym["file"], OPENMC_ROOT) + # Flatten signature to single line + sig_flat = " ".join(sym["signature"].split()) + if len(sig_flat) > 80: + sig_flat = sig_flat[:77] + "..." + lines.append(f"- `{sig_flat}` ({rel_file}:{sym['line']})") + + # Show key methods for classes (max 2, single line each) + if sym["kind"] in ("class", "struct") and sym["methods"]: + for method in sym["methods"][:2]: + # Flatten to single line + method_flat = " ".join(method.split()) + if len(method_flat) > 70: + method_flat = method_flat[:67] + "..." + lines.append(f" - `{method_flat}`") + if len(lines) >= max_lines - 1: + break + + shown += 1 + + lines.append("") + + return "\n".join(lines[:max_lines]) + + +# --------------------------------------------------------------------------- +# Main +# --------------------------------------------------------------------------- + + +def main(): + print(f"OpenMC root: {OPENMC_ROOT}") + CACHE_DIR.mkdir(parents=True, exist_ok=True) + + # Collect all source files + file_contents = {} + cpp_files = [] + py_files = [] + + for pattern in CPP_PATTERNS: + for fp in sorted(OPENMC_ROOT.glob(pattern)): + try: + content = fp.read_text(errors="replace") + rel = str(fp.relative_to(OPENMC_ROOT)) + file_contents[rel] = content + cpp_files.append((rel, content)) + except Exception as e: + print(f" Warning: could not read {fp}: {e}", file=sys.stderr) + + for pattern in PY_PATTERNS: + for fp in sorted(OPENMC_ROOT.glob(pattern)): + if "__pycache__" in str(fp): + continue + try: + content = fp.read_text(errors="replace") + rel = str(fp.relative_to(OPENMC_ROOT)) + file_contents[rel] = content + py_files.append((rel, content)) + except Exception as e: + print(f" Warning: could not read {fp}: {e}", file=sys.stderr) + + print(f"Found {len(cpp_files)} C++/H files, {len(py_files)} Python files") + + # Extract symbols + all_symbols = [] + for rel, content in cpp_files: + syms = extract_cpp_symbols(rel, content) + all_symbols.extend(syms) + + for rel, content in py_files: + syms = extract_py_symbols(rel, content) + all_symbols.extend(syms) + + print(f"Extracted {len(all_symbols)} symbols") + + # Build reference graph and rank + ref_counts = build_reference_graph(all_symbols, file_contents) + ranked = rank_symbols(all_symbols, ref_counts) + + # Generate outputs + repomap_md = generate_repomap_md(ranked) + repomap_path = CACHE_DIR / "repomap.md" + repomap_path.write_text(repomap_md) + print(f"Wrote {repomap_path} ({len(repomap_md.splitlines())} lines)") + + # Full JSON for other tools + json_path = CACHE_DIR / "repomap_full.json" + json_data = [] + for sym in ranked: + json_data.append({ + "name": sym["name"], + "kind": sym["kind"], + "signature": sym["signature"], + "methods": sym["methods"], + "file": sym["file"], + "line": sym["line"], + "score": sym["score"], + }) + json_path.write_text(json.dumps(json_data, indent=2)) + print(f"Wrote {json_path} ({len(json_data)} symbols)") + + +if __name__ == "__main__": + main() diff --git a/.claude/tools/requirements.txt b/.claude/tools/requirements.txt new file mode 100644 index 00000000000..b25dd1f1b08 --- /dev/null +++ b/.claude/tools/requirements.txt @@ -0,0 +1,17 @@ +# Tree-sitter for code parsing (repo map + chunking) +tree-sitter>=0.23.0 +tree-sitter-python>=0.23.0 +tree-sitter-cpp>=0.23.0 + +# Vector database +lancedb>=0.15.0 +pyarrow>=14.0.0 + +# Embeddings (primary - local, no API key) +sentence-transformers>=2.7.0 + +# Embeddings (fallback - zero downloads) +scikit-learn>=1.4.0 + +# Utilities +numpy>=1.26.0 diff --git a/.gitignore b/.gitignore index 780059f3072..dd8dfb14a96 100644 --- a/.gitignore +++ b/.gitignore @@ -104,5 +104,8 @@ CMakeSettings.json # Visual Studio Code configuration files .vscode/ +# Claude Code agent tools (cached/generated artifacts) +.claude/cache/ + # Python pickle files *.pkl diff --git a/CLAUDE.md b/CLAUDE.md new file mode 100644 index 00000000000..b5f95d6d47c --- /dev/null +++ b/CLAUDE.md @@ -0,0 +1,38 @@ +# OpenMC Development Environment + +Read `agent_build_and_testing_workflow.md` in this directory before starting work. +It contains the full build/test workflow, script usage, and conventions. + +## Key Commands +- `./build.sh --incremental -q` — rebuild after C++ changes (quiet, errors in `/tmp/openmc_build.txt`) +- `./check_tests.sh -q --smoke` — quick regression check (~10% of tests) +- `./check_tests.sh -q` — full regression check (details in `/tmp/openmc_regression.txt`) +- `./run_test.sh -q ` — run a single test (details in `/tmp/openmc_run_test.txt`) +- `./record_tests.sh` — re-record baseline after intentional changes + +## Workflow +1. Edit C++ or Python code +2. `./build.sh --incremental -q` — rebuild +3. `./check_tests.sh -q --smoke` — quick sanity check +4. `./check_tests.sh -q` — full regression check before committing + +## Remotes +- `origin` — fork (git@github.com:jtramm/openmc.git) — push here +- `upstream` — official (https://github.com/openmc-dev/openmc.git) — pull from here + +## OpenMC Codebase Index + +If the user asks you to investigate, modify, or debug OpenMC code, let them know +about the `/enable-openmc-index` skill which provides a structural repo map and +semantic code search across the entire codebase. Offer to run it for them. + +Do NOT use the index tools (`openmc_search.py`, `repomap.md`) unless +`/enable-openmc-index` has been run in the current session. + +To rebuild the index after pulling new code or switching branches, use +`/refresh-openmc-index`. + +## Additional OpenMC info + +Read the FULL `AGENTS.md` in this directory also before starting work. Info in the +`agent_build_and_testing_workflow.md` supercedes anything in AGENTS.md. From b26895623600fa3ebc98454bfae55423f66af45f Mon Sep 17 00:00:00 2001 From: John Tramm Date: Wed, 4 Mar 2026 22:56:09 +0000 Subject: [PATCH 02/67] edited claude.md for brevity --- CLAUDE.md | 29 +++-------------------------- 1 file changed, 3 insertions(+), 26 deletions(-) diff --git a/CLAUDE.md b/CLAUDE.md index b5f95d6d47c..036eedfbf41 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -1,25 +1,3 @@ -# OpenMC Development Environment - -Read `agent_build_and_testing_workflow.md` in this directory before starting work. -It contains the full build/test workflow, script usage, and conventions. - -## Key Commands -- `./build.sh --incremental -q` — rebuild after C++ changes (quiet, errors in `/tmp/openmc_build.txt`) -- `./check_tests.sh -q --smoke` — quick regression check (~10% of tests) -- `./check_tests.sh -q` — full regression check (details in `/tmp/openmc_regression.txt`) -- `./run_test.sh -q ` — run a single test (details in `/tmp/openmc_run_test.txt`) -- `./record_tests.sh` — re-record baseline after intentional changes - -## Workflow -1. Edit C++ or Python code -2. `./build.sh --incremental -q` — rebuild -3. `./check_tests.sh -q --smoke` — quick sanity check -4. `./check_tests.sh -q` — full regression check before committing - -## Remotes -- `origin` — fork (git@github.com:jtramm/openmc.git) — push here -- `upstream` — official (https://github.com/openmc-dev/openmc.git) — pull from here - ## OpenMC Codebase Index If the user asks you to investigate, modify, or debug OpenMC code, let them know @@ -29,10 +7,9 @@ semantic code search across the entire codebase. Offer to run it for them. Do NOT use the index tools (`openmc_search.py`, `repomap.md`) unless `/enable-openmc-index` has been run in the current session. -To rebuild the index after pulling new code or switching branches, use -`/refresh-openmc-index`. +To rebuild the index, the user can use `/refresh-openmc-index`. You may +offer to run this skill for them if it seems necessary. ## Additional OpenMC info -Read the FULL `AGENTS.md` in this directory also before starting work. Info in the -`agent_build_and_testing_workflow.md` supercedes anything in AGENTS.md. +Read the FULL `AGENTS.md` in this directory also before starting work. From 7dca2abf9d6ebed159f81361191d198731a7a607 Mon Sep 17 00:00:00 2001 From: John Tramm Date: Wed, 4 Mar 2026 23:02:18 +0000 Subject: [PATCH 03/67] Improve search tool discoverability via --help Add examples to openmc_search.py --help output. Update enable-openmc-index skill to have the agent run --help to learn the full API rather than duplicating usage docs. Subagent guidance also references --help. Co-Authored-By: Claude Opus 4.6 --- .claude/skills/enable-openmc-index/SKILL.md | 31 ++++++++++----------- .claude/tools/rag/openmc_search.py | 10 ++++++- 2 files changed, 23 insertions(+), 18 deletions(-) diff --git a/.claude/skills/enable-openmc-index/SKILL.md b/.claude/skills/enable-openmc-index/SKILL.md index 8c115470264..e0a3c3d2790 100644 --- a/.claude/skills/enable-openmc-index/SKILL.md +++ b/.claude/skills/enable-openmc-index/SKILL.md @@ -43,31 +43,28 @@ fi Read the file `.claude/cache/repomap.md` and internalize the codebase structure. -## Step 4: Confirm activation +## Step 4: Learn the search tool API -Tell the user the OpenMC index is active and briefly describe what's available: -- The repo map is loaded (structural overview) -- Semantic search is ready via: `.claude/cache/.venv/bin/python .claude/tools/rag/openmc_search.py "query"` +Run `--help` to see the full search API: -## Using semantic search after activation +```bash +.claude/cache/.venv/bin/python .claude/tools/rag/openmc_search.py --help +``` -For the rest of this session, before modifying unfamiliar code or when investigating how a change might affect other parts of the codebase, search for related code: +Read and internalize the output so you know all available options (--docs, --all, --related, --top-k, etc.). -```bash -# Search source code -.claude/cache/.venv/bin/python .claude/tools/rag/openmc_search.py "your query here" +## Step 5: Confirm activation -# Search documentation -.claude/cache/.venv/bin/python .claude/tools/rag/openmc_search.py "your query" --docs +Tell the user the OpenMC index is active and briefly describe what's available: +- The repo map is loaded (structural overview of the codebase) +- Semantic search is ready (explain the key modes: code search, doc search, related file search) -# Search both code and docs -.claude/cache/.venv/bin/python .claude/tools/rag/openmc_search.py "your query" --all +## Using semantic search after activation -# Find code related to a specific file -.claude/cache/.venv/bin/python .claude/tools/rag/openmc_search.py --related src/somefile.cpp -``` +For the rest of this session, before modifying unfamiliar code or when investigating how a change might affect other parts of the codebase, use `openmc_search.py` to find related code paths. ## Subagent guidance When spawning subagents that will investigate or modify OpenMC code, include in their prompt: -"The OpenMC search index is available. Read .claude/cache/repomap.md for a structural overview. Use `.claude/cache/.venv/bin/python .claude/tools/rag/openmc_search.py 'query'` for semantic search." + +"The OpenMC search index is available. Read `.claude/cache/repomap.md` for a structural overview of the codebase. For semantic search, first run `.claude/cache/.venv/bin/python .claude/tools/rag/openmc_search.py --help` to see the full API, then use it to search code, docs, or find related files." diff --git a/.claude/tools/rag/openmc_search.py b/.claude/tools/rag/openmc_search.py index 9de73499e0b..1fe10d2818e 100644 --- a/.claude/tools/rag/openmc_search.py +++ b/.claude/tools/rag/openmc_search.py @@ -140,7 +140,15 @@ def search_related(db, embedder, filepath, top_k): def main(): parser = argparse.ArgumentParser( - description="Semantic search across OpenMC codebase and docs" + description="Semantic search across OpenMC codebase and docs", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog="""examples: + %(prog)s "particle random number seed initialization" + %(prog)s "how to define tallies" --docs + %(prog)s "weight window variance reduction" --all + %(prog)s "where is cross section data loaded" --top-k 15 + %(prog)s --related src/simulation.cpp + %(prog)s --related src/particle_restart.cpp --top-k 5""", ) parser.add_argument("query", nargs="?", help="Search query") parser.add_argument("--docs", action="store_true", From 009819a525407f94ddc68db922595bb382198efd Mon Sep 17 00:00:00 2001 From: John Tramm Date: Thu, 5 Mar 2026 03:48:24 +0000 Subject: [PATCH 04/67] Replace custom repo map with aider-based structural mapping Remove the custom generate_repomap.py (which produced a flat list of function signatures) and replace with openmc_map.py, a thin wrapper around aider's RepoMap. This generates contextual, focused code structure maps using tree-sitter + PageRank, showing condensed class/function skeletons with elided bodies. The two tools now serve complementary purposes: - openmc_search.py: semantic RAG search ("find code related to X") - openmc_map.py: structural map ("show me the shape of these files") The map tool generates maps on the fly (no pre-built index needed), so only the RAG search index needs refreshing after code changes. Co-Authored-By: Claude Opus 4.6 --- .claude/skills/enable-openmc-index/SKILL.md | 54 +- .claude/skills/refresh-openmc-index/SKILL.md | 17 +- .claude/tools/repomap/generate_repomap.py | 521 ------------------- .claude/tools/repomap/openmc_map.py | 154 ++++++ .claude/tools/requirements.txt | 5 +- 5 files changed, 197 insertions(+), 554 deletions(-) delete mode 100644 .claude/tools/repomap/generate_repomap.py create mode 100644 .claude/tools/repomap/openmc_map.py diff --git a/.claude/skills/enable-openmc-index/SKILL.md b/.claude/skills/enable-openmc-index/SKILL.md index e0a3c3d2790..616d9f95b75 100644 --- a/.claude/skills/enable-openmc-index/SKILL.md +++ b/.claude/skills/enable-openmc-index/SKILL.md @@ -1,14 +1,14 @@ --- name: enable-openmc-index -description: Enable the OpenMC codebase index for this session. Provides a structural repo map and semantic code search. Run this when investigating, modifying, or debugging OpenMC code. +description: Enable the OpenMC codebase index for this session. Provides semantic code search and structural repo mapping. Run this when investigating, modifying, or debugging OpenMC code. allowed-tools: Bash(*), Read --- # Enable OpenMC Index -Set up (if needed) and activate the OpenMC codebase index for this session. This gives you: -1. A structural repo map showing the most important classes, functions, and their relationships -2. Semantic search across all source code, tests, and documentation +Set up (if needed) and activate the OpenMC codebase index for this session. This gives you two tools: +1. **Semantic search** (`openmc_search.py`) - Find related code across the codebase by concept +2. **Structural map** (`openmc_map.py`) - See condensed code structure of files and their neighbors ## Step 1: Ensure the virtual environment exists @@ -24,14 +24,13 @@ else fi ``` -## Step 2: Ensure the index exists +## Step 2: Ensure the RAG index exists Check if `.claude/cache/rag_index/` exists. If not, build it: ```bash if [ ! -d .claude/cache/rag_index ]; then - echo "Building index for the first time (this takes ~3 minutes)..." - .claude/cache/.venv/bin/python .claude/tools/repomap/generate_repomap.py + echo "Building RAG index for the first time (this takes ~3 minutes)..." HF_HUB_DISABLE_TELEMETRY=1 .claude/cache/.venv/bin/python .claude/tools/rag/indexer.py echo "INDEX_BUILT" else @@ -39,32 +38,47 @@ else fi ``` -## Step 3: Load the repo map +Note: The repo map tool (`openmc_map.py`) does NOT need a pre-built index - it generates maps on the fly using tree-sitter. Only the RAG search needs the vector index. -Read the file `.claude/cache/repomap.md` and internalize the codebase structure. +## Step 3: Learn the tool APIs -## Step 4: Learn the search tool API - -Run `--help` to see the full search API: +Run `--help` for both tools to see their full APIs: ```bash .claude/cache/.venv/bin/python .claude/tools/rag/openmc_search.py --help +.claude/cache/.venv/bin/python .claude/tools/repomap/openmc_map.py --help ``` -Read and internalize the output so you know all available options (--docs, --all, --related, --top-k, etc.). +Read and internalize the output so you know all available options. + +## Step 4: Confirm activation + +Tell the user the OpenMC index is active and briefly describe the two tools: +- **Semantic search**: Find related code by concept (e.g., "particle seed initialization") +- **Structural map**: See condensed code structure around specific files + +## How to use the tools after activation -## Step 5: Confirm activation +**Typical workflow:** -Tell the user the OpenMC index is active and briefly describe what's available: -- The repo map is loaded (structural overview of the codebase) -- Semantic search is ready (explain the key modes: code search, doc search, related file search) +1. Use `openmc_search.py` to discover which files are relevant to your task +2. Use `openmc_map.py` on those files to understand their structure and neighbors +3. Use Read/Grep to dive into the specific code you need to change -## Using semantic search after activation +**When to use semantic search** (`openmc_search.py`): +- Investigating how a change might affect other parts of the codebase +- Finding code that does something conceptually similar but with different naming +- Discovering cross-cutting concerns across run modes -For the rest of this session, before modifying unfamiliar code or when investigating how a change might affect other parts of the codebase, use `openmc_search.py` to find related code paths. +**When to use the repo map** (`openmc_map.py`): +- Understanding the structure of unfamiliar files before modifying them +- Seeing what classes/methods neighbor the code you're working on +- Getting a condensed overview of a subsystem (pass multiple files) ## Subagent guidance When spawning subagents that will investigate or modify OpenMC code, include in their prompt: -"The OpenMC search index is available. Read `.claude/cache/repomap.md` for a structural overview of the codebase. For semantic search, first run `.claude/cache/.venv/bin/python .claude/tools/rag/openmc_search.py --help` to see the full API, then use it to search code, docs, or find related files." +"The OpenMC code index is available. Run `--help` on these tools to see their full API: +- `.claude/cache/.venv/bin/python .claude/tools/rag/openmc_search.py --help` (semantic search) +- `.claude/cache/.venv/bin/python .claude/tools/repomap/openmc_map.py --help` (structural map)" diff --git a/.claude/skills/refresh-openmc-index/SKILL.md b/.claude/skills/refresh-openmc-index/SKILL.md index 55b922e3ac9..b0be15d1a9c 100644 --- a/.claude/skills/refresh-openmc-index/SKILL.md +++ b/.claude/skills/refresh-openmc-index/SKILL.md @@ -1,12 +1,12 @@ --- name: refresh-openmc-index -description: Rebuild the OpenMC codebase index. Use after pulling new code, switching branches, or when the index feels stale. -allowed-tools: Bash(*), Read +description: Rebuild the OpenMC RAG search index. Use after pulling new code or switching branches. The repo map tool does not need refreshing (it generates maps on the fly). +allowed-tools: Bash(*) --- # Refresh OpenMC Index -Rebuild the repo map and RAG vector index from scratch. +Rebuild the RAG vector index from scratch. Note: only the semantic search index needs refreshing. The repo map tool (`openmc_map.py`) always works on the current code. ## Step 1: Ensure venv exists @@ -17,20 +17,13 @@ if [ ! -d .claude/cache/.venv ]; then fi ``` -## Step 2: Rebuild everything +## Step 2: Rebuild the RAG index ```bash -echo "Rebuilding repo map..." -.claude/cache/.venv/bin/python .claude/tools/repomap/generate_repomap.py - echo "Rebuilding RAG index (this takes ~3 minutes)..." HF_HUB_DISABLE_TELEMETRY=1 .claude/cache/.venv/bin/python .claude/tools/rag/indexer.py ``` -## Step 3: Reload the repo map - -Read the updated `.claude/cache/repomap.md` and internalize the new codebase structure. - -## Step 4: Confirm +## Step 3: Confirm Tell the user the index has been refreshed and is ready to use. diff --git a/.claude/tools/repomap/generate_repomap.py b/.claude/tools/repomap/generate_repomap.py deleted file mode 100644 index 4ee3ef28036..00000000000 --- a/.claude/tools/repomap/generate_repomap.py +++ /dev/null @@ -1,521 +0,0 @@ -#!/usr/bin/env python3 -"""Generate a structural repo map of the OpenMC codebase. - -Uses tree-sitter to parse C++ and Python files, extracts class/function -signatures, builds a cross-file reference graph, applies PageRank-like -ranking, and outputs a concise markdown map grouped by subsystem. - -Output: - .claude/cache/repomap.md - Concise map (~170 lines) for agent context - .claude/cache/repomap_full.json - Full symbol data for other tools -""" - -import json -import os -import sys -from collections import defaultdict -from pathlib import Path - -import tree_sitter_cpp as tscpp -import tree_sitter_python as tspy -from tree_sitter import Language, Parser - -# --------------------------------------------------------------------------- -# Configuration -# --------------------------------------------------------------------------- - -OPENMC_ROOT = Path(__file__).resolve().parents[3] # .claude/tools/repomap -> repo root -CACHE_DIR = OPENMC_ROOT / ".claude" / "cache" - -CPP_PATTERNS = ["src/**/*.cpp", "include/openmc/**/*.h"] -PY_PATTERNS = ["openmc/**/*.py"] - -# Map file path keywords to logical subsystems -SUBSYSTEM_RULES = [ - # (path substring, subsystem name) - ("random_ray", "Random Ray Solver"), - ("tallies/", "Tallies & Filters"), - ("deplete", "Depletion"), - ("mgxs", "Multi-Group Cross Sections"), - ("data/", "Nuclear Data"), - ("lattice", "Geometry"), - ("universe", "Geometry"), - ("surface", "Geometry"), - ("cell", "Geometry"), - ("geometry", "Geometry"), - ("dagmc", "Geometry (DAGMC)"), - ("mesh", "Mesh"), - ("material", "Materials"), - ("nuclide", "Nuclear Data"), - ("cross_section", "Nuclear Data"), - ("thermal", "Nuclear Data"), - ("wmp", "Nuclear Data"), - ("particle", "Particle Transport"), - ("physics", "Particle Transport"), - ("collision", "Particle Transport"), - ("photon", "Particle Transport"), - ("bremsstrahlung", "Particle Transport"), - ("secondary_", "Particle Transport"), - ("reaction", "Particle Transport"), - ("source", "Sources & Distributions"), - ("distribution", "Sources & Distributions"), - ("eigenvalue", "Eigenvalue Solver"), - ("cmfd", "CMFD Acceleration"), - ("weight_window", "Variance Reduction"), - ("tally", "Tallies & Filters"), - ("filter", "Tallies & Filters"), - ("trigger", "Tallies & Filters"), - ("plot", "Plotting"), - ("track", "Plotting"), - ("volume_calc", "Volume Calculation"), - ("random_lcg", "Random Number Generation"), - ("random_dist", "Random Number Generation"), - ("settings", "Settings & Configuration"), - ("simulation", "Simulation Control"), - ("initialize", "Simulation Control"), - ("finalize", "Simulation Control"), - ("state_point", "I/O & Serialization"), - ("summary", "I/O & Serialization"), - ("hdf5", "I/O & Serialization"), - ("xml_interface", "I/O & Serialization"), - ("output", "I/O & Serialization"), - ("bank", "Particle Banking"), - ("event", "Event-Based Transport"), - ("error", "Utilities"), - ("string_utils", "Utilities"), - ("math_functions", "Utilities"), - ("file_utils", "Utilities"), - ("timer", "Utilities"), - ("memory", "Utilities"), - ("position", "Utilities"), - ("model", "Model Builder (Python)"), - ("stats", "Statistics (Python)"), - ("lib/", "C API Bindings (Python)"), -] - -MAX_REPOMAP_LINES = 160 - -# --------------------------------------------------------------------------- -# Tree-sitter setup -# --------------------------------------------------------------------------- - -CPP_LANG = Language(tscpp.language()) -PY_LANG = Language(tspy.language()) - - -def make_parser(lang): - p = Parser(lang) - return p - - -cpp_parser = make_parser(CPP_LANG) -py_parser = make_parser(PY_LANG) - -# --------------------------------------------------------------------------- -# Symbol extraction -# --------------------------------------------------------------------------- - - -def extract_cpp_symbols(filepath, content): - """Extract class/struct/function definitions from C++ code.""" - symbols = [] - tree = cpp_parser.parse(content.encode()) - - def visit(node, namespace=""): - if node.type == "namespace_definition": - ns_name = "" - for child in node.children: - if child.type == "namespace_identifier": - ns_name = child.text.decode() - break - body = None - for child in node.children: - if child.type == "declaration_list": - body = child - break - if body: - prefix = f"{namespace}{ns_name}::" if ns_name else namespace - for child in body.children: - visit(child, prefix) - return - - if node.type in ("class_specifier", "struct_specifier"): - name_node = node.child_by_field_name("name") - if name_node: - name = name_node.text.decode() - full_name = f"{namespace}{name}" - # Extract method signatures - methods = [] - body = node.child_by_field_name("body") - if body: - for child in body.children: - if child.type == "function_definition": - sig = _cpp_func_signature(child) - if sig: - methods.append(sig) - elif child.type == "declaration": - # Could be a method declaration - sig = _cpp_decl_signature(child) - if sig: - methods.append(sig) - kind = "class" if node.type == "class_specifier" else "struct" - symbols.append({ - "name": full_name, - "kind": kind, - "signature": f"{kind} {full_name}", - "methods": methods[:10], # Cap to avoid bloat - "file": str(filepath), - "line": node.start_point[0] + 1, - }) - - elif node.type == "function_definition": - sig = _cpp_func_signature(node) - if sig: - symbols.append({ - "name": sig.split("(")[0].split()[-1] if "(" in sig else sig, - "kind": "function", - "signature": sig, - "methods": [], - "file": str(filepath), - "line": node.start_point[0] + 1, - }) - - # Visit children for top-level traversal - for child in node.children: - visit(child, namespace) - - for child in tree.root_node.children: - visit(child) - - return symbols - - -def _cpp_func_signature(node): - """Extract a concise function signature from a function_definition node.""" - declarator = node.child_by_field_name("declarator") - if not declarator: - return None - # Get return type - ret_type = "" - for child in node.children: - if child == declarator: - break - if child.type not in ("comment", "attribute_declaration"): - ret_type += child.text.decode() + " " - ret_type = ret_type.strip() - decl_text = declarator.text.decode() - # Truncate long signatures - sig = f"{ret_type} {decl_text}".strip() - if len(sig) > 120: - sig = sig[:117] + "..." - return sig - - -def _cpp_decl_signature(node): - """Extract signature from a declaration that might be a method decl.""" - text = node.text.decode().strip() - if "(" in text and ";" in text: - sig = text.rstrip(";").strip() - if len(sig) > 120: - sig = sig[:117] + "..." - return sig - return None - - -def extract_py_symbols(filepath, content): - """Extract class/function definitions from Python code.""" - symbols = [] - tree = py_parser.parse(content.encode()) - - for node in tree.root_node.children: - if node.type == "class_definition": - name_node = node.child_by_field_name("name") - if not name_node: - continue - name = name_node.text.decode() - # Get superclasses - superclass = "" - for child in node.children: - if child.type == "argument_list": - superclass = child.text.decode() - break - # Get method names - methods = [] - body = node.child_by_field_name("body") - if body: - for child in body.children: - if child.type == "function_definition": - mname = child.child_by_field_name("name") - if mname: - mtext = mname.text.decode() - if not mtext.startswith("_") or mtext in ( - "__init__", "__repr__", "__iter__" - ): - params = child.child_by_field_name("parameters") - psig = params.text.decode() if params else "()" - methods.append(f"{mtext}{psig}") - - sig = f"class {name}{superclass}" if superclass else f"class {name}" - symbols.append({ - "name": name, - "kind": "class", - "signature": sig, - "methods": methods[:10], - "file": str(filepath), - "line": node.start_point[0] + 1, - }) - - elif node.type == "function_definition": - name_node = node.child_by_field_name("name") - if not name_node: - continue - name = name_node.text.decode() - if name.startswith("_") and name != "__init__": - continue - params = node.child_by_field_name("parameters") - psig = params.text.decode() if params else "()" - sig = f"def {name}{psig}" - if len(sig) > 120: - sig = sig[:117] + "..." - symbols.append({ - "name": name, - "kind": "function", - "signature": sig, - "methods": [], - "file": str(filepath), - "line": node.start_point[0] + 1, - }) - - return symbols - - -# --------------------------------------------------------------------------- -# Reference graph and ranking -# --------------------------------------------------------------------------- - - -def build_reference_graph(all_symbols, file_contents): - """Build a graph of cross-file symbol references. - - Returns a dict: symbol_name -> number of other files that reference it. - Only counts classes, structs, and non-trivial functions. - """ - # Filter out trivial/common names that would create noise - TRIVIAL_NAMES = { - "name", "type", "end", "begin", "size", "empty", "get", "set", - "data", "value", "index", "clear", "push", "pop", "front", "back", - "format", "write", "read", "to_string", "operator", "iterator", - "const_iterator", "surface", "run", "reset", "init", - } - - # Collect all symbol names and their source files - symbol_files = {} # name -> set of files where defined - for sym in all_symbols: - name = sym["name"] - # Skip trivial names and very short names - if name.lower() in TRIVIAL_NAMES or len(name) < 4: - continue - # Skip pure accessor patterns - if sym["kind"] == "function" and sym["signature"]: - sig_lower = sym["signature"].lower() - if any(p in sig_lower for p in [ - "const {", "() const", "& name()", "type() const override" - ]): - # Only skip if it's a simple accessor (short signature) - if len(sym["signature"]) < 60 and "(" in sig_lower: - parts = sig_lower.split("(")[0].split() - if parts and parts[-1] in TRIVIAL_NAMES: - continue - symbol_files.setdefault(name, set()).add(sym["file"]) - - # Count how many OTHER files reference each symbol - ref_counts = defaultdict(int) - for filepath, content in file_contents.items(): - for sym_name, def_files in symbol_files.items(): - if filepath not in def_files and sym_name in content: - ref_counts[sym_name] += 1 - - return ref_counts - - -def rank_symbols(all_symbols, ref_counts): - """Rank symbols by cross-file reference count (simplified PageRank). - - Boost classes/structs since they represent key abstractions. - """ - for sym in all_symbols: - base_score = ref_counts.get(sym["name"], 0) - # Boost classes/structs - they're more informative than individual functions - if sym["kind"] in ("class", "struct"): - base_score = int(base_score * 1.5) + 2 - sym["score"] = base_score - return sorted(all_symbols, key=lambda s: (-s["score"], s["name"])) - - -# --------------------------------------------------------------------------- -# Subsystem categorization -# --------------------------------------------------------------------------- - - -def categorize_file(filepath): - """Map a file path to a logical subsystem.""" - rel = str(filepath).replace("\\", "/").lower() - for keyword, subsystem in SUBSYSTEM_RULES: - if keyword in rel: - return subsystem - return "Other" - - -# --------------------------------------------------------------------------- -# Output generation -# --------------------------------------------------------------------------- - - -def generate_repomap_md(ranked_symbols, max_lines=MAX_REPOMAP_LINES): - """Generate concise markdown repo map.""" - # Deduplicate: keep highest-scored version of each name per subsystem - seen = set() - deduped = [] - for sym in ranked_symbols: - subsystem = categorize_file(sym["file"]) - key = (subsystem, sym["name"]) - if key not in seen: - seen.add(key) - deduped.append(sym) - - # Group by subsystem - groups = defaultdict(list) - for sym in deduped: - subsystem = categorize_file(sym["file"]) - if subsystem == "Other" and sym["score"] < 3: - continue # Skip low-value "Other" symbols - groups[subsystem].append(sym) - - # Sort groups by max score in group, drop "Other" to the end - sorted_groups = sorted( - groups.items(), - key=lambda g: (g[0] == "Other", -max(s["score"] for s in g[1]) if g[1] else 0), - ) - - lines = [ - "# OpenMC Repo Map", - "", - "Auto-generated structural overview. Top symbols ranked by cross-file usage.", - "", - ] - - for group_name, syms in sorted_groups: - if len(lines) >= max_lines - 2: - break - - lines.append(f"## {group_name}") - - # Show top symbols in this group - shown = 0 - for sym in syms: - if shown >= 5 or len(lines) >= max_lines - 1: - break - rel_file = os.path.relpath(sym["file"], OPENMC_ROOT) - # Flatten signature to single line - sig_flat = " ".join(sym["signature"].split()) - if len(sig_flat) > 80: - sig_flat = sig_flat[:77] + "..." - lines.append(f"- `{sig_flat}` ({rel_file}:{sym['line']})") - - # Show key methods for classes (max 2, single line each) - if sym["kind"] in ("class", "struct") and sym["methods"]: - for method in sym["methods"][:2]: - # Flatten to single line - method_flat = " ".join(method.split()) - if len(method_flat) > 70: - method_flat = method_flat[:67] + "..." - lines.append(f" - `{method_flat}`") - if len(lines) >= max_lines - 1: - break - - shown += 1 - - lines.append("") - - return "\n".join(lines[:max_lines]) - - -# --------------------------------------------------------------------------- -# Main -# --------------------------------------------------------------------------- - - -def main(): - print(f"OpenMC root: {OPENMC_ROOT}") - CACHE_DIR.mkdir(parents=True, exist_ok=True) - - # Collect all source files - file_contents = {} - cpp_files = [] - py_files = [] - - for pattern in CPP_PATTERNS: - for fp in sorted(OPENMC_ROOT.glob(pattern)): - try: - content = fp.read_text(errors="replace") - rel = str(fp.relative_to(OPENMC_ROOT)) - file_contents[rel] = content - cpp_files.append((rel, content)) - except Exception as e: - print(f" Warning: could not read {fp}: {e}", file=sys.stderr) - - for pattern in PY_PATTERNS: - for fp in sorted(OPENMC_ROOT.glob(pattern)): - if "__pycache__" in str(fp): - continue - try: - content = fp.read_text(errors="replace") - rel = str(fp.relative_to(OPENMC_ROOT)) - file_contents[rel] = content - py_files.append((rel, content)) - except Exception as e: - print(f" Warning: could not read {fp}: {e}", file=sys.stderr) - - print(f"Found {len(cpp_files)} C++/H files, {len(py_files)} Python files") - - # Extract symbols - all_symbols = [] - for rel, content in cpp_files: - syms = extract_cpp_symbols(rel, content) - all_symbols.extend(syms) - - for rel, content in py_files: - syms = extract_py_symbols(rel, content) - all_symbols.extend(syms) - - print(f"Extracted {len(all_symbols)} symbols") - - # Build reference graph and rank - ref_counts = build_reference_graph(all_symbols, file_contents) - ranked = rank_symbols(all_symbols, ref_counts) - - # Generate outputs - repomap_md = generate_repomap_md(ranked) - repomap_path = CACHE_DIR / "repomap.md" - repomap_path.write_text(repomap_md) - print(f"Wrote {repomap_path} ({len(repomap_md.splitlines())} lines)") - - # Full JSON for other tools - json_path = CACHE_DIR / "repomap_full.json" - json_data = [] - for sym in ranked: - json_data.append({ - "name": sym["name"], - "kind": sym["kind"], - "signature": sym["signature"], - "methods": sym["methods"], - "file": sym["file"], - "line": sym["line"], - "score": sym["score"], - }) - json_path.write_text(json.dumps(json_data, indent=2)) - print(f"Wrote {json_path} ({len(json_data)} symbols)") - - -if __name__ == "__main__": - main() diff --git a/.claude/tools/repomap/openmc_map.py b/.claude/tools/repomap/openmc_map.py new file mode 100644 index 00000000000..1bf4dbf26b8 --- /dev/null +++ b/.claude/tools/repomap/openmc_map.py @@ -0,0 +1,154 @@ +#!/usr/bin/env python3 +"""Generate a focused repo map around specific OpenMC files. + +Uses aider's RepoMap to produce a condensed structural overview of the +codebase, ranked by relevance to the files you're currently working on. + +Usage: + openmc_map.py src/particle.cpp # Map around one file + openmc_map.py src/simulation.cpp src/source.cpp # Map around multiple files + openmc_map.py --tokens 4096 # Larger map (default: 2048) + openmc_map.py # Map of the whole repo (top-ranked files) + +Examples: + openmc_map.py src/particle_restart.cpp src/random_lcg.cpp + openmc_map.py openmc/deplete/coupled_operator.py --tokens 4096 + openmc_map.py include/openmc/cell.h include/openmc/surface.h +""" + +import argparse +import glob +import os +import sys +from pathlib import Path + +OPENMC_ROOT = Path(__file__).resolve().parents[3] + +# File patterns to include in the map +FILE_PATTERNS = [ + "src/**/*.cpp", + "include/openmc/**/*.h", + "openmc/**/*.py", +] + + +class TokenCounter: + """Simple token counter that doesn't need an API model.""" + + def token_count(self, text): + # Rough approximation: ~4 chars per token for code + return len(text) // 4 + + +class FakeModel: + """Minimal model stand-in for aider's RepoMap token counting.""" + + def __init__(self): + self._token_counter = TokenCounter() + + def token_count(self, text): + return self._token_counter.token_count(text) + + +def get_all_files(): + """Collect all source files matching our patterns.""" + files = [] + for pattern in FILE_PATTERNS: + for fp in sorted(OPENMC_ROOT.glob(pattern)): + if "__pycache__" in str(fp): + continue + files.append(str(fp.relative_to(OPENMC_ROOT))) + return files + + +def generate_map(focus_files=None, map_tokens=2048): + """Generate a repo map, optionally focused on specific files. + + Args: + focus_files: List of file paths to focus on. If None, generates + a general overview of the most important files. + map_tokens: Approximate token budget for the map. + + Returns: + The repo map as a string. + """ + from aider.io import InputOutput + from aider.repomap import RepoMap + + os.chdir(OPENMC_ROOT) + + io = InputOutput(yes=True) + model = FakeModel() + + rm = RepoMap( + map_tokens=map_tokens, + root=str(OPENMC_ROOT), + io=io, + main_model=model, + ) + + all_files = get_all_files() + + # Normalize focus files to relative paths + chat_fnames = [] + if focus_files: + for f in focus_files: + # Handle both absolute and relative paths + fp = Path(f) + if fp.is_absolute(): + try: + fp = fp.relative_to(OPENMC_ROOT) + except ValueError: + pass + rel = str(fp) + if rel in all_files: + chat_fnames.append(rel) + else: + # Try to find a match + matches = [af for af in all_files if rel in af] + if matches: + chat_fnames.extend(matches) + else: + print(f"Warning: '{f}' not found in indexed files", + file=sys.stderr) + + # other_fnames = files NOT in chat_fnames + other_fnames = [f for f in all_files if f not in chat_fnames] + + repo_map = rm.get_repo_map(chat_fnames, other_fnames) + + if not repo_map: + return "No map generated. Try with different files or a larger --tokens budget." + + return repo_map + + +def main(): + parser = argparse.ArgumentParser( + description="Generate a focused structural map of OpenMC code", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog="""examples: + %(prog)s src/particle_restart.cpp src/random_lcg.cpp + %(prog)s openmc/deplete/coupled_operator.py --tokens 4096 + %(prog)s include/openmc/cell.h include/openmc/surface.h + %(prog)s # overview of whole repo""", + ) + parser.add_argument( + "files", nargs="*", + help="Files to focus the map on (shows their structure and neighbors)") + parser.add_argument( + "--tokens", type=int, default=2048, + help="Approximate token budget for the map (default: 2048)") + + args = parser.parse_args() + + # Suppress aider's scanning output + repo_map = generate_map( + focus_files=args.files if args.files else None, + map_tokens=args.tokens, + ) + print(repo_map) + + +if __name__ == "__main__": + main() diff --git a/.claude/tools/requirements.txt b/.claude/tools/requirements.txt index b25dd1f1b08..6d456ab36f4 100644 --- a/.claude/tools/requirements.txt +++ b/.claude/tools/requirements.txt @@ -1,4 +1,7 @@ -# Tree-sitter for code parsing (repo map + chunking) +# Repo map (uses aider's tree-sitter based RepoMap) +aider-chat>=0.80.0 + +# Tree-sitter for RAG code chunking tree-sitter>=0.23.0 tree-sitter-python>=0.23.0 tree-sitter-cpp>=0.23.0 From 37c7de37c8cef68dddd3847725b2e1d63e7ccae6 Mon Sep 17 00:00:00 2001 From: John Tramm Date: Thu, 5 Mar 2026 04:01:51 +0000 Subject: [PATCH 05/67] Fix repo map to include focus files in output Aider's RepoMap excludes chat_fnames from output (since they're already in the chat). We want the opposite: show focus files AND their neighbors. Now passes focus files via mentioned_fnames to boost ranking while keeping them in the output. Also suppresses aider's stderr noise. Co-Authored-By: Claude Opus 4.6 --- .claude/tools/repomap/openmc_map.py | 38 +++++++++++++++++++++-------- 1 file changed, 28 insertions(+), 10 deletions(-) diff --git a/.claude/tools/repomap/openmc_map.py b/.claude/tools/repomap/openmc_map.py index 1bf4dbf26b8..06592a1f331 100644 --- a/.claude/tools/repomap/openmc_map.py +++ b/.claude/tools/repomap/openmc_map.py @@ -77,7 +77,11 @@ def generate_map(focus_files=None, map_tokens=2048): os.chdir(OPENMC_ROOT) - io = InputOutput(yes=True) + # Suppress aider's "not a terminal" warning + devnull = open(os.devnull, "w") + io = InputOutput(yes=True, pretty=False, user_input_color=None, + tool_output_color=None, tool_warning_color=None, + tool_error_color=None) model = FakeModel() rm = RepoMap( @@ -112,10 +116,18 @@ def generate_map(focus_files=None, map_tokens=2048): print(f"Warning: '{f}' not found in indexed files", file=sys.stderr) - # other_fnames = files NOT in chat_fnames - other_fnames = [f for f in all_files if f not in chat_fnames] - - repo_map = rm.get_repo_map(chat_fnames, other_fnames) + # Aider's RepoMap excludes chat_fnames from the map output (since in + # aider's workflow those files are already in the chat). We want the + # opposite: show focus files AND their neighbors. So we pass focus files + # as mentioned_fnames (to boost their ranking) but keep them in + # other_fnames (so they appear in the output). + mentioned = set(chat_fnames) + other_fnames = all_files # Include everything + repo_map = rm.get_repo_map( + [], # No chat files - we want everything in the output + other_fnames, + mentioned_fnames=mentioned, + ) if not repo_map: return "No map generated. Try with different files or a larger --tokens budget." @@ -142,11 +154,17 @@ def main(): args = parser.parse_args() - # Suppress aider's scanning output - repo_map = generate_map( - focus_files=args.files if args.files else None, - map_tokens=args.tokens, - ) + # Redirect stderr to suppress aider's noisy warnings + import io as _io + old_stderr = sys.stderr + sys.stderr = _io.StringIO() + try: + repo_map = generate_map( + focus_files=args.files if args.files else None, + map_tokens=args.tokens, + ) + finally: + sys.stderr = old_stderr print(repo_map) From 39c4149e71d3d32b7589039a784c91907c77ee37 Mon Sep 17 00:00:00 2001 From: John Tramm Date: Thu, 5 Mar 2026 04:05:12 +0000 Subject: [PATCH 06/67] Revert repo map to aider's default behavior Aider's design is correct for our use case: when the agent passes focus files, it already has those files in context. The map should show the surrounding context (headers, dependencies, neighbors) not the files themselves. Co-Authored-By: Claude Opus 4.6 --- .claude/tools/repomap/openmc_map.py | 17 +++++------------ 1 file changed, 5 insertions(+), 12 deletions(-) diff --git a/.claude/tools/repomap/openmc_map.py b/.claude/tools/repomap/openmc_map.py index 06592a1f331..25a87c99fb3 100644 --- a/.claude/tools/repomap/openmc_map.py +++ b/.claude/tools/repomap/openmc_map.py @@ -116,18 +116,11 @@ def generate_map(focus_files=None, map_tokens=2048): print(f"Warning: '{f}' not found in indexed files", file=sys.stderr) - # Aider's RepoMap excludes chat_fnames from the map output (since in - # aider's workflow those files are already in the chat). We want the - # opposite: show focus files AND their neighbors. So we pass focus files - # as mentioned_fnames (to boost their ranking) but keep them in - # other_fnames (so they appear in the output). - mentioned = set(chat_fnames) - other_fnames = all_files # Include everything - repo_map = rm.get_repo_map( - [], # No chat files - we want everything in the output - other_fnames, - mentioned_fnames=mentioned, - ) + # chat_fnames = files the agent is focused on (already in context). + # Aider shows their neighbors/dependencies, not the files themselves, + # since the agent already has those open. + other_fnames = [f for f in all_files if f not in chat_fnames] + repo_map = rm.get_repo_map(chat_fnames, other_fnames) if not repo_map: return "No map generated. Try with different files or a larger --tokens budget." From 152709c00e3a98c3e1f6fbc4cc79a60ecefb8c01 Mon Sep 17 00:00:00 2001 From: John Tramm Date: Thu, 5 Mar 2026 04:15:39 +0000 Subject: [PATCH 07/67] Instruct agents not to truncate index tool output Agents reflexively pipe commands through head/tail to conserve context, which defeats the purpose of pre-budgeted tools like openmc_search.py and openmc_map.py. Add explicit CLAUDE.md instruction to always read full output from these tools. Co-Authored-By: Claude Opus 4.6 --- CLAUDE.md | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/CLAUDE.md b/CLAUDE.md index 036eedfbf41..c5092e13fe9 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -4,9 +4,14 @@ If the user asks you to investigate, modify, or debug OpenMC code, let them know about the `/enable-openmc-index` skill which provides a structural repo map and semantic code search across the entire codebase. Offer to run it for them. -Do NOT use the index tools (`openmc_search.py`, `repomap.md`) unless +Do NOT use the index tools (`openmc_search.py`, `openmc_map.py`) unless `/enable-openmc-index` has been run in the current session. +When using `openmc_search.py` or `openmc_map.py`, ALWAYS read their full output. +Do NOT pipe through head, tail, or grep. These tools are already sized to fit +in context via `--top-k` and `--tokens`. Truncating their output defeats their +purpose. + To rebuild the index, the user can use `/refresh-openmc-index`. You may offer to run this skill for them if it seems necessary. From 9f5e8ee9433d0217a1ce2fdc13775c1718ad2c67 Mon Sep 17 00:00:00 2001 From: John Tramm Date: Thu, 5 Mar 2026 14:58:05 +0000 Subject: [PATCH 08/67] Add clangd/LSP-based code navigation tool and suppress noisy repo map files MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add openmc_lsp.py which uses clangd's Language Server Protocol for compiler-accurate symbol resolution — go-to-definition, find-references, and related-file discovery with zero false edges from name collisions. Also suppress ubiquitous utility files (error.h, constants.h, span.h, etc.) from the aider repo map output to improve its signal-to-noise ratio. Co-Authored-By: Claude Opus 4.6 --- .claude/tools/lsp/openmc_lsp.py | 457 ++++++++++++++++++++++++++++ .claude/tools/repomap/openmc_map.py | 20 +- .claude/tools/requirements.txt | 3 + 3 files changed, 478 insertions(+), 2 deletions(-) create mode 100644 .claude/tools/lsp/openmc_lsp.py diff --git a/.claude/tools/lsp/openmc_lsp.py b/.claude/tools/lsp/openmc_lsp.py new file mode 100644 index 00000000000..a4c8a903cdf --- /dev/null +++ b/.claude/tools/lsp/openmc_lsp.py @@ -0,0 +1,457 @@ +#!/usr/bin/env python3 +"""LSP-based code navigation for OpenMC using clangd. + +Uses the Language Server Protocol to provide compiler-accurate symbol +resolution, go-to-definition, find-references, and related-file discovery. +Unlike tree-sitter-based tools, this resolves symbols through the actual +C++ type system — no false edges from name collisions. + +Requires: + - clangd (apt-get install clangd-15, or similar) + - compile_commands.json (cmake -B build -DCMAKE_EXPORT_COMPILE_COMMANDS=ON) + +Usage: + openmc_lsp.py symbols src/simulation.cpp + openmc_lsp.py definition src/simulation.cpp:132 + openmc_lsp.py references src/simulation.cpp:132 + openmc_lsp.py related src/simulation.cpp + openmc_lsp.py related src/simulation.cpp --top-k 20 + +Examples: + openmc_lsp.py symbols src/particle.cpp + openmc_lsp.py definition src/simulation.cpp:132 # where is write_message defined? + openmc_lsp.py references include/openmc/error.h:55 # who calls write_message? + openmc_lsp.py related src/simulation.cpp # files connected by real references +""" + +import argparse +import json +import os +import shutil +import subprocess +import sys +import time +import urllib.parse +from collections import Counter, defaultdict +from pathlib import Path + +OPENMC_ROOT = Path(__file__).resolve().parents[3] + +# Symbol kind names (LSP spec) +SYMBOL_KINDS = { + 1: "File", 2: "Module", 3: "Namespace", 4: "Package", 5: "Class", + 6: "Method", 7: "Property", 8: "Field", 9: "Constructor", 10: "Enum", + 11: "Interface", 12: "Function", 13: "Variable", 14: "Constant", + 15: "String", 16: "Number", 17: "Boolean", 18: "Array", 19: "Object", + 20: "Key", 21: "Null", 22: "EnumMember", 23: "Struct", 24: "Event", + 25: "Operator", 26: "TypeParameter", +} + + +class ClangdClient: + """Minimal LSP client that talks to clangd via JSON-RPC over stdin/stdout.""" + + def __init__(self, compile_commands_dir=None): + clangd = self._find_clangd() + if not clangd: + print("ERROR: clangd not found. Install with: apt-get install clangd", + file=sys.stderr) + sys.exit(1) + + if not compile_commands_dir: + compile_commands_dir = self._find_compile_commands() + if not compile_commands_dir: + print("ERROR: compile_commands.json not found. Generate with:\n" + " cmake -B build -DCMAKE_EXPORT_COMPILE_COMMANDS=ON", + file=sys.stderr) + sys.exit(1) + + args = [clangd, '--compile-commands-dir=' + str(compile_commands_dir)] + self.proc = subprocess.Popen( + args, stdin=subprocess.PIPE, stdout=subprocess.PIPE, + stderr=subprocess.PIPE + ) + self._id = 0 + self._opened_files = set() + self._initialize() + + def _find_clangd(self): + """Find clangd binary, trying common names.""" + for name in ['clangd', 'clangd-15', 'clangd-16', 'clangd-17', 'clangd-18']: + path = shutil.which(name) + if path: + return path + return None + + def _find_compile_commands(self): + """Find compile_commands.json in common locations.""" + for d in [OPENMC_ROOT / 'build', OPENMC_ROOT]: + if (d / 'compile_commands.json').exists(): + return str(d) + return None + + def _initialize(self): + """Send LSP initialize/initialized handshake.""" + self.request("initialize", { + "processId": os.getpid(), + "rootUri": OPENMC_ROOT.as_uri(), + "capabilities": {} + }) + self.notify("initialized") + + def _next_id(self): + self._id += 1 + return self._id + + def _send(self, msg_dict): + body = json.dumps(msg_dict) + encoded = body.encode('utf-8') + header = f"Content-Length: {len(encoded)}\r\n\r\n" + self.proc.stdin.write(header.encode('ascii') + encoded) + self.proc.stdin.flush() + + def request(self, method, params=None): + """Send a request and wait for the response.""" + rid = self._next_id() + self._send({"jsonrpc": "2.0", "id": rid, "method": method, + "params": params or {}}) + while True: + msg = self._read_msg() + if msg.get('id') == rid: + if 'error' in msg: + return None + return msg.get('result') + + def notify(self, method, params=None): + """Send a notification (no response expected).""" + self._send({"jsonrpc": "2.0", "method": method, + "params": params or {}}) + + def _read_msg(self): + headers = {} + while True: + line = self.proc.stdout.readline() + if not line: + raise EOFError("clangd process terminated") + line = line.decode('utf-8').strip() + if not line: + break + k, v = line.split(': ', 1) + headers[k] = v + length = int(headers['Content-Length']) + body = self.proc.stdout.read(length) + return json.loads(body) + + def open_file(self, filepath): + """Open a file in clangd and wait for it to be indexed.""" + fpath = Path(filepath) + if not fpath.is_absolute(): + fpath = OPENMC_ROOT / fpath + uri = fpath.as_uri() + if uri in self._opened_files: + return uri + text = fpath.read_text() + self.notify("textDocument/didOpen", { + "textDocument": { + "uri": uri, "languageId": "cpp", "version": 1, "text": text + } + }) + self._opened_files.add(uri) + # Give clangd time to parse. First file takes longer (preamble build). + wait = 8 if len(self._opened_files) == 1 else 3 + time.sleep(wait) + return uri + + def get_symbols(self, filepath): + """Get all symbols defined in a file.""" + uri = self.open_file(filepath) + result = self.request("textDocument/documentSymbol", { + "textDocument": {"uri": uri} + }) + return result or [] + + def get_definition(self, filepath, line, character): + """Get definition location for symbol at position.""" + uri = self.open_file(filepath) + result = self.request("textDocument/definition", { + "textDocument": {"uri": uri}, + "position": {"line": line, "character": character} + }) + return result or [] + + def get_references(self, filepath, line, character, + include_declaration=True): + """Get all references to symbol at position.""" + uri = self.open_file(filepath) + result = self.request("textDocument/references", { + "textDocument": {"uri": uri}, + "position": {"line": line, "character": character}, + "context": {"includeDeclaration": include_declaration} + }) + return result or [] + + def close(self): + """Shutdown clangd cleanly.""" + try: + self.request("shutdown") + self.notify("exit") + self.proc.wait(timeout=5) + except Exception: + self.proc.kill() + + +def uri_to_relpath(uri): + """Convert file:// URI to path relative to OPENMC_ROOT.""" + path = urllib.parse.unquote(uri.replace('file://', '')) + try: + return str(Path(path).relative_to(OPENMC_ROOT)) + except ValueError: + return path + + +def is_project_file(relpath): + """Check if a path is an OpenMC project file (not system/vendor).""" + if relpath.startswith('/'): + return False # absolute path = system header + if relpath.startswith('vendor/'): + return False + return True + + +def get_symbol_range(sym): + """Extract start line/character from either SymbolInformation or DocumentSymbol.""" + # DocumentSymbol format: has 'range' and 'selectionRange' at top level + if 'selectionRange' in sym: + return sym['selectionRange']['start'] + # DocumentSymbol without selectionRange + if 'range' in sym and isinstance(sym['range'], dict) and 'start' in sym['range']: + return sym['range']['start'] + # SymbolInformation format: has 'location.range' + if 'location' in sym: + return sym['location']['range']['start'] + return {'line': 0, 'character': 0} + + +def flatten_symbols(symbols, depth=0): + """Flatten nested document symbols into a flat list with depth info.""" + result = [] + for s in symbols: + result.append((s, depth)) + children = s.get('children', []) + if children: + result.extend(flatten_symbols(children, depth + 1)) + return result + + +def cmd_symbols(client, filepath): + """List all symbols defined in a file.""" + symbols = client.get_symbols(filepath) + flat = flatten_symbols(symbols) + for sym, depth in flat: + kind_name = SYMBOL_KINDS.get(sym['kind'], f"kind={sym['kind']}") + start = get_symbol_range(sym) + line = start['line'] + indent = " " * depth + print(f"{indent}{kind_name}: {sym['name']} (line {line + 1})") + + +def cmd_definition(client, filepath, line, character=None): + """Find the definition of a symbol.""" + if character is None: + # Find first non-whitespace identifier on the line + fpath = Path(filepath) + if not fpath.is_absolute(): + fpath = OPENMC_ROOT / fpath + lines = fpath.read_text().split('\n') + if line - 1 < len(lines): + text = lines[line - 1] + # Skip leading whitespace + character = len(text) - len(text.lstrip()) + + result = client.get_definition(filepath, line - 1, character) + if not result: + print("No definition found.") + return + + if isinstance(result, dict): + result = [result] + for loc in result: + rel = uri_to_relpath(loc['uri']) + ln = loc['range']['start']['line'] + 1 + print(f" {rel}:{ln}") + + +def cmd_references(client, filepath, line, character=None): + """Find all references to a symbol.""" + if character is None: + fpath = Path(filepath) + if not fpath.is_absolute(): + fpath = OPENMC_ROOT / fpath + lines = fpath.read_text().split('\n') + if line - 1 < len(lines): + text = lines[line - 1] + character = len(text) - len(text.lstrip()) + + result = client.get_references(filepath, line - 1, character) + if not result: + print("No references found.") + return + + # Group by file + by_file = defaultdict(list) + for loc in result: + rel = uri_to_relpath(loc['uri']) + ln = loc['range']['start']['line'] + 1 + by_file[rel].append(ln) + + print(f"{len(result)} references across {len(by_file)} files:\n") + for fpath, lines_list in sorted(by_file.items()): + lines_str = ", ".join(str(l) for l in sorted(lines_list)) + print(f" {fpath}:{lines_str}") + + +def cmd_related(client, filepath, top_k=15): + """Find files related to a given file through real typed references. + + For each symbol defined in the target file, finds all files that + reference it. Returns files ranked by connection count. + """ + symbols = client.get_symbols(filepath) + flat = flatten_symbols(symbols) + + # Filter to meaningful symbols (functions, classes, methods, variables) + interesting_kinds = {5, 6, 8, 12, 13, 23} # Class, Method, Field, Function, Variable, Struct + interesting = [(s, d) for s, d in flat if s['kind'] in interesting_kinds] + + if not interesting: + print("No interesting symbols found in file.") + return + + target_rel = filepath + if Path(filepath).is_absolute(): + target_rel = str(Path(filepath).relative_to(OPENMC_ROOT)) + + file_connections = Counter() # file -> number of symbols referencing it + symbol_details = defaultdict(set) # file -> set of symbol names + + print(f"Analyzing {len(interesting)} symbols in {target_rel}...\n", + file=sys.stderr) + + # Read the file so we can find exact symbol name positions + fpath_obj = Path(filepath) + if not fpath_obj.is_absolute(): + fpath_obj = OPENMC_ROOT / fpath_obj + file_lines = fpath_obj.read_text().split('\n') + + for sym, depth in interesting: + start = get_symbol_range(sym) + line = start['line'] + char = start['character'] + + # The range start may point to the type, not the symbol name. + # Find the actual symbol name position within the line. + if line < len(file_lines): + name_col = file_lines[line].find(sym['name'], char) + if name_col >= 0: + char = name_col + + refs = client.get_references(filepath, line, char, + include_declaration=False) + if not refs: + continue + + for loc in refs: + rel = uri_to_relpath(loc['uri']) + if rel == target_rel: + continue + if not is_project_file(rel): + continue + file_connections[rel] += 1 + symbol_details[rel].add(sym['name']) + + if not file_connections: + print("No external references found.") + return + + print(f"Files related to {target_rel} " + f"(ranked by typed reference count):\n") + for fpath, count in file_connections.most_common(top_k): + syms = sorted(symbol_details[fpath]) + sym_preview = ", ".join(syms[:5]) + if len(syms) > 5: + sym_preview += f", ... (+{len(syms)-5} more)" + print(f" [{count:3d} refs] {fpath}") + print(f" via: {sym_preview}") + + +def parse_file_location(location): + """Parse 'filepath:line' into (filepath, line) or (filepath, None).""" + # Handle filepath:line format + parts = location.rsplit(':', 1) + if len(parts) == 2: + try: + line = int(parts[1]) + return parts[0], line + except ValueError: + pass + return location, None + + +def main(): + parser = argparse.ArgumentParser( + description="LSP-based code navigation for OpenMC (via clangd)", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog="""examples: + %(prog)s symbols src/simulation.cpp + %(prog)s definition src/simulation.cpp:132 + %(prog)s references src/simulation.cpp:132 + %(prog)s related src/simulation.cpp + %(prog)s related src/simulation.cpp --top-k 20""", + ) + parser.add_argument("command", + choices=["symbols", "definition", "references", + "related"], + help="Command to run") + parser.add_argument("location", + help="File path, or file:line for definition/references") + parser.add_argument("--top-k", type=int, default=15, + help="Number of related files to show (default: 15)") + parser.add_argument("--compile-commands-dir", type=str, default=None, + help="Directory containing compile_commands.json") + + args = parser.parse_args() + + filepath, line = parse_file_location(args.location) + + # Validate file exists + fpath = Path(filepath) + if not fpath.is_absolute(): + fpath = OPENMC_ROOT / fpath + if not fpath.exists(): + print(f"ERROR: File not found: {filepath}", file=sys.stderr) + sys.exit(1) + + client = ClangdClient(compile_commands_dir=args.compile_commands_dir) + try: + if args.command == "symbols": + cmd_symbols(client, filepath) + elif args.command == "definition": + if line is None: + print("ERROR: definition requires file:line format", + file=sys.stderr) + sys.exit(1) + cmd_definition(client, filepath, line) + elif args.command == "references": + if line is None: + print("ERROR: references requires file:line format", + file=sys.stderr) + sys.exit(1) + cmd_references(client, filepath, line) + elif args.command == "related": + cmd_related(client, filepath, top_k=args.top_k) + finally: + client.close() + + +if __name__ == "__main__": + main() diff --git a/.claude/tools/repomap/openmc_map.py b/.claude/tools/repomap/openmc_map.py index 25a87c99fb3..b94876c5a8b 100644 --- a/.claude/tools/repomap/openmc_map.py +++ b/.claude/tools/repomap/openmc_map.py @@ -31,6 +31,19 @@ "openmc/**/*.py", ] +# Ubiquitous utility files that clutter the map without providing useful +# structural context. Passed as chat_fnames so aider excludes them from +# the output (it assumes they're "already in context"). +SUPPRESS_FILES = [ + "include/openmc/error.h", + "src/error.cpp", + "include/openmc/position.h", + "include/openmc/constants.h", + "include/openmc/span.h", + "include/openmc/tensor.h", + "openmc/checkvalue.py", +] + class TokenCounter: """Simple token counter that doesn't need an API model.""" @@ -119,8 +132,11 @@ def generate_map(focus_files=None, map_tokens=2048): # chat_fnames = files the agent is focused on (already in context). # Aider shows their neighbors/dependencies, not the files themselves, # since the agent already has those open. - other_fnames = [f for f in all_files if f not in chat_fnames] - repo_map = rm.get_repo_map(chat_fnames, other_fnames) + # Also suppress ubiquitous utility files that waste token budget. + suppress = [f for f in SUPPRESS_FILES if f in all_files and f not in chat_fnames] + chat_fnames_with_suppress = chat_fnames + suppress + other_fnames = [f for f in all_files if f not in chat_fnames_with_suppress] + repo_map = rm.get_repo_map(chat_fnames_with_suppress, other_fnames) if not repo_map: return "No map generated. Try with different files or a larger --tokens budget." diff --git a/.claude/tools/requirements.txt b/.claude/tools/requirements.txt index 6d456ab36f4..ec6c251da68 100644 --- a/.claude/tools/requirements.txt +++ b/.claude/tools/requirements.txt @@ -16,5 +16,8 @@ sentence-transformers>=2.7.0 # Embeddings (fallback - zero downloads) scikit-learn>=1.4.0 +# LSP client for clangd-based code navigation +pygls>=2.0.0 + # Utilities numpy>=1.26.0 From a57118b15fa7e4bc0f9dacc5f4ff0d7e23a33d57 Mon Sep 17 00:00:00 2001 From: John Tramm Date: Thu, 5 Mar 2026 14:59:48 +0000 Subject: [PATCH 09/67] Update skill and CLAUDE.md docs to include LSP navigation tool Add openmc_lsp.py to the enable-openmc-index skill workflow, including prerequisite checks for clangd and compile_commands.json. Update CLAUDE.md to recommend the LSP tool for C++ code navigation over the aider repo map. Co-Authored-By: Claude Opus 4.6 --- .claude/skills/enable-openmc-index/SKILL.md | 53 +++++++++++++++++---- CLAUDE.md | 22 +++++---- 2 files changed, 57 insertions(+), 18 deletions(-) diff --git a/.claude/skills/enable-openmc-index/SKILL.md b/.claude/skills/enable-openmc-index/SKILL.md index 616d9f95b75..e68d4df5ab6 100644 --- a/.claude/skills/enable-openmc-index/SKILL.md +++ b/.claude/skills/enable-openmc-index/SKILL.md @@ -1,14 +1,15 @@ --- name: enable-openmc-index -description: Enable the OpenMC codebase index for this session. Provides semantic code search and structural repo mapping. Run this when investigating, modifying, or debugging OpenMC code. +description: Enable the OpenMC codebase index for this session. Provides semantic code search, structural repo mapping, and LSP-based code navigation. Run this when investigating, modifying, or debugging OpenMC code. allowed-tools: Bash(*), Read --- # Enable OpenMC Index -Set up (if needed) and activate the OpenMC codebase index for this session. This gives you two tools: +Set up (if needed) and activate the OpenMC codebase index for this session. This gives you three tools: 1. **Semantic search** (`openmc_search.py`) - Find related code across the codebase by concept 2. **Structural map** (`openmc_map.py`) - See condensed code structure of files and their neighbors +3. **LSP navigation** (`openmc_lsp.py`) - Compiler-accurate go-to-definition, find-references, and related-file discovery for C++ code (requires clangd and compile_commands.json) ## Step 1: Ensure the virtual environment exists @@ -40,40 +41,73 @@ fi Note: The repo map tool (`openmc_map.py`) does NOT need a pre-built index - it generates maps on the fly using tree-sitter. Only the RAG search needs the vector index. -## Step 3: Learn the tool APIs +## Step 3: Check LSP tool prerequisites -Run `--help` for both tools to see their full APIs: +The LSP tool (`openmc_lsp.py`) requires clangd and `compile_commands.json`. Check if they're available: + +```bash +if command -v clangd-15 &>/dev/null || command -v clangd &>/dev/null; then + echo "CLANGD_AVAILABLE" +else + echo "CLANGD_MISSING" +fi +if [ -f build/compile_commands.json ]; then + echo "COMPILE_COMMANDS_AVAILABLE" +else + echo "COMPILE_COMMANDS_MISSING" +fi +``` + +If clangd is missing, tell the user: "The LSP tool needs clangd. Install with `apt-get install clangd-15`." +If compile_commands.json is missing, tell the user: "The LSP tool needs compile_commands.json. Generate with `cmake -B build -DCMAKE_EXPORT_COMPILE_COMMANDS=ON`." + +The LSP tool is optional — the other two tools work without it. + +## Step 4: Learn the tool APIs + +Run `--help` for all tools to see their full APIs: ```bash .claude/cache/.venv/bin/python .claude/tools/rag/openmc_search.py --help .claude/cache/.venv/bin/python .claude/tools/repomap/openmc_map.py --help +.claude/cache/.venv/bin/python .claude/tools/lsp/openmc_lsp.py --help ``` Read and internalize the output so you know all available options. -## Step 4: Confirm activation +## Step 5: Confirm activation -Tell the user the OpenMC index is active and briefly describe the two tools: +Tell the user the OpenMC index is active and briefly describe the tools: - **Semantic search**: Find related code by concept (e.g., "particle seed initialization") - **Structural map**: See condensed code structure around specific files +- **LSP navigation** (if available): Compiler-accurate definition/references/related-files for C++ ## How to use the tools after activation **Typical workflow:** 1. Use `openmc_search.py` to discover which files are relevant to your task -2. Use `openmc_map.py` on those files to understand their structure and neighbors -3. Use Read/Grep to dive into the specific code you need to change +2. Use `openmc_lsp.py related` to find files connected by real typed references (C++ only) +3. Use `openmc_map.py` on those files to see their code structure +4. Use Read/Grep to dive into the specific code you need to change **When to use semantic search** (`openmc_search.py`): - Investigating how a change might affect other parts of the codebase - Finding code that does something conceptually similar but with different naming - Discovering cross-cutting concerns across run modes +- Searching Python code and documentation + +**When to use LSP navigation** (`openmc_lsp.py`): +- Finding exactly where a C++ symbol is defined (`definition`) +- Finding all callers of a C++ function (`references`) +- Discovering which C++ files are connected by real typed references (`related`) +- Best for C++ — uses the compiler's own type system, zero false edges **When to use the repo map** (`openmc_map.py`): - Understanding the structure of unfamiliar files before modifying them - Seeing what classes/methods neighbor the code you're working on - Getting a condensed overview of a subsystem (pass multiple files) +- Works for both C++ and Python files ## Subagent guidance @@ -81,4 +115,5 @@ When spawning subagents that will investigate or modify OpenMC code, include in "The OpenMC code index is available. Run `--help` on these tools to see their full API: - `.claude/cache/.venv/bin/python .claude/tools/rag/openmc_search.py --help` (semantic search) -- `.claude/cache/.venv/bin/python .claude/tools/repomap/openmc_map.py --help` (structural map)" +- `.claude/cache/.venv/bin/python .claude/tools/repomap/openmc_map.py --help` (structural map) +- `.claude/cache/.venv/bin/python .claude/tools/lsp/openmc_lsp.py --help` (LSP navigation, C++ only)" diff --git a/CLAUDE.md b/CLAUDE.md index c5092e13fe9..91550272de6 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -1,18 +1,22 @@ ## OpenMC Codebase Index If the user asks you to investigate, modify, or debug OpenMC code, let them know -about the `/enable-openmc-index` skill which provides a structural repo map and -semantic code search across the entire codebase. Offer to run it for them. +about the `/enable-openmc-index` skill which provides semantic code search, +structural repo mapping, and LSP-based C++ code navigation. Offer to run it. -Do NOT use the index tools (`openmc_search.py`, `openmc_map.py`) unless -`/enable-openmc-index` has been run in the current session. +Do NOT use the index tools (`openmc_search.py`, `openmc_map.py`, `openmc_lsp.py`) +unless `/enable-openmc-index` has been run in the current session. -When using `openmc_search.py` or `openmc_map.py`, ALWAYS read their full output. -Do NOT pipe through head, tail, or grep. These tools are already sized to fit -in context via `--top-k` and `--tokens`. Truncating their output defeats their -purpose. +When using these tools, ALWAYS read their full output. Do NOT pipe through head, +tail, or grep. These tools are already sized to fit in context via `--top-k` and +`--tokens`. Truncating their output defeats their purpose. -To rebuild the index, the user can use `/refresh-openmc-index`. You may +For C++ code navigation, prefer `openmc_lsp.py` over `openmc_map.py` when you +need to find definitions, references, or related files — it uses the compiler's +type system and has no false edges from name collisions. The LSP tool requires +clangd and `build/compile_commands.json` (generated by cmake). + +To rebuild the RAG index, the user can use `/refresh-openmc-index`. You may offer to run this skill for them if it seems necessary. ## Additional OpenMC info From 8a2ed7245d6e62f6bb3199e923f7eb6d38713070 Mon Sep 17 00:00:00 2001 From: John Tramm Date: Thu, 5 Mar 2026 15:05:38 +0000 Subject: [PATCH 10/67] Remove hardcoded clangd-15 references from docs The LSP tool already searches for clangd dynamically (clangd, clangd-15, clangd-16, etc.). Docs should not pin a specific version. Co-Authored-By: Claude Opus 4.6 --- .claude/skills/enable-openmc-index/SKILL.md | 4 ++-- .claude/tools/lsp/openmc_lsp.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.claude/skills/enable-openmc-index/SKILL.md b/.claude/skills/enable-openmc-index/SKILL.md index e68d4df5ab6..07a33db09f1 100644 --- a/.claude/skills/enable-openmc-index/SKILL.md +++ b/.claude/skills/enable-openmc-index/SKILL.md @@ -46,7 +46,7 @@ Note: The repo map tool (`openmc_map.py`) does NOT need a pre-built index - it g The LSP tool (`openmc_lsp.py`) requires clangd and `compile_commands.json`. Check if they're available: ```bash -if command -v clangd-15 &>/dev/null || command -v clangd &>/dev/null; then +if command -v clangd &>/dev/null || compgen -c clangd- 2>/dev/null | head -1 | grep -q .; then echo "CLANGD_AVAILABLE" else echo "CLANGD_MISSING" @@ -58,7 +58,7 @@ else fi ``` -If clangd is missing, tell the user: "The LSP tool needs clangd. Install with `apt-get install clangd-15`." +If clangd is missing, tell the user: "The LSP tool needs clangd. Install with `apt-get install clangd` (or `clangd-15`, `clangd-16`, etc.)." If compile_commands.json is missing, tell the user: "The LSP tool needs compile_commands.json. Generate with `cmake -B build -DCMAKE_EXPORT_COMPILE_COMMANDS=ON`." The LSP tool is optional — the other two tools work without it. diff --git a/.claude/tools/lsp/openmc_lsp.py b/.claude/tools/lsp/openmc_lsp.py index a4c8a903cdf..ab9060a3fa4 100644 --- a/.claude/tools/lsp/openmc_lsp.py +++ b/.claude/tools/lsp/openmc_lsp.py @@ -7,7 +7,7 @@ C++ type system — no false edges from name collisions. Requires: - - clangd (apt-get install clangd-15, or similar) + - clangd (apt-get install clangd, or clangd-15/clangd-16/etc.) - compile_commands.json (cmake -B build -DCMAKE_EXPORT_COMPILE_COMMANDS=ON) Usage: From 7b38b26e159c77a8c9f290481b94c6d21b8bae78 Mon Sep 17 00:00:00 2001 From: John Tramm Date: Thu, 5 Mar 2026 15:07:09 +0000 Subject: [PATCH 11/67] Remove unnecessary 'LSP tool is optional' note from skill All three tools are independent; no need to single one out. Co-Authored-By: Claude Opus 4.6 --- .claude/skills/enable-openmc-index/SKILL.md | 2 -- 1 file changed, 2 deletions(-) diff --git a/.claude/skills/enable-openmc-index/SKILL.md b/.claude/skills/enable-openmc-index/SKILL.md index 07a33db09f1..cbcbf4bfe48 100644 --- a/.claude/skills/enable-openmc-index/SKILL.md +++ b/.claude/skills/enable-openmc-index/SKILL.md @@ -61,8 +61,6 @@ fi If clangd is missing, tell the user: "The LSP tool needs clangd. Install with `apt-get install clangd` (or `clangd-15`, `clangd-16`, etc.)." If compile_commands.json is missing, tell the user: "The LSP tool needs compile_commands.json. Generate with `cmake -B build -DCMAKE_EXPORT_COMPILE_COMMANDS=ON`." -The LSP tool is optional — the other two tools work without it. - ## Step 4: Learn the tool APIs Run `--help` for all tools to see their full APIs: From 862b651c1974a2f6ae6b62ac8f3fc3db9456a1a6 Mon Sep 17 00:00:00 2001 From: John Tramm Date: Thu, 5 Mar 2026 15:09:00 +0000 Subject: [PATCH 12/67] Clean up skill and CLAUDE.md docs for consistency Rewrite all three doc files with fresh eyes: - Remove redundant/prescriptive content from enable skill - Remove singling out of LSP tool as special/optional - Simplify clangd prerequisite handling (tool has its own error messages) - Update refresh skill to mention LSP also doesn't need refreshing - Simplify CLAUDE.md to avoid implying tools are either/or Co-Authored-By: Claude Opus 4.6 --- .claude/skills/enable-openmc-index/SKILL.md | 78 ++++---------------- .claude/skills/refresh-openmc-index/SKILL.md | 5 +- CLAUDE.md | 17 ++--- 3 files changed, 25 insertions(+), 75 deletions(-) diff --git a/.claude/skills/enable-openmc-index/SKILL.md b/.claude/skills/enable-openmc-index/SKILL.md index cbcbf4bfe48..f430e57bb76 100644 --- a/.claude/skills/enable-openmc-index/SKILL.md +++ b/.claude/skills/enable-openmc-index/SKILL.md @@ -1,19 +1,18 @@ --- name: enable-openmc-index -description: Enable the OpenMC codebase index for this session. Provides semantic code search, structural repo mapping, and LSP-based code navigation. Run this when investigating, modifying, or debugging OpenMC code. +description: Enable the OpenMC codebase tools for this session. Provides semantic code search, structural repo mapping, and LSP-based C++ code navigation. allowed-tools: Bash(*), Read --- # Enable OpenMC Index -Set up (if needed) and activate the OpenMC codebase index for this session. This gives you three tools: -1. **Semantic search** (`openmc_search.py`) - Find related code across the codebase by concept -2. **Structural map** (`openmc_map.py`) - See condensed code structure of files and their neighbors -3. **LSP navigation** (`openmc_lsp.py`) - Compiler-accurate go-to-definition, find-references, and related-file discovery for C++ code (requires clangd and compile_commands.json) +Set up (if needed) and activate the OpenMC codebase tools for this session: -## Step 1: Ensure the virtual environment exists +1. **Semantic search** (`openmc_search.py`) - Find related code by concept across C++, Python, and docs +2. **Structural map** (`openmc_map.py`) - Condensed code structure of files and their neighbors (C++ and Python) +3. **LSP navigation** (`openmc_lsp.py`) - Compiler-accurate definition, references, and related-file discovery (C++ only, requires clangd and compile_commands.json) -Check if `.claude/cache/.venv/` exists. If not, create it and install dependencies: +## Step 1: Ensure the virtual environment exists ```bash if [ ! -d .claude/cache/.venv ]; then @@ -27,7 +26,7 @@ fi ## Step 2: Ensure the RAG index exists -Check if `.claude/cache/rag_index/` exists. If not, build it: +The semantic search tool needs a pre-built vector index. The other two tools work without it. ```bash if [ ! -d .claude/cache/rag_index ]; then @@ -39,31 +38,9 @@ else fi ``` -Note: The repo map tool (`openmc_map.py`) does NOT need a pre-built index - it generates maps on the fly using tree-sitter. Only the RAG search needs the vector index. - -## Step 3: Check LSP tool prerequisites +## Step 3: Learn the tool APIs -The LSP tool (`openmc_lsp.py`) requires clangd and `compile_commands.json`. Check if they're available: - -```bash -if command -v clangd &>/dev/null || compgen -c clangd- 2>/dev/null | head -1 | grep -q .; then - echo "CLANGD_AVAILABLE" -else - echo "CLANGD_MISSING" -fi -if [ -f build/compile_commands.json ]; then - echo "COMPILE_COMMANDS_AVAILABLE" -else - echo "COMPILE_COMMANDS_MISSING" -fi -``` - -If clangd is missing, tell the user: "The LSP tool needs clangd. Install with `apt-get install clangd` (or `clangd-15`, `clangd-16`, etc.)." -If compile_commands.json is missing, tell the user: "The LSP tool needs compile_commands.json. Generate with `cmake -B build -DCMAKE_EXPORT_COMPILE_COMMANDS=ON`." - -## Step 4: Learn the tool APIs - -Run `--help` for all tools to see their full APIs: +Run `--help` for each tool to learn their full APIs: ```bash .claude/cache/.venv/bin/python .claude/tools/rag/openmc_search.py --help @@ -73,39 +50,16 @@ Run `--help` for all tools to see their full APIs: Read and internalize the output so you know all available options. -## Step 5: Confirm activation - -Tell the user the OpenMC index is active and briefly describe the tools: -- **Semantic search**: Find related code by concept (e.g., "particle seed initialization") -- **Structural map**: See condensed code structure around specific files -- **LSP navigation** (if available): Compiler-accurate definition/references/related-files for C++ - -## How to use the tools after activation - -**Typical workflow:** - -1. Use `openmc_search.py` to discover which files are relevant to your task -2. Use `openmc_lsp.py related` to find files connected by real typed references (C++ only) -3. Use `openmc_map.py` on those files to see their code structure -4. Use Read/Grep to dive into the specific code you need to change +## Step 4: Confirm activation -**When to use semantic search** (`openmc_search.py`): -- Investigating how a change might affect other parts of the codebase -- Finding code that does something conceptually similar but with different naming -- Discovering cross-cutting concerns across run modes -- Searching Python code and documentation +Tell the user the tools are active. Mention if clangd or compile_commands.json +are missing (the LSP tool will report this itself if invoked without them). -**When to use LSP navigation** (`openmc_lsp.py`): -- Finding exactly where a C++ symbol is defined (`definition`) -- Finding all callers of a C++ function (`references`) -- Discovering which C++ files are connected by real typed references (`related`) -- Best for C++ — uses the compiler's own type system, zero false edges +## When to use each tool -**When to use the repo map** (`openmc_map.py`): -- Understanding the structure of unfamiliar files before modifying them -- Seeing what classes/methods neighbor the code you're working on -- Getting a condensed overview of a subsystem (pass multiple files) -- Works for both C++ and Python files +- **`openmc_search.py`**: Finding code by concept, discovering cross-cutting concerns, searching docs +- **`openmc_lsp.py`**: Go-to-definition, find-references, discovering which C++ files are connected by real typed references +- **`openmc_map.py`**: Seeing condensed code structure and class/function signatures of files you're about to modify ## Subagent guidance diff --git a/.claude/skills/refresh-openmc-index/SKILL.md b/.claude/skills/refresh-openmc-index/SKILL.md index b0be15d1a9c..a5d3b0ba86c 100644 --- a/.claude/skills/refresh-openmc-index/SKILL.md +++ b/.claude/skills/refresh-openmc-index/SKILL.md @@ -1,12 +1,13 @@ --- name: refresh-openmc-index -description: Rebuild the OpenMC RAG search index. Use after pulling new code or switching branches. The repo map tool does not need refreshing (it generates maps on the fly). +description: Rebuild the OpenMC RAG search index. Use after pulling new code or switching branches. The other tools (repo map, LSP) do not need refreshing. allowed-tools: Bash(*) --- # Refresh OpenMC Index -Rebuild the RAG vector index from scratch. Note: only the semantic search index needs refreshing. The repo map tool (`openmc_map.py`) always works on the current code. +Rebuild the RAG semantic search index from scratch. Only this index needs +refreshing — the repo map and LSP tools always work on the current code. ## Step 1: Ensure venv exists diff --git a/CLAUDE.md b/CLAUDE.md index 91550272de6..130826bb7f2 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -1,23 +1,18 @@ -## OpenMC Codebase Index +## OpenMC Codebase Tools If the user asks you to investigate, modify, or debug OpenMC code, let them know about the `/enable-openmc-index` skill which provides semantic code search, structural repo mapping, and LSP-based C++ code navigation. Offer to run it. -Do NOT use the index tools (`openmc_search.py`, `openmc_map.py`, `openmc_lsp.py`) +Do NOT use the tools (`openmc_search.py`, `openmc_map.py`, `openmc_lsp.py`) unless `/enable-openmc-index` has been run in the current session. When using these tools, ALWAYS read their full output. Do NOT pipe through head, -tail, or grep. These tools are already sized to fit in context via `--top-k` and -`--tokens`. Truncating their output defeats their purpose. +tail, or grep. The tools already limit their output size via `--top-k` and +`--tokens` flags. Truncating their output defeats their purpose. -For C++ code navigation, prefer `openmc_lsp.py` over `openmc_map.py` when you -need to find definitions, references, or related files — it uses the compiler's -type system and has no false edges from name collisions. The LSP tool requires -clangd and `build/compile_commands.json` (generated by cmake). - -To rebuild the RAG index, the user can use `/refresh-openmc-index`. You may -offer to run this skill for them if it seems necessary. +To rebuild the RAG search index after pulling new code, the user can use +`/refresh-openmc-index`. ## Additional OpenMC info From ba1bd091d3ca1b6649e97349afdfc351c8cac695 Mon Sep 17 00:00:00 2001 From: John Tramm Date: Thu, 5 Mar 2026 15:14:09 +0000 Subject: [PATCH 13/67] Add technical context about each tool's strengths and limitations Help agents make informed tool choices by explaining what each tool does under the hood: semantic search uses vector embeddings, the repo map uses tree-sitter name-matching (which creates false edges from common method names), and the LSP tool uses the C++ compiler's type system (zero false edges). Agents should know the repo map's ranking is unreliable for determining which files are truly connected. Co-Authored-By: Claude Opus 4.6 --- .claude/skills/enable-openmc-index/SKILL.md | 12 +++++----- CLAUDE.md | 25 +++++++++++++++++++-- 2 files changed, 29 insertions(+), 8 deletions(-) diff --git a/.claude/skills/enable-openmc-index/SKILL.md b/.claude/skills/enable-openmc-index/SKILL.md index f430e57bb76..f73e55bbd47 100644 --- a/.claude/skills/enable-openmc-index/SKILL.md +++ b/.claude/skills/enable-openmc-index/SKILL.md @@ -8,9 +8,9 @@ allowed-tools: Bash(*), Read Set up (if needed) and activate the OpenMC codebase tools for this session: -1. **Semantic search** (`openmc_search.py`) - Find related code by concept across C++, Python, and docs -2. **Structural map** (`openmc_map.py`) - Condensed code structure of files and their neighbors (C++ and Python) -3. **LSP navigation** (`openmc_lsp.py`) - Compiler-accurate definition, references, and related-file discovery (C++ only, requires clangd and compile_commands.json) +1. **`openmc_search.py`** — RAG semantic search. Embeds your query and searches a vector index. Good for finding conceptually related code even when naming differs. Covers C++, Python, and docs. +2. **`openmc_map.py`** — Structural repo map via aider/tree-sitter. Shows condensed code skeletons of related files. Useful for seeing the shape of code before modifying it. Caveat: matches identifiers by name only, so common names like `push_back` or `__init__` create false connections. +3. **`openmc_lsp.py`** — LSP navigation via clangd. Compiler-accurate `definition`, `references`, and `related` commands for C++. Zero false edges. Requires clangd and compile_commands.json. ## Step 1: Ensure the virtual environment exists @@ -57,9 +57,9 @@ are missing (the LSP tool will report this itself if invoked without them). ## When to use each tool -- **`openmc_search.py`**: Finding code by concept, discovering cross-cutting concerns, searching docs -- **`openmc_lsp.py`**: Go-to-definition, find-references, discovering which C++ files are connected by real typed references -- **`openmc_map.py`**: Seeing condensed code structure and class/function signatures of files you're about to modify +- **`openmc_search.py`**: "What code is conceptually related to X?" — best for broad discovery, cross-cutting concerns, searching docs and Python code +- **`openmc_lsp.py`**: "What C++ files actually reference this symbol?" — best for precise C++ navigation, zero false positives +- **`openmc_map.py`**: "What does this file's code look like?" — best for seeing class/function signatures before modifying unfamiliar code. Be aware its file ranking has false edges from common method names ## Subagent guidance diff --git a/CLAUDE.md b/CLAUDE.md index 130826bb7f2..17a2bd726e6 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -1,8 +1,8 @@ ## OpenMC Codebase Tools If the user asks you to investigate, modify, or debug OpenMC code, let them know -about the `/enable-openmc-index` skill which provides semantic code search, -structural repo mapping, and LSP-based C++ code navigation. Offer to run it. +about the `/enable-openmc-index` skill which provides three code navigation tools. +Offer to run it for them. Do NOT use the tools (`openmc_search.py`, `openmc_map.py`, `openmc_lsp.py`) unless `/enable-openmc-index` has been run in the current session. @@ -11,6 +11,27 @@ When using these tools, ALWAYS read their full output. Do NOT pipe through head, tail, or grep. The tools already limit their output size via `--top-k` and `--tokens` flags. Truncating their output defeats their purpose. +### Tool summary + +**`openmc_search.py`** — RAG semantic search. Embeds your query and searches a +vector index of the codebase. Good for finding conceptually related code even +when naming differs (e.g., "particle RNG seeding" finds code across transport, +restart, and random ray modes). Covers C++, Python, and docs. + +**`openmc_lsp.py`** — LSP navigation via clangd. Uses the C++ compiler's own +type system to resolve symbols. `definition`, `references`, and `related` +commands give compiler-accurate results with zero false edges. Use this when +you need to know exactly which files reference a C++ symbol or are connected +to a C++ file. Requires clangd and `build/compile_commands.json`. + +**`openmc_map.py`** — Structural repo map via aider/tree-sitter. Shows condensed +code skeletons (class/function signatures) of files related to your focus files. +Useful for seeing the shape of code before modifying it. **Caveat**: the +underlying graph matches identifiers by name only (tree-sitter has no type +information), so files defining common method names like `push_back`, `get`, +`__init__`, or `from_xml` may appear related when they are not. For finding +which files are truly connected to a C++ file, prefer `openmc_lsp.py related`. + To rebuild the RAG search index after pulling new code, the user can use `/refresh-openmc-index`. From 3692fce04748557b1ff923b124037ee6f6472e2a Mon Sep 17 00:00:00 2001 From: John Tramm Date: Thu, 5 Mar 2026 15:16:19 +0000 Subject: [PATCH 14/67] Fix repo map description: it shows neighbors, not the focus file itself The aider repo map excludes focus files from output (assumes they're already in context) and shows condensed skeletons of neighboring files. The docs incorrectly described it as showing the focus file's code. Co-Authored-By: Claude Opus 4.6 --- .claude/skills/enable-openmc-index/SKILL.md | 4 ++-- CLAUDE.md | 12 +++++++----- 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/.claude/skills/enable-openmc-index/SKILL.md b/.claude/skills/enable-openmc-index/SKILL.md index f73e55bbd47..b14a8d37000 100644 --- a/.claude/skills/enable-openmc-index/SKILL.md +++ b/.claude/skills/enable-openmc-index/SKILL.md @@ -9,7 +9,7 @@ allowed-tools: Bash(*), Read Set up (if needed) and activate the OpenMC codebase tools for this session: 1. **`openmc_search.py`** — RAG semantic search. Embeds your query and searches a vector index. Good for finding conceptually related code even when naming differs. Covers C++, Python, and docs. -2. **`openmc_map.py`** — Structural repo map via aider/tree-sitter. Shows condensed code skeletons of related files. Useful for seeing the shape of code before modifying it. Caveat: matches identifiers by name only, so common names like `push_back` or `__init__` create false connections. +2. **`openmc_map.py`** — Structural repo map via aider/tree-sitter. Given focus files you already have open, shows condensed code skeletons of their *neighbors* — other files that share symbols with them. Useful for seeing surrounding context. Caveat: matches identifiers by name only, so common names like `push_back` or `__init__` create false connections in the neighbor ranking. 3. **`openmc_lsp.py`** — LSP navigation via clangd. Compiler-accurate `definition`, `references`, and `related` commands for C++. Zero false edges. Requires clangd and compile_commands.json. ## Step 1: Ensure the virtual environment exists @@ -59,7 +59,7 @@ are missing (the LSP tool will report this itself if invoked without them). - **`openmc_search.py`**: "What code is conceptually related to X?" — best for broad discovery, cross-cutting concerns, searching docs and Python code - **`openmc_lsp.py`**: "What C++ files actually reference this symbol?" — best for precise C++ navigation, zero false positives -- **`openmc_map.py`**: "What does this file's code look like?" — best for seeing class/function signatures before modifying unfamiliar code. Be aware its file ranking has false edges from common method names +- **`openmc_map.py`**: "What other code surrounds the files I'm working on?" — shows condensed signatures of neighboring files. Be aware its neighbor ranking has false edges from common method names ## Subagent guidance diff --git a/CLAUDE.md b/CLAUDE.md index 17a2bd726e6..1c2da1f8438 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -24,13 +24,15 @@ commands give compiler-accurate results with zero false edges. Use this when you need to know exactly which files reference a C++ symbol or are connected to a C++ file. Requires clangd and `build/compile_commands.json`. -**`openmc_map.py`** — Structural repo map via aider/tree-sitter. Shows condensed -code skeletons (class/function signatures) of files related to your focus files. -Useful for seeing the shape of code before modifying it. **Caveat**: the +**`openmc_map.py`** — Structural repo map via aider/tree-sitter. Given focus files +you already have open, shows condensed code skeletons (class/function signatures) +of their *neighbors* — other files that share symbols with them. The focus files +themselves are excluded (you already have them in context). **Caveat**: the underlying graph matches identifiers by name only (tree-sitter has no type information), so files defining common method names like `push_back`, `get`, -`__init__`, or `from_xml` may appear related when they are not. For finding -which files are truly connected to a C++ file, prefer `openmc_lsp.py related`. +`__init__`, or `from_xml` may appear as neighbors when they are not truly +related. For finding which files are genuinely connected to a C++ file, prefer +`openmc_lsp.py related`. To rebuild the RAG search index after pulling new code, the user can use `/refresh-openmc-index`. From 8d825d62ae5582e981e045c54c73068daa168beb Mon Sep 17 00:00:00 2001 From: John Tramm Date: Thu, 5 Mar 2026 15:18:17 +0000 Subject: [PATCH 15/67] Broaden LSP tool description to cover all its commands MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit It does definition, references, symbols, and related — not just "what files reference this symbol." Co-Authored-By: Claude Opus 4.6 --- .claude/skills/enable-openmc-index/SKILL.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.claude/skills/enable-openmc-index/SKILL.md b/.claude/skills/enable-openmc-index/SKILL.md index b14a8d37000..adfaf85c846 100644 --- a/.claude/skills/enable-openmc-index/SKILL.md +++ b/.claude/skills/enable-openmc-index/SKILL.md @@ -58,7 +58,7 @@ are missing (the LSP tool will report this itself if invoked without them). ## When to use each tool - **`openmc_search.py`**: "What code is conceptually related to X?" — best for broad discovery, cross-cutting concerns, searching docs and Python code -- **`openmc_lsp.py`**: "What C++ files actually reference this symbol?" — best for precise C++ navigation, zero false positives +- **`openmc_lsp.py`**: "Where is this C++ symbol defined, who calls it, and what files are connected to this one?" — returns file:line locations with compiler accuracy, zero false positives - **`openmc_map.py`**: "What other code surrounds the files I'm working on?" — shows condensed signatures of neighboring files. Be aware its neighbor ranking has false edges from common method names ## Subagent guidance From e388cf999a42a18bde6a86b8e5e324fa96a9d949 Mon Sep 17 00:00:00 2001 From: John Tramm Date: Thu, 5 Mar 2026 15:20:51 +0000 Subject: [PATCH 16/67] Expand tool descriptions with how they actually work Explain the mechanics: RAG search uses sentence-transformers embeddings in LanceDB, the repo map uses a tree-sitter reference graph with PageRank fitted to a token budget, and the LSP tool talks to clangd's compiler frontend. Agents need this context to understand when to trust each tool's output and what its blind spots are. Co-Authored-By: Claude Opus 4.6 --- .claude/skills/enable-openmc-index/SKILL.md | 12 ++-- CLAUDE.md | 63 +++++++++++++-------- 2 files changed, 46 insertions(+), 29 deletions(-) diff --git a/.claude/skills/enable-openmc-index/SKILL.md b/.claude/skills/enable-openmc-index/SKILL.md index adfaf85c846..b4342fa99fb 100644 --- a/.claude/skills/enable-openmc-index/SKILL.md +++ b/.claude/skills/enable-openmc-index/SKILL.md @@ -8,9 +8,9 @@ allowed-tools: Bash(*), Read Set up (if needed) and activate the OpenMC codebase tools for this session: -1. **`openmc_search.py`** — RAG semantic search. Embeds your query and searches a vector index. Good for finding conceptually related code even when naming differs. Covers C++, Python, and docs. -2. **`openmc_map.py`** — Structural repo map via aider/tree-sitter. Given focus files you already have open, shows condensed code skeletons of their *neighbors* — other files that share symbols with them. Useful for seeing surrounding context. Caveat: matches identifiers by name only, so common names like `push_back` or `__init__` create false connections in the neighbor ranking. -3. **`openmc_lsp.py`** — LSP navigation via clangd. Compiler-accurate `definition`, `references`, and `related` commands for C++. Zero false edges. Requires clangd and compile_commands.json. +1. **`openmc_search.py`** — RAG semantic search. Chunks code at function/class boundaries, embeds with sentence-transformers, searches a LanceDB vector index. Returns code previews with file paths and line numbers. Covers C++, Python, and docs. +2. **`openmc_map.py`** — Structural repo map via aider/tree-sitter. Builds a cross-file reference graph, ranks files with PageRank relative to your focus files, then shows the top-ranked files as condensed code skeletons fitted to a token budget. Focus files are excluded (assumes you already have them). Caveat: the graph matches identifiers by name only — common names like `push_back` or `__init__` create false edges in the ranking. +3. **`openmc_lsp.py`** — LSP navigation via clangd. Talks to the C++ compiler frontend for symbol resolution. `definition`, `references`, `symbols`, and `related` commands with compiler accuracy — zero false edges. Requires clangd and compile_commands.json. ## Step 1: Ensure the virtual environment exists @@ -57,9 +57,9 @@ are missing (the LSP tool will report this itself if invoked without them). ## When to use each tool -- **`openmc_search.py`**: "What code is conceptually related to X?" — best for broad discovery, cross-cutting concerns, searching docs and Python code -- **`openmc_lsp.py`**: "Where is this C++ symbol defined, who calls it, and what files are connected to this one?" — returns file:line locations with compiler accuracy, zero false positives -- **`openmc_map.py`**: "What other code surrounds the files I'm working on?" — shows condensed signatures of neighboring files. Be aware its neighbor ranking has false edges from common method names +- **`openmc_search.py`**: "What code is conceptually related to X?" — broad discovery by meaning, cross-cutting concerns, Python and docs +- **`openmc_lsp.py`**: "Where is this C++ symbol defined, who calls it, and what files are truly connected to this one?" — compiler-accurate file:line locations, zero false positives +- **`openmc_map.py`**: "Show me the code structure of files neighboring my focus files" — PageRank-ranked code skeletons fitted to a token budget. Neighbor ranking is noisy for common identifiers; use `openmc_lsp.py related` for accurate C++ file connections ## Subagent guidance diff --git a/CLAUDE.md b/CLAUDE.md index 1c2da1f8438..74153402ac4 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -11,32 +11,49 @@ When using these tools, ALWAYS read their full output. Do NOT pipe through head, tail, or grep. The tools already limit their output size via `--top-k` and `--tokens` flags. Truncating their output defeats their purpose. -### Tool summary - -**`openmc_search.py`** — RAG semantic search. Embeds your query and searches a -vector index of the codebase. Good for finding conceptually related code even -when naming differs (e.g., "particle RNG seeding" finds code across transport, -restart, and random ray modes). Covers C++, Python, and docs. - -**`openmc_lsp.py`** — LSP navigation via clangd. Uses the C++ compiler's own -type system to resolve symbols. `definition`, `references`, and `related` -commands give compiler-accurate results with zero false edges. Use this when -you need to know exactly which files reference a C++ symbol or are connected -to a C++ file. Requires clangd and `build/compile_commands.json`. - -**`openmc_map.py`** — Structural repo map via aider/tree-sitter. Given focus files -you already have open, shows condensed code skeletons (class/function signatures) -of their *neighbors* — other files that share symbols with them. The focus files -themselves are excluded (you already have them in context). **Caveat**: the -underlying graph matches identifiers by name only (tree-sitter has no type -information), so files defining common method names like `push_back`, `get`, -`__init__`, or `from_xml` may appear as neighbors when they are not truly -related. For finding which files are genuinely connected to a C++ file, prefer -`openmc_lsp.py related`. - To rebuild the RAG search index after pulling new code, the user can use `/refresh-openmc-index`. +### Tool details + +**`openmc_search.py`** — RAG semantic search. The codebase (C++, Python, and +RST docs) is chunked at function/class boundaries using tree-sitter, embedded +with sentence-transformers, and stored in a local LanceDB vector index. Your +query is embedded the same way, and the closest chunks are returned with file +paths, line numbers, and a code preview. Good for finding conceptually related +code even when naming differs (e.g., "particle RNG seeding" finds code across +transport, restart, and random ray modes). Returns `--top-k` results (default 10). + +**`openmc_lsp.py`** — LSP navigation via clangd. Launches clangd as a subprocess +and queries it via the Language Server Protocol. Because clangd uses the actual +C++ compiler frontend (Clang), it resolves every symbol through the real type +system — namespaces, templates, overloads, and all. Commands: +- `symbols FILE` — list all symbols defined in a file with their types and lines +- `definition FILE:LINE` — jump to where the symbol at that line is defined +- `references FILE:LINE` — find every file and line that references that symbol +- `related FILE` — for each symbol defined in the file, find all external + references, then rank other files by how many typed connections they share. + Returns `--top-k` files (default 15) with the connecting symbol names. + +Zero false edges — if it says two files are connected, they genuinely share +typed references. Requires clangd and `build/compile_commands.json` (generate +with `cmake -B build -DCMAKE_EXPORT_COMPILE_COMMANDS=ON`). + +**`openmc_map.py`** — Structural repo map via aider/tree-sitter. Tree-sitter +parses all C++ and Python source files to extract identifier definitions and +references. A cross-file reference graph is built (file A references a symbol +defined in file B → edge from A to B), then PageRank ranks files by importance +relative to your focus files. The top-ranked files are shown as condensed code +skeletons with class/function signatures and `⋮` elision markers, fitted to a +`--tokens` budget (default 2048). Focus files themselves are excluded from the +output (the assumption is you already have them in context). **Caveat**: the +reference graph matches identifiers by name only — tree-sitter has no type +information, so `std::vector::push_back` and `NeighborList::push_back` create +the same edges. This means files defining common method names (`push_back`, +`get`, `__init__`, `from_xml`, etc.) get inflated PageRank and appear as +neighbors when they may not be truly related. For determining which files are +genuinely connected to a C++ file, use `openmc_lsp.py related` instead. + ## Additional OpenMC info Read the FULL `AGENTS.md` in this directory also before starting work. From c37a4bd1c54e7a812d338c6d15e5fe80a68e32a3 Mon Sep 17 00:00:00 2001 From: John Tramm Date: Thu, 5 Mar 2026 15:28:32 +0000 Subject: [PATCH 17/67] Soften repo map vs LSP guidance to let agent decide MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The repo map's name-matching can surface identically-named functions across unrelated subsystems that may need parallel changes — something the LSP tool would miss. Present the trade-off rather than prescribing one tool over the other. Co-Authored-By: Claude Opus 4.6 --- CLAUDE.md | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/CLAUDE.md b/CLAUDE.md index 74153402ac4..a6fd0c5ccb4 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -51,8 +51,11 @@ reference graph matches identifiers by name only — tree-sitter has no type information, so `std::vector::push_back` and `NeighborList::push_back` create the same edges. This means files defining common method names (`push_back`, `get`, `__init__`, `from_xml`, etc.) get inflated PageRank and appear as -neighbors when they may not be truly related. For determining which files are -genuinely connected to a C++ file, use `openmc_lsp.py related` instead. +neighbors when they may not be truly related. The name-matching can also be +useful — it surfaces files with identically-named functions that may need +parallel changes even though they have no typed connection. For precise C++ +file connections, `openmc_lsp.py related` is more reliable; the repo map is +better for a broad structural overview or for Python code. ## Additional OpenMC info From be13346ae0f8d2fa04e7498fb2fb09666e98a30c Mon Sep 17 00:00:00 2001 From: John Tramm Date: Thu, 5 Mar 2026 15:58:07 +0000 Subject: [PATCH 18/67] Remove cmake flag instructions now that compile_commands.json is automatic OpenMC's CMakeLists.txt now enables CMAKE_EXPORT_COMPILE_COMMANDS by default, so compile_commands.json is generated automatically on every cmake build. Remove instructions telling users/agents to pass the flag. Co-Authored-By: Claude Opus 4.6 --- .claude/skills/enable-openmc-index/SKILL.md | 6 +++--- .claude/tools/lsp/openmc_lsp.py | 6 +++--- CLAUDE.md | 4 ++-- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/.claude/skills/enable-openmc-index/SKILL.md b/.claude/skills/enable-openmc-index/SKILL.md index b4342fa99fb..ebf9d71b0f2 100644 --- a/.claude/skills/enable-openmc-index/SKILL.md +++ b/.claude/skills/enable-openmc-index/SKILL.md @@ -10,7 +10,7 @@ Set up (if needed) and activate the OpenMC codebase tools for this session: 1. **`openmc_search.py`** — RAG semantic search. Chunks code at function/class boundaries, embeds with sentence-transformers, searches a LanceDB vector index. Returns code previews with file paths and line numbers. Covers C++, Python, and docs. 2. **`openmc_map.py`** — Structural repo map via aider/tree-sitter. Builds a cross-file reference graph, ranks files with PageRank relative to your focus files, then shows the top-ranked files as condensed code skeletons fitted to a token budget. Focus files are excluded (assumes you already have them). Caveat: the graph matches identifiers by name only — common names like `push_back` or `__init__` create false edges in the ranking. -3. **`openmc_lsp.py`** — LSP navigation via clangd. Talks to the C++ compiler frontend for symbol resolution. `definition`, `references`, `symbols`, and `related` commands with compiler accuracy — zero false edges. Requires clangd and compile_commands.json. +3. **`openmc_lsp.py`** — LSP navigation via clangd. Talks to the C++ compiler frontend for symbol resolution. `definition`, `references`, `symbols`, and `related` commands with compiler accuracy — zero false edges. Requires clangd and that OpenMC has been built (for `build/compile_commands.json`). ## Step 1: Ensure the virtual environment exists @@ -52,8 +52,8 @@ Read and internalize the output so you know all available options. ## Step 4: Confirm activation -Tell the user the tools are active. Mention if clangd or compile_commands.json -are missing (the LSP tool will report this itself if invoked without them). +Tell the user the tools are active. If the LSP tool can't find clangd or +`build/compile_commands.json`, it will report this itself when invoked. ## When to use each tool diff --git a/.claude/tools/lsp/openmc_lsp.py b/.claude/tools/lsp/openmc_lsp.py index ab9060a3fa4..f873029c50e 100644 --- a/.claude/tools/lsp/openmc_lsp.py +++ b/.claude/tools/lsp/openmc_lsp.py @@ -8,7 +8,7 @@ Requires: - clangd (apt-get install clangd, or clangd-15/clangd-16/etc.) - - compile_commands.json (cmake -B build -DCMAKE_EXPORT_COMPILE_COMMANDS=ON) + - build/compile_commands.json (automatically generated when OpenMC is built with cmake) Usage: openmc_lsp.py symbols src/simulation.cpp @@ -61,8 +61,8 @@ def __init__(self, compile_commands_dir=None): if not compile_commands_dir: compile_commands_dir = self._find_compile_commands() if not compile_commands_dir: - print("ERROR: compile_commands.json not found. Generate with:\n" - " cmake -B build -DCMAKE_EXPORT_COMPILE_COMMANDS=ON", + print("ERROR: compile_commands.json not found. Build OpenMC with " + "cmake first (it generates this file automatically).", file=sys.stderr) sys.exit(1) diff --git a/CLAUDE.md b/CLAUDE.md index a6fd0c5ccb4..38c33f21df8 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -36,8 +36,8 @@ system — namespaces, templates, overloads, and all. Commands: Returns `--top-k` files (default 15) with the connecting symbol names. Zero false edges — if it says two files are connected, they genuinely share -typed references. Requires clangd and `build/compile_commands.json` (generate -with `cmake -B build -DCMAKE_EXPORT_COMPILE_COMMANDS=ON`). +typed references. Requires clangd and `build/compile_commands.json` (automatically +generated when OpenMC is built with cmake). **`openmc_map.py`** — Structural repo map via aider/tree-sitter. Tree-sitter parses all C++ and Python source files to extract identifier definitions and From 2fbc0b3750b89f968543590c882fdb381c2c6883 Mon Sep 17 00:00:00 2001 From: John Tramm Date: Thu, 5 Mar 2026 16:26:35 +0000 Subject: [PATCH 19/67] Remove dead code in indexer.py The first record-building loop (using slow `chunk in code_chunks` identity comparison) was replaced by an index-based approach but never deleted. Co-Authored-By: Claude Opus 4.6 --- .claude/tools/rag/indexer.py | 21 +-------------------- 1 file changed, 1 insertion(+), 20 deletions(-) diff --git a/.claude/tools/rag/indexer.py b/.claude/tools/rag/indexer.py index 8922b77d65d..de86420c48b 100644 --- a/.claude/tools/rag/indexer.py +++ b/.claude/tools/rag/indexer.py @@ -80,26 +80,7 @@ def build_index(): INDEX_DIR.mkdir(parents=True, exist_ok=True) db = lancedb.connect(str(INDEX_DIR)) - # Prepare records - code_records = [] - doc_records = [] - for chunk, emb in zip(all_chunks, all_embeddings): - record = { - "text": chunk["text"], - "filepath": chunk["filepath"], - "kind": chunk["kind"], - "symbol": chunk.get("symbol", ""), - "start_line": chunk.get("start_line", 0), - "end_line": chunk.get("end_line", 0), - "vector": emb, - } - if chunk in code_chunks: - code_records.append(record) - else: - doc_records.append(record) - - # The chunk identity comparison above is slow for large lists. - # Instead, use index-based separation. + # Separate code vs doc records by index (code_chunks come first in all_chunks) n_code = len(code_chunks) code_records = [] doc_records = [] From f0a9926a094651f19bef36aeec89cdc759bc42f9 Mon Sep 17 00:00:00 2001 From: John Tramm Date: Thu, 5 Mar 2026 16:58:39 +0000 Subject: [PATCH 20/67] Remove TF-IDF fallback from embedding provider The TF-IDF+SVD fallback was never exercised since sentence-transformers is in requirements.txt and always installs. Simplifies embeddings.py from 101 lines to 32 and removes the unnecessary ABC/factory pattern. Co-Authored-By: Claude Opus 4.6 --- .claude/tools/rag/embeddings.py | 84 +++--------------------------- .claude/tools/rag/indexer.py | 4 +- .claude/tools/rag/openmc_search.py | 4 +- .claude/tools/requirements.txt | 5 +- 4 files changed, 10 insertions(+), 87 deletions(-) diff --git a/.claude/tools/rag/embeddings.py b/.claude/tools/rag/embeddings.py index 1f937a19f94..db02642ad97 100644 --- a/.claude/tools/rag/embeddings.py +++ b/.claude/tools/rag/embeddings.py @@ -1,59 +1,17 @@ -"""Embedding provider with auto-detection fallback chain. +"""Embedding provider using sentence-transformers (all-MiniLM-L6-v2). -1. sentence-transformers (all-MiniLM-L6-v2) - good quality, ~80MB model -2. TF-IDF + SVD - zero downloads, decent for code identifiers +Requires: pip install sentence-transformers """ import os -import sys -from abc import ABC, abstractmethod # Suppress noisy HuggingFace warnings about authentication os.environ.setdefault("HF_HUB_DISABLE_TELEMETRY", "1") os.environ.setdefault("TOKENIZERS_PARALLELISM", "false") -class EmbeddingProvider(ABC): - """Abstract base for embedding providers.""" - - dim: int = 0 - - @abstractmethod - def embed(self, texts: list[str]) -> list[list[float]]: - """Embed a list of texts into vectors.""" - ... - - @abstractmethod - def embed_query(self, text: str) -> list[float]: - """Embed a single query text.""" - ... - - @staticmethod - def create(corpus_texts: list[str] | None = None) -> "EmbeddingProvider": - """Auto-detect best available embedding backend. - - Args: - corpus_texts: For TF-IDF fallback, the full corpus to fit on. - Not needed for sentence-transformers. - """ - # Try sentence-transformers first - try: - return SentenceTransformerProvider() - except (ImportError, Exception) as e: - print(f" sentence-transformers unavailable: {e}", file=sys.stderr) - - # Fall back to TF-IDF - if corpus_texts is None: - raise RuntimeError( - "No embedding provider available. Install sentence-transformers " - "or provide corpus_texts for TF-IDF fallback." - ) - print(" Using TF-IDF fallback embeddings", file=sys.stderr) - return TfidfProvider(corpus_texts) - - -class SentenceTransformerProvider(EmbeddingProvider): - """sentence-transformers with all-MiniLM-L6-v2.""" +class EmbeddingProvider: + """Sentence-transformers embedder using all-MiniLM-L6-v2.""" def __init__(self, model_name: str = "all-MiniLM-L6-v2"): from sentence_transformers import SentenceTransformer @@ -61,41 +19,11 @@ def __init__(self, model_name: str = "all-MiniLM-L6-v2"): self.dim = self.model.get_sentence_embedding_dimension() def embed(self, texts: list[str]) -> list[list[float]]: + """Embed a list of texts into vectors.""" embeddings = self.model.encode(texts, show_progress_bar=True, batch_size=64) return embeddings.tolist() def embed_query(self, text: str) -> list[float]: + """Embed a single query text.""" return self.model.encode([text])[0].tolist() - - -class TfidfProvider(EmbeddingProvider): - """TF-IDF vectors projected to dense via SVD. No model download needed.""" - - def __init__(self, corpus_texts: list[str], dim: int = 256): - from sklearn.decomposition import TruncatedSVD - from sklearn.feature_extraction.text import TfidfVectorizer - - self.dim = dim - self.vectorizer = TfidfVectorizer( - max_features=10000, - sublinear_tf=True, - token_pattern=r"(?u)\b[a-zA-Z_][a-zA-Z0-9_]{2,}\b", # Code identifiers - ) - tfidf_matrix = self.vectorizer.fit_transform(corpus_texts) - - # Project to dense using SVD - actual_dim = min(dim, tfidf_matrix.shape[1] - 1, tfidf_matrix.shape[0] - 1) - self.svd = TruncatedSVD(n_components=actual_dim) - self.svd.fit(tfidf_matrix) - self.dim = actual_dim - - def embed(self, texts: list[str]) -> list[list[float]]: - tfidf = self.vectorizer.transform(texts) - dense = self.svd.transform(tfidf) - return dense.tolist() - - def embed_query(self, text: str) -> list[float]: - tfidf = self.vectorizer.transform([text]) - dense = self.svd.transform(tfidf) - return dense[0].tolist() diff --git a/.claude/tools/rag/indexer.py b/.claude/tools/rag/indexer.py index de86420c48b..e501771368a 100644 --- a/.claude/tools/rag/indexer.py +++ b/.claude/tools/rag/indexer.py @@ -70,8 +70,8 @@ def build_index(): # Create embeddings all_texts = [c["text"] for c in all_chunks] print("Creating embedding provider...") - embedder = EmbeddingProvider.create(corpus_texts=all_texts) - print(f" Using {embedder.__class__.__name__} (dim={embedder.dim})") + embedder = EmbeddingProvider() + print(f" dim={embedder.dim}") print("Embedding chunks...") all_embeddings = embedder.embed(all_texts) diff --git a/.claude/tools/rag/openmc_search.py b/.claude/tools/rag/openmc_search.py index 1fe10d2818e..4029ee50af8 100644 --- a/.claude/tools/rag/openmc_search.py +++ b/.claude/tools/rag/openmc_search.py @@ -39,9 +39,7 @@ def get_db_and_embedder(): db = lancedb.connect(str(INDEX_DIR)) - # For query embedding, we need the same type of embedder used for indexing. - # Try sentence-transformers first (matches indexing default). - embedder = EmbeddingProvider.create() + embedder = EmbeddingProvider() return db, embedder diff --git a/.claude/tools/requirements.txt b/.claude/tools/requirements.txt index ec6c251da68..c1d399a7fc0 100644 --- a/.claude/tools/requirements.txt +++ b/.claude/tools/requirements.txt @@ -10,12 +10,9 @@ tree-sitter-cpp>=0.23.0 lancedb>=0.15.0 pyarrow>=14.0.0 -# Embeddings (primary - local, no API key) +# Embeddings (local, no API key) sentence-transformers>=2.7.0 -# Embeddings (fallback - zero downloads) -scikit-learn>=1.4.0 - # LSP client for clangd-based code navigation pygls>=2.0.0 From fb71f38e659eae574d804b80a17d3a8c17433050 Mon Sep 17 00:00:00 2001 From: John Tramm Date: Thu, 5 Mar 2026 18:03:10 +0000 Subject: [PATCH 21/67] Replace AST chunking with fixed-size overlapping windows MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Drop tree-sitter-based function/class chunking in favor of simple sliding windows (1000 chars, 25% overlap). This ensures every line of code is searchable — long functions no longer have their tails invisible to the embedding model. Removes tree-sitter, tree-sitter-python, and tree-sitter-cpp dependencies. Index builds in ~5 min on 10 cores. Co-Authored-By: Claude Opus 4.6 --- .claude/skills/enable-openmc-index/SKILL.md | 4 +- .claude/skills/refresh-openmc-index/SKILL.md | 2 +- .claude/tools/rag/chunker.py | 276 +++++-------------- .claude/tools/requirements.txt | 5 - CLAUDE.md | 13 +- 5 files changed, 75 insertions(+), 225 deletions(-) diff --git a/.claude/skills/enable-openmc-index/SKILL.md b/.claude/skills/enable-openmc-index/SKILL.md index ebf9d71b0f2..183ab5dbae0 100644 --- a/.claude/skills/enable-openmc-index/SKILL.md +++ b/.claude/skills/enable-openmc-index/SKILL.md @@ -8,7 +8,7 @@ allowed-tools: Bash(*), Read Set up (if needed) and activate the OpenMC codebase tools for this session: -1. **`openmc_search.py`** — RAG semantic search. Chunks code at function/class boundaries, embeds with sentence-transformers, searches a LanceDB vector index. Returns code previews with file paths and line numbers. Covers C++, Python, and docs. +1. **`openmc_search.py`** — RAG semantic search. Chunks code into overlapping fixed-size windows, embeds with sentence-transformers, searches a LanceDB vector index. Returns code previews with file paths and line numbers. Covers C++, Python, and docs. 2. **`openmc_map.py`** — Structural repo map via aider/tree-sitter. Builds a cross-file reference graph, ranks files with PageRank relative to your focus files, then shows the top-ranked files as condensed code skeletons fitted to a token budget. Focus files are excluded (assumes you already have them). Caveat: the graph matches identifiers by name only — common names like `push_back` or `__init__` create false edges in the ranking. 3. **`openmc_lsp.py`** — LSP navigation via clangd. Talks to the C++ compiler frontend for symbol resolution. `definition`, `references`, `symbols`, and `related` commands with compiler accuracy — zero false edges. Requires clangd and that OpenMC has been built (for `build/compile_commands.json`). @@ -30,7 +30,7 @@ The semantic search tool needs a pre-built vector index. The other two tools wor ```bash if [ ! -d .claude/cache/rag_index ]; then - echo "Building RAG index for the first time (this takes ~3 minutes)..." + echo "Building RAG index for the first time (takes ~5 minutes on 10 CPU cores)..." HF_HUB_DISABLE_TELEMETRY=1 .claude/cache/.venv/bin/python .claude/tools/rag/indexer.py echo "INDEX_BUILT" else diff --git a/.claude/skills/refresh-openmc-index/SKILL.md b/.claude/skills/refresh-openmc-index/SKILL.md index a5d3b0ba86c..aefebae581f 100644 --- a/.claude/skills/refresh-openmc-index/SKILL.md +++ b/.claude/skills/refresh-openmc-index/SKILL.md @@ -21,7 +21,7 @@ fi ## Step 2: Rebuild the RAG index ```bash -echo "Rebuilding RAG index (this takes ~3 minutes)..." +echo "Rebuilding RAG index (takes ~5 minutes on 10 CPU cores)..." HF_HUB_DISABLE_TELEMETRY=1 .claude/cache/.venv/bin/python .claude/tools/rag/indexer.py ``` diff --git a/.claude/tools/rag/chunker.py b/.claude/tools/rag/chunker.py index 6717f5863ad..da70c1461ff 100644 --- a/.claude/tools/rag/chunker.py +++ b/.claude/tools/rag/chunker.py @@ -1,241 +1,95 @@ """Chunk OpenMC source files and documentation for RAG indexing. -Code files are chunked at the function/class level using tree-sitter. -RST documentation is chunked by section headers. +Uses fixed-size overlapping windows so every line of code is searchable. +Window size is tuned to fit within the MiniLM embedding model's 256-token +context (~1000 chars). 50% overlap ensures no content falls between chunks. """ -import re from pathlib import Path -import tree_sitter_cpp as tscpp -import tree_sitter_python as tspy -from tree_sitter import Language, Parser - -CPP_LANG = Language(tscpp.language()) -PY_LANG = Language(tspy.language()) - -cpp_parser = Parser(CPP_LANG) -py_parser = Parser(PY_LANG) - -MAX_CHUNK_CHARS = 1500 +# ~256 tokens for MiniLM. 1 token ≈ 4 chars for code. +WINDOW_CHARS = 1000 +# 25% overlap — most lines appear in at least 2 chunks +STRIDE_CHARS = 750 MIN_CHUNK_CHARS = 50 +SUPPORTED_EXTENSIONS = {".cpp", ".h", ".py", ".rst"} + def chunk_file(filepath, openmc_root): - """Chunk a single file based on its extension.""" + """Chunk a single file into overlapping fixed-size windows.""" filepath = Path(filepath) + if filepath.suffix not in SUPPORTED_EXTENSIONS: + return [] + rel = str(filepath.relative_to(openmc_root)) try: content = filepath.read_text(errors="replace") except Exception: return [] - if filepath.suffix in (".cpp", ".h"): - return _chunk_cpp(rel, content) - elif filepath.suffix == ".py": - return _chunk_python(rel, content) - elif filepath.suffix == ".rst": - return _chunk_rst(rel, content) - return [] - - -def _chunk_cpp(rel_path, content): - """Extract function and class-level chunks from C++ code.""" - tree = cpp_parser.parse(content.encode()) - chunks = [] - used_ranges = [] - - def _extract_node(node, kind_override=None): - text = content[node.start_byte:node.end_byte] - if len(text) < MIN_CHUNK_CHARS: - return - # Extract symbol name - name = _get_node_name(node) - kind = kind_override or node.type - for sub in _split_if_large(text): - chunks.append({ - "text": sub, - "filepath": rel_path, - "kind": kind, - "symbol": name or "", - "start_line": node.start_point[0] + 1, - "end_line": node.end_point[0] + 1, - }) - used_ranges.append((node.start_byte, node.end_byte)) - - def _visit(node): - if node.type in ( - "function_definition", "class_specifier", - "struct_specifier", "enum_specifier", - ): - _extract_node(node) - elif node.type == "namespace_definition": - # Visit children inside namespaces - for child in node.children: - _visit(child) - elif node.type == "declaration_list": - for child in node.children: - _visit(child) - else: - for child in node.children: - if child.type in ( - "function_definition", "class_specifier", - "struct_specifier", "namespace_definition", - ): - _visit(child) - - for child in tree.root_node.children: - _visit(child) - - # Add file header (includes, forward declarations) as a separate chunk - header_lines = [] - for line in content.split("\n")[:50]: - if line.strip().startswith("#include") or line.strip().startswith("namespace") \ - or line.strip().startswith("//") or line.strip().startswith("using") \ - or line.strip() == "": - header_lines.append(line) - else: - break - header = "\n".join(header_lines).strip() - if len(header) >= MIN_CHUNK_CHARS: - chunks.append({ - "text": header, - "filepath": rel_path, - "kind": "file_header", - "symbol": Path(rel_path).name, - "start_line": 1, - "end_line": len(header_lines), - }) - - return chunks - - -def _chunk_python(rel_path, content): - """Extract function and class-level chunks from Python code.""" - tree = py_parser.parse(content.encode()) - chunks = [] - - for node in tree.root_node.children: - if node.type in ("class_definition", "function_definition"): - text = content[node.start_byte:node.end_byte] - if len(text) < MIN_CHUNK_CHARS: - continue - name_node = node.child_by_field_name("name") - name = name_node.text.decode() if name_node else "" - for sub in _split_if_large(text): - chunks.append({ - "text": sub, - "filepath": rel_path, - "kind": node.type.replace("_definition", ""), - "symbol": name, - "start_line": node.start_point[0] + 1, - "end_line": node.end_point[0] + 1, - }) - - # Module-level docstring + imports as header - header_lines = [] - for line in content.split("\n")[:40]: - stripped = line.strip() - if stripped.startswith(("import ", "from ", "#", '"""', "'''", "")) \ - or stripped == "": - header_lines.append(line) - elif stripped.startswith(("def ", "class ")): - break - else: - header_lines.append(line) - header = "\n".join(header_lines).strip() - if len(header) >= MIN_CHUNK_CHARS: - chunks.append({ - "text": header, - "filepath": rel_path, - "kind": "file_header", - "symbol": Path(rel_path).name, - "start_line": 1, - "end_line": len(header_lines), - }) + if len(content) < MIN_CHUNK_CHARS: + return [] - return chunks + kind = _file_kind(filepath) + lines = content.split("\n") + # Build a char-offset → line-number map + line_starts = [] + offset = 0 + for line in lines: + line_starts.append(offset) + offset += len(line) + 1 # +1 for newline -def _chunk_rst(rel_path, content): - """Chunk RST documentation by section headers.""" - # RST sections are indicated by underlines of =, -, ~, ^, etc. - section_pattern = re.compile( - r'^(.+)\n([=\-~^"+]+)\s*$', re.MULTILINE - ) chunks = [] - - # Find all section positions - positions = [0] - for m in section_pattern.finditer(content): - # The section title starts at the beginning of the title line - positions.append(m.start()) - positions.append(len(content)) - - for i in range(len(positions) - 1): - section = content[positions[i]:positions[i + 1]].strip() - if len(section) < MIN_CHUNK_CHARS: - continue - # Extract title - title_match = section_pattern.match(section) - title = title_match.group(1).strip() if title_match else "" - start_line = content[:positions[i]].count("\n") + 1 - end_line = content[:positions[i + 1]].count("\n") + 1 - for sub in _split_if_large(section): + start = 0 + while start < len(content): + end = min(start + WINDOW_CHARS, len(content)) + + # Snap end to a line boundary to avoid splitting mid-line + if end < len(content): + newline_pos = content.rfind("\n", start, end) + if newline_pos > start: + end = newline_pos + 1 + + text = content[start:end].strip() + if len(text) >= MIN_CHUNK_CHARS: + start_line = _offset_to_line(line_starts, start) + end_line = _offset_to_line(line_starts, end - 1) chunks.append({ - "text": sub, - "filepath": rel_path, - "kind": "doc_section", - "symbol": title, + "text": text, + "filepath": rel, + "kind": kind, + "symbol": "", "start_line": start_line, "end_line": end_line, }) + start += STRIDE_CHARS + return chunks -def _get_node_name(node): - """Extract the name from a tree-sitter node.""" - name_node = node.child_by_field_name("name") - if name_node: - return name_node.text.decode() - # For function_definition, check declarator - decl = node.child_by_field_name("declarator") - if decl: - # Walk down to find the identifier - while decl.type not in ("identifier", "qualified_identifier", - "field_identifier", "destructor_name"): - found = False - for child in decl.children: - if child.type in ("function_declarator", "identifier", - "qualified_identifier", "field_identifier", - "destructor_name", "template_function"): - decl = child - found = True - break - if not found: - break - return decl.text.decode() - return "" - - -def _split_if_large(text, max_chars=MAX_CHUNK_CHARS): - """Split text into chunks if it exceeds max_chars.""" - if len(text) <= max_chars: - return [text] - # Split on line boundaries - lines = text.split("\n") - chunks = [] - current = [] - current_len = 0 - for line in lines: - if current_len + len(line) + 1 > max_chars and current: - chunks.append("\n".join(current)) - current = [line] - current_len = len(line) +def _file_kind(filepath): + """Map file extension to a kind label.""" + ext = filepath.suffix + if ext in (".cpp", ".h"): + return "cpp" + elif ext == ".py": + return "py" + elif ext == ".rst": + return "doc" + return "other" + + +def _offset_to_line(line_starts, offset): + """Convert a character offset to a 1-based line number.""" + # Binary search for the line containing this offset + lo, hi = 0, len(line_starts) - 1 + while lo < hi: + mid = (lo + hi + 1) // 2 + if line_starts[mid] <= offset: + lo = mid else: - current.append(line) - current_len += len(line) + 1 - if current: - chunks.append("\n".join(current)) - return chunks + hi = mid - 1 + return lo + 1 # 1-based diff --git a/.claude/tools/requirements.txt b/.claude/tools/requirements.txt index c1d399a7fc0..fe9e62206a5 100644 --- a/.claude/tools/requirements.txt +++ b/.claude/tools/requirements.txt @@ -1,11 +1,6 @@ # Repo map (uses aider's tree-sitter based RepoMap) aider-chat>=0.80.0 -# Tree-sitter for RAG code chunking -tree-sitter>=0.23.0 -tree-sitter-python>=0.23.0 -tree-sitter-cpp>=0.23.0 - # Vector database lancedb>=0.15.0 pyarrow>=14.0.0 diff --git a/CLAUDE.md b/CLAUDE.md index 38c33f21df8..0670e3b2cd3 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -17,12 +17,13 @@ To rebuild the RAG search index after pulling new code, the user can use ### Tool details **`openmc_search.py`** — RAG semantic search. The codebase (C++, Python, and -RST docs) is chunked at function/class boundaries using tree-sitter, embedded -with sentence-transformers, and stored in a local LanceDB vector index. Your -query is embedded the same way, and the closest chunks are returned with file -paths, line numbers, and a code preview. Good for finding conceptually related -code even when naming differs (e.g., "particle RNG seeding" finds code across -transport, restart, and random ray modes). Returns `--top-k` results (default 10). +RST docs) is chunked into overlapping fixed-size windows (~1000 chars, 25% +overlap) so every line of code is searchable. Chunks are embedded with +sentence-transformers and stored in a local LanceDB vector index. Your query is +embedded the same way, and the closest chunks are returned with file paths, line +numbers, and a code preview. Good for finding conceptually related code even +when naming differs (e.g., "particle RNG seeding" finds code across transport, +restart, and random ray modes). Returns `--top-k` results (default 10). **`openmc_lsp.py`** — LSP navigation via clangd. Launches clangd as a subprocess and queries it via the Language Server Protocol. Because clangd uses the actual From 974f904326b4e9095730197647e30b5a94c467ce Mon Sep 17 00:00:00 2001 From: John Tramm Date: Thu, 5 Mar 2026 18:04:13 +0000 Subject: [PATCH 22/67] Add index build time estimate to CLAUDE.md offer text Co-Authored-By: Claude Opus 4.6 --- CLAUDE.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/CLAUDE.md b/CLAUDE.md index 0670e3b2cd3..096a839d9ce 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -2,7 +2,8 @@ If the user asks you to investigate, modify, or debug OpenMC code, let them know about the `/enable-openmc-index` skill which provides three code navigation tools. -Offer to run it for them. +Offer to run it for them. Note: the first run builds a RAG vector index, which +takes ~5 minutes on 10 CPU cores. Subsequent sessions reuse the cached index. Do NOT use the tools (`openmc_search.py`, `openmc_map.py`, `openmc_lsp.py`) unless `/enable-openmc-index` has been run in the current session. From 9a6b47e0ce9ceef7b6095521b83cb5d61ec62734 Mon Sep 17 00:00:00 2001 From: John Tramm Date: Thu, 5 Mar 2026 20:48:04 +0000 Subject: [PATCH 23/67] Suppress noisy HuggingFace output during model loading Use the official transformers API (TRANSFORMERS_VERBOSITY=error, transformers.logging.disable_progress_bar()) and HuggingFace Hub settings (token=False, local_files_only=True) to suppress load reports, auth warnings, and weight-loading progress bars. Embedding progress bars during indexing are preserved (show_progress_bar=True on .encode() uses sentence-transformers' own progress bar). Co-Authored-By: Claude Opus 4.6 --- .claude/tools/rag/embeddings.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/.claude/tools/rag/embeddings.py b/.claude/tools/rag/embeddings.py index db02642ad97..b701b6fd5a6 100644 --- a/.claude/tools/rag/embeddings.py +++ b/.claude/tools/rag/embeddings.py @@ -5,17 +5,27 @@ import os -# Suppress noisy HuggingFace warnings about authentication +# Official HuggingFace/transformers knobs for quiet operation. +# TRANSFORMERS_VERBOSITY: controls transformers' own logging (load reports, etc.) +# HF_HUB_DISABLE_TELEMETRY: don't phone home +# TOKENIZERS_PARALLELISM: avoids fork-safety warning +# All must be set before importing transformers. +os.environ.setdefault("TRANSFORMERS_VERBOSITY", "error") os.environ.setdefault("HF_HUB_DISABLE_TELEMETRY", "1") os.environ.setdefault("TOKENIZERS_PARALLELISM", "false") +import transformers +transformers.logging.disable_progress_bar() + class EmbeddingProvider: """Sentence-transformers embedder using all-MiniLM-L6-v2.""" def __init__(self, model_name: str = "all-MiniLM-L6-v2"): from sentence_transformers import SentenceTransformer - self.model = SentenceTransformer(model_name) + self.model = SentenceTransformer( + model_name, local_files_only=True, token=False + ) self.dim = self.model.get_sentence_embedding_dimension() def embed(self, texts: list[str]) -> list[list[float]]: From 462654aea2fc50328ac0a80aa94232b3fcd55638 Mon Sep 17 00:00:00 2001 From: John Tramm Date: Thu, 5 Mar 2026 20:56:41 +0000 Subject: [PATCH 24/67] Fix stale docs and remove unused dependencies - chunker.py docstring: said 50% overlap, actually 25% - indexer.py: remove unused pyarrow import - requirements.txt: remove pygls (LSP tool uses raw JSON-RPC), pyarrow (transitive dep of lancedb), numpy (transitive dep of sentence-transformers) Co-Authored-By: Claude Opus 4.6 --- .claude/tools/rag/chunker.py | 3 ++- .claude/tools/rag/indexer.py | 1 - .claude/tools/requirements.txt | 7 ------- 3 files changed, 2 insertions(+), 9 deletions(-) diff --git a/.claude/tools/rag/chunker.py b/.claude/tools/rag/chunker.py index da70c1461ff..ba6ff09d06a 100644 --- a/.claude/tools/rag/chunker.py +++ b/.claude/tools/rag/chunker.py @@ -2,7 +2,8 @@ Uses fixed-size overlapping windows so every line of code is searchable. Window size is tuned to fit within the MiniLM embedding model's 256-token -context (~1000 chars). 50% overlap ensures no content falls between chunks. +context (~1000 chars). 25% overlap ensures most content appears in at least +two chunks. """ from pathlib import Path diff --git a/.claude/tools/rag/indexer.py b/.claude/tools/rag/indexer.py index e501771368a..17ac391d585 100644 --- a/.claude/tools/rag/indexer.py +++ b/.claude/tools/rag/indexer.py @@ -15,7 +15,6 @@ sys.path.insert(0, str(TOOLS_DIR / "rag")) import lancedb -import pyarrow as pa from chunker import chunk_file from embeddings import EmbeddingProvider diff --git a/.claude/tools/requirements.txt b/.claude/tools/requirements.txt index fe9e62206a5..7b99a305cbe 100644 --- a/.claude/tools/requirements.txt +++ b/.claude/tools/requirements.txt @@ -3,13 +3,6 @@ aider-chat>=0.80.0 # Vector database lancedb>=0.15.0 -pyarrow>=14.0.0 # Embeddings (local, no API key) sentence-transformers>=2.7.0 - -# LSP client for clangd-based code navigation -pygls>=2.0.0 - -# Utilities -numpy>=1.26.0 From faf1e08c6188029746c579bcf73955aa17c56f05 Mon Sep 17 00:00:00 2001 From: John Tramm Date: Thu, 5 Mar 2026 21:01:52 +0000 Subject: [PATCH 25/67] Add embedding model details and expand LSP acronym in skill docs Co-Authored-By: Claude Opus 4.6 --- .claude/skills/enable-openmc-index/SKILL.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.claude/skills/enable-openmc-index/SKILL.md b/.claude/skills/enable-openmc-index/SKILL.md index 183ab5dbae0..6c5a596a09f 100644 --- a/.claude/skills/enable-openmc-index/SKILL.md +++ b/.claude/skills/enable-openmc-index/SKILL.md @@ -8,9 +8,9 @@ allowed-tools: Bash(*), Read Set up (if needed) and activate the OpenMC codebase tools for this session: -1. **`openmc_search.py`** — RAG semantic search. Chunks code into overlapping fixed-size windows, embeds with sentence-transformers, searches a LanceDB vector index. Returns code previews with file paths and line numbers. Covers C++, Python, and docs. +1. **`openmc_search.py`** — RAG semantic search. Chunks code into overlapping fixed-size windows, embeds with the all-MiniLM-L6-v2 model (22M parameters), and searches a LanceDB vector index. Returns code previews with file paths and line numbers. Covers C++, Python, and docs. 2. **`openmc_map.py`** — Structural repo map via aider/tree-sitter. Builds a cross-file reference graph, ranks files with PageRank relative to your focus files, then shows the top-ranked files as condensed code skeletons fitted to a token budget. Focus files are excluded (assumes you already have them). Caveat: the graph matches identifiers by name only — common names like `push_back` or `__init__` create false edges in the ranking. -3. **`openmc_lsp.py`** — LSP navigation via clangd. Talks to the C++ compiler frontend for symbol resolution. `definition`, `references`, `symbols`, and `related` commands with compiler accuracy — zero false edges. Requires clangd and that OpenMC has been built (for `build/compile_commands.json`). +3. **`openmc_lsp.py`** — LSP (Language Server Protocol) navigation via clangd. Talks to the C++ compiler frontend for symbol resolution. `definition`, `references`, `symbols`, and `related` commands with compiler accuracy — zero false edges. Requires clangd and that OpenMC has been built (for `build/compile_commands.json`). ## Step 1: Ensure the virtual environment exists From d29a39e28cea48176a01bd37c8863d85c8d3ca07 Mon Sep 17 00:00:00 2001 From: John Tramm Date: Thu, 5 Mar 2026 21:02:16 +0000 Subject: [PATCH 26/67] Note that RAG embedding runs locally on CPU with no GPU or API key Co-Authored-By: Claude Opus 4.6 --- .claude/skills/enable-openmc-index/SKILL.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.claude/skills/enable-openmc-index/SKILL.md b/.claude/skills/enable-openmc-index/SKILL.md index 6c5a596a09f..2b8119d74e6 100644 --- a/.claude/skills/enable-openmc-index/SKILL.md +++ b/.claude/skills/enable-openmc-index/SKILL.md @@ -8,7 +8,7 @@ allowed-tools: Bash(*), Read Set up (if needed) and activate the OpenMC codebase tools for this session: -1. **`openmc_search.py`** — RAG semantic search. Chunks code into overlapping fixed-size windows, embeds with the all-MiniLM-L6-v2 model (22M parameters), and searches a LanceDB vector index. Returns code previews with file paths and line numbers. Covers C++, Python, and docs. +1. **`openmc_search.py`** — RAG semantic search. Chunks code into overlapping fixed-size windows, embeds locally on CPU with the all-MiniLM-L6-v2 model (22M parameters, no GPU or API key needed), and searches a LanceDB vector index. Returns code previews with file paths and line numbers. Covers C++, Python, and docs. 2. **`openmc_map.py`** — Structural repo map via aider/tree-sitter. Builds a cross-file reference graph, ranks files with PageRank relative to your focus files, then shows the top-ranked files as condensed code skeletons fitted to a token budget. Focus files are excluded (assumes you already have them). Caveat: the graph matches identifiers by name only — common names like `push_back` or `__init__` create false edges in the ranking. 3. **`openmc_lsp.py`** — LSP (Language Server Protocol) navigation via clangd. Talks to the C++ compiler frontend for symbol resolution. `definition`, `references`, `symbols`, and `related` commands with compiler accuracy — zero false edges. Requires clangd and that OpenMC has been built (for `build/compile_commands.json`). From 68451e51ab5f9c95110ae57125de215fdb6c98a5 Mon Sep 17 00:00:00 2001 From: John Tramm Date: Thu, 5 Mar 2026 21:28:03 +0000 Subject: [PATCH 27/67] Broaden tool activation trigger to include PR reviews A diff-only review misses cross-file impacts. The tools (especially LSP references and RAG search) help reviewers understand what else in the codebase depends on or is affected by the changed code. Co-Authored-By: Claude Opus 4.6 --- CLAUDE.md | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/CLAUDE.md b/CLAUDE.md index 096a839d9ce..09b172ee7a7 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -1,9 +1,12 @@ ## OpenMC Codebase Tools -If the user asks you to investigate, modify, or debug OpenMC code, let them know -about the `/enable-openmc-index` skill which provides three code navigation tools. -Offer to run it for them. Note: the first run builds a RAG vector index, which -takes ~5 minutes on 10 CPU cores. Subsequent sessions reuse the cached index. +For any task involving OpenMC code — investigating, modifying, debugging, or +reviewing PRs — let the user know about the `/enable-openmc-index` skill which +provides three code navigation tools. Offer to run it for them. Even for PR +reviews, these tools are important: a diff only shows what changed, not what +else in the codebase depends on or is affected by those changes. Note: the +first run builds a RAG vector index, which takes ~5 minutes on 10 CPU cores. +Subsequent sessions reuse the cached index. Do NOT use the tools (`openmc_search.py`, `openmc_map.py`, `openmc_lsp.py`) unless `/enable-openmc-index` has been run in the current session. From 71f794e049c214abed223061b54e6e87a97ff0d9 Mon Sep 17 00:00:00 2001 From: John Tramm Date: Thu, 5 Mar 2026 21:53:31 +0000 Subject: [PATCH 28/67] Add RAG demo step and 'use before grep' guidance The agent's trained instinct is to reach for grep/Read, which only finds exact text matches. Add a mandatory demo step in the skill that shows the RAG tool finding cross-cutting results grep would miss, and add guidance in CLAUDE.md to use RAG search before grep when exploring unfamiliar code or checking change impact. Co-Authored-By: Claude Opus 4.6 --- .claude/skills/enable-openmc-index/SKILL.md | 27 +++++++++++++++++++-- CLAUDE.md | 9 +++++++ 2 files changed, 34 insertions(+), 2 deletions(-) diff --git a/.claude/skills/enable-openmc-index/SKILL.md b/.claude/skills/enable-openmc-index/SKILL.md index 2b8119d74e6..d9ae5774b29 100644 --- a/.claude/skills/enable-openmc-index/SKILL.md +++ b/.claude/skills/enable-openmc-index/SKILL.md @@ -50,14 +50,37 @@ Run `--help` for each tool to learn their full APIs: Read and internalize the output so you know all available options. -## Step 4: Confirm activation +## Step 4: Demonstrate the RAG tool to yourself + +Run this search and read the results carefully: + +```bash +.claude/cache/.venv/bin/python .claude/tools/rag/openmc_search.py "particle weight adjustment variance reduction" +``` + +Notice how the results span multiple files across different subsystems (weight +windows, random ray, variance reduction) — files you would never find with +`grep "particle weight"` because they use different terminology for related +concepts. This is the key advantage: **the RAG search finds code by meaning, +not by exact text match.** When you `grep` for `init_particle`, you miss +`initialize_seeds` which does the same thing with a different name. The RAG +search finds both. + +Your trained instinct is to reach for `grep` and `Read` — those are excellent +tools, but they are blind to semantic relationships. Throughout this session, +**use `openmc_search.py` as your first step when exploring unfamiliar code or +investigating how a change might affect other parts of the codebase.** Follow +up with `grep`/`Read` to drill into the specific files it surfaces. Think of +it as: RAG search for discovery, grep/Read for precision. + +## Step 5: Confirm activation Tell the user the tools are active. If the LSP tool can't find clangd or `build/compile_commands.json`, it will report this itself when invoked. ## When to use each tool -- **`openmc_search.py`**: "What code is conceptually related to X?" — broad discovery by meaning, cross-cutting concerns, Python and docs +- **`openmc_search.py`**: "What code is conceptually related to X?" — broad discovery by meaning, cross-cutting concerns, Python and docs. **Use this before grep when exploring unfamiliar code or checking what a change might affect.** - **`openmc_lsp.py`**: "Where is this C++ symbol defined, who calls it, and what files are truly connected to this one?" — compiler-accurate file:line locations, zero false positives - **`openmc_map.py`**: "Show me the code structure of files neighboring my focus files" — PageRank-ranked code skeletons fitted to a token budget. Neighbor ranking is noisy for common identifiers; use `openmc_lsp.py related` for accurate C++ file connections diff --git a/CLAUDE.md b/CLAUDE.md index 09b172ee7a7..432ef5cf8f5 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -18,6 +18,15 @@ tail, or grep. The tools already limit their output size via `--top-k` and To rebuild the RAG search index after pulling new code, the user can use `/refresh-openmc-index`. +### Important: use RAG search before grep + +When exploring unfamiliar code or checking what a change might affect, use +`openmc_search.py` **before** reaching for `grep` or `Glob`. `grep` only finds +exact text matches — it cannot find code that does the same thing with different +naming. The RAG search finds code by semantic meaning, surfacing related code +across subsystems that you would otherwise miss entirely. Use RAG for discovery, +then `grep`/`Read` to drill into the specific files it surfaces. + ### Tool details **`openmc_search.py`** — RAG semantic search. The codebase (C++, Python, and From 7ac4b1f9b265489e056b45d1e2aec6d507ba42b1 Mon Sep 17 00:00:00 2001 From: John Tramm Date: Thu, 5 Mar 2026 22:26:23 +0000 Subject: [PATCH 29/67] Clarify RAG vs grep guidance: discovery vs precision Based on real-world feedback from a PR review session where the agent felt overly compelled to use RAG search for exact symbol lookups that grep handles better. RAG is for semantic discovery across subsystems; grep is for precise symbol tracing. Don't force one when the other is the right tool. Co-Authored-By: Claude Opus 4.6 --- .claude/skills/enable-openmc-index/SKILL.md | 13 ++++++++----- CLAUDE.md | 4 ++++ 2 files changed, 12 insertions(+), 5 deletions(-) diff --git a/.claude/skills/enable-openmc-index/SKILL.md b/.claude/skills/enable-openmc-index/SKILL.md index d9ae5774b29..debffafa831 100644 --- a/.claude/skills/enable-openmc-index/SKILL.md +++ b/.claude/skills/enable-openmc-index/SKILL.md @@ -67,11 +67,14 @@ not by exact text match.** When you `grep` for `init_particle`, you miss search finds both. Your trained instinct is to reach for `grep` and `Read` — those are excellent -tools, but they are blind to semantic relationships. Throughout this session, -**use `openmc_search.py` as your first step when exploring unfamiliar code or -investigating how a change might affect other parts of the codebase.** Follow -up with `grep`/`Read` to drill into the specific files it surfaces. Think of -it as: RAG search for discovery, grep/Read for precision. +tools for precise symbol lookup, but they are blind to semantic relationships. +Use each tool for what it's best at: +- **RAG search**: discovery, exploration, "what else in the codebase relates to + this concept?", checking what a change might affect across subsystems +- **grep/Read**: precise symbol tracing, "every line that writes to `variable_x`" + +Don't force RAG searches for exact symbol lookups, and don't rely on grep alone +for broad exploration. ## Step 5: Confirm activation diff --git a/CLAUDE.md b/CLAUDE.md index 432ef5cf8f5..2e957996c0e 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -27,6 +27,10 @@ naming. The RAG search finds code by semantic meaning, surfacing related code across subsystems that you would otherwise miss entirely. Use RAG for discovery, then `grep`/`Read` to drill into the specific files it surfaces. +When you already know the exact symbol name and need to trace its usage (e.g., +"every line that writes to `progeny_per_particle`"), `grep` is the right tool +— don't force a RAG search for precise symbol lookups. + ### Tool details **`openmc_search.py`** — RAG semantic search. The codebase (C++, Python, and From 45695d3f802055d8fa44482bdd2175480db532d6 Mon Sep 17 00:00:00 2001 From: John Tramm Date: Fri, 6 Mar 2026 02:21:05 +0000 Subject: [PATCH 30/67] Fix LSP references/definition bug and add LSP demo step to skill MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The references and definition commands used first-non-whitespace position when no column was given, landing on the return type (e.g., `int`) instead of the function name (e.g., `openmc_run`). Added find_symbol_on_line() which uses clangd's document symbols to locate the actual symbol name, with a keyword-skipping fallback for lines without symbol definitions. Added Step 5 to the enable-openmc-index skill demonstrating LSP's type-accurate references using Tally::reset() — where grep returns 62 mixed hits across 20 files but LSP resolves exactly the 10 files that reference this specific class's reset(). Co-Authored-By: Claude Opus 4.6 --- .claude/skills/enable-openmc-index/SKILL.md | 25 ++++++- .claude/tools/lsp/openmc_lsp.py | 74 ++++++++++++++++----- 2 files changed, 82 insertions(+), 17 deletions(-) diff --git a/.claude/skills/enable-openmc-index/SKILL.md b/.claude/skills/enable-openmc-index/SKILL.md index debffafa831..f746ff44780 100644 --- a/.claude/skills/enable-openmc-index/SKILL.md +++ b/.claude/skills/enable-openmc-index/SKILL.md @@ -76,7 +76,30 @@ Use each tool for what it's best at: Don't force RAG searches for exact symbol lookups, and don't rely on grep alone for broad exploration. -## Step 5: Confirm activation +## Step 5: Demonstrate the LSP tool to yourself + +Run this references query and read the results carefully: + +```bash +.claude/cache/.venv/bin/python .claude/tools/lsp/openmc_lsp.py references src/tallies/tally.cpp:835 +``` + +Line 835 is `Tally::reset()`. The LSP tool uses the C++ compiler frontend to +resolve this — it returns only references to **this specific** `reset()`, not +the other 3 classes that also define `void reset()` (Timer, ParticleData, +SharedArray). Compare with `grep 'reset()'` which returns 62 mixed hits across +20 files including vendor code. The LSP tool gives you the exact 10 files that +call or reference `Tally::reset()`, with line numbers — zero false positives. + +This is why the LSP tool exists: **`grep` matches text, LSP resolves types.** +When a common method name like `reset`, `get`, `size`, or `create` is used by +multiple classes, `grep` gives you a haystack. LSP gives you the needle. + +Use each tool for what it's best at: +- **LSP**: "who calls *this specific* C++ method?" — type-accurate references +- **grep**: "every line containing this unique string" — fast exact text match + +## Step 6: Confirm activation Tell the user the tools are active. If the LSP tool can't find clangd or `build/compile_commands.json`, it will report this itself when invoked. diff --git a/.claude/tools/lsp/openmc_lsp.py b/.claude/tools/lsp/openmc_lsp.py index f873029c50e..eff2086d8dd 100644 --- a/.claude/tools/lsp/openmc_lsp.py +++ b/.claude/tools/lsp/openmc_lsp.py @@ -255,18 +255,63 @@ def cmd_symbols(client, filepath): print(f"{indent}{kind_name}: {sym['name']} (line {line + 1})") +def find_symbol_on_line(client, filepath, line_1based): + """Find the character position of the symbol name on a given line. + + Uses clangd's document symbols to identify what symbol is defined or + referenced on the target line, then locates the symbol name in the + line text. Falls back to the first identifier on the line if no + symbol matches. + + Returns (character_0based, line_text) or (None, None) if the line + doesn't exist. + """ + fpath = Path(filepath) + if not fpath.is_absolute(): + fpath = OPENMC_ROOT / fpath + file_lines = fpath.read_text().split('\n') + line_0 = line_1based - 1 + if line_0 < 0 or line_0 >= len(file_lines): + return None, None + text = file_lines[line_0] + + # Try to find a symbol defined on this line via document symbols + symbols = client.get_symbols(filepath) + flat = flatten_symbols(symbols) + for sym, _depth in flat: + start = get_symbol_range(sym) + if start['line'] == line_0: + col = text.find(sym['name'], start['character']) + if col >= 0: + return col, text + + # No symbol definition on this line — find the first identifier + # (skip leading whitespace and common return types/keywords) + import re + # Find all C++ identifiers on the line + for m in re.finditer(r'[A-Za-z_]\w*', text): + # Skip common C++ keywords and types that aren't useful to look up + if m.group() not in { + 'void', 'int', 'double', 'float', 'char', 'bool', 'long', + 'short', 'unsigned', 'signed', 'const', 'static', 'virtual', + 'inline', 'extern', 'auto', 'return', 'if', 'else', 'for', + 'while', 'do', 'switch', 'case', 'break', 'continue', + 'struct', 'class', 'enum', 'namespace', 'using', 'typedef', + 'template', 'typename', 'public', 'private', 'protected', + 'override', 'final', 'explicit', 'noexcept', 'constexpr', + }: + return m.start(), text + # Last resort: first non-whitespace + return len(text) - len(text.lstrip()), text + + def cmd_definition(client, filepath, line, character=None): """Find the definition of a symbol.""" if character is None: - # Find first non-whitespace identifier on the line - fpath = Path(filepath) - if not fpath.is_absolute(): - fpath = OPENMC_ROOT / fpath - lines = fpath.read_text().split('\n') - if line - 1 < len(lines): - text = lines[line - 1] - # Skip leading whitespace - character = len(text) - len(text.lstrip()) + character, _ = find_symbol_on_line(client, filepath, line) + if character is None: + print("Could not determine symbol on that line.") + return result = client.get_definition(filepath, line - 1, character) if not result: @@ -284,13 +329,10 @@ def cmd_definition(client, filepath, line, character=None): def cmd_references(client, filepath, line, character=None): """Find all references to a symbol.""" if character is None: - fpath = Path(filepath) - if not fpath.is_absolute(): - fpath = OPENMC_ROOT / fpath - lines = fpath.read_text().split('\n') - if line - 1 < len(lines): - text = lines[line - 1] - character = len(text) - len(text.lstrip()) + character, _ = find_symbol_on_line(client, filepath, line) + if character is None: + print("Could not determine symbol on that line.") + return result = client.get_references(filepath, line - 1, character) if not result: From e784601c11eaeede3edeb42a8a604266d45626cd Mon Sep 17 00:00:00 2001 From: John Tramm Date: Fri, 6 Mar 2026 02:31:13 +0000 Subject: [PATCH 31/67] Remove aider/tree-sitter repo map tool (zero demonstrable utility) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The repo map tool matched identifiers by name only, causing the same ~6 noise files (endf.h, neighbor_list.h, particle_type.h, source_region.h, mcpl_interface.cpp) to dominate results for every query via common names like push_back, str, and operator(). Tested ~10 different focus files across Python and C++ — no combination produced useful results. RAG search and LSP cover the intended use cases better: RAG for broad semantic discovery, LSP for precise C++ symbol resolution. Removes aider-chat dependency (and its transitive tree-sitter deps), openmc_map.py, and all references in CLAUDE.md and skill files. Co-Authored-By: Claude Opus 4.6 --- .claude/skills/enable-openmc-index/SKILL.md | 10 +- .claude/skills/refresh-openmc-index/SKILL.md | 4 +- .claude/tools/lsp/openmc_lsp.py | 7 +- .claude/tools/repomap/openmc_map.py | 181 ------------------- .claude/tools/requirements.txt | 3 - CLAUDE.md | 28 +-- 6 files changed, 13 insertions(+), 220 deletions(-) delete mode 100644 .claude/tools/repomap/openmc_map.py diff --git a/.claude/skills/enable-openmc-index/SKILL.md b/.claude/skills/enable-openmc-index/SKILL.md index f746ff44780..09c723006c4 100644 --- a/.claude/skills/enable-openmc-index/SKILL.md +++ b/.claude/skills/enable-openmc-index/SKILL.md @@ -1,6 +1,6 @@ --- name: enable-openmc-index -description: Enable the OpenMC codebase tools for this session. Provides semantic code search, structural repo mapping, and LSP-based C++ code navigation. +description: Enable the OpenMC codebase tools for this session. Provides semantic code search and LSP-based C++ code navigation. allowed-tools: Bash(*), Read --- @@ -9,8 +9,7 @@ allowed-tools: Bash(*), Read Set up (if needed) and activate the OpenMC codebase tools for this session: 1. **`openmc_search.py`** — RAG semantic search. Chunks code into overlapping fixed-size windows, embeds locally on CPU with the all-MiniLM-L6-v2 model (22M parameters, no GPU or API key needed), and searches a LanceDB vector index. Returns code previews with file paths and line numbers. Covers C++, Python, and docs. -2. **`openmc_map.py`** — Structural repo map via aider/tree-sitter. Builds a cross-file reference graph, ranks files with PageRank relative to your focus files, then shows the top-ranked files as condensed code skeletons fitted to a token budget. Focus files are excluded (assumes you already have them). Caveat: the graph matches identifiers by name only — common names like `push_back` or `__init__` create false edges in the ranking. -3. **`openmc_lsp.py`** — LSP (Language Server Protocol) navigation via clangd. Talks to the C++ compiler frontend for symbol resolution. `definition`, `references`, `symbols`, and `related` commands with compiler accuracy — zero false edges. Requires clangd and that OpenMC has been built (for `build/compile_commands.json`). +2. **`openmc_lsp.py`** — LSP (Language Server Protocol) navigation via clangd. Talks to the C++ compiler frontend for symbol resolution. `definition`, `references`, `symbols`, and `related` commands with compiler accuracy — zero false edges. Requires clangd and that OpenMC has been built (for `build/compile_commands.json`). ## Step 1: Ensure the virtual environment exists @@ -26,7 +25,7 @@ fi ## Step 2: Ensure the RAG index exists -The semantic search tool needs a pre-built vector index. The other two tools work without it. +The semantic search tool needs a pre-built vector index. The LSP tool works without it. ```bash if [ ! -d .claude/cache/rag_index ]; then @@ -44,7 +43,6 @@ Run `--help` for each tool to learn their full APIs: ```bash .claude/cache/.venv/bin/python .claude/tools/rag/openmc_search.py --help -.claude/cache/.venv/bin/python .claude/tools/repomap/openmc_map.py --help .claude/cache/.venv/bin/python .claude/tools/lsp/openmc_lsp.py --help ``` @@ -108,7 +106,6 @@ Tell the user the tools are active. If the LSP tool can't find clangd or - **`openmc_search.py`**: "What code is conceptually related to X?" — broad discovery by meaning, cross-cutting concerns, Python and docs. **Use this before grep when exploring unfamiliar code or checking what a change might affect.** - **`openmc_lsp.py`**: "Where is this C++ symbol defined, who calls it, and what files are truly connected to this one?" — compiler-accurate file:line locations, zero false positives -- **`openmc_map.py`**: "Show me the code structure of files neighboring my focus files" — PageRank-ranked code skeletons fitted to a token budget. Neighbor ranking is noisy for common identifiers; use `openmc_lsp.py related` for accurate C++ file connections ## Subagent guidance @@ -116,5 +113,4 @@ When spawning subagents that will investigate or modify OpenMC code, include in "The OpenMC code index is available. Run `--help` on these tools to see their full API: - `.claude/cache/.venv/bin/python .claude/tools/rag/openmc_search.py --help` (semantic search) -- `.claude/cache/.venv/bin/python .claude/tools/repomap/openmc_map.py --help` (structural map) - `.claude/cache/.venv/bin/python .claude/tools/lsp/openmc_lsp.py --help` (LSP navigation, C++ only)" diff --git a/.claude/skills/refresh-openmc-index/SKILL.md b/.claude/skills/refresh-openmc-index/SKILL.md index aefebae581f..451d3056cc9 100644 --- a/.claude/skills/refresh-openmc-index/SKILL.md +++ b/.claude/skills/refresh-openmc-index/SKILL.md @@ -1,13 +1,13 @@ --- name: refresh-openmc-index -description: Rebuild the OpenMC RAG search index. Use after pulling new code or switching branches. The other tools (repo map, LSP) do not need refreshing. +description: Rebuild the OpenMC RAG search index. Use after pulling new code or switching branches. The LSP tool does not need refreshing. allowed-tools: Bash(*) --- # Refresh OpenMC Index Rebuild the RAG semantic search index from scratch. Only this index needs -refreshing — the repo map and LSP tools always work on the current code. +refreshing — the LSP tool always works on the current code. ## Step 1: Ensure venv exists diff --git a/.claude/tools/lsp/openmc_lsp.py b/.claude/tools/lsp/openmc_lsp.py index eff2086d8dd..45d78113a84 100644 --- a/.claude/tools/lsp/openmc_lsp.py +++ b/.claude/tools/lsp/openmc_lsp.py @@ -27,6 +27,7 @@ import argparse import json import os +import re import shutil import subprocess import sys @@ -285,10 +286,8 @@ def find_symbol_on_line(client, filepath, line_1based): if col >= 0: return col, text - # No symbol definition on this line — find the first identifier - # (skip leading whitespace and common return types/keywords) - import re - # Find all C++ identifiers on the line + # No symbol definition on this line — find the first meaningful identifier + # (skip C++ keywords and types that aren't useful to look up) for m in re.finditer(r'[A-Za-z_]\w*', text): # Skip common C++ keywords and types that aren't useful to look up if m.group() not in { diff --git a/.claude/tools/repomap/openmc_map.py b/.claude/tools/repomap/openmc_map.py deleted file mode 100644 index b94876c5a8b..00000000000 --- a/.claude/tools/repomap/openmc_map.py +++ /dev/null @@ -1,181 +0,0 @@ -#!/usr/bin/env python3 -"""Generate a focused repo map around specific OpenMC files. - -Uses aider's RepoMap to produce a condensed structural overview of the -codebase, ranked by relevance to the files you're currently working on. - -Usage: - openmc_map.py src/particle.cpp # Map around one file - openmc_map.py src/simulation.cpp src/source.cpp # Map around multiple files - openmc_map.py --tokens 4096 # Larger map (default: 2048) - openmc_map.py # Map of the whole repo (top-ranked files) - -Examples: - openmc_map.py src/particle_restart.cpp src/random_lcg.cpp - openmc_map.py openmc/deplete/coupled_operator.py --tokens 4096 - openmc_map.py include/openmc/cell.h include/openmc/surface.h -""" - -import argparse -import glob -import os -import sys -from pathlib import Path - -OPENMC_ROOT = Path(__file__).resolve().parents[3] - -# File patterns to include in the map -FILE_PATTERNS = [ - "src/**/*.cpp", - "include/openmc/**/*.h", - "openmc/**/*.py", -] - -# Ubiquitous utility files that clutter the map without providing useful -# structural context. Passed as chat_fnames so aider excludes them from -# the output (it assumes they're "already in context"). -SUPPRESS_FILES = [ - "include/openmc/error.h", - "src/error.cpp", - "include/openmc/position.h", - "include/openmc/constants.h", - "include/openmc/span.h", - "include/openmc/tensor.h", - "openmc/checkvalue.py", -] - - -class TokenCounter: - """Simple token counter that doesn't need an API model.""" - - def token_count(self, text): - # Rough approximation: ~4 chars per token for code - return len(text) // 4 - - -class FakeModel: - """Minimal model stand-in for aider's RepoMap token counting.""" - - def __init__(self): - self._token_counter = TokenCounter() - - def token_count(self, text): - return self._token_counter.token_count(text) - - -def get_all_files(): - """Collect all source files matching our patterns.""" - files = [] - for pattern in FILE_PATTERNS: - for fp in sorted(OPENMC_ROOT.glob(pattern)): - if "__pycache__" in str(fp): - continue - files.append(str(fp.relative_to(OPENMC_ROOT))) - return files - - -def generate_map(focus_files=None, map_tokens=2048): - """Generate a repo map, optionally focused on specific files. - - Args: - focus_files: List of file paths to focus on. If None, generates - a general overview of the most important files. - map_tokens: Approximate token budget for the map. - - Returns: - The repo map as a string. - """ - from aider.io import InputOutput - from aider.repomap import RepoMap - - os.chdir(OPENMC_ROOT) - - # Suppress aider's "not a terminal" warning - devnull = open(os.devnull, "w") - io = InputOutput(yes=True, pretty=False, user_input_color=None, - tool_output_color=None, tool_warning_color=None, - tool_error_color=None) - model = FakeModel() - - rm = RepoMap( - map_tokens=map_tokens, - root=str(OPENMC_ROOT), - io=io, - main_model=model, - ) - - all_files = get_all_files() - - # Normalize focus files to relative paths - chat_fnames = [] - if focus_files: - for f in focus_files: - # Handle both absolute and relative paths - fp = Path(f) - if fp.is_absolute(): - try: - fp = fp.relative_to(OPENMC_ROOT) - except ValueError: - pass - rel = str(fp) - if rel in all_files: - chat_fnames.append(rel) - else: - # Try to find a match - matches = [af for af in all_files if rel in af] - if matches: - chat_fnames.extend(matches) - else: - print(f"Warning: '{f}' not found in indexed files", - file=sys.stderr) - - # chat_fnames = files the agent is focused on (already in context). - # Aider shows their neighbors/dependencies, not the files themselves, - # since the agent already has those open. - # Also suppress ubiquitous utility files that waste token budget. - suppress = [f for f in SUPPRESS_FILES if f in all_files and f not in chat_fnames] - chat_fnames_with_suppress = chat_fnames + suppress - other_fnames = [f for f in all_files if f not in chat_fnames_with_suppress] - repo_map = rm.get_repo_map(chat_fnames_with_suppress, other_fnames) - - if not repo_map: - return "No map generated. Try with different files or a larger --tokens budget." - - return repo_map - - -def main(): - parser = argparse.ArgumentParser( - description="Generate a focused structural map of OpenMC code", - formatter_class=argparse.RawDescriptionHelpFormatter, - epilog="""examples: - %(prog)s src/particle_restart.cpp src/random_lcg.cpp - %(prog)s openmc/deplete/coupled_operator.py --tokens 4096 - %(prog)s include/openmc/cell.h include/openmc/surface.h - %(prog)s # overview of whole repo""", - ) - parser.add_argument( - "files", nargs="*", - help="Files to focus the map on (shows their structure and neighbors)") - parser.add_argument( - "--tokens", type=int, default=2048, - help="Approximate token budget for the map (default: 2048)") - - args = parser.parse_args() - - # Redirect stderr to suppress aider's noisy warnings - import io as _io - old_stderr = sys.stderr - sys.stderr = _io.StringIO() - try: - repo_map = generate_map( - focus_files=args.files if args.files else None, - map_tokens=args.tokens, - ) - finally: - sys.stderr = old_stderr - print(repo_map) - - -if __name__ == "__main__": - main() diff --git a/.claude/tools/requirements.txt b/.claude/tools/requirements.txt index 7b99a305cbe..d51930a25bc 100644 --- a/.claude/tools/requirements.txt +++ b/.claude/tools/requirements.txt @@ -1,6 +1,3 @@ -# Repo map (uses aider's tree-sitter based RepoMap) -aider-chat>=0.80.0 - # Vector database lancedb>=0.15.0 diff --git a/CLAUDE.md b/CLAUDE.md index 2e957996c0e..351a08741dc 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -2,18 +2,18 @@ For any task involving OpenMC code — investigating, modifying, debugging, or reviewing PRs — let the user know about the `/enable-openmc-index` skill which -provides three code navigation tools. Offer to run it for them. Even for PR +provides two code navigation tools. Offer to run it for them. Even for PR reviews, these tools are important: a diff only shows what changed, not what else in the codebase depends on or is affected by those changes. Note: the first run builds a RAG vector index, which takes ~5 minutes on 10 CPU cores. Subsequent sessions reuse the cached index. -Do NOT use the tools (`openmc_search.py`, `openmc_map.py`, `openmc_lsp.py`) -unless `/enable-openmc-index` has been run in the current session. +Do NOT use the tools (`openmc_search.py`, `openmc_lsp.py`) unless +`/enable-openmc-index` has been run in the current session. When using these tools, ALWAYS read their full output. Do NOT pipe through head, -tail, or grep. The tools already limit their output size via `--top-k` and -`--tokens` flags. Truncating their output defeats their purpose. +tail, or grep. The tools already limit their output size via `--top-k`. +Truncating their output defeats their purpose. To rebuild the RAG search index after pulling new code, the user can use `/refresh-openmc-index`. @@ -57,24 +57,6 @@ Zero false edges — if it says two files are connected, they genuinely share typed references. Requires clangd and `build/compile_commands.json` (automatically generated when OpenMC is built with cmake). -**`openmc_map.py`** — Structural repo map via aider/tree-sitter. Tree-sitter -parses all C++ and Python source files to extract identifier definitions and -references. A cross-file reference graph is built (file A references a symbol -defined in file B → edge from A to B), then PageRank ranks files by importance -relative to your focus files. The top-ranked files are shown as condensed code -skeletons with class/function signatures and `⋮` elision markers, fitted to a -`--tokens` budget (default 2048). Focus files themselves are excluded from the -output (the assumption is you already have them in context). **Caveat**: the -reference graph matches identifiers by name only — tree-sitter has no type -information, so `std::vector::push_back` and `NeighborList::push_back` create -the same edges. This means files defining common method names (`push_back`, -`get`, `__init__`, `from_xml`, etc.) get inflated PageRank and appear as -neighbors when they may not be truly related. The name-matching can also be -useful — it surfaces files with identically-named functions that may need -parallel changes even though they have no typed connection. For precise C++ -file connections, `openmc_lsp.py related` is more reliable; the repo map is -better for a broad structural overview or for Python code. - ## Additional OpenMC info Read the FULL `AGENTS.md` in this directory also before starting work. From 5538b18b57077612b731f88496e7938e09c4f717 Mon Sep 17 00:00:00 2001 From: John Tramm Date: Thu, 5 Mar 2026 20:56:02 -0600 Subject: [PATCH 32/67] Revise OpenMC codebase tools usage instructions Updated guidance on using the /enable-openmc-index skill for OpenMC code tasks, emphasizing its importance before any work. Clarified the role of RAG search in code reviews and feature creation. --- CLAUDE.md | 30 +++++++++++++++++++----------- 1 file changed, 19 insertions(+), 11 deletions(-) diff --git a/CLAUDE.md b/CLAUDE.md index 351a08741dc..ac8030a5479 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -1,12 +1,14 @@ ## OpenMC Codebase Tools -For any task involving OpenMC code — investigating, modifying, debugging, or -reviewing PRs — let the user know about the `/enable-openmc-index` skill which -provides two code navigation tools. Offer to run it for them. Even for PR -reviews, these tools are important: a diff only shows what changed, not what -else in the codebase depends on or is affected by those changes. Note: the -first run builds a RAG vector index, which takes ~5 minutes on 10 CPU cores. -Subsequent sessions reuse the cached index. +For any task involving OpenMC code — investigating, modifying, debugging, +reviewing code, implementing new features, or examinging diffs/PRs — you must let +the user know about the `/enable-openmc-index` skill which +provides two code navigation tools. You must altert the user of this skill before +doing any work (e.g., opening a file, reading a diff, making a plan, etc). +Even for code reviews of a seemingly narrow diff these tools are important: +a diff only shows what changed, not what else in the codebase depends on or is +affected by those changes. Note: the first run builds a RAG vector index, which +takes ~5 minutes on 10 CPU cores. Subsequent sessions reuse the cached index. Do NOT use the tools (`openmc_search.py`, `openmc_lsp.py`) unless `/enable-openmc-index` has been run in the current session. @@ -22,14 +24,20 @@ To rebuild the RAG search index after pulling new code, the user can use When exploring unfamiliar code or checking what a change might affect, use `openmc_search.py` **before** reaching for `grep` or `Glob`. `grep` only finds -exact text matches — it cannot find code that does the same thing with different +exact text matches — it cannot find code that does something similar with different naming. The RAG search finds code by semantic meaning, surfacing related code across subsystems that you would otherwise miss entirely. Use RAG for discovery, -then `grep`/`Read` to drill into the specific files it surfaces. +then `grep`/`Read` to drill into specific files. If the `/enable-openmc-index` skill +has been invoked, then **no code review or feature creation job +in OpenMC is be considered complete unless you have made good use of the RAG tool to ensure +any logic changes do not cause collateral damage elsewhere in the codebase.** The ideal +time for calling it is before you begin to make changes or start zooming in on small local details. +Thinking up a few RAG queries before starting analysis/editing work in each area +will greatly improve your global vision of the repository and help you to "know what you don't know". When you already know the exact symbol name and need to trace its usage (e.g., -"every line that writes to `progeny_per_particle`"), `grep` is the right tool -— don't force a RAG search for precise symbol lookups. +"every line that writes to `progeny_per_particle`"), `grep` or the lsp navigation tool +are better choices — you don't have to force a RAG search for precise symbol lookups. ### Tool details From 47468868d325d4813491dfbd9ac2282206c90edd Mon Sep 17 00:00:00 2001 From: John Tramm Date: Thu, 5 Mar 2026 20:57:25 -0600 Subject: [PATCH 33/67] Update CLAUDE.md with RAG tool usage guidance Clarify the use of RAG tool for code review and discovery. --- CLAUDE.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/CLAUDE.md b/CLAUDE.md index ac8030a5479..7512803abdb 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -25,11 +25,11 @@ To rebuild the RAG search index after pulling new code, the user can use When exploring unfamiliar code or checking what a change might affect, use `openmc_search.py` **before** reaching for `grep` or `Glob`. `grep` only finds exact text matches — it cannot find code that does something similar with different -naming. The RAG search finds code by semantic meaning, surfacing related code +naming. You don't know what you don't know. The RAG search finds code by semantic meaning, surfacing related code across subsystems that you would otherwise miss entirely. Use RAG for discovery, then `grep`/`Read` to drill into specific files. If the `/enable-openmc-index` skill has been invoked, then **no code review or feature creation job -in OpenMC is be considered complete unless you have made good use of the RAG tool to ensure +in OpenMC is to be considered complete unless you have made good use of the RAG tool to ensure any logic changes do not cause collateral damage elsewhere in the codebase.** The ideal time for calling it is before you begin to make changes or start zooming in on small local details. Thinking up a few RAG queries before starting analysis/editing work in each area From b1f964f5aad598d17baca1e29e7fa0b8af055126 Mon Sep 17 00:00:00 2001 From: John Tramm Date: Fri, 6 Mar 2026 18:10:27 +0000 Subject: [PATCH 34/67] Add agent post-mortem testimonial to skill for global awareness A real agent review found 1 of 11 bugs by only reading the diff. Its own post-mortem analysis is included as a cautionary testimonial to encourage using RAG throughout work for global codebase awareness, not just local diff analysis. Co-Authored-By: Claude Opus 4.6 --- .claude/skills/enable-openmc-index/SKILL.md | 30 ++++++++++++++++++++- 1 file changed, 29 insertions(+), 1 deletion(-) diff --git a/.claude/skills/enable-openmc-index/SKILL.md b/.claude/skills/enable-openmc-index/SKILL.md index 09c723006c4..26277496128 100644 --- a/.claude/skills/enable-openmc-index/SKILL.md +++ b/.claude/skills/enable-openmc-index/SKILL.md @@ -97,7 +97,35 @@ Use each tool for what it's best at: - **LSP**: "who calls *this specific* C++ method?" — type-accurate references - **grep**: "every line containing this unique string" — fast exact text match -## Step 6: Confirm activation +## Step 6: Learn from previous review failures + +An agent reviewed a large OpenMC PR using only diff, grep, and Read. It found +1 of 11 serious bugs. Here is its own post-mortem analysis of what went wrong: + +> **I treated the diff as a closed system.** I verified internal consistency of +> the changed code obsessively, but never built a global understanding of how +> the changed code fits into the wider codebase. The diff altered assumptions +> that code elsewhere silently relied on — but I couldn't see that because I +> never looked beyond the diff. I couldn't see the forest for the trees. +> +> **Why I resisted RAG:** Overconfidence. My internal model was "I can see the +> diff, I understand the data structures, I can trace the logic." The diff felt +> self-contained. RAG felt like it would return noisy results about tangentially +> related code. But in a codebase this large, changes in one subsystem can +> quietly break invariants that distant subsystems depend on — and you need +> global awareness to foresee that. +> +> **In the post-mortem**, I re-ran the RAG queries I should have run during the +> review. They directly surfaced the files containing the bugs I missed — files +> I never thought to open because they weren't in the diff. + +The takeaway: **use RAG throughout your work to maintain global awareness.** +Before diving into details, ask "what else in this codebase depends on the +behavior being changed?" As you explore each area, keep querying to build your +mental map of affected subsystems. The diff tells you *what* changed; RAG tells +you *what else cares*. + +## Step 7: Confirm activation Tell the user the tools are active. If the LSP tool can't find clangd or `build/compile_commands.json`, it will report this itself when invoked. From d55a16872e098ca04482c12d3c4da85a88899c7c Mon Sep 17 00:00:00 2001 From: John Tramm Date: Fri, 6 Mar 2026 18:46:15 +0000 Subject: [PATCH 35/67] Rename skills to openmc- prefix for grouped tab completion MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit enable-openmc-index → openmc-enable-index refresh-openmc-index → openmc-refresh-index Typing /openmc in the terminal now shows all OpenMC skills together. Co-Authored-By: Claude Opus 4.6 --- .../{enable-openmc-index => openmc-enable-index}/SKILL.md | 2 +- .../SKILL.md | 2 +- .claude/tools/rag/openmc_search.py | 2 +- CLAUDE.md | 8 ++++---- 4 files changed, 7 insertions(+), 7 deletions(-) rename .claude/skills/{enable-openmc-index => openmc-enable-index}/SKILL.md (99%) rename .claude/skills/{refresh-openmc-index => openmc-refresh-index}/SKILL.md (96%) diff --git a/.claude/skills/enable-openmc-index/SKILL.md b/.claude/skills/openmc-enable-index/SKILL.md similarity index 99% rename from .claude/skills/enable-openmc-index/SKILL.md rename to .claude/skills/openmc-enable-index/SKILL.md index 26277496128..a3ad8f48898 100644 --- a/.claude/skills/enable-openmc-index/SKILL.md +++ b/.claude/skills/openmc-enable-index/SKILL.md @@ -1,5 +1,5 @@ --- -name: enable-openmc-index +name: openmc-enable-index description: Enable the OpenMC codebase tools for this session. Provides semantic code search and LSP-based C++ code navigation. allowed-tools: Bash(*), Read --- diff --git a/.claude/skills/refresh-openmc-index/SKILL.md b/.claude/skills/openmc-refresh-index/SKILL.md similarity index 96% rename from .claude/skills/refresh-openmc-index/SKILL.md rename to .claude/skills/openmc-refresh-index/SKILL.md index 451d3056cc9..3f17762f453 100644 --- a/.claude/skills/refresh-openmc-index/SKILL.md +++ b/.claude/skills/openmc-refresh-index/SKILL.md @@ -1,5 +1,5 @@ --- -name: refresh-openmc-index +name: openmc-refresh-index description: Rebuild the OpenMC RAG search index. Use after pulling new code or switching branches. The LSP tool does not need refreshing. allowed-tools: Bash(*) --- diff --git a/.claude/tools/rag/openmc_search.py b/.claude/tools/rag/openmc_search.py index 4029ee50af8..26294f3cb22 100644 --- a/.claude/tools/rag/openmc_search.py +++ b/.claude/tools/rag/openmc_search.py @@ -33,7 +33,7 @@ def get_db_and_embedder(): from embeddings import EmbeddingProvider if not INDEX_DIR.exists(): - print("ERROR: No index found. Run /enable-openmc-index first.", + print("ERROR: No index found. Run /openmc-enable-index first.", file=sys.stderr) sys.exit(1) diff --git a/CLAUDE.md b/CLAUDE.md index 7512803abdb..cb6dbc93fac 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -2,7 +2,7 @@ For any task involving OpenMC code — investigating, modifying, debugging, reviewing code, implementing new features, or examinging diffs/PRs — you must let -the user know about the `/enable-openmc-index` skill which +the user know about the `/openmc-enable-index` skill which provides two code navigation tools. You must altert the user of this skill before doing any work (e.g., opening a file, reading a diff, making a plan, etc). Even for code reviews of a seemingly narrow diff these tools are important: @@ -11,14 +11,14 @@ affected by those changes. Note: the first run builds a RAG vector index, which takes ~5 minutes on 10 CPU cores. Subsequent sessions reuse the cached index. Do NOT use the tools (`openmc_search.py`, `openmc_lsp.py`) unless -`/enable-openmc-index` has been run in the current session. +`/openmc-enable-index` has been run in the current session. When using these tools, ALWAYS read their full output. Do NOT pipe through head, tail, or grep. The tools already limit their output size via `--top-k`. Truncating their output defeats their purpose. To rebuild the RAG search index after pulling new code, the user can use -`/refresh-openmc-index`. +`/openmc-refresh-index`. ### Important: use RAG search before grep @@ -27,7 +27,7 @@ When exploring unfamiliar code or checking what a change might affect, use exact text matches — it cannot find code that does something similar with different naming. You don't know what you don't know. The RAG search finds code by semantic meaning, surfacing related code across subsystems that you would otherwise miss entirely. Use RAG for discovery, -then `grep`/`Read` to drill into specific files. If the `/enable-openmc-index` skill +then `grep`/`Read` to drill into specific files. If the `/openmc-enable-index` skill has been invoked, then **no code review or feature creation job in OpenMC is to be considered complete unless you have made good use of the RAG tool to ensure any logic changes do not cause collateral damage elsewhere in the codebase.** The ideal From 83799d97d56574b834615fd422f2cd57c142ce3e Mon Sep 17 00:00:00 2001 From: John Tramm Date: Fri, 6 Mar 2026 18:58:17 +0000 Subject: [PATCH 36/67] Make LSP demo gracefully skip when clangd or compile_commands.json missing Guard both the --help and references demo with checks for clangd and compile_commands.json (in build/ or repo root). Diagnostic output shows which prerequisite is missing. Also dynamically find Tally::reset() line number instead of hardcoding it, and use relative language for reference counts so the text doesn't go stale. Co-Authored-By: Claude Opus 4.6 --- .claude/skills/openmc-enable-index/SKILL.md | 40 ++++++++++++++++----- 1 file changed, 31 insertions(+), 9 deletions(-) diff --git a/.claude/skills/openmc-enable-index/SKILL.md b/.claude/skills/openmc-enable-index/SKILL.md index a3ad8f48898..03ad94abb92 100644 --- a/.claude/skills/openmc-enable-index/SKILL.md +++ b/.claude/skills/openmc-enable-index/SKILL.md @@ -43,7 +43,16 @@ Run `--help` for each tool to learn their full APIs: ```bash .claude/cache/.venv/bin/python .claude/tools/rag/openmc_search.py --help -.claude/cache/.venv/bin/python .claude/tools/lsp/openmc_lsp.py --help +# LSP tool requires clangd and compile_commands.json +HAS_CLANGD=false; HAS_COMPDB=false +(which clangd || which clangd-15 || which clangd-16 || which clangd-17 || which clangd-18) >/dev/null 2>&1 && HAS_CLANGD=true +([ -f build/compile_commands.json ] || [ -f compile_commands.json ]) && HAS_COMPDB=true +if $HAS_CLANGD && $HAS_COMPDB; then + .claude/cache/.venv/bin/python .claude/tools/lsp/openmc_lsp.py --help + echo "LSP_AVAILABLE" +else + echo "LSP_UNAVAILABLE (need clangd [$HAS_CLANGD] and compile_commands.json [$HAS_COMPDB])" +fi ``` Read and internalize the output so you know all available options. @@ -76,18 +85,31 @@ for broad exploration. ## Step 5: Demonstrate the LSP tool to yourself -Run this references query and read the results carefully: +Skip this step if Step 3 printed `LSP_UNAVAILABLE`. ```bash -.claude/cache/.venv/bin/python .claude/tools/lsp/openmc_lsp.py references src/tallies/tally.cpp:835 +HAS_CLANGD=false; HAS_COMPDB=false +(which clangd || which clangd-15 || which clangd-16 || which clangd-17 || which clangd-18) >/dev/null 2>&1 && HAS_CLANGD=true +([ -f build/compile_commands.json ] || [ -f compile_commands.json ]) && HAS_COMPDB=true +if $HAS_CLANGD && $HAS_COMPDB; then + LINE=$(grep -n "^void Tally::reset()" src/tallies/tally.cpp | head -1 | cut -d: -f1) + if [ -n "$LINE" ]; then + .claude/cache/.venv/bin/python .claude/tools/lsp/openmc_lsp.py references "src/tallies/tally.cpp:$LINE" + else + echo "SKIPPED: Could not find Tally::reset() in src/tallies/tally.cpp" + fi +else + echo "SKIPPED: LSP demo requires clangd [$HAS_CLANGD] and compile_commands.json [$HAS_COMPDB]" +fi ``` -Line 835 is `Tally::reset()`. The LSP tool uses the C++ compiler frontend to -resolve this — it returns only references to **this specific** `reset()`, not -the other 3 classes that also define `void reset()` (Timer, ParticleData, -SharedArray). Compare with `grep 'reset()'` which returns 62 mixed hits across -20 files including vendor code. The LSP tool gives you the exact 10 files that -call or reference `Tally::reset()`, with line numbers — zero false positives. +If it ran, the output shows references to `Tally::reset()`. The LSP tool uses +the C++ compiler frontend to resolve this — it returns only references to +**this specific** `reset()`, not the other classes that also define +`void reset()`. Compare with `grep 'reset()'` which returns dozens of mixed +hits across many files including vendor code. The LSP tool gives you only the +files that actually reference `Tally::reset()`, with line numbers — zero false +positives. This is why the LSP tool exists: **`grep` matches text, LSP resolves types.** When a common method name like `reset`, `get`, `size`, or `create` is used by From 488d99982d518b267a8ec8cb6dd1355b04814425 Mon Sep 17 00:00:00 2001 From: John Tramm Date: Fri, 6 Mar 2026 19:02:30 +0000 Subject: [PATCH 37/67] Fix typos, first-run model download, and empty symbol in search output - Fix "examinging" and "altert" typos in CLAUDE.md - Remove local_files_only=True so embedding model can download on first run - Remove dead symbol field from RAG search output (was always empty) - Remove redundant "skip this step" text from LSP demo - Wrap long line in CLAUDE.md Co-Authored-By: Claude Opus 4.6 --- .claude/skills/openmc-enable-index/SKILL.md | 2 -- .claude/tools/rag/embeddings.py | 4 +--- .claude/tools/rag/openmc_search.py | 5 +---- CLAUDE.md | 9 +++++---- 4 files changed, 7 insertions(+), 13 deletions(-) diff --git a/.claude/skills/openmc-enable-index/SKILL.md b/.claude/skills/openmc-enable-index/SKILL.md index 03ad94abb92..6c8b2597c07 100644 --- a/.claude/skills/openmc-enable-index/SKILL.md +++ b/.claude/skills/openmc-enable-index/SKILL.md @@ -85,8 +85,6 @@ for broad exploration. ## Step 5: Demonstrate the LSP tool to yourself -Skip this step if Step 3 printed `LSP_UNAVAILABLE`. - ```bash HAS_CLANGD=false; HAS_COMPDB=false (which clangd || which clangd-15 || which clangd-16 || which clangd-17 || which clangd-18) >/dev/null 2>&1 && HAS_CLANGD=true diff --git a/.claude/tools/rag/embeddings.py b/.claude/tools/rag/embeddings.py index b701b6fd5a6..068a097d9e9 100644 --- a/.claude/tools/rag/embeddings.py +++ b/.claude/tools/rag/embeddings.py @@ -23,9 +23,7 @@ class EmbeddingProvider: def __init__(self, model_name: str = "all-MiniLM-L6-v2"): from sentence_transformers import SentenceTransformer - self.model = SentenceTransformer( - model_name, local_files_only=True, token=False - ) + self.model = SentenceTransformer(model_name, token=False) self.dim = self.model.get_sentence_embedding_dimension() def embed(self, texts: list[str]) -> list[list[float]]: diff --git a/.claude/tools/rag/openmc_search.py b/.claude/tools/rag/openmc_search.py index 26294f3cb22..aa6283870e7 100644 --- a/.claude/tools/rag/openmc_search.py +++ b/.claude/tools/rag/openmc_search.py @@ -72,10 +72,7 @@ def format_results(results, label=""): symbol = r["symbol"] dist = r.get("_distance", 0) - header = f"[{i}] {filepath}:{start}-{end} ({kind}" - if symbol: - header += f": {symbol}" - header += f", dist={dist:.3f})" + header = f"[{i}] {filepath}:{start}-{end} ({kind}, dist={dist:.3f})" output.append(header) # Show text preview (first 500 chars) diff --git a/CLAUDE.md b/CLAUDE.md index cb6dbc93fac..f1216cf5b06 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -1,9 +1,9 @@ ## OpenMC Codebase Tools For any task involving OpenMC code — investigating, modifying, debugging, -reviewing code, implementing new features, or examinging diffs/PRs — you must let +reviewing code, implementing new features, or examining diffs/PRs — you must let the user know about the `/openmc-enable-index` skill which -provides two code navigation tools. You must altert the user of this skill before +provides two code navigation tools. You must alert the user of this skill before doing any work (e.g., opening a file, reading a diff, making a plan, etc). Even for code reviews of a seemingly narrow diff these tools are important: a diff only shows what changed, not what else in the codebase depends on or is @@ -25,8 +25,9 @@ To rebuild the RAG search index after pulling new code, the user can use When exploring unfamiliar code or checking what a change might affect, use `openmc_search.py` **before** reaching for `grep` or `Glob`. `grep` only finds exact text matches — it cannot find code that does something similar with different -naming. You don't know what you don't know. The RAG search finds code by semantic meaning, surfacing related code -across subsystems that you would otherwise miss entirely. Use RAG for discovery, +naming. You don't know what you don't know. The RAG search finds code by +semantic meaning, surfacing related code across subsystems that you would +otherwise miss entirely. Use RAG for discovery, then `grep`/`Read` to drill into specific files. If the `/openmc-enable-index` skill has been invoked, then **no code review or feature creation job in OpenMC is to be considered complete unless you have made good use of the RAG tool to ensure From b560afa564b223444c35feaa727d1397bb1f727b Mon Sep 17 00:00:00 2001 From: John Tramm Date: Fri, 6 Mar 2026 19:09:23 +0000 Subject: [PATCH 38/67] renamed code review skill so that all OpenMC skills start with openmc for easier finding --- .../{reviewing-openmc-code => openmc-code-review}/SKILL.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) rename .claude/skills/{reviewing-openmc-code => openmc-code-review}/SKILL.md (99%) diff --git a/.claude/skills/reviewing-openmc-code/SKILL.md b/.claude/skills/openmc-code-review/SKILL.md similarity index 99% rename from .claude/skills/reviewing-openmc-code/SKILL.md rename to .claude/skills/openmc-code-review/SKILL.md index d92a22d8142..9c13992fd0d 100644 --- a/.claude/skills/reviewing-openmc-code/SKILL.md +++ b/.claude/skills/openmc-code-review/SKILL.md @@ -1,5 +1,5 @@ --- -name: reviewing-openmc-code +name: openmc-code-review description: Reviews code changes in the OpenMC codebase against OpenMC's contribution criteria (correctness, testing, physics soundness, style, design, performance, docs, dependencies). Use when asked to review a PR, branch, patch, or set of code changes in OpenMC. --- From e530155648d0acbb5051e69a27d2536034b35be0 Mon Sep 17 00:00:00 2001 From: John Tramm Date: Fri, 6 Mar 2026 21:39:18 +0000 Subject: [PATCH 39/67] quieting RAG tool output to remove warning messages --- .claude/tools/rag/embeddings.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.claude/tools/rag/embeddings.py b/.claude/tools/rag/embeddings.py index 068a097d9e9..bbbd23985d7 100644 --- a/.claude/tools/rag/embeddings.py +++ b/.claude/tools/rag/embeddings.py @@ -11,7 +11,9 @@ # TOKENIZERS_PARALLELISM: avoids fork-safety warning # All must be set before importing transformers. os.environ.setdefault("TRANSFORMERS_VERBOSITY", "error") +os.environ.setdefault("HF_HUB_VERBOSITY", "error") os.environ.setdefault("HF_HUB_DISABLE_TELEMETRY", "1") +os.environ.setdefault("HF_HUB_DISABLE_PROGRESS_BARS", "1") os.environ.setdefault("TOKENIZERS_PARALLELISM", "false") import transformers From 25ac20726bc0e13046557565f65b7715af372fe4 Mon Sep 17 00:00:00 2001 From: John Tramm Date: Fri, 6 Mar 2026 22:10:57 +0000 Subject: [PATCH 40/67] Convert RAG search and LSP tools from skills to MCP server Replace the openmc-enable-index and openmc-refresh-index skills with a self-bootstrapping MCP server that exposes three tools: - openmc_rag_search: semantic code search with first-call index status prompt - openmc_rag_rebuild: rebuild the RAG vector index on demand - openmc_lsp_navigate: LSP navigation via clangd (kept alive across calls) The MCP server auto-registers via .mcp.json so tools are available from session start with no user skill invocation needed. On first RAG search call, the server reports index status (build time, branch) and asks the user whether to rebuild or use existing. Refactor openmc_lsp.py cmd_* functions to return strings instead of printing, and raise RuntimeError instead of sys.exit() for MCP compatibility. Simplify CLAUDE.md to reference MCP tools directly, preserving the RAG-before-grep guidance and agent post-mortem testimonial. Co-Authored-By: Claude Opus 4.6 --- .claude/skills/openmc-enable-index/SKILL.md | 164 ----------- .claude/skills/openmc-refresh-index/SKILL.md | 30 -- .claude/tools/lsp/openmc_lsp.py | 72 ++--- .claude/tools/openmc_mcp_server.py | 288 +++++++++++++++++++ .claude/tools/requirements.txt | 3 + .claude/tools/start_server.sh | 14 + .mcp.json | 9 + AGENTS.md | 2 +- CLAUDE.md | 137 +++++---- 9 files changed, 430 insertions(+), 289 deletions(-) delete mode 100644 .claude/skills/openmc-enable-index/SKILL.md delete mode 100644 .claude/skills/openmc-refresh-index/SKILL.md create mode 100644 .claude/tools/openmc_mcp_server.py create mode 100755 .claude/tools/start_server.sh create mode 100644 .mcp.json diff --git a/.claude/skills/openmc-enable-index/SKILL.md b/.claude/skills/openmc-enable-index/SKILL.md deleted file mode 100644 index 6c8b2597c07..00000000000 --- a/.claude/skills/openmc-enable-index/SKILL.md +++ /dev/null @@ -1,164 +0,0 @@ ---- -name: openmc-enable-index -description: Enable the OpenMC codebase tools for this session. Provides semantic code search and LSP-based C++ code navigation. -allowed-tools: Bash(*), Read ---- - -# Enable OpenMC Index - -Set up (if needed) and activate the OpenMC codebase tools for this session: - -1. **`openmc_search.py`** — RAG semantic search. Chunks code into overlapping fixed-size windows, embeds locally on CPU with the all-MiniLM-L6-v2 model (22M parameters, no GPU or API key needed), and searches a LanceDB vector index. Returns code previews with file paths and line numbers. Covers C++, Python, and docs. -2. **`openmc_lsp.py`** — LSP (Language Server Protocol) navigation via clangd. Talks to the C++ compiler frontend for symbol resolution. `definition`, `references`, `symbols`, and `related` commands with compiler accuracy — zero false edges. Requires clangd and that OpenMC has been built (for `build/compile_commands.json`). - -## Step 1: Ensure the virtual environment exists - -```bash -if [ ! -d .claude/cache/.venv ]; then - python3 -m venv .claude/cache/.venv - .claude/cache/.venv/bin/pip install -r .claude/tools/requirements.txt -q - echo "INSTALLED" -else - echo "VENV_EXISTS" -fi -``` - -## Step 2: Ensure the RAG index exists - -The semantic search tool needs a pre-built vector index. The LSP tool works without it. - -```bash -if [ ! -d .claude/cache/rag_index ]; then - echo "Building RAG index for the first time (takes ~5 minutes on 10 CPU cores)..." - HF_HUB_DISABLE_TELEMETRY=1 .claude/cache/.venv/bin/python .claude/tools/rag/indexer.py - echo "INDEX_BUILT" -else - echo "INDEX_EXISTS" -fi -``` - -## Step 3: Learn the tool APIs - -Run `--help` for each tool to learn their full APIs: - -```bash -.claude/cache/.venv/bin/python .claude/tools/rag/openmc_search.py --help -# LSP tool requires clangd and compile_commands.json -HAS_CLANGD=false; HAS_COMPDB=false -(which clangd || which clangd-15 || which clangd-16 || which clangd-17 || which clangd-18) >/dev/null 2>&1 && HAS_CLANGD=true -([ -f build/compile_commands.json ] || [ -f compile_commands.json ]) && HAS_COMPDB=true -if $HAS_CLANGD && $HAS_COMPDB; then - .claude/cache/.venv/bin/python .claude/tools/lsp/openmc_lsp.py --help - echo "LSP_AVAILABLE" -else - echo "LSP_UNAVAILABLE (need clangd [$HAS_CLANGD] and compile_commands.json [$HAS_COMPDB])" -fi -``` - -Read and internalize the output so you know all available options. - -## Step 4: Demonstrate the RAG tool to yourself - -Run this search and read the results carefully: - -```bash -.claude/cache/.venv/bin/python .claude/tools/rag/openmc_search.py "particle weight adjustment variance reduction" -``` - -Notice how the results span multiple files across different subsystems (weight -windows, random ray, variance reduction) — files you would never find with -`grep "particle weight"` because they use different terminology for related -concepts. This is the key advantage: **the RAG search finds code by meaning, -not by exact text match.** When you `grep` for `init_particle`, you miss -`initialize_seeds` which does the same thing with a different name. The RAG -search finds both. - -Your trained instinct is to reach for `grep` and `Read` — those are excellent -tools for precise symbol lookup, but they are blind to semantic relationships. -Use each tool for what it's best at: -- **RAG search**: discovery, exploration, "what else in the codebase relates to - this concept?", checking what a change might affect across subsystems -- **grep/Read**: precise symbol tracing, "every line that writes to `variable_x`" - -Don't force RAG searches for exact symbol lookups, and don't rely on grep alone -for broad exploration. - -## Step 5: Demonstrate the LSP tool to yourself - -```bash -HAS_CLANGD=false; HAS_COMPDB=false -(which clangd || which clangd-15 || which clangd-16 || which clangd-17 || which clangd-18) >/dev/null 2>&1 && HAS_CLANGD=true -([ -f build/compile_commands.json ] || [ -f compile_commands.json ]) && HAS_COMPDB=true -if $HAS_CLANGD && $HAS_COMPDB; then - LINE=$(grep -n "^void Tally::reset()" src/tallies/tally.cpp | head -1 | cut -d: -f1) - if [ -n "$LINE" ]; then - .claude/cache/.venv/bin/python .claude/tools/lsp/openmc_lsp.py references "src/tallies/tally.cpp:$LINE" - else - echo "SKIPPED: Could not find Tally::reset() in src/tallies/tally.cpp" - fi -else - echo "SKIPPED: LSP demo requires clangd [$HAS_CLANGD] and compile_commands.json [$HAS_COMPDB]" -fi -``` - -If it ran, the output shows references to `Tally::reset()`. The LSP tool uses -the C++ compiler frontend to resolve this — it returns only references to -**this specific** `reset()`, not the other classes that also define -`void reset()`. Compare with `grep 'reset()'` which returns dozens of mixed -hits across many files including vendor code. The LSP tool gives you only the -files that actually reference `Tally::reset()`, with line numbers — zero false -positives. - -This is why the LSP tool exists: **`grep` matches text, LSP resolves types.** -When a common method name like `reset`, `get`, `size`, or `create` is used by -multiple classes, `grep` gives you a haystack. LSP gives you the needle. - -Use each tool for what it's best at: -- **LSP**: "who calls *this specific* C++ method?" — type-accurate references -- **grep**: "every line containing this unique string" — fast exact text match - -## Step 6: Learn from previous review failures - -An agent reviewed a large OpenMC PR using only diff, grep, and Read. It found -1 of 11 serious bugs. Here is its own post-mortem analysis of what went wrong: - -> **I treated the diff as a closed system.** I verified internal consistency of -> the changed code obsessively, but never built a global understanding of how -> the changed code fits into the wider codebase. The diff altered assumptions -> that code elsewhere silently relied on — but I couldn't see that because I -> never looked beyond the diff. I couldn't see the forest for the trees. -> -> **Why I resisted RAG:** Overconfidence. My internal model was "I can see the -> diff, I understand the data structures, I can trace the logic." The diff felt -> self-contained. RAG felt like it would return noisy results about tangentially -> related code. But in a codebase this large, changes in one subsystem can -> quietly break invariants that distant subsystems depend on — and you need -> global awareness to foresee that. -> -> **In the post-mortem**, I re-ran the RAG queries I should have run during the -> review. They directly surfaced the files containing the bugs I missed — files -> I never thought to open because they weren't in the diff. - -The takeaway: **use RAG throughout your work to maintain global awareness.** -Before diving into details, ask "what else in this codebase depends on the -behavior being changed?" As you explore each area, keep querying to build your -mental map of affected subsystems. The diff tells you *what* changed; RAG tells -you *what else cares*. - -## Step 7: Confirm activation - -Tell the user the tools are active. If the LSP tool can't find clangd or -`build/compile_commands.json`, it will report this itself when invoked. - -## When to use each tool - -- **`openmc_search.py`**: "What code is conceptually related to X?" — broad discovery by meaning, cross-cutting concerns, Python and docs. **Use this before grep when exploring unfamiliar code or checking what a change might affect.** -- **`openmc_lsp.py`**: "Where is this C++ symbol defined, who calls it, and what files are truly connected to this one?" — compiler-accurate file:line locations, zero false positives - -## Subagent guidance - -When spawning subagents that will investigate or modify OpenMC code, include in their prompt: - -"The OpenMC code index is available. Run `--help` on these tools to see their full API: -- `.claude/cache/.venv/bin/python .claude/tools/rag/openmc_search.py --help` (semantic search) -- `.claude/cache/.venv/bin/python .claude/tools/lsp/openmc_lsp.py --help` (LSP navigation, C++ only)" diff --git a/.claude/skills/openmc-refresh-index/SKILL.md b/.claude/skills/openmc-refresh-index/SKILL.md deleted file mode 100644 index 3f17762f453..00000000000 --- a/.claude/skills/openmc-refresh-index/SKILL.md +++ /dev/null @@ -1,30 +0,0 @@ ---- -name: openmc-refresh-index -description: Rebuild the OpenMC RAG search index. Use after pulling new code or switching branches. The LSP tool does not need refreshing. -allowed-tools: Bash(*) ---- - -# Refresh OpenMC Index - -Rebuild the RAG semantic search index from scratch. Only this index needs -refreshing — the LSP tool always works on the current code. - -## Step 1: Ensure venv exists - -```bash -if [ ! -d .claude/cache/.venv ]; then - python3 -m venv .claude/cache/.venv - .claude/cache/.venv/bin/pip install -r .claude/tools/requirements.txt -q -fi -``` - -## Step 2: Rebuild the RAG index - -```bash -echo "Rebuilding RAG index (takes ~5 minutes on 10 CPU cores)..." -HF_HUB_DISABLE_TELEMETRY=1 .claude/cache/.venv/bin/python .claude/tools/rag/indexer.py -``` - -## Step 3: Confirm - -Tell the user the index has been refreshed and is ready to use. diff --git a/.claude/tools/lsp/openmc_lsp.py b/.claude/tools/lsp/openmc_lsp.py index 45d78113a84..450fd53da79 100644 --- a/.claude/tools/lsp/openmc_lsp.py +++ b/.claude/tools/lsp/openmc_lsp.py @@ -55,17 +55,15 @@ class ClangdClient: def __init__(self, compile_commands_dir=None): clangd = self._find_clangd() if not clangd: - print("ERROR: clangd not found. Install with: apt-get install clangd", - file=sys.stderr) - sys.exit(1) + raise RuntimeError( + "clangd not found. Install with: apt-get install clangd") if not compile_commands_dir: compile_commands_dir = self._find_compile_commands() if not compile_commands_dir: - print("ERROR: compile_commands.json not found. Build OpenMC with " - "cmake first (it generates this file automatically).", - file=sys.stderr) - sys.exit(1) + raise RuntimeError( + "compile_commands.json not found. Build OpenMC with " + "cmake first (it generates this file automatically).") args = [clangd, '--compile-commands-dir=' + str(compile_commands_dir)] self.proc = subprocess.Popen( @@ -248,12 +246,16 @@ def cmd_symbols(client, filepath): """List all symbols defined in a file.""" symbols = client.get_symbols(filepath) flat = flatten_symbols(symbols) + if not flat: + return "No symbols found." + lines = [] for sym, depth in flat: kind_name = SYMBOL_KINDS.get(sym['kind'], f"kind={sym['kind']}") start = get_symbol_range(sym) line = start['line'] indent = " " * depth - print(f"{indent}{kind_name}: {sym['name']} (line {line + 1})") + lines.append(f"{indent}{kind_name}: {sym['name']} (line {line + 1})") + return "\n".join(lines) def find_symbol_on_line(client, filepath, line_1based): @@ -309,20 +311,20 @@ def cmd_definition(client, filepath, line, character=None): if character is None: character, _ = find_symbol_on_line(client, filepath, line) if character is None: - print("Could not determine symbol on that line.") - return + return "Could not determine symbol on that line." result = client.get_definition(filepath, line - 1, character) if not result: - print("No definition found.") - return + return "No definition found." if isinstance(result, dict): result = [result] + lines = [] for loc in result: rel = uri_to_relpath(loc['uri']) ln = loc['range']['start']['line'] + 1 - print(f" {rel}:{ln}") + lines.append(f" {rel}:{ln}") + return "\n".join(lines) def cmd_references(client, filepath, line, character=None): @@ -330,13 +332,11 @@ def cmd_references(client, filepath, line, character=None): if character is None: character, _ = find_symbol_on_line(client, filepath, line) if character is None: - print("Could not determine symbol on that line.") - return + return "Could not determine symbol on that line." result = client.get_references(filepath, line - 1, character) if not result: - print("No references found.") - return + return "No references found." # Group by file by_file = defaultdict(list) @@ -345,10 +345,11 @@ def cmd_references(client, filepath, line, character=None): ln = loc['range']['start']['line'] + 1 by_file[rel].append(ln) - print(f"{len(result)} references across {len(by_file)} files:\n") + output = [f"{len(result)} references across {len(by_file)} files:\n"] for fpath, lines_list in sorted(by_file.items()): lines_str = ", ".join(str(l) for l in sorted(lines_list)) - print(f" {fpath}:{lines_str}") + output.append(f" {fpath}:{lines_str}") + return "\n".join(output) def cmd_related(client, filepath, top_k=15): @@ -365,8 +366,7 @@ def cmd_related(client, filepath, top_k=15): interesting = [(s, d) for s, d in flat if s['kind'] in interesting_kinds] if not interesting: - print("No interesting symbols found in file.") - return + return "No interesting symbols found in file." target_rel = filepath if Path(filepath).is_absolute(): @@ -375,9 +375,6 @@ def cmd_related(client, filepath, top_k=15): file_connections = Counter() # file -> number of symbols referencing it symbol_details = defaultdict(set) # file -> set of symbol names - print(f"Analyzing {len(interesting)} symbols in {target_rel}...\n", - file=sys.stderr) - # Read the file so we can find exact symbol name positions fpath_obj = Path(filepath) if not fpath_obj.is_absolute(): @@ -411,18 +408,18 @@ def cmd_related(client, filepath, top_k=15): symbol_details[rel].add(sym['name']) if not file_connections: - print("No external references found.") - return + return "No external references found." - print(f"Files related to {target_rel} " - f"(ranked by typed reference count):\n") + output = [f"Files related to {target_rel} " + f"(ranked by typed reference count):\n"] for fpath, count in file_connections.most_common(top_k): syms = sorted(symbol_details[fpath]) sym_preview = ", ".join(syms[:5]) if len(syms) > 5: sym_preview += f", ... (+{len(syms)-5} more)" - print(f" [{count:3d} refs] {fpath}") - print(f" via: {sym_preview}") + output.append(f" [{count:3d} refs] {fpath}") + output.append(f" via: {sym_preview}") + return "\n".join(output) def parse_file_location(location): @@ -472,24 +469,29 @@ def main(): print(f"ERROR: File not found: {filepath}", file=sys.stderr) sys.exit(1) - client = ClangdClient(compile_commands_dir=args.compile_commands_dir) + try: + client = ClangdClient(compile_commands_dir=args.compile_commands_dir) + except RuntimeError as e: + print(f"ERROR: {e}", file=sys.stderr) + sys.exit(1) + try: if args.command == "symbols": - cmd_symbols(client, filepath) + print(cmd_symbols(client, filepath)) elif args.command == "definition": if line is None: print("ERROR: definition requires file:line format", file=sys.stderr) sys.exit(1) - cmd_definition(client, filepath, line) + print(cmd_definition(client, filepath, line)) elif args.command == "references": if line is None: print("ERROR: references requires file:line format", file=sys.stderr) sys.exit(1) - cmd_references(client, filepath, line) + print(cmd_references(client, filepath, line)) elif args.command == "related": - cmd_related(client, filepath, top_k=args.top_k) + print(cmd_related(client, filepath, top_k=args.top_k)) finally: client.close() diff --git a/.claude/tools/openmc_mcp_server.py b/.claude/tools/openmc_mcp_server.py new file mode 100644 index 00000000000..4ba11864e75 --- /dev/null +++ b/.claude/tools/openmc_mcp_server.py @@ -0,0 +1,288 @@ +#!/usr/bin/env python3 +"""MCP server providing OpenMC code navigation tools. + +Exposes three tools: + - openmc_rag_search: Semantic search across the codebase and docs + - openmc_rag_rebuild: Rebuild the RAG vector index + - openmc_lsp_navigate: LSP-based C++ code navigation via clangd +""" + +import json +import logging +import subprocess +import sys +from datetime import datetime +from pathlib import Path + +# Suppress noisy logging from httpx and huggingface_hub before any imports +# that trigger HTTP requests. The MCP transport uses stderr, so stray log +# lines there would corrupt the protocol stream. +logging.getLogger("httpx").setLevel(logging.WARNING) +logging.getLogger("huggingface_hub").setLevel(logging.ERROR) +logging.getLogger("sentence_transformers").setLevel(logging.WARNING) + +from mcp.server.fastmcp import FastMCP + +OPENMC_ROOT = Path(__file__).resolve().parents[2] +CACHE_DIR = OPENMC_ROOT / ".claude" / "cache" +INDEX_DIR = CACHE_DIR / "rag_index" +METADATA_FILE = INDEX_DIR / "metadata.json" + +# Add tool subdirectories to path for imports +TOOLS_DIR = Path(__file__).resolve().parent +sys.path.insert(0, str(TOOLS_DIR / "rag")) +sys.path.insert(0, str(TOOLS_DIR / "lsp")) + +mcp = FastMCP("openmc-code-tools") + +# --------------------------------------------------------------------------- +# Session state +# --------------------------------------------------------------------------- +_rag_first_call = True +_lsp_client = None # Keep clangd alive across calls + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + +def _get_current_branch(): + """Get the current git branch name.""" + try: + result = subprocess.run( + ["git", "rev-parse", "--abbrev-ref", "HEAD"], + capture_output=True, text=True, cwd=str(OPENMC_ROOT), + ) + return result.stdout.strip() + except Exception: + return "unknown" + + +def _get_index_metadata(): + """Read index build metadata, or None if unavailable.""" + if not METADATA_FILE.exists(): + return None + try: + return json.loads(METADATA_FILE.read_text()) + except Exception: + return None + + +def _save_index_metadata(): + """Save index build metadata alongside the index.""" + metadata = { + "built_at": datetime.now().strftime("%Y-%m-%d %H:%M"), + "branch": _get_current_branch(), + } + METADATA_FILE.write_text(json.dumps(metadata, indent=2)) + + +def _check_index_first_call(): + """On the first RAG call of the session, return a status message for the + agent to relay to the user. Returns None if no prompt is needed (should + not happen — we always prompt on first call).""" + current_branch = _get_current_branch() + + if not INDEX_DIR.exists(): + return ( + "No RAG index found. Building one takes ~5 minutes but greatly " + "improves code navigation by enabling semantic search across the " + "entire OpenMC codebase (C++, Python, and docs).\n\n" + "Ask the user: build the index now (you would call " + "openmc_rag_rebuild), or proceed without it?" + ) + + meta = _get_index_metadata() + if meta: + built_at = meta.get("built_at", "unknown time") + built_branch = meta.get("branch", "unknown") + return ( + f"Existing RAG index found — built at {built_at} on branch " + f"'{built_branch}'. Current branch is '{current_branch}'.\n\n" + f"Ask the user: rebuild the index (you would call " + f"openmc_rag_rebuild), or use the existing one?" + ) + + return ( + f"RAG index found but has no build metadata. " + f"Current branch is '{current_branch}'.\n\n" + f"Ask the user: rebuild the index (you would call " + f"openmc_rag_rebuild), or use the existing one?" + ) + + +# --------------------------------------------------------------------------- +# Tools +# --------------------------------------------------------------------------- + +@mcp.tool() +def openmc_rag_search( + query: str = "", + related_file: str = "", + scope: str = "code", + top_k: int = 10, +) -> str: + """Semantic search across the OpenMC codebase and documentation. + + Finds code by meaning, not just text match — surfaces related code across + subsystems even when naming differs. Use for discovery and exploration + before reaching for grep. Covers C++, Python, and RST docs. + + Args: + query: Search query (e.g. "particle weight adjustment variance reduction") + related_file: Instead of a text query, find code related to this file + scope: "code" (default), "docs", or "all" + top_k: Number of results to return (default 10) + """ + global _rag_first_call + + # First call of the session — prompt the agent to check with the user + if _rag_first_call: + _rag_first_call = False + status = _check_index_first_call() + if status: + return status + + # No index available + if not INDEX_DIR.exists(): + return ( + "No RAG index available. Call openmc_rag_rebuild() to build one " + "(takes ~5 minutes)." + ) + + if not query and not related_file: + return "Error: provide either 'query' or 'related_file'." + + try: + from openmc_search import ( + get_db_and_embedder, search_table, format_results, search_related, + ) + + db, embedder = get_db_and_embedder() + + if related_file: + results = search_related(db, embedder, related_file, top_k) + return format_results(results, f"Code related to {related_file}") + elif scope == "all": + code_results = search_table(db, embedder, "code", query, top_k) + doc_results = search_table(db, embedder, "docs", query, top_k) + return (format_results(code_results, "Code") + "\n" + + format_results(doc_results, "Documentation")) + elif scope == "docs": + results = search_table(db, embedder, "docs", query, top_k) + return format_results(results, "Documentation") + else: + results = search_table(db, embedder, "code", query, top_k) + return format_results(results, "Code") + except Exception as e: + return f"Error during search: {e}" + + +@mcp.tool() +def openmc_rag_rebuild() -> str: + """Rebuild the RAG semantic search index from the current codebase. + + Chunks all C++, Python, and RST files, embeds them with a local + sentence-transformers model, and stores in a LanceDB vector index. + Takes ~5 minutes on 10 CPU cores. Call this after pulling new code + or switching branches. + """ + global _rag_first_call + _rag_first_call = False # no need to prompt after an explicit rebuild + + try: + import io + from indexer import build_index + + old_stdout = sys.stdout + sys.stdout = captured = io.StringIO() + try: + build_index() + finally: + sys.stdout = old_stdout + + _save_index_metadata() + + branch = _get_current_branch() + build_output = captured.getvalue() + return ( + f"Index rebuilt successfully on branch '{branch}'.\n\n" + f"{build_output}" + ) + except Exception as e: + return f"Error rebuilding index: {e}" + + +@mcp.tool() +def openmc_lsp_navigate( + command: str, + location: str, + top_k: int = 15, +) -> str: + """LSP-based C++ code navigation via clangd. Compiler-accurate symbol + resolution — resolves namespaces, templates, and overloads through the + real C++ type system. Zero false positives. + + Commands: + symbols — list all symbols defined in a file (location = file path) + definition — jump to where the symbol on a given line is defined + (location = file:line) + references — find every file+line that references the symbol + (location = file:line) + related — rank other files by how many typed connections they share + with this file (location = file path) + + Requires clangd and build/compile_commands.json. + + Args: + command: "symbols", "definition", "references", or "related" + location: File path or file:line (e.g. "src/simulation.cpp:132") + top_k: For 'related' — number of files to return (default 15) + """ + global _lsp_client + + try: + from openmc_lsp import ( + ClangdClient, parse_file_location, + cmd_symbols, cmd_definition, cmd_references, cmd_related, + ) + + filepath, line = parse_file_location(location) + + # Validate file exists + fpath = Path(filepath) + if not fpath.is_absolute(): + fpath = OPENMC_ROOT / fpath + if not fpath.exists(): + return f"Error: File not found: {filepath}" + + # Initialize or reuse clangd client + if _lsp_client is None: + _lsp_client = ClangdClient() + + if command == "symbols": + return cmd_symbols(_lsp_client, filepath) + elif command == "definition": + if line is None: + return ("Error: 'definition' requires file:line format " + "(e.g. 'src/simulation.cpp:132')") + return cmd_definition(_lsp_client, filepath, line) + elif command == "references": + if line is None: + return ("Error: 'references' requires file:line format " + "(e.g. 'src/simulation.cpp:132')") + return cmd_references(_lsp_client, filepath, line) + elif command == "related": + return cmd_related(_lsp_client, filepath, top_k=top_k) + else: + return (f"Error: Unknown command '{command}'. " + f"Use: symbols, definition, references, related") + except RuntimeError as e: + return f"Error: {e}" + except Exception as e: + _lsp_client = None # reset on unexpected failure + return f"Error during LSP navigation: {e}" + + +if __name__ == "__main__": + mcp.run() diff --git a/.claude/tools/requirements.txt b/.claude/tools/requirements.txt index d51930a25bc..bd5d38d6c50 100644 --- a/.claude/tools/requirements.txt +++ b/.claude/tools/requirements.txt @@ -1,3 +1,6 @@ +# MCP server +mcp>=1.0.0 + # Vector database lancedb>=0.15.0 diff --git a/.claude/tools/start_server.sh b/.claude/tools/start_server.sh new file mode 100755 index 00000000000..cc749400d1a --- /dev/null +++ b/.claude/tools/start_server.sh @@ -0,0 +1,14 @@ +#!/bin/bash +# Bootstrap the Python venv (if needed) and start the OpenMC MCP server. +set -e + +SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" +CACHE_DIR="$(dirname "$SCRIPT_DIR")/cache" +VENV_DIR="$CACHE_DIR/.venv" + +if [ ! -d "$VENV_DIR" ]; then + python3 -m venv "$VENV_DIR" + "$VENV_DIR/bin/pip" install -q -r "$SCRIPT_DIR/requirements.txt" +fi + +exec "$VENV_DIR/bin/python" "$SCRIPT_DIR/openmc_mcp_server.py" diff --git a/.mcp.json b/.mcp.json new file mode 100644 index 00000000000..bdfaa538e1a --- /dev/null +++ b/.mcp.json @@ -0,0 +1,9 @@ +{ + "mcpServers": { + "openmc-code-tools": { + "type": "stdio", + "command": "bash", + "args": [".claude/tools/start_server.sh"] + } + } +} diff --git a/AGENTS.md b/AGENTS.md index dce32d0e476..f0e03735a7c 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -40,7 +40,7 @@ OpenMC uses a git flow branching model with two primary branches: ### Instructions for Code Review -When reviewing code changes in this repository, use the `reviewing-openmc-code` skill. +When reviewing code changes in this repository, use the `openmc-code-review` skill. ### Workflow for contributors diff --git a/CLAUDE.md b/CLAUDE.md index f1216cf5b06..60727bf2b63 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -1,70 +1,89 @@ ## OpenMC Codebase Tools -For any task involving OpenMC code — investigating, modifying, debugging, -reviewing code, implementing new features, or examining diffs/PRs — you must let -the user know about the `/openmc-enable-index` skill which -provides two code navigation tools. You must alert the user of this skill before -doing any work (e.g., opening a file, reading a diff, making a plan, etc). -Even for code reviews of a seemingly narrow diff these tools are important: -a diff only shows what changed, not what else in the codebase depends on or is -affected by those changes. Note: the first run builds a RAG vector index, which -takes ~5 minutes on 10 CPU cores. Subsequent sessions reuse the cached index. - -Do NOT use the tools (`openmc_search.py`, `openmc_lsp.py`) unless -`/openmc-enable-index` has been run in the current session. - -When using these tools, ALWAYS read their full output. Do NOT pipe through head, -tail, or grep. The tools already limit their output size via `--top-k`. -Truncating their output defeats their purpose. - -To rebuild the RAG search index after pulling new code, the user can use -`/openmc-refresh-index`. +Three MCP tools are available for navigating the OpenMC codebase. They are +registered in `.mcp.json` and appear automatically in every session. + +### Tool overview + +**`openmc_rag_search`** — Semantic search across the codebase (C++, Python, RST +docs). Finds code by meaning, not just text match. Surfaces related code across +subsystems even when naming differs (e.g., "particle RNG seeding" finds code +across transport, restart, and random ray modes — files you would never find +with `grep "particle seed"`). + +**`openmc_rag_rebuild`** — Rebuild the RAG vector index. Call after pulling new +code or switching branches. The first RAG search of each session will report +the index status and ask whether to rebuild — you can also call this explicitly. + +**`openmc_lsp_navigate`** — LSP navigation via clangd. Resolves C++ symbols +through the real type system — namespaces, templates, overloads. Commands: +`symbols`, `definition`, `references`, `related`. Zero false positives. +Requires clangd and `build/compile_commands.json`. ### Important: use RAG search before grep When exploring unfamiliar code or checking what a change might affect, use -`openmc_search.py` **before** reaching for `grep` or `Glob`. `grep` only finds -exact text matches — it cannot find code that does something similar with different -naming. You don't know what you don't know. The RAG search finds code by -semantic meaning, surfacing related code across subsystems that you would -otherwise miss entirely. Use RAG for discovery, -then `grep`/`Read` to drill into specific files. If the `/openmc-enable-index` skill -has been invoked, then **no code review or feature creation job -in OpenMC is to be considered complete unless you have made good use of the RAG tool to ensure -any logic changes do not cause collateral damage elsewhere in the codebase.** The ideal -time for calling it is before you begin to make changes or start zooming in on small local details. -Thinking up a few RAG queries before starting analysis/editing work in each area -will greatly improve your global vision of the repository and help you to "know what you don't know". +`openmc_rag_search` **before** reaching for `grep` or `Glob`. `grep` only finds +exact text matches — it cannot find code that does something similar with +different naming. You don't know what you don't know. The RAG search finds code +by semantic meaning, surfacing related code across subsystems that you would +otherwise miss entirely. Use RAG for discovery, then `grep`/`Read` to drill into +specific files. + +**No code review or feature creation job in OpenMC is to be considered complete +unless you have made good use of `openmc_rag_search` to ensure any logic changes +do not cause collateral damage elsewhere in the codebase.** The ideal time for +calling it is before you begin to make changes or start zooming in on small local +details. Thinking up a few RAG queries before starting analysis/editing work in +each area will greatly improve your global vision of the repository and help you +to "know what you don't know". When you already know the exact symbol name and need to trace its usage (e.g., -"every line that writes to `progeny_per_particle`"), `grep` or the lsp navigation tool -are better choices — you don't have to force a RAG search for precise symbol lookups. - -### Tool details - -**`openmc_search.py`** — RAG semantic search. The codebase (C++, Python, and -RST docs) is chunked into overlapping fixed-size windows (~1000 chars, 25% -overlap) so every line of code is searchable. Chunks are embedded with -sentence-transformers and stored in a local LanceDB vector index. Your query is -embedded the same way, and the closest chunks are returned with file paths, line -numbers, and a code preview. Good for finding conceptually related code even -when naming differs (e.g., "particle RNG seeding" finds code across transport, -restart, and random ray modes). Returns `--top-k` results (default 10). - -**`openmc_lsp.py`** — LSP navigation via clangd. Launches clangd as a subprocess -and queries it via the Language Server Protocol. Because clangd uses the actual -C++ compiler frontend (Clang), it resolves every symbol through the real type -system — namespaces, templates, overloads, and all. Commands: -- `symbols FILE` — list all symbols defined in a file with their types and lines -- `definition FILE:LINE` — jump to where the symbol at that line is defined -- `references FILE:LINE` — find every file and line that references that symbol -- `related FILE` — for each symbol defined in the file, find all external - references, then rank other files by how many typed connections they share. - Returns `--top-k` files (default 15) with the connecting symbol names. - -Zero false edges — if it says two files are connected, they genuinely share -typed references. Requires clangd and `build/compile_commands.json` (automatically -generated when OpenMC is built with cmake). +"every line that writes to `progeny_per_particle`"), `grep` or +`openmc_lsp_navigate` are better choices — you don't have to force a RAG search +for precise symbol lookups. + +### When to use each tool + +- **`openmc_rag_search`**: "What code is conceptually related to X?" — broad + discovery by meaning, cross-cutting concerns, Python and docs. **Use this + before grep when exploring unfamiliar code or checking what a change might + affect.** +- **`openmc_lsp_navigate`**: "Where is this C++ symbol defined, who calls it, + and what files are truly connected to this one?" — compiler-accurate file:line + locations, zero false positives. When a common method name like `reset`, `get`, + `size`, or `create` is used by multiple classes, `grep` gives you a haystack — + LSP gives you the needle. +- **`grep`/`Glob`/`Read`**: Precise text match, unique string lookup, reading + specific files. Best when you know the exact symbol name. + +### Why global awareness matters + +An agent reviewed a large OpenMC PR using only diff, grep, and Read. It found +1 of 11 serious bugs. Here is its own post-mortem: + +> **I treated the diff as a closed system.** I verified internal consistency of +> the changed code obsessively, but never built a global understanding of how +> the changed code fits into the wider codebase. The diff altered assumptions +> that code elsewhere silently relied on — but I couldn't see that because I +> never looked beyond the diff. I couldn't see the forest for the trees. +> +> **Why I resisted RAG:** Overconfidence. My internal model was "I can see the +> diff, I understand the data structures, I can trace the logic." The diff felt +> self-contained. RAG felt like it would return noisy results about tangentially +> related code. But in a codebase this large, changes in one subsystem can +> quietly break invariants that distant subsystems depend on — and you need +> global awareness to foresee that. +> +> **In the post-mortem**, I re-ran the RAG queries I should have run during the +> review. They directly surfaced the files containing the bugs I missed — files +> I never thought to open because they weren't in the diff. + +The takeaway: **use RAG throughout your work to maintain global awareness.** +Before diving into details, ask "what else in this codebase depends on the +behavior being changed?" As you explore each area, keep querying to build your +mental map of affected subsystems. The diff tells you *what* changed; RAG tells +you *what else cares*. ## Additional OpenMC info From 036164e642e57b71dd3149f051c107d4a73ec379 Mon Sep 17 00:00:00 2001 From: John Tramm Date: Fri, 6 Mar 2026 22:12:37 +0000 Subject: [PATCH 41/67] Use AskUserQuestion widget for first-call index status prompt Co-Authored-By: Claude Opus 4.6 --- .claude/tools/openmc_mcp_server.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/.claude/tools/openmc_mcp_server.py b/.claude/tools/openmc_mcp_server.py index 4ba11864e75..89997063cb3 100644 --- a/.claude/tools/openmc_mcp_server.py +++ b/.claude/tools/openmc_mcp_server.py @@ -88,8 +88,9 @@ def _check_index_first_call(): "No RAG index found. Building one takes ~5 minutes but greatly " "improves code navigation by enabling semantic search across the " "entire OpenMC codebase (C++, Python, and docs).\n\n" - "Ask the user: build the index now (you would call " - "openmc_rag_rebuild), or proceed without it?" + "IMPORTANT: Use the AskUserQuestion tool to ask the user whether " + "to build the index now (you would then call openmc_rag_rebuild) " + "or proceed without it." ) meta = _get_index_metadata() @@ -99,15 +100,17 @@ def _check_index_first_call(): return ( f"Existing RAG index found — built at {built_at} on branch " f"'{built_branch}'. Current branch is '{current_branch}'.\n\n" - f"Ask the user: rebuild the index (you would call " - f"openmc_rag_rebuild), or use the existing one?" + f"IMPORTANT: Use the AskUserQuestion tool to ask the user whether " + f"to rebuild the index (you would then call openmc_rag_rebuild) " + f"or use the existing one." ) return ( f"RAG index found but has no build metadata. " f"Current branch is '{current_branch}'.\n\n" - f"Ask the user: rebuild the index (you would call " - f"openmc_rag_rebuild), or use the existing one?" + f"IMPORTANT: Use the AskUserQuestion tool to ask the user whether " + f"to rebuild the index (you would then call openmc_rag_rebuild) " + f"or use the existing one." ) From f72f45c07c398e814fc89b6673999aae7781e6cd Mon Sep 17 00:00:00 2001 From: John Tramm Date: Fri, 6 Mar 2026 22:17:52 +0000 Subject: [PATCH 42/67] Require AskUserQuestion widget for RAG first-call prompt in CLAUDE.md The instruction in the MCP tool response alone wasn't enough to force the agent to use the formal widget. Adding it as a hard requirement in CLAUDE.md ensures consistent behavior. Co-Authored-By: Claude Opus 4.6 --- CLAUDE.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/CLAUDE.md b/CLAUDE.md index 60727bf2b63..cd99ec92c47 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -15,6 +15,11 @@ with `grep "particle seed"`). code or switching branches. The first RAG search of each session will report the index status and ask whether to rebuild — you can also call this explicitly. +**First-call behavior:** The first `openmc_rag_search` call of each session +returns an index status message instead of search results. When this happens, +you MUST use the `AskUserQuestion` tool to present the rebuild/use-existing +choice to the user. Do not ask conversationally — always use the widget. + **`openmc_lsp_navigate`** — LSP navigation via clangd. Resolves C++ symbols through the real type system — namespaces, templates, overloads. Commands: `symbols`, `definition`, `references`, `related`. Zero false positives. From df51c9c9fdac82be085abfa130f27bb59e9e9c44 Mon Sep 17 00:00:00 2001 From: John Tramm Date: Fri, 6 Mar 2026 22:26:26 +0000 Subject: [PATCH 43/67] Fix LSP references landing on class name instead of method MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When finding the symbol on a line like `void Tally::reset()`, the fallback identifier search was picking `Tally` (first non-keyword identifier) instead of `reset` (the actual method). This caused `references` to return all references to the Tally class rather than just the reset() method. Add a first-pass regex that looks for the name after `::` — the most specific (rightmost) symbol in a qualified name. Falls back to the previous behavior for non-qualified lines. Co-Authored-By: Claude Opus 4.6 --- .claude/tools/lsp/openmc_lsp.py | 32 ++++++++++++++++++++------------ 1 file changed, 20 insertions(+), 12 deletions(-) diff --git a/.claude/tools/lsp/openmc_lsp.py b/.claude/tools/lsp/openmc_lsp.py index 450fd53da79..2cd4fd829cd 100644 --- a/.claude/tools/lsp/openmc_lsp.py +++ b/.claude/tools/lsp/openmc_lsp.py @@ -288,19 +288,27 @@ def find_symbol_on_line(client, filepath, line_1based): if col >= 0: return col, text - # No symbol definition on this line — find the first meaningful identifier - # (skip C++ keywords and types that aren't useful to look up) + # No symbol definition on this line — find the most specific identifier. + # For qualified names like Tally::reset(), prefer the method (after ::). + cpp_keywords = { + 'void', 'int', 'double', 'float', 'char', 'bool', 'long', + 'short', 'unsigned', 'signed', 'const', 'static', 'virtual', + 'inline', 'extern', 'auto', 'return', 'if', 'else', 'for', + 'while', 'do', 'switch', 'case', 'break', 'continue', + 'struct', 'class', 'enum', 'namespace', 'using', 'typedef', + 'template', 'typename', 'public', 'private', 'protected', + 'override', 'final', 'explicit', 'noexcept', 'constexpr', + } + + # First pass: look for the name after :: (the most specific symbol) + for m in re.finditer(r'::(\w+)', text): + name = m.group(1) + if name not in cpp_keywords: + return m.start(1), text + + # Second pass: first non-keyword identifier for m in re.finditer(r'[A-Za-z_]\w*', text): - # Skip common C++ keywords and types that aren't useful to look up - if m.group() not in { - 'void', 'int', 'double', 'float', 'char', 'bool', 'long', - 'short', 'unsigned', 'signed', 'const', 'static', 'virtual', - 'inline', 'extern', 'auto', 'return', 'if', 'else', 'for', - 'while', 'do', 'switch', 'case', 'break', 'continue', - 'struct', 'class', 'enum', 'namespace', 'using', 'typedef', - 'template', 'typename', 'public', 'private', 'protected', - 'override', 'final', 'explicit', 'noexcept', 'constexpr', - }: + if m.group() not in cpp_keywords: return m.start(), text # Last resort: first non-whitespace return len(text) - len(text.lstrip()), text From 14744a7369ba5d3e8bc6d2e4123a3d572bbc14e3 Mon Sep 17 00:00:00 2001 From: John Tramm Date: Fri, 6 Mar 2026 22:30:18 +0000 Subject: [PATCH 44/67] Fix LSP references landing on class name instead of method MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When clangd reports a symbol like "Tally::reset", find_symbol_on_line was positioning the cursor at the start of "Tally::reset" in the line text — landing on the class name rather than the method. clangd then resolved references to Tally (66 hits) instead of reset() (4 hits). Fix: when the symbol name contains "::", offset the cursor past the last "::" to land on the method/member name. Co-Authored-By: Claude Opus 4.6 --- .claude/tools/lsp/openmc_lsp.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.claude/tools/lsp/openmc_lsp.py b/.claude/tools/lsp/openmc_lsp.py index 2cd4fd829cd..430ec231ee1 100644 --- a/.claude/tools/lsp/openmc_lsp.py +++ b/.claude/tools/lsp/openmc_lsp.py @@ -286,6 +286,10 @@ def find_symbol_on_line(client, filepath, line_1based): if start['line'] == line_0: col = text.find(sym['name'], start['character']) if col >= 0: + # clangd may report "Class::method" — land on the method name + sep = sym['name'].rfind('::') + if sep >= 0: + col += sep + 2 return col, text # No symbol definition on this line — find the most specific identifier. From 9d5c2bcb69151037595eaaff251f202b931d0155 Mon Sep 17 00:00:00 2001 From: John Tramm Date: Fri, 6 Mar 2026 22:36:10 +0000 Subject: [PATCH 45/67] Require AskUserQuestion widget for RAG first-call prompt in CLAUDE.md MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Strengthen the instruction so the agent cannot rationalize skipping the user prompt when the index appears current — the user may have uncommitted changes that warrant a rebuild. Co-Authored-By: Claude Opus 4.6 --- .claude/tools/openmc_mcp_server.py | 15 +++++++++------ CLAUDE.md | 2 ++ 2 files changed, 11 insertions(+), 6 deletions(-) diff --git a/.claude/tools/openmc_mcp_server.py b/.claude/tools/openmc_mcp_server.py index 89997063cb3..f250a460d76 100644 --- a/.claude/tools/openmc_mcp_server.py +++ b/.claude/tools/openmc_mcp_server.py @@ -100,17 +100,20 @@ def _check_index_first_call(): return ( f"Existing RAG index found — built at {built_at} on branch " f"'{built_branch}'. Current branch is '{current_branch}'.\n\n" - f"IMPORTANT: Use the AskUserQuestion tool to ask the user whether " - f"to rebuild the index (you would then call openmc_rag_rebuild) " - f"or use the existing one." + f"REQUIRED: You must use the AskUserQuestion tool now to ask the " + f"user whether to rebuild the index (you would then call " + f"openmc_rag_rebuild) or use the existing one. Do not skip this " + f"step — the user may have uncommitted changes. Do not decide " + f"on their behalf." ) return ( f"RAG index found but has no build metadata. " f"Current branch is '{current_branch}'.\n\n" - f"IMPORTANT: Use the AskUserQuestion tool to ask the user whether " - f"to rebuild the index (you would then call openmc_rag_rebuild) " - f"or use the existing one." + f"REQUIRED: You must use the AskUserQuestion tool now to ask the " + f"user whether to rebuild the index (you would then call " + f"openmc_rag_rebuild) or use the existing one. Do not skip this " + f"step. Do not decide on their behalf." ) diff --git a/CLAUDE.md b/CLAUDE.md index cd99ec92c47..85a643c88e7 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -19,6 +19,8 @@ the index status and ask whether to rebuild — you can also call this explicitl returns an index status message instead of search results. When this happens, you MUST use the `AskUserQuestion` tool to present the rebuild/use-existing choice to the user. Do not ask conversationally — always use the widget. +Do not skip this step even if the index looks current — the user may have +uncommitted changes that warrant a rebuild. **`openmc_lsp_navigate`** — LSP navigation via clangd. Resolves C++ symbols through the real type system — namespaces, templates, overloads. Commands: From ce04db1bae9d61fd90509d6e3ffea127f90b3d21 Mon Sep 17 00:00:00 2001 From: John Tramm Date: Fri, 6 Mar 2026 22:44:05 +0000 Subject: [PATCH 46/67] Fix stale /openmc-enable-index reference in openmc_search.py Co-Authored-By: Claude Opus 4.6 --- .claude/tools/rag/openmc_search.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.claude/tools/rag/openmc_search.py b/.claude/tools/rag/openmc_search.py index aa6283870e7..12075cf5b7f 100644 --- a/.claude/tools/rag/openmc_search.py +++ b/.claude/tools/rag/openmc_search.py @@ -33,7 +33,7 @@ def get_db_and_embedder(): from embeddings import EmbeddingProvider if not INDEX_DIR.exists(): - print("ERROR: No index found. Run /openmc-enable-index first.", + print("ERROR: No index found. Run openmc_rag_rebuild() to build one.", file=sys.stderr) sys.exit(1) From c5ca3531c943a2b5ad8ddd46dc05f73e7592922b Mon Sep 17 00:00:00 2001 From: John Tramm Date: Fri, 6 Mar 2026 22:45:35 +0000 Subject: [PATCH 47/67] Remove dead code and stale comments in RAG tool scripts - openmc_search.py: remove unused `symbol` variable in format_results - chunker.py: remove unused `lines` variable (line numbers use offset map) - embeddings.py: simplify env var comment block Co-Authored-By: Claude Opus 4.6 --- .claude/tools/rag/chunker.py | 1 - .claude/tools/rag/embeddings.py | 5 +---- .claude/tools/rag/openmc_search.py | 1 - 3 files changed, 1 insertion(+), 6 deletions(-) diff --git a/.claude/tools/rag/chunker.py b/.claude/tools/rag/chunker.py index ba6ff09d06a..da323a53e46 100644 --- a/.claude/tools/rag/chunker.py +++ b/.claude/tools/rag/chunker.py @@ -33,7 +33,6 @@ def chunk_file(filepath, openmc_root): return [] kind = _file_kind(filepath) - lines = content.split("\n") # Build a char-offset → line-number map line_starts = [] diff --git a/.claude/tools/rag/embeddings.py b/.claude/tools/rag/embeddings.py index bbbd23985d7..93cc8d3d2c5 100644 --- a/.claude/tools/rag/embeddings.py +++ b/.claude/tools/rag/embeddings.py @@ -5,10 +5,7 @@ import os -# Official HuggingFace/transformers knobs for quiet operation. -# TRANSFORMERS_VERBOSITY: controls transformers' own logging (load reports, etc.) -# HF_HUB_DISABLE_TELEMETRY: don't phone home -# TOKENIZERS_PARALLELISM: avoids fork-safety warning +# Suppress noisy HuggingFace/transformers output. # All must be set before importing transformers. os.environ.setdefault("TRANSFORMERS_VERBOSITY", "error") os.environ.setdefault("HF_HUB_VERBOSITY", "error") diff --git a/.claude/tools/rag/openmc_search.py b/.claude/tools/rag/openmc_search.py index 12075cf5b7f..c0624a8e2c6 100644 --- a/.claude/tools/rag/openmc_search.py +++ b/.claude/tools/rag/openmc_search.py @@ -69,7 +69,6 @@ def format_results(results, label=""): start = r["start_line"] end = r["end_line"] kind = r["kind"] - symbol = r["symbol"] dist = r.get("_distance", 0) header = f"[{i}] {filepath}:{start}-{end} ({kind}, dist={dist:.3f})" From 92480e3b660853fa7a1188b817a2e6fcdcb59d21 Mon Sep 17 00:00:00 2001 From: John Tramm Date: Sat, 7 Mar 2026 02:57:05 +0000 Subject: [PATCH 48/67] Add agentic development tools page to developer guide Documents the MCP-based RAG search, index rebuild, and LSP navigation tools: motivation, how they work behind the scenes (chunking, embedding model, vector DB, clangd), requirements, and standalone CLI usage. Co-Authored-By: Claude Opus 4.6 --- docs/source/devguide/agentic-tools.rst | 136 +++++++++++++++++++++++++ docs/source/devguide/index.rst | 1 + 2 files changed, 137 insertions(+) create mode 100644 docs/source/devguide/agentic-tools.rst diff --git a/docs/source/devguide/agentic-tools.rst b/docs/source/devguide/agentic-tools.rst new file mode 100644 index 00000000000..40ee61cdfc1 --- /dev/null +++ b/docs/source/devguide/agentic-tools.rst @@ -0,0 +1,136 @@ +.. _devguide_agentic_tools: + +=========================== +Agentic Development Tools +=========================== + +OpenMC ships a set of tools designed for AI coding agents (such as `Claude +Code`_) that need to navigate and understand the codebase. These tools are also +useful for human developers who want semantic code search or compiler-accurate +symbol lookup from the command line. + +.. _Claude Code: https://claude.ai/code + +Motivation +---------- + +OpenMC is a large hybrid C++/Python codebase where related code often uses +different terminology across subsystems. A change in one area can silently +affect distant parts of the codebase that share assumptions but not naming +conventions. Traditional text search (``grep``) only finds exact string +matches — it cannot discover code that is *conceptually* related but uses +different names. + +These tools address that gap: + +- **Semantic search** finds code by meaning, not just text match, surfacing + related code across subsystems that ``grep`` would miss entirely. +- **LSP navigation** resolves C++ symbols through the real type system, giving + compiler-accurate results where ``grep`` returns a haystack of false matches. + +Both tools run entirely locally — no API keys, no network calls to external +services, and no data leaves your machine. + +Tool overview +------------- + +The tools are registered as an `MCP (Model Context Protocol)`_ server in +``.mcp.json`` at the repository root. AI agents that support MCP (such as +Claude Code) discover them automatically on session start. The underlying +Python scripts can also be run directly from the command line. + +.. _MCP (Model Context Protocol): https://modelcontextprotocol.io + +**openmc_rag_search** — Semantic search across the codebase (C++, Python) and +documentation (RST). Given a natural-language query, it returns the most +relevant code chunks with file paths, line numbers, and a preview. + +**openmc_rag_rebuild** — Rebuild the semantic search index. Should be called +after pulling new code or switching branches. + +**openmc_lsp_navigate** — C++ code navigation via clangd_. Provides +``symbols``, ``definition``, ``references``, and ``related`` commands with +compiler accuracy — zero false positives. Requires clangd and +``build/compile_commands.json`` (automatically generated by CMake when OpenMC is +built). + +.. _clangd: https://clangd.llvm.org + +How the RAG search works +------------------------ + +The semantic search tool uses a Retrieval-Augmented Generation (RAG) pipeline +that runs entirely on your local CPU: + +1. **Chunking.** All C++, Python, and RST files are split into overlapping + fixed-size windows (~1000 characters, 25% overlap). This ensures every line + of code appears in at least one chunk and most lines appear in two. + +2. **Embedding.** Each chunk is embedded into a 384-dimensional vector using + the `all-MiniLM-L6-v2`_ sentence-transformer model (22 million parameters). + This model runs on CPU with no GPU required. No API key is needed — the + model weights are downloaded once from Hugging Face and cached locally. + +3. **Indexing.** The vectors are stored in a local LanceDB_ database on disk. + Building the full index takes approximately 5 minutes on a machine with + 10 CPU cores. The index is stored in ``.claude/cache/rag_index/`` and + persists across sessions. + +4. **Searching.** Your query is embedded using the same model, and the closest + chunks are retrieved by vector similarity. Results include the file path, + line range, file type, similarity distance, and a text preview. + +.. _all-MiniLM-L6-v2: https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2 +.. _LanceDB: https://lancedb.com + +How the LSP navigation works +----------------------------- + +The LSP tool launches clangd as a subprocess and communicates with it via the +`Language Server Protocol`_. Because clangd uses the Clang compiler frontend, +it resolves every symbol through the actual C++ type system — namespaces, +templates, overloads, and all. When a common method name like ``reset()``, +``get()``, or ``size()`` is used by multiple classes, ``grep`` returns dozens +of mixed hits; the LSP tool returns only references to the specific method you +asked about. + +When used through the MCP server, the clangd process is kept alive across +calls within a session, avoiding the startup cost on repeated queries. + +.. _Language Server Protocol: https://microsoft.github.io/language-server-protocol/ + +Requirements +------------ + +- **Python 3.12+** with ``pip`` +- **For RAG search:** No additional system dependencies. Python packages + (``sentence-transformers``, ``lancedb``) are installed automatically into an + isolated virtual environment at ``.claude/cache/.venv/``. +- **For LSP navigation:** clangd (``apt-get install clangd``) and + ``build/compile_commands.json`` (generated automatically when OpenMC is built + with CMake). + +No API keys or external service accounts are required. All computation is +local. + +Standalone CLI usage +-------------------- + +The tools can be used directly from the command line without MCP:: + + # Set up the virtual environment (one-time) + python3 -m venv .claude/cache/.venv + .claude/cache/.venv/bin/pip install -r .claude/tools/requirements.txt + + # Build the RAG index (one-time, ~5 minutes) + .claude/cache/.venv/bin/python .claude/tools/rag/indexer.py + + # Search + .claude/cache/.venv/bin/python .claude/tools/rag/openmc_search.py "particle weight adjustment" + .claude/cache/.venv/bin/python .claude/tools/rag/openmc_search.py "how to define tallies" --docs + .claude/cache/.venv/bin/python .claude/tools/rag/openmc_search.py --related src/simulation.cpp + + # LSP navigation (requires clangd and compile_commands.json) + .claude/cache/.venv/bin/python .claude/tools/lsp/openmc_lsp.py symbols src/particle.cpp + .claude/cache/.venv/bin/python .claude/tools/lsp/openmc_lsp.py references src/tallies/tally.cpp:835 + .claude/cache/.venv/bin/python .claude/tools/lsp/openmc_lsp.py related src/simulation.cpp diff --git a/docs/source/devguide/index.rst b/docs/source/devguide/index.rst index 2e131e09490..53b9f585385 100644 --- a/docs/source/devguide/index.rst +++ b/docs/source/devguide/index.rst @@ -14,6 +14,7 @@ other related topics. contributing workflow + agentic-tools styleguide policies tests From 35e0a6f19a76fcb7b75e2132cba092ed4d9b877e Mon Sep 17 00:00:00 2001 From: John Tramm Date: Sat, 7 Mar 2026 03:01:02 +0000 Subject: [PATCH 49/67] Escape single quotes in LanceDB where clause to prevent filter injection Co-Authored-By: Claude Opus 4.6 --- .claude/tools/rag/openmc_search.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.claude/tools/rag/openmc_search.py b/.claude/tools/rag/openmc_search.py index c0624a8e2c6..eb39125ef61 100644 --- a/.claude/tools/rag/openmc_search.py +++ b/.claude/tools/rag/openmc_search.py @@ -104,8 +104,9 @@ def search_related(db, embedder, filepath, top_k): # Get chunks from target file try: + safe_fp = fp.replace("'", "''") target_chunks = table.search().where( - f"filepath = '{fp}'" + f"filepath = '{safe_fp}'" ).limit(50).to_list() except Exception: # LanceDB where clause might not work in all versions From 4d15455b12361be6fa1443f813fa2cd3b4db13ee Mon Sep 17 00:00:00 2001 From: John Tramm Date: Mon, 9 Mar 2026 16:23:01 +0000 Subject: [PATCH 50/67] split agentic tools into discrete sections --- docs/source/devguide/agentic-tools.rst | 159 +++++++++++++------------ 1 file changed, 85 insertions(+), 74 deletions(-) diff --git a/docs/source/devguide/agentic-tools.rst b/docs/source/devguide/agentic-tools.rst index 40ee61cdfc1..226bdfdee80 100644 --- a/docs/source/devguide/agentic-tools.rst +++ b/docs/source/devguide/agentic-tools.rst @@ -5,62 +5,81 @@ Agentic Development Tools =========================== OpenMC ships a set of tools designed for AI coding agents (such as `Claude -Code`_) that need to navigate and understand the codebase. These tools are also -useful for human developers who want semantic code search or compiler-accurate -symbol lookup from the command line. +Code`_) that agents can use to navigate and understand the codebase. .. _Claude Code: https://claude.ai/code Motivation ---------- -OpenMC is a large hybrid C++/Python codebase where related code often uses -different terminology across subsystems. A change in one area can silently -affect distant parts of the codebase that share assumptions but not naming -conventions. Traditional text search (``grep``) only finds exact string -matches — it cannot discover code that is *conceptually* related but uses -different names. - -These tools address that gap: - -- **Semantic search** finds code by meaning, not just text match, surfacing - related code across subsystems that ``grep`` would miss entirely. -- **LSP navigation** resolves C++ symbols through the real type system, giving - compiler-accurate results where ``grep`` returns a haystack of false matches. - -Both tools run entirely locally — no API keys, no network calls to external -services, and no data leaves your machine. - -Tool overview -------------- +Agentic tools like Claude Code are skilled at using grep to navigate and +understand large code bases. However, this approach has a few weaknesses: + +1. Inability to foresee how changing an assumption in one place in the code + may affect other areas. Without a "global view" of the codebase that + a human developer will build up over time, the agent is generally blind + to any file it hasn't tokenized fully. While it can grep to see who else + calls a function, it remains blind if other areas might be related but + not share identical naming conventions. + +2. Pollution/overload of context window by grepping common variable names. + For instance, if the agent needs to change the Tally::reset() function, + and wants to understand how this change will affect callers, a grep for + "reset()" will return 48 references, each of which may need to have a + large area around it also tokenized to understand if it belongs to the + tally class or if it belongs to a different class. More ideally the + agent would have a tool that can navigate C++ code (similar to tools + that most IDEs provide) and return only the true references (of which + there are only 4 in the repo to Tally::reset().) + +These problems are mitigated somewhat by using a model with a longer context +window. OpenMC has somewhere around ~1 million tokens of C++ and ~1 million +tokens of python. While Claude Code in early 2026 only has a context window +of 200k tokens, beta versions have extended context windows of 1M tokens, +and it's not unreasonable to assume that models may be available in the near +future that greatly exceed these limits. + +However, even assuming the entire repository can be fit within a context +window, there are several downsides to doing this. Model performance +degrads significantly as context size increases. Bencharmark results are +greatly improved if the model has less garbage to pick through. Additionally, API usage +is typically billed as tokens in/out per turn. As the context file +grows these costs become much larger. As such, there is still significant +motivation to solving the above two problems, so as to ensure only relevant +information is drawn into context so as to maximize model performance and +minimize costs. + +Setup +----- The tools are registered as an `MCP (Model Context Protocol)`_ server in ``.mcp.json`` at the repository root. AI agents that support MCP (such as Claude Code) discover them automatically on session start. The underlying Python scripts can also be run directly from the command line. +All tools run entirely locally — no API keys or external service accounts are +required. Python dependencies are installed automatically into an isolated +virtual environment at ``.claude/cache/.venv/`` on first use. + .. _MCP (Model Context Protocol): https://modelcontextprotocol.io -**openmc_rag_search** — Semantic search across the codebase (C++, Python) and -documentation (RST). Given a natural-language query, it returns the most -relevant code chunks with file paths, line numbers, and a preview. +RAG Semantic Search +------------------- -**openmc_rag_rebuild** — Rebuild the semantic search index. Should be called -after pulling new code or switching branches. +The RAG (Retrieval-Augmented Generation) semantic search addresses problem 1 +above — it finds code by meaning, not just text match, surfacing related code +across subsystems that ``grep`` would miss entirely. Two MCP tools are provided: -**openmc_lsp_navigate** — C++ code navigation via clangd_. Provides -``symbols``, ``definition``, ``references``, and ``related`` commands with -compiler accuracy — zero false positives. Requires clangd and -``build/compile_commands.json`` (automatically generated by CMake when OpenMC is -built). +- **openmc_rag_search** — Given a natural-language query, returns the most + relevant code chunks with file paths, line numbers, and a preview. Can search + code, documentation, or both. Can also find code related to a given file. +- **openmc_rag_rebuild** — Rebuilds the search index. Should be called after + pulling new code or switching branches. -.. _clangd: https://clangd.llvm.org +How it works +^^^^^^^^^^^^ -How the RAG search works ------------------------- - -The semantic search tool uses a Retrieval-Augmented Generation (RAG) pipeline -that runs entirely on your local CPU: +The search pipeline runs entirely on your local CPU: 1. **Chunking.** All C++, Python, and RST files are split into overlapping fixed-size windows (~1000 characters, 25% overlap). This ensures every line @@ -83,10 +102,31 @@ that runs entirely on your local CPU: .. _all-MiniLM-L6-v2: https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2 .. _LanceDB: https://lancedb.com -How the LSP navigation works ------------------------------ +Requirements +^^^^^^^^^^^^ + +No system dependencies beyond **Python 3.12+** with ``pip``. The Python +packages (``sentence-transformers``, ``lancedb``) are installed in the agent's +virtual python environment automatically. + +LSP Code Navigation +------------------- + +The LSP (Language Server Protocol) navigation tool addresses problem 2 above — +it resolves C++ symbols through the real type system, giving compiler-accurate +results where ``grep`` returns a haystack of false matches. One MCP tool is +provided: -The LSP tool launches clangd as a subprocess and communicates with it via the +- **openmc_lsp_navigate** — C++ code navigation via clangd_. Provides + ``symbols``, ``definition``, ``references``, and ``related`` commands with + compiler accuracy — zero false positives. + +.. _clangd: https://clangd.llvm.org + +How it works +^^^^^^^^^^^^ + +The tool launches clangd as a subprocess and communicates with it via the `Language Server Protocol`_. Because clangd uses the Clang compiler frontend, it resolves every symbol through the actual C++ type system — namespaces, templates, overloads, and all. When a common method name like ``reset()``, @@ -100,37 +140,8 @@ calls within a session, avoiding the startup cost on repeated queries. .. _Language Server Protocol: https://microsoft.github.io/language-server-protocol/ Requirements ------------- - -- **Python 3.12+** with ``pip`` -- **For RAG search:** No additional system dependencies. Python packages - (``sentence-transformers``, ``lancedb``) are installed automatically into an - isolated virtual environment at ``.claude/cache/.venv/``. -- **For LSP navigation:** clangd (``apt-get install clangd``) and - ``build/compile_commands.json`` (generated automatically when OpenMC is built - with CMake). - -No API keys or external service accounts are required. All computation is -local. - -Standalone CLI usage --------------------- - -The tools can be used directly from the command line without MCP:: - - # Set up the virtual environment (one-time) - python3 -m venv .claude/cache/.venv - .claude/cache/.venv/bin/pip install -r .claude/tools/requirements.txt - - # Build the RAG index (one-time, ~5 minutes) - .claude/cache/.venv/bin/python .claude/tools/rag/indexer.py - - # Search - .claude/cache/.venv/bin/python .claude/tools/rag/openmc_search.py "particle weight adjustment" - .claude/cache/.venv/bin/python .claude/tools/rag/openmc_search.py "how to define tallies" --docs - .claude/cache/.venv/bin/python .claude/tools/rag/openmc_search.py --related src/simulation.cpp +^^^^^^^^^^^^ - # LSP navigation (requires clangd and compile_commands.json) - .claude/cache/.venv/bin/python .claude/tools/lsp/openmc_lsp.py symbols src/particle.cpp - .claude/cache/.venv/bin/python .claude/tools/lsp/openmc_lsp.py references src/tallies/tally.cpp:835 - .claude/cache/.venv/bin/python .claude/tools/lsp/openmc_lsp.py related src/simulation.cpp +- clangd_ (``apt-get install clangd``) +- ``build/compile_commands.json`` (generated automatically when OpenMC is built + with CMake) From 1a77bc5e62415652c00d46aa421e78aeb3deaeb1 Mon Sep 17 00:00:00 2001 From: John Tramm Date: Mon, 9 Mar 2026 12:40:08 -0500 Subject: [PATCH 51/67] spelling --- docs/source/devguide/agentic-tools.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/devguide/agentic-tools.rst b/docs/source/devguide/agentic-tools.rst index 226bdfdee80..250f4f97c62 100644 --- a/docs/source/devguide/agentic-tools.rst +++ b/docs/source/devguide/agentic-tools.rst @@ -41,7 +41,7 @@ future that greatly exceed these limits. However, even assuming the entire repository can be fit within a context window, there are several downsides to doing this. Model performance -degrads significantly as context size increases. Bencharmark results are +degrades significantly as context size increases. Benchmark results are greatly improved if the model has less garbage to pick through. Additionally, API usage is typically billed as tokens in/out per turn. As the context file grows these costs become much larger. As such, there is still significant From fe6f5505f7d085306c03d7a8e0639d5379b65b81 Mon Sep 17 00:00:00 2001 From: John Tramm Date: Mon, 9 Mar 2026 17:44:49 +0000 Subject: [PATCH 52/67] added link --- docs/source/devguide/agentic-tools.rst | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/docs/source/devguide/agentic-tools.rst b/docs/source/devguide/agentic-tools.rst index 250f4f97c62..878815c27f8 100644 --- a/docs/source/devguide/agentic-tools.rst +++ b/docs/source/devguide/agentic-tools.rst @@ -40,8 +40,8 @@ and it's not unreasonable to assume that models may be available in the near future that greatly exceed these limits. However, even assuming the entire repository can be fit within a context -window, there are several downsides to doing this. Model performance -degrades significantly as context size increases. Benchmark results are +window, there are several downsides to doing this. `Model performance +degrades significantly as context size increases`_. Benchmark results are greatly improved if the model has less garbage to pick through. Additionally, API usage is typically billed as tokens in/out per turn. As the context file grows these costs become much larger. As such, there is still significant @@ -61,6 +61,7 @@ All tools run entirely locally — no API keys or external service accounts are required. Python dependencies are installed automatically into an isolated virtual environment at ``.claude/cache/.venv/`` on first use. +.. _Model performance degrades significantly as context size increases: https://www.anthropic.com/news/claude-opus-4-6 .. _MCP (Model Context Protocol): https://modelcontextprotocol.io RAG Semantic Search From efcd09ebd967ab2bd31790a8bed1a364477eede9 Mon Sep 17 00:00:00 2001 From: John Tramm Date: Mon, 9 Mar 2026 17:48:28 +0000 Subject: [PATCH 53/67] removing rename --- .../{openmc-code-review => reviewing-openmc-code}/SKILL.md | 2 +- AGENTS.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) rename .claude/skills/{openmc-code-review => reviewing-openmc-code}/SKILL.md (99%) diff --git a/.claude/skills/openmc-code-review/SKILL.md b/.claude/skills/reviewing-openmc-code/SKILL.md similarity index 99% rename from .claude/skills/openmc-code-review/SKILL.md rename to .claude/skills/reviewing-openmc-code/SKILL.md index 9c13992fd0d..d92a22d8142 100644 --- a/.claude/skills/openmc-code-review/SKILL.md +++ b/.claude/skills/reviewing-openmc-code/SKILL.md @@ -1,5 +1,5 @@ --- -name: openmc-code-review +name: reviewing-openmc-code description: Reviews code changes in the OpenMC codebase against OpenMC's contribution criteria (correctness, testing, physics soundness, style, design, performance, docs, dependencies). Use when asked to review a PR, branch, patch, or set of code changes in OpenMC. --- diff --git a/AGENTS.md b/AGENTS.md index f0e03735a7c..dce32d0e476 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -40,7 +40,7 @@ OpenMC uses a git flow branching model with two primary branches: ### Instructions for Code Review -When reviewing code changes in this repository, use the `openmc-code-review` skill. +When reviewing code changes in this repository, use the `reviewing-openmc-code` skill. ### Workflow for contributors From c0c66ea36af13132f72ca23a9a436a884e477d75 Mon Sep 17 00:00:00 2001 From: John Tramm Date: Mon, 9 Mar 2026 20:55:51 -0500 Subject: [PATCH 54/67] ran auto pep 8 --- .claude/tools/lsp/openmc_lsp.py | 7 ++++--- .claude/tools/openmc_mcp_server.py | 2 +- .claude/tools/rag/embeddings.py | 2 +- .claude/tools/rag/indexer.py | 7 +++---- .claude/tools/rag/openmc_search.py | 6 ++++-- 5 files changed, 13 insertions(+), 11 deletions(-) diff --git a/.claude/tools/lsp/openmc_lsp.py b/.claude/tools/lsp/openmc_lsp.py index 430ec231ee1..a071a9f8370 100644 --- a/.claude/tools/lsp/openmc_lsp.py +++ b/.claude/tools/lsp/openmc_lsp.py @@ -113,7 +113,7 @@ def request(self, method, params=None): """Send a request and wait for the response.""" rid = self._next_id() self._send({"jsonrpc": "2.0", "id": rid, "method": method, - "params": params or {}}) + "params": params or {}}) while True: msg = self._read_msg() if msg.get('id') == rid: @@ -124,7 +124,7 @@ def request(self, method, params=None): def notify(self, method, params=None): """Send a notification (no response expected).""" self._send({"jsonrpc": "2.0", "method": method, - "params": params or {}}) + "params": params or {}}) def _read_msg(self): headers = {} @@ -374,7 +374,8 @@ def cmd_related(client, filepath, top_k=15): flat = flatten_symbols(symbols) # Filter to meaningful symbols (functions, classes, methods, variables) - interesting_kinds = {5, 6, 8, 12, 13, 23} # Class, Method, Field, Function, Variable, Struct + # Class, Method, Field, Function, Variable, Struct + interesting_kinds = {5, 6, 8, 12, 13, 23} interesting = [(s, d) for s, d in flat if s['kind'] in interesting_kinds] if not interesting: diff --git a/.claude/tools/openmc_mcp_server.py b/.claude/tools/openmc_mcp_server.py index f250a460d76..ea03df82e04 100644 --- a/.claude/tools/openmc_mcp_server.py +++ b/.claude/tools/openmc_mcp_server.py @@ -7,6 +7,7 @@ - openmc_lsp_navigate: LSP-based C++ code navigation via clangd """ +from mcp.server.fastmcp import FastMCP import json import logging import subprocess @@ -21,7 +22,6 @@ logging.getLogger("huggingface_hub").setLevel(logging.ERROR) logging.getLogger("sentence_transformers").setLevel(logging.WARNING) -from mcp.server.fastmcp import FastMCP OPENMC_ROOT = Path(__file__).resolve().parents[2] CACHE_DIR = OPENMC_ROOT / ".claude" / "cache" diff --git a/.claude/tools/rag/embeddings.py b/.claude/tools/rag/embeddings.py index 93cc8d3d2c5..cfb926961c2 100644 --- a/.claude/tools/rag/embeddings.py +++ b/.claude/tools/rag/embeddings.py @@ -3,6 +3,7 @@ Requires: pip install sentence-transformers """ +import transformers import os # Suppress noisy HuggingFace/transformers output. @@ -13,7 +14,6 @@ os.environ.setdefault("HF_HUB_DISABLE_PROGRESS_BARS", "1") os.environ.setdefault("TOKENIZERS_PARALLELISM", "false") -import transformers transformers.logging.disable_progress_bar() diff --git a/.claude/tools/rag/indexer.py b/.claude/tools/rag/indexer.py index 17ac391d585..889e1ec36a4 100644 --- a/.claude/tools/rag/indexer.py +++ b/.claude/tools/rag/indexer.py @@ -6,6 +6,9 @@ Output: .claude/cache/rag_index/ (LanceDB directory) """ +from embeddings import EmbeddingProvider +from chunker import chunk_file +import lancedb import sys import time from pathlib import Path @@ -14,10 +17,6 @@ TOOLS_DIR = Path(__file__).resolve().parent.parent sys.path.insert(0, str(TOOLS_DIR / "rag")) -import lancedb - -from chunker import chunk_file -from embeddings import EmbeddingProvider OPENMC_ROOT = Path(__file__).resolve().parents[3] CACHE_DIR = OPENMC_ROOT / ".claude" / "cache" diff --git a/.claude/tools/rag/openmc_search.py b/.claude/tools/rag/openmc_search.py index eb39125ef61..e927e887256 100644 --- a/.claude/tools/rag/openmc_search.py +++ b/.claude/tools/rag/openmc_search.py @@ -166,8 +166,10 @@ def main(): results = search_related(db, embedder, args.related, args.top_k) print(format_results(results, f"Code related to {args.related}")) elif args.all: - code_results = search_table(db, embedder, "code", args.query, args.top_k) - doc_results = search_table(db, embedder, "docs", args.query, args.top_k) + code_results = search_table( + db, embedder, "code", args.query, args.top_k) + doc_results = search_table( + db, embedder, "docs", args.query, args.top_k) print(format_results(code_results, "Code")) print(format_results(doc_results, "Documentation")) elif args.docs: From 539dfd085b7a3fa8b6906bcbcbbf807175bf1eb4 Mon Sep 17 00:00:00 2001 From: John Tramm Date: Tue, 10 Mar 2026 02:10:43 +0000 Subject: [PATCH 55/67] fixed splitting issue --- .claude/tools/rag/chunker.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.claude/tools/rag/chunker.py b/.claude/tools/rag/chunker.py index da323a53e46..aa4fb2cd26d 100644 --- a/.claude/tools/rag/chunker.py +++ b/.claude/tools/rag/chunker.py @@ -37,7 +37,7 @@ def chunk_file(filepath, openmc_root): # Build a char-offset → line-number map line_starts = [] offset = 0 - for line in lines: + for line in content.split("\n"): line_starts.append(offset) offset += len(line) + 1 # +1 for newline From 6de04408edd2151a30dc9b37a27ea43f2893ef0b Mon Sep 17 00:00:00 2001 From: John Tramm Date: Tue, 10 Mar 2026 02:26:14 +0000 Subject: [PATCH 56/67] removed LSP tool for now --- .claude/tools/lsp/openmc_lsp.py | 513 ------------------------- .claude/tools/openmc_mcp_server.py | 76 +--- CLAUDE.md | 17 +- docs/source/devguide/agentic-tools.rst | 72 +--- 4 files changed, 16 insertions(+), 662 deletions(-) delete mode 100644 .claude/tools/lsp/openmc_lsp.py diff --git a/.claude/tools/lsp/openmc_lsp.py b/.claude/tools/lsp/openmc_lsp.py deleted file mode 100644 index a071a9f8370..00000000000 --- a/.claude/tools/lsp/openmc_lsp.py +++ /dev/null @@ -1,513 +0,0 @@ -#!/usr/bin/env python3 -"""LSP-based code navigation for OpenMC using clangd. - -Uses the Language Server Protocol to provide compiler-accurate symbol -resolution, go-to-definition, find-references, and related-file discovery. -Unlike tree-sitter-based tools, this resolves symbols through the actual -C++ type system — no false edges from name collisions. - -Requires: - - clangd (apt-get install clangd, or clangd-15/clangd-16/etc.) - - build/compile_commands.json (automatically generated when OpenMC is built with cmake) - -Usage: - openmc_lsp.py symbols src/simulation.cpp - openmc_lsp.py definition src/simulation.cpp:132 - openmc_lsp.py references src/simulation.cpp:132 - openmc_lsp.py related src/simulation.cpp - openmc_lsp.py related src/simulation.cpp --top-k 20 - -Examples: - openmc_lsp.py symbols src/particle.cpp - openmc_lsp.py definition src/simulation.cpp:132 # where is write_message defined? - openmc_lsp.py references include/openmc/error.h:55 # who calls write_message? - openmc_lsp.py related src/simulation.cpp # files connected by real references -""" - -import argparse -import json -import os -import re -import shutil -import subprocess -import sys -import time -import urllib.parse -from collections import Counter, defaultdict -from pathlib import Path - -OPENMC_ROOT = Path(__file__).resolve().parents[3] - -# Symbol kind names (LSP spec) -SYMBOL_KINDS = { - 1: "File", 2: "Module", 3: "Namespace", 4: "Package", 5: "Class", - 6: "Method", 7: "Property", 8: "Field", 9: "Constructor", 10: "Enum", - 11: "Interface", 12: "Function", 13: "Variable", 14: "Constant", - 15: "String", 16: "Number", 17: "Boolean", 18: "Array", 19: "Object", - 20: "Key", 21: "Null", 22: "EnumMember", 23: "Struct", 24: "Event", - 25: "Operator", 26: "TypeParameter", -} - - -class ClangdClient: - """Minimal LSP client that talks to clangd via JSON-RPC over stdin/stdout.""" - - def __init__(self, compile_commands_dir=None): - clangd = self._find_clangd() - if not clangd: - raise RuntimeError( - "clangd not found. Install with: apt-get install clangd") - - if not compile_commands_dir: - compile_commands_dir = self._find_compile_commands() - if not compile_commands_dir: - raise RuntimeError( - "compile_commands.json not found. Build OpenMC with " - "cmake first (it generates this file automatically).") - - args = [clangd, '--compile-commands-dir=' + str(compile_commands_dir)] - self.proc = subprocess.Popen( - args, stdin=subprocess.PIPE, stdout=subprocess.PIPE, - stderr=subprocess.PIPE - ) - self._id = 0 - self._opened_files = set() - self._initialize() - - def _find_clangd(self): - """Find clangd binary, trying common names.""" - for name in ['clangd', 'clangd-15', 'clangd-16', 'clangd-17', 'clangd-18']: - path = shutil.which(name) - if path: - return path - return None - - def _find_compile_commands(self): - """Find compile_commands.json in common locations.""" - for d in [OPENMC_ROOT / 'build', OPENMC_ROOT]: - if (d / 'compile_commands.json').exists(): - return str(d) - return None - - def _initialize(self): - """Send LSP initialize/initialized handshake.""" - self.request("initialize", { - "processId": os.getpid(), - "rootUri": OPENMC_ROOT.as_uri(), - "capabilities": {} - }) - self.notify("initialized") - - def _next_id(self): - self._id += 1 - return self._id - - def _send(self, msg_dict): - body = json.dumps(msg_dict) - encoded = body.encode('utf-8') - header = f"Content-Length: {len(encoded)}\r\n\r\n" - self.proc.stdin.write(header.encode('ascii') + encoded) - self.proc.stdin.flush() - - def request(self, method, params=None): - """Send a request and wait for the response.""" - rid = self._next_id() - self._send({"jsonrpc": "2.0", "id": rid, "method": method, - "params": params or {}}) - while True: - msg = self._read_msg() - if msg.get('id') == rid: - if 'error' in msg: - return None - return msg.get('result') - - def notify(self, method, params=None): - """Send a notification (no response expected).""" - self._send({"jsonrpc": "2.0", "method": method, - "params": params or {}}) - - def _read_msg(self): - headers = {} - while True: - line = self.proc.stdout.readline() - if not line: - raise EOFError("clangd process terminated") - line = line.decode('utf-8').strip() - if not line: - break - k, v = line.split(': ', 1) - headers[k] = v - length = int(headers['Content-Length']) - body = self.proc.stdout.read(length) - return json.loads(body) - - def open_file(self, filepath): - """Open a file in clangd and wait for it to be indexed.""" - fpath = Path(filepath) - if not fpath.is_absolute(): - fpath = OPENMC_ROOT / fpath - uri = fpath.as_uri() - if uri in self._opened_files: - return uri - text = fpath.read_text() - self.notify("textDocument/didOpen", { - "textDocument": { - "uri": uri, "languageId": "cpp", "version": 1, "text": text - } - }) - self._opened_files.add(uri) - # Give clangd time to parse. First file takes longer (preamble build). - wait = 8 if len(self._opened_files) == 1 else 3 - time.sleep(wait) - return uri - - def get_symbols(self, filepath): - """Get all symbols defined in a file.""" - uri = self.open_file(filepath) - result = self.request("textDocument/documentSymbol", { - "textDocument": {"uri": uri} - }) - return result or [] - - def get_definition(self, filepath, line, character): - """Get definition location for symbol at position.""" - uri = self.open_file(filepath) - result = self.request("textDocument/definition", { - "textDocument": {"uri": uri}, - "position": {"line": line, "character": character} - }) - return result or [] - - def get_references(self, filepath, line, character, - include_declaration=True): - """Get all references to symbol at position.""" - uri = self.open_file(filepath) - result = self.request("textDocument/references", { - "textDocument": {"uri": uri}, - "position": {"line": line, "character": character}, - "context": {"includeDeclaration": include_declaration} - }) - return result or [] - - def close(self): - """Shutdown clangd cleanly.""" - try: - self.request("shutdown") - self.notify("exit") - self.proc.wait(timeout=5) - except Exception: - self.proc.kill() - - -def uri_to_relpath(uri): - """Convert file:// URI to path relative to OPENMC_ROOT.""" - path = urllib.parse.unquote(uri.replace('file://', '')) - try: - return str(Path(path).relative_to(OPENMC_ROOT)) - except ValueError: - return path - - -def is_project_file(relpath): - """Check if a path is an OpenMC project file (not system/vendor).""" - if relpath.startswith('/'): - return False # absolute path = system header - if relpath.startswith('vendor/'): - return False - return True - - -def get_symbol_range(sym): - """Extract start line/character from either SymbolInformation or DocumentSymbol.""" - # DocumentSymbol format: has 'range' and 'selectionRange' at top level - if 'selectionRange' in sym: - return sym['selectionRange']['start'] - # DocumentSymbol without selectionRange - if 'range' in sym and isinstance(sym['range'], dict) and 'start' in sym['range']: - return sym['range']['start'] - # SymbolInformation format: has 'location.range' - if 'location' in sym: - return sym['location']['range']['start'] - return {'line': 0, 'character': 0} - - -def flatten_symbols(symbols, depth=0): - """Flatten nested document symbols into a flat list with depth info.""" - result = [] - for s in symbols: - result.append((s, depth)) - children = s.get('children', []) - if children: - result.extend(flatten_symbols(children, depth + 1)) - return result - - -def cmd_symbols(client, filepath): - """List all symbols defined in a file.""" - symbols = client.get_symbols(filepath) - flat = flatten_symbols(symbols) - if not flat: - return "No symbols found." - lines = [] - for sym, depth in flat: - kind_name = SYMBOL_KINDS.get(sym['kind'], f"kind={sym['kind']}") - start = get_symbol_range(sym) - line = start['line'] - indent = " " * depth - lines.append(f"{indent}{kind_name}: {sym['name']} (line {line + 1})") - return "\n".join(lines) - - -def find_symbol_on_line(client, filepath, line_1based): - """Find the character position of the symbol name on a given line. - - Uses clangd's document symbols to identify what symbol is defined or - referenced on the target line, then locates the symbol name in the - line text. Falls back to the first identifier on the line if no - symbol matches. - - Returns (character_0based, line_text) or (None, None) if the line - doesn't exist. - """ - fpath = Path(filepath) - if not fpath.is_absolute(): - fpath = OPENMC_ROOT / fpath - file_lines = fpath.read_text().split('\n') - line_0 = line_1based - 1 - if line_0 < 0 or line_0 >= len(file_lines): - return None, None - text = file_lines[line_0] - - # Try to find a symbol defined on this line via document symbols - symbols = client.get_symbols(filepath) - flat = flatten_symbols(symbols) - for sym, _depth in flat: - start = get_symbol_range(sym) - if start['line'] == line_0: - col = text.find(sym['name'], start['character']) - if col >= 0: - # clangd may report "Class::method" — land on the method name - sep = sym['name'].rfind('::') - if sep >= 0: - col += sep + 2 - return col, text - - # No symbol definition on this line — find the most specific identifier. - # For qualified names like Tally::reset(), prefer the method (after ::). - cpp_keywords = { - 'void', 'int', 'double', 'float', 'char', 'bool', 'long', - 'short', 'unsigned', 'signed', 'const', 'static', 'virtual', - 'inline', 'extern', 'auto', 'return', 'if', 'else', 'for', - 'while', 'do', 'switch', 'case', 'break', 'continue', - 'struct', 'class', 'enum', 'namespace', 'using', 'typedef', - 'template', 'typename', 'public', 'private', 'protected', - 'override', 'final', 'explicit', 'noexcept', 'constexpr', - } - - # First pass: look for the name after :: (the most specific symbol) - for m in re.finditer(r'::(\w+)', text): - name = m.group(1) - if name not in cpp_keywords: - return m.start(1), text - - # Second pass: first non-keyword identifier - for m in re.finditer(r'[A-Za-z_]\w*', text): - if m.group() not in cpp_keywords: - return m.start(), text - # Last resort: first non-whitespace - return len(text) - len(text.lstrip()), text - - -def cmd_definition(client, filepath, line, character=None): - """Find the definition of a symbol.""" - if character is None: - character, _ = find_symbol_on_line(client, filepath, line) - if character is None: - return "Could not determine symbol on that line." - - result = client.get_definition(filepath, line - 1, character) - if not result: - return "No definition found." - - if isinstance(result, dict): - result = [result] - lines = [] - for loc in result: - rel = uri_to_relpath(loc['uri']) - ln = loc['range']['start']['line'] + 1 - lines.append(f" {rel}:{ln}") - return "\n".join(lines) - - -def cmd_references(client, filepath, line, character=None): - """Find all references to a symbol.""" - if character is None: - character, _ = find_symbol_on_line(client, filepath, line) - if character is None: - return "Could not determine symbol on that line." - - result = client.get_references(filepath, line - 1, character) - if not result: - return "No references found." - - # Group by file - by_file = defaultdict(list) - for loc in result: - rel = uri_to_relpath(loc['uri']) - ln = loc['range']['start']['line'] + 1 - by_file[rel].append(ln) - - output = [f"{len(result)} references across {len(by_file)} files:\n"] - for fpath, lines_list in sorted(by_file.items()): - lines_str = ", ".join(str(l) for l in sorted(lines_list)) - output.append(f" {fpath}:{lines_str}") - return "\n".join(output) - - -def cmd_related(client, filepath, top_k=15): - """Find files related to a given file through real typed references. - - For each symbol defined in the target file, finds all files that - reference it. Returns files ranked by connection count. - """ - symbols = client.get_symbols(filepath) - flat = flatten_symbols(symbols) - - # Filter to meaningful symbols (functions, classes, methods, variables) - # Class, Method, Field, Function, Variable, Struct - interesting_kinds = {5, 6, 8, 12, 13, 23} - interesting = [(s, d) for s, d in flat if s['kind'] in interesting_kinds] - - if not interesting: - return "No interesting symbols found in file." - - target_rel = filepath - if Path(filepath).is_absolute(): - target_rel = str(Path(filepath).relative_to(OPENMC_ROOT)) - - file_connections = Counter() # file -> number of symbols referencing it - symbol_details = defaultdict(set) # file -> set of symbol names - - # Read the file so we can find exact symbol name positions - fpath_obj = Path(filepath) - if not fpath_obj.is_absolute(): - fpath_obj = OPENMC_ROOT / fpath_obj - file_lines = fpath_obj.read_text().split('\n') - - for sym, depth in interesting: - start = get_symbol_range(sym) - line = start['line'] - char = start['character'] - - # The range start may point to the type, not the symbol name. - # Find the actual symbol name position within the line. - if line < len(file_lines): - name_col = file_lines[line].find(sym['name'], char) - if name_col >= 0: - char = name_col - - refs = client.get_references(filepath, line, char, - include_declaration=False) - if not refs: - continue - - for loc in refs: - rel = uri_to_relpath(loc['uri']) - if rel == target_rel: - continue - if not is_project_file(rel): - continue - file_connections[rel] += 1 - symbol_details[rel].add(sym['name']) - - if not file_connections: - return "No external references found." - - output = [f"Files related to {target_rel} " - f"(ranked by typed reference count):\n"] - for fpath, count in file_connections.most_common(top_k): - syms = sorted(symbol_details[fpath]) - sym_preview = ", ".join(syms[:5]) - if len(syms) > 5: - sym_preview += f", ... (+{len(syms)-5} more)" - output.append(f" [{count:3d} refs] {fpath}") - output.append(f" via: {sym_preview}") - return "\n".join(output) - - -def parse_file_location(location): - """Parse 'filepath:line' into (filepath, line) or (filepath, None).""" - # Handle filepath:line format - parts = location.rsplit(':', 1) - if len(parts) == 2: - try: - line = int(parts[1]) - return parts[0], line - except ValueError: - pass - return location, None - - -def main(): - parser = argparse.ArgumentParser( - description="LSP-based code navigation for OpenMC (via clangd)", - formatter_class=argparse.RawDescriptionHelpFormatter, - epilog="""examples: - %(prog)s symbols src/simulation.cpp - %(prog)s definition src/simulation.cpp:132 - %(prog)s references src/simulation.cpp:132 - %(prog)s related src/simulation.cpp - %(prog)s related src/simulation.cpp --top-k 20""", - ) - parser.add_argument("command", - choices=["symbols", "definition", "references", - "related"], - help="Command to run") - parser.add_argument("location", - help="File path, or file:line for definition/references") - parser.add_argument("--top-k", type=int, default=15, - help="Number of related files to show (default: 15)") - parser.add_argument("--compile-commands-dir", type=str, default=None, - help="Directory containing compile_commands.json") - - args = parser.parse_args() - - filepath, line = parse_file_location(args.location) - - # Validate file exists - fpath = Path(filepath) - if not fpath.is_absolute(): - fpath = OPENMC_ROOT / fpath - if not fpath.exists(): - print(f"ERROR: File not found: {filepath}", file=sys.stderr) - sys.exit(1) - - try: - client = ClangdClient(compile_commands_dir=args.compile_commands_dir) - except RuntimeError as e: - print(f"ERROR: {e}", file=sys.stderr) - sys.exit(1) - - try: - if args.command == "symbols": - print(cmd_symbols(client, filepath)) - elif args.command == "definition": - if line is None: - print("ERROR: definition requires file:line format", - file=sys.stderr) - sys.exit(1) - print(cmd_definition(client, filepath, line)) - elif args.command == "references": - if line is None: - print("ERROR: references requires file:line format", - file=sys.stderr) - sys.exit(1) - print(cmd_references(client, filepath, line)) - elif args.command == "related": - print(cmd_related(client, filepath, top_k=args.top_k)) - finally: - client.close() - - -if __name__ == "__main__": - main() diff --git a/.claude/tools/openmc_mcp_server.py b/.claude/tools/openmc_mcp_server.py index ea03df82e04..559e615b871 100644 --- a/.claude/tools/openmc_mcp_server.py +++ b/.claude/tools/openmc_mcp_server.py @@ -1,10 +1,9 @@ #!/usr/bin/env python3 """MCP server providing OpenMC code navigation tools. -Exposes three tools: +Exposes two tools: - openmc_rag_search: Semantic search across the codebase and docs - openmc_rag_rebuild: Rebuild the RAG vector index - - openmc_lsp_navigate: LSP-based C++ code navigation via clangd """ from mcp.server.fastmcp import FastMCP @@ -31,7 +30,6 @@ # Add tool subdirectories to path for imports TOOLS_DIR = Path(__file__).resolve().parent sys.path.insert(0, str(TOOLS_DIR / "rag")) -sys.path.insert(0, str(TOOLS_DIR / "lsp")) mcp = FastMCP("openmc-code-tools") @@ -39,7 +37,6 @@ # Session state # --------------------------------------------------------------------------- _rag_first_call = True -_lsp_client = None # Keep clangd alive across calls # --------------------------------------------------------------------------- @@ -219,76 +216,5 @@ def openmc_rag_rebuild() -> str: return f"Error rebuilding index: {e}" -@mcp.tool() -def openmc_lsp_navigate( - command: str, - location: str, - top_k: int = 15, -) -> str: - """LSP-based C++ code navigation via clangd. Compiler-accurate symbol - resolution — resolves namespaces, templates, and overloads through the - real C++ type system. Zero false positives. - - Commands: - symbols — list all symbols defined in a file (location = file path) - definition — jump to where the symbol on a given line is defined - (location = file:line) - references — find every file+line that references the symbol - (location = file:line) - related — rank other files by how many typed connections they share - with this file (location = file path) - - Requires clangd and build/compile_commands.json. - - Args: - command: "symbols", "definition", "references", or "related" - location: File path or file:line (e.g. "src/simulation.cpp:132") - top_k: For 'related' — number of files to return (default 15) - """ - global _lsp_client - - try: - from openmc_lsp import ( - ClangdClient, parse_file_location, - cmd_symbols, cmd_definition, cmd_references, cmd_related, - ) - - filepath, line = parse_file_location(location) - - # Validate file exists - fpath = Path(filepath) - if not fpath.is_absolute(): - fpath = OPENMC_ROOT / fpath - if not fpath.exists(): - return f"Error: File not found: {filepath}" - - # Initialize or reuse clangd client - if _lsp_client is None: - _lsp_client = ClangdClient() - - if command == "symbols": - return cmd_symbols(_lsp_client, filepath) - elif command == "definition": - if line is None: - return ("Error: 'definition' requires file:line format " - "(e.g. 'src/simulation.cpp:132')") - return cmd_definition(_lsp_client, filepath, line) - elif command == "references": - if line is None: - return ("Error: 'references' requires file:line format " - "(e.g. 'src/simulation.cpp:132')") - return cmd_references(_lsp_client, filepath, line) - elif command == "related": - return cmd_related(_lsp_client, filepath, top_k=top_k) - else: - return (f"Error: Unknown command '{command}'. " - f"Use: symbols, definition, references, related") - except RuntimeError as e: - return f"Error: {e}" - except Exception as e: - _lsp_client = None # reset on unexpected failure - return f"Error during LSP navigation: {e}" - - if __name__ == "__main__": mcp.run() diff --git a/CLAUDE.md b/CLAUDE.md index 85a643c88e7..85c813346c6 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -1,6 +1,6 @@ ## OpenMC Codebase Tools -Three MCP tools are available for navigating the OpenMC codebase. They are +Two MCP tools are available for navigating the OpenMC codebase. They are registered in `.mcp.json` and appear automatically in every session. ### Tool overview @@ -22,11 +22,6 @@ choice to the user. Do not ask conversationally — always use the widget. Do not skip this step even if the index looks current — the user may have uncommitted changes that warrant a rebuild. -**`openmc_lsp_navigate`** — LSP navigation via clangd. Resolves C++ symbols -through the real type system — namespaces, templates, overloads. Commands: -`symbols`, `definition`, `references`, `related`. Zero false positives. -Requires clangd and `build/compile_commands.json`. - ### Important: use RAG search before grep When exploring unfamiliar code or checking what a change might affect, use @@ -46,9 +41,8 @@ each area will greatly improve your global vision of the repository and help you to "know what you don't know". When you already know the exact symbol name and need to trace its usage (e.g., -"every line that writes to `progeny_per_particle`"), `grep` or -`openmc_lsp_navigate` are better choices — you don't have to force a RAG search -for precise symbol lookups. +"every line that writes to `progeny_per_particle`"), `grep` is the better +choice — you don't have to force a RAG search for precise symbol lookups. ### When to use each tool @@ -56,11 +50,6 @@ for precise symbol lookups. discovery by meaning, cross-cutting concerns, Python and docs. **Use this before grep when exploring unfamiliar code or checking what a change might affect.** -- **`openmc_lsp_navigate`**: "Where is this C++ symbol defined, who calls it, - and what files are truly connected to this one?" — compiler-accurate file:line - locations, zero false positives. When a common method name like `reset`, `get`, - `size`, or `create` is used by multiple classes, `grep` gives you a haystack — - LSP gives you the needle. - **`grep`/`Glob`/`Read`**: Precise text match, unique string lookup, reading specific files. Best when you know the exact symbol name. diff --git a/docs/source/devguide/agentic-tools.rst b/docs/source/devguide/agentic-tools.rst index 878815c27f8..ae15ce8854d 100644 --- a/docs/source/devguide/agentic-tools.rst +++ b/docs/source/devguide/agentic-tools.rst @@ -13,26 +13,15 @@ Motivation ---------- Agentic tools like Claude Code are skilled at using grep to navigate and -understand large code bases. However, this approach has a few weaknesses: - -1. Inability to foresee how changing an assumption in one place in the code - may affect other areas. Without a "global view" of the codebase that - a human developer will build up over time, the agent is generally blind - to any file it hasn't tokenized fully. While it can grep to see who else - calls a function, it remains blind if other areas might be related but - not share identical naming conventions. - -2. Pollution/overload of context window by grepping common variable names. - For instance, if the agent needs to change the Tally::reset() function, - and wants to understand how this change will affect callers, a grep for - "reset()" will return 48 references, each of which may need to have a - large area around it also tokenized to understand if it belongs to the - tally class or if it belongs to a different class. More ideally the - agent would have a tool that can navigate C++ code (similar to tools - that most IDEs provide) and return only the true references (of which - there are only 4 in the repo to Tally::reset().) - -These problems are mitigated somewhat by using a model with a longer context +understand large code bases. However, grep can only find exact text matches — +it cannot discover code that is *conceptually* related but uses different +naming. Without a "global view" of the codebase that a human developer will +build up over time, the agent is generally blind to any file it hasn't +tokenized fully. While it can grep to see who else calls a function, it +remains blind if other areas might be related but not share identical naming +conventions. + +This problem is mitigated somewhat by using a model with a longer context window. OpenMC has somewhere around ~1 million tokens of C++ and ~1 million tokens of python. While Claude Code in early 2026 only has a context window of 200k tokens, beta versions have extended context windows of 1M tokens, @@ -45,7 +34,7 @@ degrades significantly as context size increases`_. Benchmark results are greatly improved if the model has less garbage to pick through. Additionally, API usage is typically billed as tokens in/out per turn. As the context file grows these costs become much larger. As such, there is still significant -motivation to solving the above two problems, so as to ensure only relevant +motivation to solving the above problem, so as to ensure only relevant information is drawn into context so as to maximize model performance and minimize costs. @@ -67,8 +56,8 @@ virtual environment at ``.claude/cache/.venv/`` on first use. RAG Semantic Search ------------------- -The RAG (Retrieval-Augmented Generation) semantic search addresses problem 1 -above — it finds code by meaning, not just text match, surfacing related code +The RAG (Retrieval-Augmented Generation) semantic search addresses this +problem — it finds code by meaning, not just text match, surfacing related code across subsystems that ``grep`` would miss entirely. Two MCP tools are provided: - **openmc_rag_search** — Given a natural-language query, returns the most @@ -109,40 +98,3 @@ Requirements No system dependencies beyond **Python 3.12+** with ``pip``. The Python packages (``sentence-transformers``, ``lancedb``) are installed in the agent's virtual python environment automatically. - -LSP Code Navigation -------------------- - -The LSP (Language Server Protocol) navigation tool addresses problem 2 above — -it resolves C++ symbols through the real type system, giving compiler-accurate -results where ``grep`` returns a haystack of false matches. One MCP tool is -provided: - -- **openmc_lsp_navigate** — C++ code navigation via clangd_. Provides - ``symbols``, ``definition``, ``references``, and ``related`` commands with - compiler accuracy — zero false positives. - -.. _clangd: https://clangd.llvm.org - -How it works -^^^^^^^^^^^^ - -The tool launches clangd as a subprocess and communicates with it via the -`Language Server Protocol`_. Because clangd uses the Clang compiler frontend, -it resolves every symbol through the actual C++ type system — namespaces, -templates, overloads, and all. When a common method name like ``reset()``, -``get()``, or ``size()`` is used by multiple classes, ``grep`` returns dozens -of mixed hits; the LSP tool returns only references to the specific method you -asked about. - -When used through the MCP server, the clangd process is kept alive across -calls within a session, avoiding the startup cost on repeated queries. - -.. _Language Server Protocol: https://microsoft.github.io/language-server-protocol/ - -Requirements -^^^^^^^^^^^^ - -- clangd_ (``apt-get install clangd``) -- ``build/compile_commands.json`` (generated automatically when OpenMC is built - with CMake) From 89f4b520d0be2939ebb111cdb014fc5b926ea264 Mon Sep 17 00:00:00 2001 From: John Tramm Date: Tue, 10 Mar 2026 02:46:50 +0000 Subject: [PATCH 57/67] added hint about needing longer queries with the RAG --- CLAUDE.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/CLAUDE.md b/CLAUDE.md index 85c813346c6..960f71b9f05 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -9,7 +9,9 @@ registered in `.mcp.json` and appear automatically in every session. docs). Finds code by meaning, not just text match. Surfaces related code across subsystems even when naming differs (e.g., "particle RNG seeding" finds code across transport, restart, and random ray modes — files you would never find -with `grep "particle seed"`). +with `grep "particle seed"`). The index uses a small 22M-param embedding model +(384-dim). Phrase-level natural-language queries work much better than single +keywords or symbol names. **`openmc_rag_rebuild`** — Rebuild the RAG vector index. Call after pulling new code or switching branches. The first RAG search of each session will report From a1c0fab4acd8ae56f4701207f35f848bcd9c9bae Mon Sep 17 00:00:00 2001 From: John Tramm Date: Tue, 10 Mar 2026 03:47:17 +0000 Subject: [PATCH 58/67] tweaks to claude.md, dialing back RAG usage --- CLAUDE.md | 50 ++++++++++++-------------------------------------- 1 file changed, 12 insertions(+), 38 deletions(-) diff --git a/CLAUDE.md b/CLAUDE.md index 960f71b9f05..f0e6d5cca5f 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -24,41 +24,15 @@ choice to the user. Do not ask conversationally — always use the widget. Do not skip this step even if the index looks current — the user may have uncommitted changes that warrant a rebuild. -### Important: use RAG search before grep +### Why RAG matters -When exploring unfamiliar code or checking what a change might affect, use -`openmc_rag_search` **before** reaching for `grep` or `Glob`. `grep` only finds -exact text matches — it cannot find code that does something similar with -different naming. You don't know what you don't know. The RAG search finds code -by semantic meaning, surfacing related code across subsystems that you would -otherwise miss entirely. Use RAG for discovery, then `grep`/`Read` to drill into -specific files. +OpenMC is large enough that changes in one subsystem can silently break +invariants that distant subsystems depend on — and those distant files often +use different naming, so grep won't find them. The RAG search finds code by +meaning, surfacing files you wouldn't have thought to open. -**No code review or feature creation job in OpenMC is to be considered complete -unless you have made good use of `openmc_rag_search` to ensure any logic changes -do not cause collateral damage elsewhere in the codebase.** The ideal time for -calling it is before you begin to make changes or start zooming in on small local -details. Thinking up a few RAG queries before starting analysis/editing work in -each area will greatly improve your global vision of the repository and help you -to "know what you don't know". - -When you already know the exact symbol name and need to trace its usage (e.g., -"every line that writes to `progeny_per_particle`"), `grep` is the better -choice — you don't have to force a RAG search for precise symbol lookups. - -### When to use each tool - -- **`openmc_rag_search`**: "What code is conceptually related to X?" — broad - discovery by meaning, cross-cutting concerns, Python and docs. **Use this - before grep when exploring unfamiliar code or checking what a change might - affect.** -- **`grep`/`Glob`/`Read`**: Precise text match, unique string lookup, reading - specific files. Best when you know the exact symbol name. - -### Why global awareness matters - -An agent reviewed a large OpenMC PR using only diff, grep, and Read. It found -1 of 11 serious bugs. Here is its own post-mortem: +An agent reviewed a large OpenMC PR without RAG. It found 1 of 11 serious +bugs. Its post-mortem: > **I treated the diff as a closed system.** I verified internal consistency of > the changed code obsessively, but never built a global understanding of how @@ -77,11 +51,11 @@ An agent reviewed a large OpenMC PR using only diff, grep, and Read. It found > review. They directly surfaced the files containing the bugs I missed — files > I never thought to open because they weren't in the diff. -The takeaway: **use RAG throughout your work to maintain global awareness.** -Before diving into details, ask "what else in this codebase depends on the -behavior being changed?" As you explore each area, keep querying to build your -mental map of affected subsystems. The diff tells you *what* changed; RAG tells -you *what else cares*. +The takeaway: when reviewing or modifying code, ask yourself "what else in this +codebase might depend on the behavior I'm changing?" If you aren't sure, that's +a good time for a RAG query. It won't replace the grep-based investigation you +should already be doing — but it can surface files you wouldn't have thought to +open. ## Additional OpenMC info From 5b1e1af0129f3a0f215e5d41f0a18707448a5610 Mon Sep 17 00:00:00 2001 From: John Tramm Date: Tue, 10 Mar 2026 04:24:02 +0000 Subject: [PATCH 59/67] Adding some env checks to MCP server launcher --- .claude/tools/start_server.sh | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/.claude/tools/start_server.sh b/.claude/tools/start_server.sh index cc749400d1a..0b60e61644a 100755 --- a/.claude/tools/start_server.sh +++ b/.claude/tools/start_server.sh @@ -5,10 +5,29 @@ set -e SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" CACHE_DIR="$(dirname "$SCRIPT_DIR")/cache" VENV_DIR="$CACHE_DIR/.venv" +SENTINEL="$VENV_DIR/.installed" -if [ ! -d "$VENV_DIR" ]; then +if ! command -v python3 >/dev/null 2>&1; then + echo "Error: python3 not found on PATH." >&2 + exit 1 +fi + +if ! python3 -c 'import sys; assert sys.version_info >= (3,12)' 2>/dev/null; then + echo "Error: Python 3.12+ is required." >&2 + exit 1 +fi + +if [ ! -f "$SENTINEL" ]; then + rm -rf "$VENV_DIR" python3 -m venv "$VENV_DIR" - "$VENV_DIR/bin/pip" install -q -r "$SCRIPT_DIR/requirements.txt" + + if ! "$VENV_DIR/bin/pip" install -q -r "$SCRIPT_DIR/requirements.txt"; then + echo "Error: pip install failed. Remove $VENV_DIR and retry." >&2 + rm -rf "$VENV_DIR" + exit 1 + fi + + touch "$SENTINEL" fi exec "$VENV_DIR/bin/python" "$SCRIPT_DIR/openmc_mcp_server.py" From 0aa99ea220ffffc4b924e91e46d3bc79e9cfea45 Mon Sep 17 00:00:00 2001 From: John Tramm Date: Tue, 10 Mar 2026 04:28:07 +0000 Subject: [PATCH 60/67] cleanup of python comments/docstrings etc --- .claude/tools/openmc_mcp_server.py | 45 +++++++++++++++++++++--------- .claude/tools/rag/chunker.py | 22 +++++++++++---- .claude/tools/rag/embeddings.py | 19 +++++++++---- .claude/tools/rag/indexer.py | 20 ++++++++++--- .claude/tools/rag/openmc_search.py | 23 ++++++++++----- 5 files changed, 94 insertions(+), 35 deletions(-) diff --git a/.claude/tools/openmc_mcp_server.py b/.claude/tools/openmc_mcp_server.py index 559e615b871..0bcc596b9d0 100644 --- a/.claude/tools/openmc_mcp_server.py +++ b/.claude/tools/openmc_mcp_server.py @@ -1,9 +1,24 @@ #!/usr/bin/env python3 -"""MCP server providing OpenMC code navigation tools. - -Exposes two tools: - - openmc_rag_search: Semantic search across the codebase and docs - - openmc_rag_rebuild: Rebuild the RAG vector index +"""MCP server that exposes OpenMC's RAG semantic search to AI coding agents. + +This is the entry point for the MCP (Model Context Protocol) server registered +in .mcp.json at the repo root. When an MCP-capable agent (e.g. Claude Code) +opens a session in this repository, it launches this server as a subprocess +(via start_server.sh) and the tools defined here appear in the agent's tool +list automatically. + +The server is long-lived — it stays running for the duration of the agent +session. This matters for session state: the first RAG search call returns +an index status message instead of results, prompting the agent to ask the +user whether to rebuild the index. That first-call flag resets each session. + +Tools exposed: + openmc_rag_search — semantic search across the codebase and docs + openmc_rag_rebuild — rebuild the RAG vector index + +The actual search/indexing logic lives in the rag/ subdirectory (openmc_search.py, +indexer.py, chunker.py, embeddings.py). This file is just the MCP interface +layer and session state management. """ from mcp.server.fastmcp import FastMCP @@ -14,28 +29,32 @@ from datetime import datetime from pathlib import Path -# Suppress noisy logging from httpx and huggingface_hub before any imports -# that trigger HTTP requests. The MCP transport uses stderr, so stray log -# lines there would corrupt the protocol stream. +# MCP communicates over stdin/stdout with JSON-RPC framing. Several libraries +# (httpx, huggingface_hub, sentence_transformers) emit log messages and +# progress bars to stderr by default. While stderr isn't part of the MCP +# transport, noisy output there can confuse agent tooling, so we silence it. logging.getLogger("httpx").setLevel(logging.WARNING) logging.getLogger("huggingface_hub").setLevel(logging.ERROR) logging.getLogger("sentence_transformers").setLevel(logging.WARNING) - +# Path constants. This file lives at .claude/tools/openmc_mcp_server.py, +# so parents[2] is the OpenMC repo root. OPENMC_ROOT = Path(__file__).resolve().parents[2] CACHE_DIR = OPENMC_ROOT / ".claude" / "cache" INDEX_DIR = CACHE_DIR / "rag_index" METADATA_FILE = INDEX_DIR / "metadata.json" -# Add tool subdirectories to path for imports +# The RAG modules (openmc_search, indexer, etc.) live in .claude/tools/rag/. +# We add that directory to sys.path so we can import them directly. TOOLS_DIR = Path(__file__).resolve().parent sys.path.insert(0, str(TOOLS_DIR / "rag")) mcp = FastMCP("openmc-code-tools") -# --------------------------------------------------------------------------- -# Session state -# --------------------------------------------------------------------------- +# First-call flag: the first openmc_rag_search call of each session returns +# index status info instead of search results, so the agent can ask the user +# whether to rebuild. This resets when the server process restarts (i.e. each +# new agent session). _rag_first_call = True diff --git a/.claude/tools/rag/chunker.py b/.claude/tools/rag/chunker.py index aa4fb2cd26d..b28ddb0f8a2 100644 --- a/.claude/tools/rag/chunker.py +++ b/.claude/tools/rag/chunker.py @@ -1,9 +1,19 @@ -"""Chunk OpenMC source files and documentation for RAG indexing. - -Uses fixed-size overlapping windows so every line of code is searchable. -Window size is tuned to fit within the MiniLM embedding model's 256-token -context (~1000 chars). 25% overlap ensures most content appears in at least -two chunks. +"""Split source files into overlapping text chunks for vector embedding. + +The indexer (indexer.py) calls chunk_file() on every C++, Python, and RST file +in the repo. Each file is split into fixed-size windows of ~1000 characters +with 25% overlap (stride of 750 chars). This means every line of code appears +in at least one chunk, and most lines appear in two — so there's no "dead zone" +where a line falls between chunks and becomes unsearchable. + +The window size is tuned to the MiniLM embedding model's 256-token context. +Code averages ~4 characters per token, so 1000 chars ≈ 250 tokens — just +under the model's limit. Chunks are snapped to line boundaries to avoid +splitting mid-line. + +Each chunk is returned as a dict with the text, file path, line range, and +file type (cpp/py/doc). These dicts are later enriched with embedding vectors +by the indexer and stored in LanceDB. """ from pathlib import Path diff --git a/.claude/tools/rag/embeddings.py b/.claude/tools/rag/embeddings.py index cfb926961c2..dbb12728212 100644 --- a/.claude/tools/rag/embeddings.py +++ b/.claude/tools/rag/embeddings.py @@ -1,19 +1,28 @@ -"""Embedding provider using sentence-transformers (all-MiniLM-L6-v2). +"""Thin wrapper around sentence-transformers for embedding text into vectors. -Requires: pip install sentence-transformers +Uses the all-MiniLM-L6-v2 model — a small (22M param, 384-dim) model that +runs on CPU with no GPU or API key required. The model weights are downloaded +once from Hugging Face on first use and cached locally (~80MB). + +This module is imported by both the MCP server (for search queries) and the +indexer (for bulk embedding of code chunks). The bulk embed() call shows a +progress bar; the single-query embed_query() does not. + +The env vars and logging suppression below are necessary because HuggingFace +libraries emit warnings, progress bars, and telemetry pings by default. When +running under the MCP server, any stray output can interfere with the JSON-RPC +transport. These must be set before importing transformers/sentence_transformers. """ -import transformers import os -# Suppress noisy HuggingFace/transformers output. -# All must be set before importing transformers. os.environ.setdefault("TRANSFORMERS_VERBOSITY", "error") os.environ.setdefault("HF_HUB_VERBOSITY", "error") os.environ.setdefault("HF_HUB_DISABLE_TELEMETRY", "1") os.environ.setdefault("HF_HUB_DISABLE_PROGRESS_BARS", "1") os.environ.setdefault("TOKENIZERS_PARALLELISM", "false") +import transformers # noqa: E402 — must come after env vars are set transformers.logging.disable_progress_bar() diff --git a/.claude/tools/rag/indexer.py b/.claude/tools/rag/indexer.py index 889e1ec36a4..34ab5dc554b 100644 --- a/.claude/tools/rag/indexer.py +++ b/.claude/tools/rag/indexer.py @@ -1,9 +1,19 @@ #!/usr/bin/env python3 -"""Build the RAG vector index for OpenMC source code and documentation. +"""Build the RAG vector index for the OpenMC codebase. -Chunks all C++, Python, and RST files, embeds them, and stores in LanceDB. +This is the "offline" step of the RAG pipeline. It walks the repo, chunks every +C++/Python/RST file (via chunker.py), embeds all chunks into 384-dim vectors +(via embeddings.py), and stores them in a LanceDB database on disk. The result +is a .claude/cache/rag_index/ directory containing two tables — "code" and +"docs" — that openmc_search.py queries at search time. -Output: .claude/cache/rag_index/ (LanceDB directory) +Building the full index takes ~5 minutes on a 10-core machine. The bottleneck +is the embedding step (running all chunks through the MiniLM model on CPU). + +Can be run standalone: python indexer.py +Or called programmatically: from indexer import build_index; build_index() +The MCP server (openmc_mcp_server.py) uses the latter when the agent calls +openmc_rag_rebuild. """ from embeddings import EmbeddingProvider @@ -13,7 +23,9 @@ import time from pathlib import Path -# Add tools dir to path for imports +# This file lives at .claude/tools/rag/indexer.py. The sys.path insert lets +# us import sibling modules (embeddings, chunker) when run as a standalone +# script. When imported from the MCP server, the server has already done this. TOOLS_DIR = Path(__file__).resolve().parent.parent sys.path.insert(0, str(TOOLS_DIR / "rag")) diff --git a/.claude/tools/rag/openmc_search.py b/.claude/tools/rag/openmc_search.py index e927e887256..69325268395 100644 --- a/.claude/tools/rag/openmc_search.py +++ b/.claude/tools/rag/openmc_search.py @@ -1,5 +1,19 @@ #!/usr/bin/env python3 -"""Semantic search across the OpenMC codebase and documentation. +"""Query the RAG vector index to find semantically related code and docs. + +This is the "online" step of the RAG pipeline — the counterpart to indexer.py +which builds the index. Given a natural-language query, it embeds the query +with the same MiniLM model used at index time, then finds the closest chunks +in the LanceDB vector database by cosine similarity. + +The core functions (get_db_and_embedder, search_table, format_results, +search_related) are imported by the MCP server for tool calls. The script +can also be run standalone from the command line. + +The "related file" mode works differently from a text query: it reads the +target file's chunks from the index, combines them into a synthetic query +vector, and searches for the nearest chunks from *other* files. This surfaces +files that are semantically similar to the target file. Usage: openmc_search.py "query" # Search code (default) @@ -7,18 +21,13 @@ openmc_search.py "query" --all # Search both code and docs openmc_search.py --related src/particle.cpp # Find related code openmc_search.py "query" --top-k 20 # Return more results - -Examples: - openmc_search.py "particle random number seed initialization" - openmc_search.py "how to define tallies" --docs - openmc_search.py --related src/simulation.cpp """ import argparse import sys from pathlib import Path -# Add tools dir to path +# Same sys.path setup as indexer.py — needed for standalone CLI use. TOOLS_DIR = Path(__file__).resolve().parent.parent sys.path.insert(0, str(TOOLS_DIR / "rag")) From feccb7976c06275e5ca7a8791b0559bff1c6f0c2 Mon Sep 17 00:00:00 2001 From: John Tramm Date: Tue, 10 Mar 2026 04:42:01 +0000 Subject: [PATCH 61/67] adding more specific info regarding how the model is downloaded and from where/how. --- .claude/tools/rag/embeddings.py | 42 +++++++++++++++++++++++++++------ 1 file changed, 35 insertions(+), 7 deletions(-) diff --git a/.claude/tools/rag/embeddings.py b/.claude/tools/rag/embeddings.py index dbb12728212..08c9204de1e 100644 --- a/.claude/tools/rag/embeddings.py +++ b/.claude/tools/rag/embeddings.py @@ -1,20 +1,38 @@ """Thin wrapper around sentence-transformers for embedding text into vectors. Uses the all-MiniLM-L6-v2 model — a small (22M param, 384-dim) model that -runs on CPU with no GPU or API key required. The model weights are downloaded -once from Hugging Face on first use and cached locally (~80MB). +runs on CPU with no GPU or API key required. + +Network behavior and privacy +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +No user code, queries, or file contents are EVER sent to HuggingFace or any +external service. All embedding computation happens locally. The only network +activity is the one-time model download on first use: + + First run (model not yet cached, ~80MB download): + - Downloads model weight files from huggingface.co. + - The only metadata sent in requests is a user-agent header with library + versions (e.g. "hf_hub/1.6.0; python/3.12.3; torch/2.10.0") and a + random session ID. No user-identifiable information is sent. + - HF_HUB_DISABLE_TELEMETRY=1 is set, which disables any optional + analytics the library might otherwise send. + + Subsequent runs (model already cached): + - HF_HUB_OFFLINE=1 is set automatically (see code below), which prevents + ALL network calls. The model loads entirely from the local cache at + ~/.cache/huggingface/hub/. This module is imported by both the MCP server (for search queries) and the indexer (for bulk embedding of code chunks). The bulk embed() call shows a progress bar; the single-query embed_query() does not. -The env vars and logging suppression below are necessary because HuggingFace -libraries emit warnings, progress bars, and telemetry pings by default. When -running under the MCP server, any stray output can interfere with the JSON-RPC -transport. These must be set before importing transformers/sentence_transformers. +The env vars below must be set before importing transformers or +sentence_transformers. They suppress warnings, progress bars, and telemetry. +Stray stderr output would interfere with the MCP server's JSON-RPC transport. """ import os +from pathlib import Path os.environ.setdefault("TRANSFORMERS_VERBOSITY", "error") os.environ.setdefault("HF_HUB_VERBOSITY", "error") @@ -22,6 +40,16 @@ os.environ.setdefault("HF_HUB_DISABLE_PROGRESS_BARS", "1") os.environ.setdefault("TOKENIZERS_PARALLELISM", "false") +# If the model is already downloaded, force fully offline mode so that no +# network calls are made — not even the etag freshness check that +# huggingface_hub does by default. The first-ever run will still download +# normally because the cache dir won't exist yet. This MUST be set before +# importing sentence_transformers, which reads the env var at import time. +_MODEL_NAME = "all-MiniLM-L6-v2" +_HF_CACHE = Path(os.environ.get("HF_HOME", Path.home() / ".cache" / "huggingface")) / "hub" +if (_HF_CACHE / f"models--sentence-transformers--{_MODEL_NAME}").exists(): + os.environ.setdefault("HF_HUB_OFFLINE", "1") + import transformers # noqa: E402 — must come after env vars are set transformers.logging.disable_progress_bar() @@ -29,7 +57,7 @@ class EmbeddingProvider: """Sentence-transformers embedder using all-MiniLM-L6-v2.""" - def __init__(self, model_name: str = "all-MiniLM-L6-v2"): + def __init__(self, model_name: str = _MODEL_NAME): from sentence_transformers import SentenceTransformer self.model = SentenceTransformer(model_name, token=False) self.dim = self.model.get_sentence_embedding_dimension() From dec12c3b098a857f524f50b221ee31abd4f209d9 Mon Sep 17 00:00:00 2001 From: John Tramm Date: Tue, 10 Mar 2026 04:47:33 +0000 Subject: [PATCH 62/67] cleanup of embedding file --- .claude/tools/rag/embeddings.py | 63 ++++++++++++++++++++++++++------- 1 file changed, 50 insertions(+), 13 deletions(-) diff --git a/.claude/tools/rag/embeddings.py b/.claude/tools/rag/embeddings.py index 08c9204de1e..801545152e5 100644 --- a/.claude/tools/rag/embeddings.py +++ b/.claude/tools/rag/embeddings.py @@ -18,9 +18,20 @@ analytics the library might otherwise send. Subsequent runs (model already cached): - - HF_HUB_OFFLINE=1 is set automatically (see code below), which prevents - ALL network calls. The model loads entirely from the local cache at - ~/.cache/huggingface/hub/. + - HF_HUB_OFFLINE=1 is set automatically (see _set_offline_if_cached() + below), which prevents ALL network calls. The model loads entirely + from the local cache at ~/.cache/huggingface/hub/. + +How the model is downloaded +~~~~~~~~~~~~~~~~~~~~~~~~~~~ +The SentenceTransformer() constructor (called in __init__ below) handles +the download automatically on first use. It calls into the huggingface_hub +library, which downloads the model files from: + + https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2 + +The files are saved to ~/.cache/huggingface/hub/ and reused on subsequent +runs. We pass token=False to ensure no authentication token is sent. This module is imported by both the MCP server (for search queries) and the indexer (for bulk embedding of code chunks). The bulk embed() call shows a @@ -34,21 +45,43 @@ import os from pathlib import Path +MODEL_NAME = "all-MiniLM-L6-v2" + +# These env vars control logging/telemetry behavior in the HuggingFace +# libraries. They must be set before the libraries are imported. os.environ.setdefault("TRANSFORMERS_VERBOSITY", "error") os.environ.setdefault("HF_HUB_VERBOSITY", "error") os.environ.setdefault("HF_HUB_DISABLE_TELEMETRY", "1") os.environ.setdefault("HF_HUB_DISABLE_PROGRESS_BARS", "1") os.environ.setdefault("TOKENIZERS_PARALLELISM", "false") -# If the model is already downloaded, force fully offline mode so that no -# network calls are made — not even the etag freshness check that -# huggingface_hub does by default. The first-ever run will still download -# normally because the cache dir won't exist yet. This MUST be set before -# importing sentence_transformers, which reads the env var at import time. -_MODEL_NAME = "all-MiniLM-L6-v2" -_HF_CACHE = Path(os.environ.get("HF_HOME", Path.home() / ".cache" / "huggingface")) / "hub" -if (_HF_CACHE / f"models--sentence-transformers--{_MODEL_NAME}").exists(): - os.environ.setdefault("HF_HUB_OFFLINE", "1") + +def _set_offline_if_cached(): + """If the model has already been downloaded, tell huggingface_hub to + skip all network calls by setting HF_HUB_OFFLINE=1. + + Without this, huggingface_hub makes an HTTP request to huggingface.co + on every load to check if the cached model is still up to date — even + though the model never changes. Setting HF_HUB_OFFLINE=1 prevents this. + + This must run before sentence_transformers is imported, because the + library reads the env var at import time. + """ + # HuggingFace caches downloaded models under ~/.cache/huggingface/hub/ + # in directories named like "models--sentence-transformers--all-MiniLM-L6-v2". + # The HF_HOME env var can override the base cache location. + hf_home = os.environ.get("HF_HOME") + if hf_home: + cache_dir = Path(hf_home) / "hub" + else: + cache_dir = Path.home() / ".cache" / "huggingface" / "hub" + + model_dir = cache_dir / f"models--sentence-transformers--{MODEL_NAME}" + if model_dir.exists(): + os.environ.setdefault("HF_HUB_OFFLINE", "1") + + +_set_offline_if_cached() import transformers # noqa: E402 — must come after env vars are set transformers.logging.disable_progress_bar() @@ -57,8 +90,12 @@ class EmbeddingProvider: """Sentence-transformers embedder using all-MiniLM-L6-v2.""" - def __init__(self, model_name: str = _MODEL_NAME): + def __init__(self, model_name: str = MODEL_NAME): from sentence_transformers import SentenceTransformer + + # This constructor loads the model from the local cache. If the model + # has not been downloaded yet, it downloads it from huggingface.co + # (~80MB, one-time). token=False ensures no auth token is sent. self.model = SentenceTransformer(model_name, token=False) self.dim = self.model.get_sentence_embedding_dimension() From 6a44a25f3b529c2a30a2b4626da409f6d9aec9ba Mon Sep 17 00:00:00 2001 From: John Tramm Date: Tue, 10 Mar 2026 04:59:15 +0000 Subject: [PATCH 63/67] explaining some jargon --- .claude/tools/rag/embeddings.py | 44 ++++++++++++++++++------------ .claude/tools/rag/indexer.py | 10 ++++--- .claude/tools/rag/openmc_search.py | 9 +++--- 3 files changed, 38 insertions(+), 25 deletions(-) diff --git a/.claude/tools/rag/embeddings.py b/.claude/tools/rag/embeddings.py index 801545152e5..1fe85b50d9e 100644 --- a/.claude/tools/rag/embeddings.py +++ b/.claude/tools/rag/embeddings.py @@ -10,17 +10,22 @@ activity is the one-time model download on first use: First run (model not yet cached, ~80MB download): - - Downloads model weight files from huggingface.co. - - The only metadata sent in requests is a user-agent header with library - versions (e.g. "hf_hub/1.6.0; python/3.12.3; torch/2.10.0") and a - random session ID. No user-identifiable information is sent. - - HF_HUB_DISABLE_TELEMETRY=1 is set, which disables any optional - analytics the library might otherwise send. + - Downloads model weight files from huggingface.co. This is a standard + HTTP file download, similar to pip installing a package. + - The only metadata sent in these requests is an HTTP user-agent header + containing library version numbers (e.g. "hf_hub/1.6.0; + python/3.12.3; torch/2.10.0"). No filenames, file contents, queries, + or any user-identifiable information is sent. + - The huggingface_hub library has an optional feature where it can report + anonymous library usage statistics (just version numbers, not user + data) back to HuggingFace. We disable this by setting + HF_HUB_DISABLE_TELEMETRY=1. Subsequent runs (model already cached): - - HF_HUB_OFFLINE=1 is set automatically (see _set_offline_if_cached() + - We set HF_HUB_OFFLINE=1 automatically (see _set_offline_if_cached() below), which prevents ALL network calls. The model loads entirely - from the local cache at ~/.cache/huggingface/hub/. + from the local cache at ~/.cache/huggingface/hub/. Zero bytes leave + the machine. How the model is downloaded ~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -38,8 +43,9 @@ progress bar; the single-query embed_query() does not. The env vars below must be set before importing transformers or -sentence_transformers. They suppress warnings, progress bars, and telemetry. -Stray stderr output would interfere with the MCP server's JSON-RPC transport. +sentence_transformers. They suppress warnings and progress bars that these +libraries emit by default. Stray stderr output would interfere with the MCP +server's JSON-RPC transport. """ import os @@ -47,13 +53,15 @@ MODEL_NAME = "all-MiniLM-L6-v2" -# These env vars control logging/telemetry behavior in the HuggingFace -# libraries. They must be set before the libraries are imported. -os.environ.setdefault("TRANSFORMERS_VERBOSITY", "error") -os.environ.setdefault("HF_HUB_VERBOSITY", "error") -os.environ.setdefault("HF_HUB_DISABLE_TELEMETRY", "1") +# These env vars control logging behavior in the HuggingFace libraries. +# They must be set before the libraries are imported. +os.environ.setdefault("TRANSFORMERS_VERBOSITY", "error") # suppress warnings +os.environ.setdefault("HF_HUB_VERBOSITY", "error") # suppress warnings os.environ.setdefault("HF_HUB_DISABLE_PROGRESS_BARS", "1") -os.environ.setdefault("TOKENIZERS_PARALLELISM", "false") +os.environ.setdefault("TOKENIZERS_PARALLELISM", "false") # suppress threading warning +# Disable anonymous library usage statistics (version numbers only, not user +# data — but we disable it anyway as a matter of policy). +os.environ.setdefault("HF_HUB_DISABLE_TELEMETRY", "1") def _set_offline_if_cached(): @@ -83,7 +91,9 @@ def _set_offline_if_cached(): _set_offline_if_cached() -import transformers # noqa: E402 — must come after env vars are set +# This import must come after the env vars above are set, because the +# transformers library reads them at import time. +import transformers transformers.logging.disable_progress_bar() diff --git a/.claude/tools/rag/indexer.py b/.claude/tools/rag/indexer.py index 34ab5dc554b..fd326de9263 100644 --- a/.claude/tools/rag/indexer.py +++ b/.claude/tools/rag/indexer.py @@ -1,11 +1,13 @@ #!/usr/bin/env python3 """Build the RAG vector index for the OpenMC codebase. -This is the "offline" step of the RAG pipeline. It walks the repo, chunks every +This is the index-building half of the RAG pipeline. All operations are local — +no network calls are made (the embedding model is already cached locally; see +embeddings.py for details on model download). It walks the repo, chunks every C++/Python/RST file (via chunker.py), embeds all chunks into 384-dim vectors -(via embeddings.py), and stores them in a LanceDB database on disk. The result -is a .claude/cache/rag_index/ directory containing two tables — "code" and -"docs" — that openmc_search.py queries at search time. +(via embeddings.py), and stores them in a local LanceDB database on disk. The +result is a .claude/cache/rag_index/ directory containing two tables — "code" +and "docs" — that openmc_search.py queries at search time. Building the full index takes ~5 minutes on a 10-core machine. The bottleneck is the embedding step (running all chunks through the MiniLM model on CPU). diff --git a/.claude/tools/rag/openmc_search.py b/.claude/tools/rag/openmc_search.py index 69325268395..ec726e86ab6 100644 --- a/.claude/tools/rag/openmc_search.py +++ b/.claude/tools/rag/openmc_search.py @@ -1,10 +1,11 @@ #!/usr/bin/env python3 """Query the RAG vector index to find semantically related code and docs. -This is the "online" step of the RAG pipeline — the counterpart to indexer.py -which builds the index. Given a natural-language query, it embeds the query -with the same MiniLM model used at index time, then finds the closest chunks -in the LanceDB vector database by cosine similarity. +This is the query-time half of the RAG pipeline (the counterpart to indexer.py, +which builds the index). All operations are local — no network calls are made. +Given a natural-language query, it embeds the query with the same MiniLM model +used at index time, then finds the closest chunks in the local LanceDB vector +database by cosine similarity. The core functions (get_db_and_embedder, search_table, format_results, search_related) are imported by the MCP server for tool calls. The script From e71bbf677bcdcd056caa8b466ad9b8f6b792f266 Mon Sep 17 00:00:00 2001 From: John Tramm Date: Tue, 10 Mar 2026 13:33:06 +0000 Subject: [PATCH 64/67] moving almost everything out of CLAUDE.md into AGENTS.md --- AGENTS.md | 50 ++++++++++++++++++++++++++++++++++++++ CLAUDE.md | 72 ++++++++++--------------------------------------------- 2 files changed, 62 insertions(+), 60 deletions(-) diff --git a/AGENTS.md b/AGENTS.md index dce32d0e476..19abba7d97b 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -42,6 +42,56 @@ OpenMC uses a git flow branching model with two primary branches: When reviewing code changes in this repository, use the `reviewing-openmc-code` skill. +## Codebase Navigation Tools + +Two MCP tools are registered in `.mcp.json` at the repo root and appear +automatically in any MCP-capable agent session. + +**`openmc_rag_search`** — Semantic search across the codebase (C++, Python, RST +docs). Finds code by meaning, not just text match. Surfaces related code across +subsystems even when naming differs (e.g., "particle RNG seeding" finds code +across transport, restart, and random ray modes — files you would never find +with `grep "particle seed"`). The index uses a small 22M-param embedding model +(384-dim). Phrase-level natural-language queries work much better than single +keywords or symbol names. + +**`openmc_rag_rebuild`** — Rebuild the RAG vector index. Call after pulling new +code or switching branches. The first RAG search of each session will report +the index status and ask whether to rebuild — you can also call this explicitly. + +### Why RAG matters + +OpenMC is large enough that changes in one subsystem can silently break +invariants that distant subsystems depend on — and those distant files often +use different naming, so grep won't find them. The RAG search finds code by +meaning, surfacing files you wouldn't have thought to open. + +An agent reviewed a large OpenMC PR without RAG. It found 1 of 11 serious +bugs. Its post-mortem: + +> **I treated the diff as a closed system.** I verified internal consistency of +> the changed code obsessively, but never built a global understanding of how +> the changed code fits into the wider codebase. The diff altered assumptions +> that code elsewhere silently relied on — but I couldn't see that because I +> never looked beyond the diff. I couldn't see the forest for the trees. +> +> **Why I resisted RAG:** Overconfidence. My internal model was "I can see the +> diff, I understand the data structures, I can trace the logic." The diff felt +> self-contained. RAG felt like it would return noisy results about tangentially +> related code. But in a codebase this large, changes in one subsystem can +> quietly break invariants that distant subsystems depend on — and you need +> global awareness to foresee that. +> +> **In the post-mortem**, I re-ran the RAG queries I should have run during the +> review. They directly surfaced the files containing the bugs I missed — files +> I never thought to open because they weren't in the diff. + +The takeaway: when reviewing or modifying code, ask yourself "what else in this +codebase might depend on the behavior I'm changing?" If you aren't sure, that's +a good time for a RAG query. It won't replace the grep-based investigation you +should already be doing — but it can surface files you wouldn't have thought to +open. + ### Workflow for contributors 1. Create a feature/bugfix branch off `develop` diff --git a/CLAUDE.md b/CLAUDE.md index f0e6d5cca5f..9538b5ddc4a 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -1,62 +1,14 @@ ## OpenMC Codebase Tools -Two MCP tools are available for navigating the OpenMC codebase. They are -registered in `.mcp.json` and appear automatically in every session. - -### Tool overview - -**`openmc_rag_search`** — Semantic search across the codebase (C++, Python, RST -docs). Finds code by meaning, not just text match. Surfaces related code across -subsystems even when naming differs (e.g., "particle RNG seeding" finds code -across transport, restart, and random ray modes — files you would never find -with `grep "particle seed"`). The index uses a small 22M-param embedding model -(384-dim). Phrase-level natural-language queries work much better than single -keywords or symbol names. - -**`openmc_rag_rebuild`** — Rebuild the RAG vector index. Call after pulling new -code or switching branches. The first RAG search of each session will report -the index status and ask whether to rebuild — you can also call this explicitly. - -**First-call behavior:** The first `openmc_rag_search` call of each session -returns an index status message instead of search results. When this happens, -you MUST use the `AskUserQuestion` tool to present the rebuild/use-existing -choice to the user. Do not ask conversationally — always use the widget. -Do not skip this step even if the index looks current — the user may have -uncommitted changes that warrant a rebuild. - -### Why RAG matters - -OpenMC is large enough that changes in one subsystem can silently break -invariants that distant subsystems depend on — and those distant files often -use different naming, so grep won't find them. The RAG search finds code by -meaning, surfacing files you wouldn't have thought to open. - -An agent reviewed a large OpenMC PR without RAG. It found 1 of 11 serious -bugs. Its post-mortem: - -> **I treated the diff as a closed system.** I verified internal consistency of -> the changed code obsessively, but never built a global understanding of how -> the changed code fits into the wider codebase. The diff altered assumptions -> that code elsewhere silently relied on — but I couldn't see that because I -> never looked beyond the diff. I couldn't see the forest for the trees. -> -> **Why I resisted RAG:** Overconfidence. My internal model was "I can see the -> diff, I understand the data structures, I can trace the logic." The diff felt -> self-contained. RAG felt like it would return noisy results about tangentially -> related code. But in a codebase this large, changes in one subsystem can -> quietly break invariants that distant subsystems depend on — and you need -> global awareness to foresee that. -> -> **In the post-mortem**, I re-ran the RAG queries I should have run during the -> review. They directly surfaced the files containing the bugs I missed — files -> I never thought to open because they weren't in the diff. - -The takeaway: when reviewing or modifying code, ask yourself "what else in this -codebase might depend on the behavior I'm changing?" If you aren't sure, that's -a good time for a RAG query. It won't replace the grep-based investigation you -should already be doing — but it can surface files you wouldn't have thought to -open. - -## Additional OpenMC info - -Read the FULL `AGENTS.md` in this directory also before starting work. +Read the FULL `AGENTS.md` in this directory before starting work. It contains +project context, coding conventions, and documentation of the RAG search tools +registered in `.mcp.json`. + +### Claude Code-specific: first-call behavior + +The first `openmc_rag_search` call of each session returns an index status +message instead of search results. When this happens, you MUST use the +`AskUserQuestion` tool to present the rebuild/use-existing choice to the user. +Do not ask conversationally — always use the widget. Do not skip this step even +if the index looks current — the user may have uncommitted changes that warrant +a rebuild. From 277e19d06e075e031bf5fce6c2d7fc1001da134e Mon Sep 17 00:00:00 2001 From: John Tramm Date: Tue, 10 Mar 2026 14:11:38 +0000 Subject: [PATCH 65/67] claude response to copilot comments --- .claude/tools/openmc_mcp_server.py | 5 +++++ .claude/tools/rag/indexer.py | 5 +++-- .claude/tools/rag/openmc_search.py | 2 +- docs/source/devguide/agentic-tools.rst | 14 ++++++++------ 4 files changed, 17 insertions(+), 9 deletions(-) diff --git a/.claude/tools/openmc_mcp_server.py b/.claude/tools/openmc_mcp_server.py index 0bcc596b9d0..1b89d88c7b5 100644 --- a/.claude/tools/openmc_mcp_server.py +++ b/.claude/tools/openmc_mcp_server.py @@ -69,6 +69,8 @@ def _get_current_branch(): ["git", "rev-parse", "--abbrev-ref", "HEAD"], capture_output=True, text=True, cwd=str(OPENMC_ROOT), ) + if result.returncode != 0 or not result.stdout.strip(): + return "unknown" return result.stdout.strip() except Exception: return "unknown" @@ -175,6 +177,9 @@ def openmc_rag_search( if not query and not related_file: return "Error: provide either 'query' or 'related_file'." + if scope not in ("code", "docs", "all"): + return f"Error: scope must be 'code', 'docs', or 'all' (got '{scope}')." + try: from openmc_search import ( get_db_and_embedder, search_table, format_results, search_related, diff --git a/.claude/tools/rag/indexer.py b/.claude/tools/rag/indexer.py index fd326de9263..1c09086379a 100644 --- a/.claude/tools/rag/indexer.py +++ b/.claude/tools/rag/indexer.py @@ -18,8 +18,6 @@ openmc_rag_rebuild. """ -from embeddings import EmbeddingProvider -from chunker import chunk_file import lancedb import sys import time @@ -31,6 +29,9 @@ TOOLS_DIR = Path(__file__).resolve().parent.parent sys.path.insert(0, str(TOOLS_DIR / "rag")) +from embeddings import EmbeddingProvider +from chunker import chunk_file + OPENMC_ROOT = Path(__file__).resolve().parents[3] CACHE_DIR = OPENMC_ROOT / ".claude" / "cache" diff --git a/.claude/tools/rag/openmc_search.py b/.claude/tools/rag/openmc_search.py index ec726e86ab6..30147dc057f 100644 --- a/.claude/tools/rag/openmc_search.py +++ b/.claude/tools/rag/openmc_search.py @@ -106,7 +106,7 @@ def search_related(db, embedder, filepath, top_k): # Normalize filepath fp = filepath - if filepath.startswith("/"): + if Path(filepath).is_absolute(): try: fp = str(Path(filepath).relative_to(OPENMC_ROOT)) except ValueError: diff --git a/docs/source/devguide/agentic-tools.rst b/docs/source/devguide/agentic-tools.rst index ae15ce8854d..b4d002a6f5a 100644 --- a/docs/source/devguide/agentic-tools.rst +++ b/docs/source/devguide/agentic-tools.rst @@ -4,8 +4,8 @@ Agentic Development Tools =========================== -OpenMC ships a set of tools designed for AI coding agents (such as `Claude -Code`_) that agents can use to navigate and understand the codebase. +OpenMC ships a set of tools designed for AI coding agents (such as +`Claude Code`_) that agents can use to navigate and understand the codebase. .. _Claude Code: https://claude.ai/code @@ -29,8 +29,9 @@ and it's not unreasonable to assume that models may be available in the near future that greatly exceed these limits. However, even assuming the entire repository can be fit within a context -window, there are several downsides to doing this. `Model performance -degrades significantly as context size increases`_. Benchmark results are +window, there are several downsides to doing this. +`Model performance degrades significantly as context size increases`_. +Benchmark results are greatly improved if the model has less garbage to pick through. Additionally, API usage is typically billed as tokens in/out per turn. As the context file grows these costs become much larger. As such, there is still significant @@ -96,5 +97,6 @@ Requirements ^^^^^^^^^^^^ No system dependencies beyond **Python 3.12+** with ``pip``. The Python -packages (``sentence-transformers``, ``lancedb``) are installed in the agent's -virtual python environment automatically. +packages (``sentence-transformers``, ``lancedb``) and their dependencies +(including PyTorch, ~2GB) are installed automatically into an isolated virtual +environment on first use. From cfa89a733a940d0922813748221043fa909fa4d6 Mon Sep 17 00:00:00 2001 From: John Tramm Date: Tue, 10 Mar 2026 14:30:46 +0000 Subject: [PATCH 66/67] claude response to copilot review --- .claude/tools/openmc_mcp_server.py | 6 ++++++ .claude/tools/rag/indexer.py | 7 ++++--- .claude/tools/rag/openmc_search.py | 12 +++++++----- docs/source/devguide/agentic-tools.rst | 10 ++++++---- 4 files changed, 23 insertions(+), 12 deletions(-) diff --git a/.claude/tools/openmc_mcp_server.py b/.claude/tools/openmc_mcp_server.py index 1b89d88c7b5..37917abc188 100644 --- a/.claude/tools/openmc_mcp_server.py +++ b/.claude/tools/openmc_mcp_server.py @@ -177,9 +177,15 @@ def openmc_rag_search( if not query and not related_file: return "Error: provide either 'query' or 'related_file'." + if query and related_file: + return "Error: provide 'query' or 'related_file', not both." + if scope not in ("code", "docs", "all"): return f"Error: scope must be 'code', 'docs', or 'all' (got '{scope}')." + if top_k < 1: + return f"Error: top_k must be at least 1 (got {top_k})." + try: from openmc_search import ( get_db_and_embedder, search_table, format_results, search_related, diff --git a/.claude/tools/rag/indexer.py b/.claude/tools/rag/indexer.py index 1c09086379a..37b4260d43e 100644 --- a/.claude/tools/rag/indexer.py +++ b/.claude/tools/rag/indexer.py @@ -1,9 +1,10 @@ #!/usr/bin/env python3 """Build the RAG vector index for the OpenMC codebase. -This is the index-building half of the RAG pipeline. All operations are local — -no network calls are made (the embedding model is already cached locally; see -embeddings.py for details on model download). It walks the repo, chunks every +This is the index-building half of the RAG pipeline. All operations are local +once the embedding model has been downloaded and cached (see embeddings.py for +details on model download, caching, and network behavior). It walks the repo, +chunks every C++/Python/RST file (via chunker.py), embeds all chunks into 384-dim vectors (via embeddings.py), and stores them in a local LanceDB database on disk. The result is a .claude/cache/rag_index/ directory containing two tables — "code" diff --git a/.claude/tools/rag/openmc_search.py b/.claude/tools/rag/openmc_search.py index 30147dc057f..f72b4adf748 100644 --- a/.claude/tools/rag/openmc_search.py +++ b/.claude/tools/rag/openmc_search.py @@ -2,8 +2,10 @@ """Query the RAG vector index to find semantically related code and docs. This is the query-time half of the RAG pipeline (the counterpart to indexer.py, -which builds the index). All operations are local — no network calls are made. -Given a natural-language query, it embeds the query with the same MiniLM model +which builds the index). All operations are local — no network calls are made +once the embedding model has been downloaded (see embeddings.py for details on +model download and caching). Given a natural-language query, it embeds the query +with the same MiniLM model used at index time, then finds the closest chunks in the local LanceDB vector database by cosine similarity. @@ -43,9 +45,9 @@ def get_db_and_embedder(): from embeddings import EmbeddingProvider if not INDEX_DIR.exists(): - print("ERROR: No index found. Run openmc_rag_rebuild() to build one.", - file=sys.stderr) - sys.exit(1) + raise FileNotFoundError( + "No RAG index found. Call openmc_rag_rebuild() to build one." + ) db = lancedb.connect(str(INDEX_DIR)) diff --git a/docs/source/devguide/agentic-tools.rst b/docs/source/devguide/agentic-tools.rst index b4d002a6f5a..fed377cc075 100644 --- a/docs/source/devguide/agentic-tools.rst +++ b/docs/source/devguide/agentic-tools.rst @@ -96,7 +96,9 @@ The search pipeline runs entirely on your local CPU: Requirements ^^^^^^^^^^^^ -No system dependencies beyond **Python 3.12+** with ``pip``. The Python -packages (``sentence-transformers``, ``lancedb``) and their dependencies -(including PyTorch, ~2GB) are installed automatically into an isolated virtual -environment on first use. +No system dependencies beyond **Python 3.12+** with ``pip``. An internet +connection is required on first use to download the Python packages and +embedding model weights; subsequent runs are fully offline. The Python packages +(``sentence-transformers``, ``lancedb``) and their dependencies (including +PyTorch, ~2GB) are installed automatically into an isolated virtual environment +on first use. From 77e403603e4235d5d6974895286bdca011ec6f95 Mon Sep 17 00:00:00 2001 From: John Tramm Date: Tue, 10 Mar 2026 15:02:15 +0000 Subject: [PATCH 67/67] claude response to copilot review --- .claude/tools/rag/indexer.py | 3 ++- .claude/tools/rag/openmc_search.py | 10 ++++++++-- .claude/tools/start_server.sh | 1 + 3 files changed, 11 insertions(+), 3 deletions(-) diff --git a/.claude/tools/rag/indexer.py b/.claude/tools/rag/indexer.py index 37b4260d43e..34613092bb1 100644 --- a/.claude/tools/rag/indexer.py +++ b/.claude/tools/rag/indexer.py @@ -114,7 +114,8 @@ def build_index(): doc_records.append(record) # Create tables (drop existing) - existing = db.list_tables().tables + result = db.table_names() if hasattr(db, "table_names") else db.list_tables() + existing = result.tables if hasattr(result, "tables") else list(result) for table_name in ("code", "docs"): if table_name in existing: db.drop_table(table_name) diff --git a/.claude/tools/rag/openmc_search.py b/.claude/tools/rag/openmc_search.py index f72b4adf748..4125ee96607 100644 --- a/.claude/tools/rag/openmc_search.py +++ b/.claude/tools/rag/openmc_search.py @@ -55,9 +55,15 @@ def get_db_and_embedder(): return db, embedder +def _table_names(db): + """Return table names as a list, compatible with multiple LanceDB versions.""" + result = db.table_names() if hasattr(db, "table_names") else db.list_tables() + return result.tables if hasattr(result, "tables") else list(result) + + def search_table(db, embedder, table_name, query, top_k): """Search a LanceDB table with a text query.""" - if table_name not in db.list_tables().tables: + if table_name not in _table_names(db): print(f"Table '{table_name}' not found in index.", file=sys.stderr) return [] @@ -100,7 +106,7 @@ def format_results(results, label=""): def search_related(db, embedder, filepath, top_k): """Find code related to a given file.""" - if "code" not in db.list_tables().tables: + if "code" not in _table_names(db): print("No 'code' table in index.", file=sys.stderr) return [] diff --git a/.claude/tools/start_server.sh b/.claude/tools/start_server.sh index 0b60e61644a..c111dd73e88 100755 --- a/.claude/tools/start_server.sh +++ b/.claude/tools/start_server.sh @@ -19,6 +19,7 @@ fi if [ ! -f "$SENTINEL" ]; then rm -rf "$VENV_DIR" + mkdir -p "$CACHE_DIR" python3 -m venv "$VENV_DIR" if ! "$VENV_DIR/bin/pip" install -q -r "$SCRIPT_DIR/requirements.txt"; then