diff --git a/CHANGELOG.md b/CHANGELOG.md index e621bcd..099c89c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,6 +12,8 @@ - Pluggable architecture for chunkers, embedders, and vector databases. - Hybrid storage with Qdrant and MongoDB. +## v0.1.1-beta.6 (2025-11-25) + ## v0.1.1-beta.5 (2025-11-21) ## v0.1.1-beta.4 (2025-11-20) diff --git a/CLAUDE.md b/CLAUDE.md index dbab95f..ae55eed 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -279,10 +279,9 @@ Documents → Extract Content → Graphiti Processing → Neo4j Graph ### Important Limitations & Notes 1. **Async-only** – GraphRAGClient uses async/await exclusively -2. **No collection deletion** – Neo4j doesn't support collection-level deletion via Graphiti; use raw Neo4j queries if needed -3. **Episode-based organization** – Graph data is organized as episodes (document sources); explicit management required -4. **LLM cost** – Entity extraction uses LLM calls; consider batching for large documents -5. **Phase 1 feature** – Hybrid retrieval (merging graph + vector results) is planned for Phase 2 +2. **Episode-based organization** – Graph data is organized as episodes (document sources); explicit management required +3. **LLM cost** – Entity extraction uses LLM calls; consider batching for large documents +4. **Phase 1 feature** – Hybrid retrieval (merging graph + vector results) is planned for Phase 2 ### Custom Group ID for Multi-Tenant & Environment Isolation @@ -346,6 +345,78 @@ POST /graph-rag/retrieve - [GRAPH_RAG_GROUP_ID_GUIDE.md](./GRAPH_RAG_GROUP_ID_GUIDE.md) – Comprehensive guide with examples - [GRAPH_RAG_INTEGRATION_GUIDE.md](./GRAPH_RAG_INTEGRATION_GUIDE.md) – Multi-tenant implementation patterns +### Graph RAG Delete Operations (NEW) + +Delete functionality has been fully implemented for Graph RAG with support for deleting at multiple levels: + +#### Deletion Methods + +1. **Delete Node (Entity)** + ```python + result = await client.delete_node(node_uuid, collection_name) + # Deletes single entity and all connected relationships + ``` + +2. **Delete Edge (Relationship/Fact)** + ```python + result = await client.delete_edge(edge_uuid, collection_name) + # Deletes single relationship without affecting entities + ``` + +3. **Delete Episode (Document)** + ```python + result = await client.delete_episode(episode_uuid, collection_name) + # Deletes all entities/relationships from document + # Automatically removes orphaned nodes (no remaining connections) + ``` + +4. **Delete Collection** + ```python + result = await client.delete_collection(collection_name) + # Deletes ALL data in collection (irreversible) + # Requires explicit confirmation to prevent accidents + ``` + +#### Implementation Details + +- **Files Modified:** + - `src/insta_rag/graph_rag/graph_builder.py` – Core deletion logic + - `src/insta_rag/graph_rag/client.py` – Client API wrappers + - `src/insta_rag/graph_rag/neo4j_driver.py` – Driver reference storage + - `testing_api/graph_rag_routes.py` – API endpoints (4 new + 1 demo endpoint) + +- **API Endpoints:** + - `POST /graph-rag/delete-node` – Delete entity + - `POST /graph-rag/delete-edge` – Delete relationship + - `POST /graph-rag/delete-episode` – Delete document + - `POST /graph-rag/delete-collection` – Delete collection (requires confirmation) + - `POST /graph-rag/test/demo-delete` – Interactive demo + +- **Key Features:** + - Multi-tenant safe via group_id isolation + - Orphan node cleanup automatically after episode deletion + - Error handling with descriptive messages + - Confirmation required for collection deletion + - Full Swagger documentation with examples + +#### Response Format + +All delete endpoints return: +```json +{ + "success": true, + "message": "Deletion status message", + "deleted_items": { + "uuid": "...", + "count": 0 + }, + "error": null +} +``` + +**See also:** +- [GRAPH_RAG_DELETE_EPISODES.md](./GRAPH_RAG_DELETE_EPISODES.md) – Comprehensive deletion guide with examples + ## Async Processing & Celery (NEW) ### Overview diff --git a/README.md b/README.md index a14eb91..2c96ff1 100644 --- a/README.md +++ b/README.md @@ -431,8 +431,70 @@ See [GRAPH_RAG_GROUP_ID_GUIDE.md](./GRAPH_RAG_GROUP_ID_GUIDE.md) for comprehensi | `await client.retrieve(query, collection_name, k)` | Search graph using hybrid semantic + BM25 | | `await client.retrieve_with_reranking(query, collection_name, center_node)` | Retrieve with distance-based reranking from center node | | `await client.get_entity_context(entity_name, collection_name, depth)` | Get entity and related facts (up to depth levels) | +| `await client.delete_node(node_uuid, collection_name)` | Delete entity node and its connected relationships | +| `await client.delete_edge(edge_uuid, collection_name)` | Delete relationship/fact between entities | +| `await client.delete_episode(episode_uuid, collection_name)` | Delete document and orphaned entities | +| `await client.delete_collection(collection_name)` | Delete entire collection (irreversible) | | `await client.close()` | Cleanup Neo4j connection | +### Graph RAG Delete Operations (NEW) + +Delete data from your knowledge graph at multiple levels: + +#### Quick Delete Example + +```python +async with GraphRAGClient() as client: + # Delete a single entity node + await client.delete_node(node_uuid, collection_name="company") + + # Delete a relationship/fact + await client.delete_edge(edge_uuid, collection_name="company") + + # Delete a document and orphaned entities + await client.delete_episode(episode_uuid, collection_name="company") + + # Delete entire collection (with confirmation) + await client.delete_collection(collection_name="company") +``` + +#### Deletion Levels + +| Operation | Scope | Use Case | +|-----------|-------|----------| +| `delete_node()` | Single entity + edges | Remove specific entity | +| `delete_edge()` | Single relationship | Remove specific fact | +| `delete_episode()` | Document + orphaned nodes | Remove document and its data | +| `delete_collection()` | All data in collection | Cleanup entire collection | + +#### REST API + +All delete operations are also available via REST endpoints: + +```bash +# Delete node +curl -X POST http://localhost:8000/graph-rag/delete-node \ + -H "Content-Type: application/json" \ + -d '{"node_uuid": "...", "collection_name": "company"}' + +# Delete edge +curl -X POST http://localhost:8000/graph-rag/delete-edge \ + -H "Content-Type: application/json" \ + -d '{"edge_uuid": "...", "collection_name": "company"}' + +# Delete episode +curl -X POST http://localhost:8000/graph-rag/delete-episode \ + -H "Content-Type: application/json" \ + -d '{"episode_uuid": "...", "collection_name": "company"}' + +# Delete collection (requires confirm=true) +curl -X POST http://localhost:8000/graph-rag/delete-collection \ + -H "Content-Type: application/json" \ + -d '{"collection_name": "company", "confirm": true}' +``` + +See [GRAPH_RAG_DELETE_EPISODES.md](./GRAPH_RAG_DELETE_EPISODES.md) for comprehensive deletion guide. + ### Graph RAG vs Vector RAG | Aspect | Vector RAG | Graph RAG | @@ -443,6 +505,7 @@ See [GRAPH_RAG_GROUP_ID_GUIDE.md](./GRAPH_RAG_GROUP_ID_GUIDE.md) for comprehensi | **Entity Extraction** | Not explicit | LLM-driven, explicit | | **Use Cases** | General similarity search | Structured knowledge discovery | | **Best For** | Content search | Relationship queries | +| **Deletion** | N/A | Full CRUD support | --- diff --git a/pyproject.toml b/pyproject.toml index 4047dd0..e8137a7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "insta_rag" -version = "0.1.1-beta.5" +version = "0.1.1-beta.6" description = "A RAG (Retrieval-Augmented Generation) library for document processing and retrieval." authors = [ { name = "Aukik Aurnab", email = "aukikaurnabx@gmail.com" }, diff --git a/src/insta_rag/graph_rag/client.py b/src/insta_rag/graph_rag/client.py index 4937cb5..ade7a3a 100644 --- a/src/insta_rag/graph_rag/client.py +++ b/src/insta_rag/graph_rag/client.py @@ -149,7 +149,13 @@ async def initialize(self) -> "GraphRAGClient": RuntimeError: If initialization fails """ self._graphiti = await self.driver.initialize() - self._builder = GraphBuilder(self._graphiti, self.group_id) + # Pass the Neo4j driver to GraphBuilder for raw query execution (needed for deletion) + neo4j_driver = self.driver.get_neo4j_driver() + self._builder = GraphBuilder( + self._graphiti, + self.group_id, + neo4j_driver=neo4j_driver + ) self._retriever = GraphRetriever(self._graphiti, self.group_id) return self @@ -431,6 +437,138 @@ async def get_entity_context( depth=depth, ) + # ======================== Delete Operations ======================== + + async def delete_node( + self, + node_uuid: str, + collection_name: str = "default", + ) -> dict: + """Delete an entity node from the knowledge graph. + + Removes the specified entity and all its connected relationships. + + Args: + node_uuid: UUID of the entity node to delete + collection_name: Collection context + + Returns: + Dict with deletion result: + - success: bool + - node_uuid: The deleted node UUID + - edges_deleted: Number of connected edges removed + - error: Optional error message + + Raises: + RuntimeError: If not initialized + ValueError: If node_uuid is invalid + """ + if not self._builder: + raise RuntimeError("Client not initialized. Call initialize() first.") + + if not node_uuid or not isinstance(node_uuid, str): + raise ValueError("node_uuid must be a non-empty string") + + return await self._builder.delete_node(node_uuid, collection_name) + + async def delete_edge( + self, + edge_uuid: str, + collection_name: str = "default", + ) -> dict: + """Delete a relationship (edge) from the knowledge graph. + + Removes the specified fact/relationship between entities. + Connected entities are not affected. + + Args: + edge_uuid: UUID of the relationship to delete + collection_name: Collection context + + Returns: + Dict with deletion result: + - success: bool + - edge_uuid: The deleted edge UUID + - error: Optional error message + + Raises: + RuntimeError: If not initialized + ValueError: If edge_uuid is invalid + """ + if not self._builder: + raise RuntimeError("Client not initialized. Call initialize() first.") + + if not edge_uuid or not isinstance(edge_uuid, str): + raise ValueError("edge_uuid must be a non-empty string") + + return await self._builder.delete_edge(edge_uuid, collection_name) + + async def delete_episode( + self, + episode_uuid: str, + collection_name: str = "default", + ) -> dict: + """Delete an entire episode (document) and its extracted data. + + Removes all edges belonging to this episode, then deletes any + orphaned nodes (nodes with no remaining connections). + + Args: + episode_uuid: UUID of the episode/document to delete + collection_name: Collection context + + Returns: + Dict with deletion statistics: + - success: bool + - episode_uuid: The deleted episode UUID + - edges_deleted: Number of edges removed + - orphan_nodes_deleted: Number of orphaned nodes removed + - error: Optional error message + + Raises: + RuntimeError: If not initialized + ValueError: If episode_uuid is invalid + """ + if not self._builder: + raise RuntimeError("Client not initialized. Call initialize() first.") + + if not episode_uuid or not isinstance(episode_uuid, str): + raise ValueError("episode_uuid must be a non-empty string") + + return await self._builder.delete_episode(episode_uuid, collection_name) + + async def delete_collection( + self, + collection_name: str, + ) -> dict: + """Delete entire collection with all its data. + + ⚠️ DESTRUCTIVE OPERATION: Removes all entities and relationships + in the specified collection. This cannot be undone. + + Args: + collection_name: Collection to delete + + Returns: + Dict with deletion statistics: + - success: bool + - collection_name: Collection that was deleted + - edges_deleted: Number of edges removed + - nodes_deleted: Number of nodes removed + - error: Optional error message + + Raises: + RuntimeError: If not initialized + ValueError: If collection_name is invalid + """ + if not self._builder: + raise RuntimeError("Client not initialized. Call initialize() first.") + + if not collection_name or not isinstance(collection_name, str): + raise ValueError("collection_name must be a non-empty string") + + return await self._builder.delete_collection(collection_name) + # ======================== Context Manager Support ======================== async def __aenter__(self): diff --git a/src/insta_rag/graph_rag/graph_builder.py b/src/insta_rag/graph_rag/graph_builder.py index 4b7b40a..be884f7 100644 --- a/src/insta_rag/graph_rag/graph_builder.py +++ b/src/insta_rag/graph_rag/graph_builder.py @@ -19,15 +19,17 @@ class GraphBuilder: entity extraction, and relationship building. """ - def __init__(self, graphiti_client: Graphiti, group_id: str = "default"): + def __init__(self, graphiti_client: Graphiti, group_id: str = "default", neo4j_driver=None): """Initialize GraphBuilder. Args: graphiti_client: Initialized Graphiti instance group_id: Group ID for organizing graph data + neo4j_driver: Optional Neo4j driver for raw query execution """ self.graphiti = graphiti_client self.group_id = group_id + self.neo4j_driver = neo4j_driver async def add_documents( self, @@ -216,24 +218,253 @@ def _convert_episode_result( extracted_edges=edges, ) - async def delete_collection(self, collection_name: str) -> int: - """Delete all episodes in a collection from the graph. + async def delete_node( + self, + node_uuid: str, + collection_name: str, + ) -> dict: + """Delete a single node (entity) and all its connected edges. Args: - collection_name: Collection name to delete + node_uuid: UUID of the entity node to delete + collection_name: Collection context (for reference) Returns: - Number of episodes deleted + Dict with keys: + - success: bool + - node_uuid: The deleted node UUID + - edges_deleted: Number of connected edges removed + - error: Optional error message Raises: RuntimeError: If operation fails """ - # Note: Graphiti doesn't have a built-in delete by collection method - # This is a limitation we document for users - raise NotImplementedError( - "Collection deletion not yet implemented. " - "Please use Neo4j directly or reset the database." - ) + try: + if not self.neo4j_driver: + return {"success": False, "error": "Neo4j driver not initialized"} + + driver = self.neo4j_driver + + # Verify node exists + node_query = "MATCH (n:Entity {uuid: $uuid}) RETURN n.uuid AS uuid" + records, _, _ = await driver.execute_query(node_query, uuid=node_uuid) + + if not records: + return { + "success": False, + "error": f"Node {node_uuid} not found", + } + + # Count connected edges before deletion + edge_count_query = ( + "MATCH (n:Entity {uuid: $uuid})-[e]-(m) RETURN COUNT(e) AS edge_count" + ) + edge_records, _, _ = await driver.execute_query( + edge_count_query, uuid=node_uuid + ) + edge_count = edge_records[0]["edge_count"] if edge_records else 0 + + # Delete the node (DETACH DELETE removes all connected edges) + delete_query = "MATCH (n:Entity {uuid: $uuid}) DETACH DELETE n" + await driver.execute_query(delete_query, uuid=node_uuid) + + return { + "success": True, + "node_uuid": node_uuid, + "edges_deleted": edge_count, + "message": f"Deleted node {node_uuid} and {edge_count} connected edges", + } + + except Exception as e: + return {"success": False, "error": f"Failed to delete node: {str(e)}"} + + async def delete_edge( + self, + edge_uuid: str, + collection_name: str, + ) -> dict: + """Delete a single edge (relationship/fact). + + Args: + edge_uuid: UUID of the edge to delete + collection_name: Collection context (for reference) + + Returns: + Dict with deletion status: + - success: bool + - edge_uuid: The deleted edge UUID + - error: Optional error message + + Raises: + RuntimeError: If operation fails + """ + try: + if not self.neo4j_driver: + return {"success": False, "error": "Neo4j driver not initialized"} + + driver = self.neo4j_driver + + # Verify edge exists (try multiple relationship types) + edge_query = "MATCH ()-[e {uuid: $uuid}]-() RETURN e.uuid AS uuid" + records, _, _ = await driver.execute_query(edge_query, uuid=edge_uuid) + + if not records: + return {"success": False, "error": f"Edge {edge_uuid} not found"} + + # Delete the edge + delete_query = "MATCH ()-[e {uuid: $uuid}]-() DELETE e" + await driver.execute_query(delete_query, uuid=edge_uuid) + + return { + "success": True, + "edge_uuid": edge_uuid, + "message": f"Deleted edge {edge_uuid}", + } + + except Exception as e: + return {"success": False, "error": f"Failed to delete edge: {str(e)}"} + + async def delete_episode( + self, + episode_uuid: str, + collection_name: str, + ) -> dict: + """Delete an episode (document) and all its extracted entities/relationships. + + Removes all edges belonging to this episode first, then deletes any + orphaned nodes (nodes with no remaining connections). + + Args: + episode_uuid: UUID of the episode to delete + collection_name: Collection context + + Returns: + Dict with deletion statistics: + - success: bool + - episode_uuid: The deleted episode UUID + - edges_deleted: Number of edges removed + - orphan_nodes_deleted: Number of orphaned nodes removed + - error: Optional error message + + Raises: + RuntimeError: If operation fails + """ + try: + if not self.neo4j_driver: + return {"success": False, "error": "Neo4j driver not initialized"} + + driver = self.neo4j_driver + group_id = f"{self.group_id}_{collection_name}" + + # Count edges to delete (edges with this episode in their episodes list) + edge_count_query = """ + MATCH ()-[e {group_id: $group_id}]-() + WHERE $episode_uuid IN e.episodes + RETURN COUNT(e) AS edge_count + """ + edge_records, _, _ = await driver.execute_query( + edge_count_query, + group_id=group_id, + episode_uuid=episode_uuid, + ) + edge_count = edge_records[0]["edge_count"] if edge_records else 0 + + # Delete edges from this episode + delete_edges_query = """ + MATCH ()-[e {group_id: $group_id}]-() + WHERE $episode_uuid IN e.episodes + DELETE e + """ + await driver.execute_query( + delete_edges_query, + group_id=group_id, + episode_uuid=episode_uuid, + ) + + # Find orphaned nodes (nodes with no connected edges) + orphan_query = """ + MATCH (n:Entity {group_id: $group_id}) + WHERE NOT (n)-[]-() + RETURN COUNT(n) AS orphan_count + """ + orphan_records, _, _ = await driver.execute_query( + orphan_query, group_id=group_id + ) + orphan_count = orphan_records[0]["orphan_count"] if orphan_records else 0 + + # Delete orphaned nodes + delete_orphans_query = """ + MATCH (n:Entity {group_id: $group_id}) + WHERE NOT (n)-[]-() + DELETE n + """ + await driver.execute_query(delete_orphans_query, group_id=group_id) + + return { + "success": True, + "episode_uuid": episode_uuid, + "edges_deleted": edge_count, + "orphan_nodes_deleted": orphan_count, + "message": f"Deleted episode {episode_uuid}, {edge_count} edges, {orphan_count} orphaned nodes", + } + + except Exception as e: + return {"success": False, "error": f"Failed to delete episode: {str(e)}"} + + async def delete_collection(self, collection_name: str) -> dict: + """Delete entire collection with all documents, entities, and relationships. + + Removes all edges first, then all nodes in the collection. This is a + destructive operation that cannot be undone. + + Args: + collection_name: Collection to delete + + Returns: + Dict with deletion statistics: + - success: bool + - collection_name: Collection that was deleted + - edges_deleted: Number of edges removed + - nodes_deleted: Number of nodes removed + - error: Optional error message + + Raises: + RuntimeError: If operation fails + """ + try: + if not self.neo4j_driver: + return {"success": False, "error": "Neo4j driver not initialized"} + + driver = self.neo4j_driver + group_id = f"{self.group_id}_{collection_name}" + + # Delete all edges in this collection first + delete_edges_query = "MATCH ()-[e {group_id: $group_id}]-() DELETE e" + result_edges = await driver.execute_query( + delete_edges_query, group_id=group_id + ) + edges_affected = result_edges.summary.counters.relationships_deleted + + # Delete all nodes in this collection + delete_nodes_query = "MATCH (n:Entity {group_id: $group_id}) DELETE n" + result_nodes = await driver.execute_query( + delete_nodes_query, group_id=group_id + ) + nodes_affected = result_nodes.summary.counters.nodes_deleted + + return { + "success": True, + "collection_name": collection_name, + "edges_deleted": edges_affected, + "nodes_deleted": nodes_affected, + "message": f"Deleted collection {collection_name}: {edges_affected} edges, {nodes_affected} nodes", + } + + except Exception as e: + return { + "success": False, + "error": f"Failed to delete collection: {str(e)}", + } async def search_graph( self, diff --git a/src/insta_rag/graph_rag/neo4j_driver.py b/src/insta_rag/graph_rag/neo4j_driver.py index e4c619e..fb97619 100644 --- a/src/insta_rag/graph_rag/neo4j_driver.py +++ b/src/insta_rag/graph_rag/neo4j_driver.py @@ -41,6 +41,7 @@ def __init__( self.embedder = embedder self._graphiti: Optional[Graphiti] = None + self._neo4j_driver: Optional[Any] = None # Store Neo4jDriver reference for deletion operations async def initialize(self) -> Graphiti: """Initialize and return Graphiti instance. @@ -60,6 +61,9 @@ async def initialize(self) -> Graphiti: database=self.database, ) + # Store reference for use in deletion operations + self._neo4j_driver = driver + # Create Graphiti instance with the driver and optional LLM/embedder clients graphiti_kwargs = {"graph_driver": driver} if self.llm_client: @@ -89,6 +93,14 @@ def get_graphiti(self) -> Optional[Graphiti]: """ return self._graphiti + def get_neo4j_driver(self) -> Optional[Any]: + """Get the Neo4j driver instance for raw query execution. + + Returns: + Neo4jDriver instance if initialized, None otherwise. + """ + return self._neo4j_driver + async def __aenter__(self): """Async context manager entry.""" await self.initialize() diff --git a/uv.lock b/uv.lock index 2c2905d..485aa8b 100644 --- a/uv.lock +++ b/uv.lock @@ -923,7 +923,7 @@ wheels = [ [[package]] name = "insta-rag" -version = "0.1.1b5" +version = "0.1.1b6" source = { editable = "." } dependencies = [ { name = "celery" },