diff --git a/.gitignore b/.gitignore index 707bc44711..461298263d 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,4 @@ -# Python Artifacts + python/*/lib/ dist/ @@ -56,3 +56,9 @@ docsite/ # Jupyter notebook .ipynb_checkpoints/ + + +**/__pycache__/ +workdir/ +sac/ +prompts/ diff --git a/.vscode/launch.json b/.vscode/launch.json index 2167063966..12505a6bed 100644 --- a/.vscode/launch.json +++ b/.vscode/launch.json @@ -3,37 +3,67 @@ "version": "0.2.0", "configurations": [ { - "name": "Indexer", - "type": "debugpy", - "request": "launch", - "module": "poetry", - "args": [ - "poe", "index", - "--root", "" + "name": "Debug Graphrag init", + "type": "python", + "request": "launch", + "module": "graphrag", + "args": [ + "init", + "--root", "${workspaceFolder}/workdir" ], - }, + "console": "integratedTerminal" + }, { - "name": "Query", - "type": "debugpy", - "request": "launch", - "module": "poetry", - "args": [ - "poe", "query", - "--root", "", - "--method", "global", - "--query", "What are the top themes in this story", - ] - }, - { - "name": "Prompt Tuning", - "type": "debugpy", - "request": "launch", - "module": "poetry", - "args": [ - "poe", "prompt-tune", - "--config", - "/settings.yaml", - ] - } + "name": "Debug Graphrag index", + "type": "python", + "request": "launch", + "module": "graphrag", + "args": [ + "index", + "--root", "${workspaceFolder}/workdir" + ], + "justMyCode": false, // 设置为 false 以调试第三方库 + "console": "integratedTerminal" + }, + { + "name": "Run Graphrag index", + "type": "python", + "request": "launch", + "module": "graphrag", + "args": [ + "index", + "--root", "${workspaceFolder}/workdir" + ], + "justMyCode": false, // 设置为 false 以调试第三方库 + "console": "integratedTerminal", + "noDebug": true + }, + { + "name": "Debug Graphrag prompt-tune", + "type": "python", + "request": "launch", + "module": "graphrag", + "args": [ + "prompt-tune", + "--root", "${workspaceFolder}/workdir", + "--config", "${workspaceFolder}/workdir/settings.yaml", + "--discover-entity-types" + ], + "console": "integratedTerminal" + }, + { + "name": "Run Graphrag prompt-tune", + "type": "python", + "request": "launch", + "module": "graphrag", + "args": [ + "prompt-tune", + "--root", "${workspaceFolder}/workdir", + "--config", "${workspaceFolder}/workdir/settings.yaml", + "--discover-entity-types" + ], + "console": "integratedTerminal", + "noDebug": true + } ] } \ No newline at end of file diff --git a/.vscode/settings.json b/.vscode/settings.json index 0b678d5d95..edc1b4294a 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -37,7 +37,7 @@ "node_modules{,/**}", ".vscode{,/**}" ], - "python.defaultInterpreterPath": "python/services/.venv/bin/python", + "python.defaultInterpreterPath": ".venv/bin/python", "python.languageServer": "Pylance", "cSpell.customDictionaries": { "project-words": { diff --git a/examples_notebooks/community_contrib/neo4j/graphrag_import_neo4j_cypher.ipynb b/examples_notebooks/community_contrib/neo4j/graphrag_import_neo4j_cypher.ipynb index d246122be6..b7ddc6e912 100644 --- a/examples_notebooks/community_contrib/neo4j/graphrag_import_neo4j_cypher.ipynb +++ b/examples_notebooks/community_contrib/neo4j/graphrag_import_neo4j_cypher.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": null, + "execution_count": 72, "id": "b4fea928", "metadata": {}, "outputs": [], @@ -50,12 +50,12 @@ }, { "cell_type": "code", - "execution_count": 59, + "execution_count": 73, "id": "adca1803", "metadata": {}, "outputs": [], "source": [ - "GRAPHRAG_FOLDER = \"PATH_TO_OUTPUT/artifacts\"" + "GRAPHRAG_FOLDER = \"/home/fei/git/graphrag/workdir/output\"" ] }, { @@ -70,17 +70,32 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 74, "id": "b57beec0", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "24948.57s - pydevd: Sending message related to process being replaced timed-out after 5 seconds\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Note: you may need to restart the kernel to use updated packages.\n" + ] + } + ], "source": [ "%pip install --quiet pandas neo4j-rust-ext" ] }, { "cell_type": "code", - "execution_count": 61, + "execution_count": 75, "id": "3eeee95f-e4f2-4052-94fb-a5dc8ab542ae", "metadata": {}, "outputs": [], @@ -106,20 +121,54 @@ }, { "cell_type": "code", - "execution_count": 62, + "execution_count": 76, "id": "b6c15443-4acb-4f91-88ea-4e08abaa4c29", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Connection established.\n" + ] + } + ], "source": [ - "NEO4J_URI = \"neo4j://localhost\" # or neo4j+s://xxxx.databases.neo4j.io\n", + "NEO4J_URI = \"neo4j+s://e941d51d.databases.neo4j.io\" # or neo4j+s://xxxx.databases.neo4j.io\n", "NEO4J_USERNAME = \"neo4j\"\n", - "NEO4J_PASSWORD = \"\" # your password\n", + "NEO4J_PASSWORD = \"r2Fboc7Ul7R6MTiobAQva-rm0jYuvVtCjvbYJe_xS0A\" # your password\n", + "AUTH = (NEO4J_USERNAME, NEO4J_PASSWORD)\n", "NEO4J_DATABASE = \"neo4j\"\n", "\n", "# Create a Neo4j driver\n", + "with GraphDatabase.driver(NEO4J_URI, auth=AUTH) as driver:\n", + " driver.verify_connectivity()\n", + " print(\"Connection established.\")\n", + "\n", "driver = GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USERNAME, NEO4J_PASSWORD))" ] }, + { + "cell_type": "code", + "execution_count": 77, + "id": "a34adcc3", + "metadata": {}, + "outputs": [], + "source": [ + "# NEO4J_URI = \"neo4j://localhost\" # or neo4j+s://xxxx.databases.neo4j.io\n", + "# NEO4J_USERNAME = \"neo4j\"\n", + "# NEO4J_PASSWORD = \"password\" # your password\n", + "# AUTH = (NEO4J_USERNAME, NEO4J_PASSWORD)\n", + "# NEO4J_DATABASE = \"neo4j\"\n", + "\n", + "# # Create a Neo4j driver\n", + "# with GraphDatabase.driver(NEO4J_URI, auth=AUTH) as driver:\n", + "# driver.verify_connectivity()\n", + "# print(\"Connection established.\")\n", + "\n", + "# driver = GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USERNAME, NEO4J_PASSWORD))" + ] + }, { "cell_type": "markdown", "id": "70f37ab6", @@ -133,7 +182,7 @@ }, { "cell_type": "code", - "execution_count": 63, + "execution_count": 78, "id": "d787bf7b-ac9b-4bfb-b140-a50a3fd205c5", "metadata": {}, "outputs": [], @@ -181,7 +230,7 @@ }, { "cell_type": "code", - "execution_count": 64, + "execution_count": 79, "id": "ed7f212e-9148-424c-adc6-d81db9f8e5a5", "metadata": {}, "outputs": [ @@ -241,7 +290,7 @@ }, { "cell_type": "code", - "execution_count": 65, + "execution_count": 80, "id": "1ba023e7", "metadata": {}, "outputs": [ @@ -273,19 +322,25 @@ " \n", " \n", " 0\n", - " c305886e4aa2f6efcf64b57762777055\n", - " book.txt\n", + " 647709a0d0e057670c5b0dbc015bb7ba03d6fedccb9a5f...\n", + " report.c\n", + " \n", + " \n", + " 1\n", + " 9f5a1185c3548f00b01f1686b1255c749c28fe24b4591f...\n", + " hashtable_utils.c\n", " \n", " \n", "\n", "" ], "text/plain": [ - " id title\n", - "0 c305886e4aa2f6efcf64b57762777055 book.txt" + " id title\n", + "0 647709a0d0e057670c5b0dbc015bb7ba03d6fedccb9a5f... report.c\n", + "1 9f5a1185c3548f00b01f1686b1255c749c28fe24b4591f... hashtable_utils.c" ] }, - "execution_count": 65, + "execution_count": 80, "metadata": {}, "output_type": "execute_result" } @@ -299,7 +354,7 @@ }, { "cell_type": "code", - "execution_count": 66, + "execution_count": 81, "id": "96391c15", "metadata": {}, "outputs": [ @@ -307,17 +362,17 @@ "name": "stdout", "output_type": "stream", "text": [ - "{'_contains_updates': True, 'labels_added': 1, 'nodes_created': 1, 'properties_set': 2}\n", - "1 rows in 0.05211496353149414 s.\n" + "{'_contains_updates': True, 'labels_added': 38, 'nodes_created': 38, 'properties_set': 76}\n", + "38 rows in 0.5366926193237305 s.\n" ] }, { "data": { "text/plain": [ - "1" + "38" ] }, - "execution_count": 66, + "execution_count": 81, "metadata": {}, "output_type": "execute_result" } @@ -345,7 +400,7 @@ }, { "cell_type": "code", - "execution_count": 67, + "execution_count": 82, "id": "0d825626", "metadata": {}, "outputs": [ @@ -379,37 +434,37 @@ " \n", " \n", " 0\n", - " 680dd6d2a970a49082fa4f34bf63a34e\n", - " The Project Gutenberg eBook of A Christmas Ca...\n", - " 300\n", - " [c305886e4aa2f6efcf64b57762777055]\n", + " dcaa484d1db860309b46a809929f5c1c27e7e53e0ab92b...\n", + " #ifndef _SSDBUFTABLE_H\\n#define _SSDBUFTABLE_H...\n", + " 180\n", + " [0038e02189a4539ec6415c2b5d6be32a8947084c04302...\n", " \n", " \n", " 1\n", - " 95f1f8f5bdbf0bee3a2c6f2f4a4907f6\n", - " THE PROJECT GUTENBERG EBOOK A CHRISTMAS CAROL...\n", - " 300\n", - " [c305886e4aa2f6efcf64b57762777055]\n", + " 9f1e4d47e4e851180c13f165a57ae3143054795d165ed7...\n", + " #ifndef _STRAtEGY_H_\\n#define _STRAtEGY_H_\\n\\n...\n", + " 134\n", + " [0f329dd40b18a57b4daba0923e508b26b3a7682cb90cb...\n", " \n", " \n", "\n", "" ], "text/plain": [ - " id \\\n", - "0 680dd6d2a970a49082fa4f34bf63a34e \n", - "1 95f1f8f5bdbf0bee3a2c6f2f4a4907f6 \n", + " id \\\n", + "0 dcaa484d1db860309b46a809929f5c1c27e7e53e0ab92b... \n", + "1 9f1e4d47e4e851180c13f165a57ae3143054795d165ed7... \n", "\n", " text n_tokens \\\n", - "0 The Project Gutenberg eBook of A Christmas Ca... 300 \n", - "1 THE PROJECT GUTENBERG EBOOK A CHRISTMAS CAROL... 300 \n", + "0 #ifndef _SSDBUFTABLE_H\\n#define _SSDBUFTABLE_H... 180 \n", + "1 #ifndef _STRAtEGY_H_\\n#define _STRAtEGY_H_\\n\\n... 134 \n", "\n", - " document_ids \n", - "0 [c305886e4aa2f6efcf64b57762777055] \n", - "1 [c305886e4aa2f6efcf64b57762777055] " + " document_ids \n", + "0 [0038e02189a4539ec6415c2b5d6be32a8947084c04302... \n", + "1 [0f329dd40b18a57b4daba0923e508b26b3a7682cb90cb... " ] }, - "execution_count": 67, + "execution_count": 82, "metadata": {}, "output_type": "execute_result" } @@ -424,7 +479,7 @@ }, { "cell_type": "code", - "execution_count": 68, + "execution_count": 83, "id": "ffd3d380-8710-46f5-b90a-04ed8482192c", "metadata": {}, "outputs": [ @@ -432,17 +487,17 @@ "name": "stdout", "output_type": "stream", "text": [ - "{'_contains_updates': True, 'relationships_created': 231, 'properties_set': 462}\n", - "231 rows in 0.05993008613586426 s.\n" + "{'_contains_updates': True, 'labels_added': 67, 'relationships_created': 67, 'nodes_created': 67, 'properties_set': 201}\n", + "67 rows in 0.9977202415466309 s.\n" ] }, { "data": { "text/plain": [ - "231" + "67" ] }, - "execution_count": 68, + "execution_count": 83, "metadata": {}, "output_type": "execute_result" } @@ -472,7 +527,7 @@ }, { "cell_type": "code", - "execution_count": 78, + "execution_count": 84, "id": "2392f9e9", "metadata": {}, "outputs": [ @@ -497,63 +552,52 @@ " \n", " \n", " \n", - " name\n", + " id\n", + " human_readable_id\n", + " title\n", " type\n", " description\n", - " human_readable_id\n", - " id\n", - " description_embedding\n", " text_unit_ids\n", " \n", " \n", " \n", " \n", " 0\n", - " \"PROJECT GUTENBERG\"\n", - " \"ORGANIZATION\"\n", - " Project Gutenberg is a pioneering organization...\n", + " 87eaa567-cc2e-4f6b-a3a5-3d1ca18eb87c\n", " 0\n", - " b45241d70f0e43fca764df95b2b81f77\n", - " [-0.020793898031115532, 0.02951139025390148, 0...\n", - " [01e84646075b255eab0a34d872336a89, 10bab8e9773...\n", + " SSDBUFHASHBUCKET\n", + " DATA STRUCTURE\n", + " A data structure used to store information in ...\n", + " [dcaa484d1db860309b46a809929f5c1c27e7e53e0ab92...\n", " \n", " \n", " 1\n", - " \"UNITED STATES\"\n", - " \"GEO\"\n", - " The United States is prominently recognized fo...\n", + " b170cde1-355e-469e-bb6e-baa9680eba21\n", " 1\n", - " 4119fd06010c494caa07f439b333f4c5\n", - " [-0.009704762138426304, 0.013335365802049637, ...\n", - " [01e84646075b255eab0a34d872336a89, 28f242c4515...\n", + " SSD_BUF_HASHTABLE\n", + " VARIABLE\n", + " A variable that points to the first SSDBufHash...\n", + " [dcaa484d1db860309b46a809929f5c1c27e7e53e0ab92...\n", " \n", " \n", "\n", "" ], "text/plain": [ - " name type \\\n", - "0 \"PROJECT GUTENBERG\" \"ORGANIZATION\" \n", - "1 \"UNITED STATES\" \"GEO\" \n", - "\n", - " description human_readable_id \\\n", - "0 Project Gutenberg is a pioneering organization... 0 \n", - "1 The United States is prominently recognized fo... 1 \n", + " id human_readable_id title \\\n", + "0 87eaa567-cc2e-4f6b-a3a5-3d1ca18eb87c 0 SSDBUFHASHBUCKET \n", + "1 b170cde1-355e-469e-bb6e-baa9680eba21 1 SSD_BUF_HASHTABLE \n", "\n", - " id \\\n", - "0 b45241d70f0e43fca764df95b2b81f77 \n", - "1 4119fd06010c494caa07f439b333f4c5 \n", - "\n", - " description_embedding \\\n", - "0 [-0.020793898031115532, 0.02951139025390148, 0... \n", - "1 [-0.009704762138426304, 0.013335365802049637, ... \n", + " type description \\\n", + "0 DATA STRUCTURE A data structure used to store information in ... \n", + "1 VARIABLE A variable that points to the first SSDBufHash... \n", "\n", " text_unit_ids \n", - "0 [01e84646075b255eab0a34d872336a89, 10bab8e9773... \n", - "1 [01e84646075b255eab0a34d872336a89, 28f242c4515... " + "0 [dcaa484d1db860309b46a809929f5c1c27e7e53e0ab92... \n", + "1 [dcaa484d1db860309b46a809929f5c1c27e7e53e0ab92... " ] }, - "execution_count": 78, + "execution_count": 84, "metadata": {}, "output_type": "execute_result" } @@ -562,12 +606,11 @@ "entity_df = pd.read_parquet(\n", " f\"{GRAPHRAG_FOLDER}/create_final_entities.parquet\",\n", " columns=[\n", - " \"name\",\n", + " \"id\",\n", + " \"human_readable_id\",\n", + " \"title\",\n", " \"type\",\n", " \"description\",\n", - " \"human_readable_id\",\n", - " \"id\",\n", - " \"description_embedding\",\n", " \"text_unit_ids\",\n", " ],\n", ")\n", @@ -576,25 +619,46 @@ }, { "cell_type": "code", - "execution_count": 81, + "execution_count": 85, "id": "1d038114-0714-48ee-a48a-c421cd539661", "metadata": {}, + "outputs": [], + "source": [ + "# entity_statement = \"\"\"\n", + "# MERGE (e:__Entity__ {id:value.id})\n", + "# SET e += value {.human_readable_id, .description, title:replace(value.title,'\"','')}\n", + "# WITH e, value\n", + "# CALL db.create.setNodeVectorProperty(e, \"description\", value.description)\n", + "# CALL apoc.create.addLabels(e, case when coalesce(value.type,\"\") = \"\" then [] else [apoc.text.upperCamelCase(replace(value.type,'\"',''))] end) yield node\n", + "# UNWIND value.text_unit_ids AS text_unit\n", + "# MATCH (c:__Chunk__ {id:text_unit})\n", + "# MERGE (c)-[:HAS_ENTITY]->(e)\n", + "# \"\"\"\n", + "\n", + "# batched_import(entity_statement, entity_df)" + ] + }, + { + "cell_type": "code", + "execution_count": 86, + "id": "8e5fe82b", + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "{'_contains_updates': True, 'properties_set': 831}\n", - "277 rows in 0.6978070735931396 s.\n" + "{'_contains_updates': True, 'labels_added': 924, 'relationships_created': 1008, 'nodes_created': 474, 'properties_set': 1896}\n", + "474 rows in 1.448024034500122 s.\n" ] }, { "data": { "text/plain": [ - "277" + "474" ] }, - "execution_count": 81, + "execution_count": 86, "metadata": {}, "output_type": "execute_result" } @@ -602,10 +666,12 @@ "source": [ "entity_statement = \"\"\"\n", "MERGE (e:__Entity__ {id:value.id})\n", - "SET e += value {.human_readable_id, .description, name:replace(value.name,'\"','')}\n", + "SET e += value {.human_readable_id, .description, title:replace(value.title,'\"','')}\n", + "WITH e, value\n", + "FOREACH (label IN CASE WHEN value.type IS NOT NULL AND value.type <> '' \n", + " THEN [value.type] ELSE [] END | \n", + " SET e:`${label}`)\n", "WITH e, value\n", - "CALL db.create.setNodeVectorProperty(e, \"description_embedding\", value.description_embedding)\n", - "CALL apoc.create.addLabels(e, case when coalesce(value.type,\"\") = \"\" then [] else [apoc.text.upperCamelCase(replace(value.type,'\"',''))] end) yield node\n", "UNWIND value.text_unit_ids AS text_unit\n", "MATCH (c:__Chunk__ {id:text_unit})\n", "MERGE (c)-[:HAS_ENTITY]->(e)\n", @@ -627,7 +693,7 @@ }, { "cell_type": "code", - "execution_count": 71, + "execution_count": 87, "id": "b347a047", "metadata": {}, "outputs": [ @@ -655,7 +721,7 @@ " source\n", " target\n", " id\n", - " rank\n", + " combined_degree\n", " weight\n", " human_readable_id\n", " description\n", @@ -665,49 +731,49 @@ " \n", " \n", " 0\n", - " \"PROJECT GUTENBERG\"\n", - " \"A CHRISTMAS CAROL\"\n", - " b84d71ed9c3b45819eb3205fd28e13a0\n", - " 20\n", - " 1.0\n", + " SSDBUFHASHBUCKET\n", + " SSD_BUF_HASHTABLE\n", + " af0e92ee-f14c-42aa-af87-eb32d4cdfc90\n", + " 11\n", + " 20.0\n", " 0\n", - " \"Project Gutenberg is responsible for releasin...\n", - " [680dd6d2a970a49082fa4f34bf63a34e]\n", + " ssd_buf_hashtable is a pointer to the first SS...\n", + " [dcaa484d1db860309b46a809929f5c1c27e7e53e0ab92...\n", " \n", " \n", " 1\n", - " \"PROJECT GUTENBERG\"\n", - " \"SUZANNE SHELL\"\n", - " b0b464bc92a541e48547fe9738378dab\n", - " 15\n", - " 1.0\n", + " SSDBUFHASHBUCKET\n", + " HASHTAB_GETHASHCODE\n", + " 2509020c-bfe8-4a18-ab83-a01433fab524\n", + " 10\n", + " 7.0\n", " 1\n", - " \"Suzanne Shell produced the eBook version of '...\n", - " [680dd6d2a970a49082fa4f34bf63a34e]\n", + " HashTab_GetHashCode function uses SSDBufHashBu...\n", + " [dcaa484d1db860309b46a809929f5c1c27e7e53e0ab92...\n", " \n", " \n", "\n", "" ], "text/plain": [ - " source target id \\\n", - "0 \"PROJECT GUTENBERG\" \"A CHRISTMAS CAROL\" b84d71ed9c3b45819eb3205fd28e13a0 \n", - "1 \"PROJECT GUTENBERG\" \"SUZANNE SHELL\" b0b464bc92a541e48547fe9738378dab \n", + " source target \\\n", + "0 SSDBUFHASHBUCKET SSD_BUF_HASHTABLE \n", + "1 SSDBUFHASHBUCKET HASHTAB_GETHASHCODE \n", "\n", - " rank weight human_readable_id \\\n", - "0 20 1.0 0 \n", - "1 15 1.0 1 \n", + " id combined_degree weight \\\n", + "0 af0e92ee-f14c-42aa-af87-eb32d4cdfc90 11 20.0 \n", + "1 2509020c-bfe8-4a18-ab83-a01433fab524 10 7.0 \n", "\n", - " description \\\n", - "0 \"Project Gutenberg is responsible for releasin... \n", - "1 \"Suzanne Shell produced the eBook version of '... \n", + " human_readable_id description \\\n", + "0 0 ssd_buf_hashtable is a pointer to the first SS... \n", + "1 1 HashTab_GetHashCode function uses SSDBufHashBu... \n", "\n", - " text_unit_ids \n", - "0 [680dd6d2a970a49082fa4f34bf63a34e] \n", - "1 [680dd6d2a970a49082fa4f34bf63a34e] " + " text_unit_ids \n", + "0 [dcaa484d1db860309b46a809929f5c1c27e7e53e0ab92... \n", + "1 [dcaa484d1db860309b46a809929f5c1c27e7e53e0ab92... " ] }, - "execution_count": 71, + "execution_count": 87, "metadata": {}, "output_type": "execute_result" } @@ -719,7 +785,7 @@ " \"source\",\n", " \"target\",\n", " \"id\",\n", - " \"rank\",\n", + " \"combined_degree\",\n", " \"weight\",\n", " \"human_readable_id\",\n", " \"description\",\n", @@ -731,7 +797,7 @@ }, { "cell_type": "code", - "execution_count": 72, + "execution_count": 88, "id": "27900c01-89e1-4dec-9d5c-c07317c68baf", "metadata": {}, "outputs": [ @@ -739,25 +805,25 @@ "name": "stdout", "output_type": "stream", "text": [ - "{'_contains_updates': True, 'properties_set': 1710}\n", - "342 rows in 0.14740705490112305 s.\n" + "{'_contains_updates': True, 'relationships_created': 653, 'properties_set': 3265}\n", + "653 rows in 2.0415780544281006 s.\n" ] }, { "data": { "text/plain": [ - "342" + "653" ] }, - "execution_count": 72, + "execution_count": 88, "metadata": {}, "output_type": "execute_result" } ], "source": [ "rel_statement = \"\"\"\n", - " MATCH (source:__Entity__ {name:replace(value.source,'\"','')})\n", - " MATCH (target:__Entity__ {name:replace(value.target,'\"','')})\n", + " MATCH (source:__Entity__ {title:replace(value.source,'\"','')})\n", + " MATCH (target:__Entity__ {title:replace(value.target,'\"','')})\n", " // not necessary to merge on id as there is only one relationship per pair\n", " MERGE (source)-[rel:RELATED {id: value.id}]->(target)\n", " SET rel += value {.rank, .weight, .human_readable_id, .description, .text_unit_ids}\n", @@ -782,7 +848,7 @@ }, { "cell_type": "code", - "execution_count": 73, + "execution_count": 89, "id": "c2fab66c", "metadata": {}, "outputs": [ @@ -817,35 +883,39 @@ " \n", " \n", " 0\n", - " 2\n", + " 73cbeacb-7f76-46d1-a025-e9e70d7c7396\n", " 0\n", - " Community 2\n", - " [0546d296a4d3bb0486bd0c94c01dc9be,0d6bc6e701a0...\n", - " [ba481175ee1d4329bf07757a30abd3a1, 8d8da35190b...\n", + " Community 0\n", + " [0806dbf3d6f7481790423036bc2afd6b54647c4282307...\n", + " [0309cdc2-9383-4b4e-be78-0675a75004d2, 0a9b46a...\n", " \n", " \n", " 1\n", - " 4\n", + " 843a8571-eebf-4504-9c21-d84a5aefd271\n", " 0\n", - " Community 4\n", - " [054bdcba0a3690b43609d9226a47f84d,3a450ed2b7fb...\n", - " [929f30875e1744b49e7b416eaf5a790c, 4920fda0318...\n", + " Community 1\n", + " [25dc39b6f5cca06b4b96f4492bfbe343263f5f2770505...\n", + " [38489977-ffba-4057-88fd-cf84c78e3be4, 3a0a40c...\n", " \n", " \n", "\n", "" ], "text/plain": [ - " id level title text_unit_ids \\\n", - "0 2 0 Community 2 [0546d296a4d3bb0486bd0c94c01dc9be,0d6bc6e701a0... \n", - "1 4 0 Community 4 [054bdcba0a3690b43609d9226a47f84d,3a450ed2b7fb... \n", + " id level title \\\n", + "0 73cbeacb-7f76-46d1-a025-e9e70d7c7396 0 Community 0 \n", + "1 843a8571-eebf-4504-9c21-d84a5aefd271 0 Community 1 \n", + "\n", + " text_unit_ids \\\n", + "0 [0806dbf3d6f7481790423036bc2afd6b54647c4282307... \n", + "1 [25dc39b6f5cca06b4b96f4492bfbe343263f5f2770505... \n", "\n", " relationship_ids \n", - "0 [ba481175ee1d4329bf07757a30abd3a1, 8d8da35190b... \n", - "1 [929f30875e1744b49e7b416eaf5a790c, 4920fda0318... " + "0 [0309cdc2-9383-4b4e-be78-0675a75004d2, 0a9b46a... \n", + "1 [38489977-ffba-4057-88fd-cf84c78e3be4, 3a0a40c... " ] }, - "execution_count": 73, + "execution_count": 89, "metadata": {}, "output_type": "execute_result" } @@ -861,7 +931,7 @@ }, { "cell_type": "code", - "execution_count": 74, + "execution_count": 90, "id": "1351f7e3", "metadata": {}, "outputs": [ @@ -869,17 +939,17 @@ "name": "stdout", "output_type": "stream", "text": [ - "{'_contains_updates': True, 'properties_set': 94}\n", - "47 rows in 0.07877922058105469 s.\n" + "{'_contains_updates': True, 'labels_added': 85, 'relationships_created': 766, 'nodes_created': 85, 'properties_set': 255}\n", + "85 rows in 2.3909716606140137 s.\n" ] }, { "data": { "text/plain": [ - "47" + "85" ] }, - "execution_count": 74, + "execution_count": 90, "metadata": {}, "output_type": "execute_result" } @@ -918,7 +988,7 @@ }, { "cell_type": "code", - "execution_count": 75, + "execution_count": 91, "id": "1be9e7a9-69ee-406b-bce5-95a9c41ecffe", "metadata": {}, "outputs": [ @@ -957,59 +1027,59 @@ " \n", " \n", " 0\n", - " e7822326-4da8-4954-afa9-be7f4f5791a5\n", - " 42\n", + " ce0cedd9eb654e48b68cf66540d34ad0\n", + " 64\n", " 2\n", - " Scrooge's Supernatural Encounters: Marley's Gh...\n", - " This report delves into the pivotal supernatur...\n", - " [{'explanation': 'Marley's Ghost plays a cruci...\n", - " 8.0\n", - " The impact severity rating is high due to the ...\n", - " # Scrooge's Supernatural Encounters: Marley's ...\n", + " Optimizing SSD Cache Management in Software En...\n", + " This report delves into the intricate network ...\n", + " [{'explanation': 'The HIT_SAC function is pivo...\n", + " 9.5\n", + " The technical depth and direct applicability o...\n", + " # Optimizing SSD Cache Management in Software ...\n", " \n", " \n", " 1\n", - " 8a5afac1-99ef-4f01-a1b1-f044ce392ff9\n", - " 43\n", + " b4691d06dd5e46d996aa06d364bb3656\n", + " 65\n", " 2\n", - " The Ghost's Influence on Scrooge's Transformation\n", - " This report delves into the pivotal role of 'T...\n", - " [{'explanation': 'The Ghost, identified at tim...\n", - " 8.5\n", - " The impact severity rating is high due to the ...\n", - " # The Ghost's Influence on Scrooge's Transform...\n", + " DSCPTR_SAC: A Keystone in Software Engineering...\n", + " The DSCPTR_SAC data structure is pivotal in th...\n", + " [{'explanation': 'The DSCPTR_SAC data structur...\n", + " 9.0\n", + " The detailed structure and relationships of DS...\n", + " # DSCPTR_SAC: A Keystone in Software Engineeri...\n", " \n", " \n", "\n", "" ], "text/plain": [ - " id community level \\\n", - "0 e7822326-4da8-4954-afa9-be7f4f5791a5 42 2 \n", - "1 8a5afac1-99ef-4f01-a1b1-f044ce392ff9 43 2 \n", + " id community level \\\n", + "0 ce0cedd9eb654e48b68cf66540d34ad0 64 2 \n", + "1 b4691d06dd5e46d996aa06d364bb3656 65 2 \n", "\n", " title \\\n", - "0 Scrooge's Supernatural Encounters: Marley's Gh... \n", - "1 The Ghost's Influence on Scrooge's Transformation \n", + "0 Optimizing SSD Cache Management in Software En... \n", + "1 DSCPTR_SAC: A Keystone in Software Engineering... \n", "\n", " summary \\\n", - "0 This report delves into the pivotal supernatur... \n", - "1 This report delves into the pivotal role of 'T... \n", + "0 This report delves into the intricate network ... \n", + "1 The DSCPTR_SAC data structure is pivotal in th... \n", "\n", " findings rank \\\n", - "0 [{'explanation': 'Marley's Ghost plays a cruci... 8.0 \n", - "1 [{'explanation': 'The Ghost, identified at tim... 8.5 \n", + "0 [{'explanation': 'The HIT_SAC function is pivo... 9.5 \n", + "1 [{'explanation': 'The DSCPTR_SAC data structur... 9.0 \n", "\n", " rank_explanation \\\n", - "0 The impact severity rating is high due to the ... \n", - "1 The impact severity rating is high due to the ... \n", + "0 The technical depth and direct applicability o... \n", + "1 The detailed structure and relationships of DS... \n", "\n", " full_content \n", - "0 # Scrooge's Supernatural Encounters: Marley's ... \n", - "1 # The Ghost's Influence on Scrooge's Transform... " + "0 # Optimizing SSD Cache Management in Software ... \n", + "1 # DSCPTR_SAC: A Keystone in Software Engineeri... " ] }, - "execution_count": 75, + "execution_count": 91, "metadata": {}, "output_type": "execute_result" } @@ -1034,7 +1104,7 @@ }, { "cell_type": "code", - "execution_count": 76, + "execution_count": 92, "id": "5c6ed591-f98c-4403-9fde-8d4cb4c01cca", "metadata": {}, "outputs": [ @@ -1042,17 +1112,17 @@ "name": "stdout", "output_type": "stream", "text": [ - "{'_contains_updates': True, 'properties_set': 729}\n", - "47 rows in 0.02472519874572754 s.\n" + "{'_contains_updates': True, 'labels_added': 520, 'relationships_created': 435, 'nodes_created': 520, 'properties_set': 1900}\n", + "85 rows in 0.594315767288208 s.\n" ] }, { "data": { "text/plain": [ - "47" + "85" ] }, - "execution_count": 76, + "execution_count": 92, "metadata": {}, "output_type": "execute_result" } @@ -1083,10 +1153,26 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 93, "id": "523bed92-d12c-4fc4-aa44-6c62321b36bc", "metadata": {}, - "outputs": [], + "outputs": [ + { + "ename": "FileNotFoundError", + "evalue": "[Errno 2] No such file or directory: '/home/fei/git/graphrag/workdir/output/create_final_covariates.parquet'", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mFileNotFoundError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[93], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m cov_df \u001b[38;5;241m=\u001b[39m (\u001b[43mpd\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mread_parquet\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43mf\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;132;43;01m{\u001b[39;49;00m\u001b[43mGRAPHRAG_FOLDER\u001b[49m\u001b[38;5;132;43;01m}\u001b[39;49;00m\u001b[38;5;124;43m/create_final_covariates.parquet\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m,)\n\u001b[1;32m 2\u001b[0m \u001b[38;5;66;03m# columns=[\"id\",\"text_unit_id\"])\u001b[39;00m\n\u001b[1;32m 3\u001b[0m cov_df\u001b[38;5;241m.\u001b[39mhead(\u001b[38;5;241m2\u001b[39m)\n", + "File \u001b[0;32m~/git/graphrag/.venv/lib/python3.12/site-packages/pandas/io/parquet.py:667\u001b[0m, in \u001b[0;36mread_parquet\u001b[0;34m(path, engine, columns, storage_options, use_nullable_dtypes, dtype_backend, filesystem, filters, **kwargs)\u001b[0m\n\u001b[1;32m 664\u001b[0m use_nullable_dtypes \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mFalse\u001b[39;00m\n\u001b[1;32m 665\u001b[0m check_dtype_backend(dtype_backend)\n\u001b[0;32m--> 667\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mimpl\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mread\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 668\u001b[0m \u001b[43m \u001b[49m\u001b[43mpath\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 669\u001b[0m \u001b[43m \u001b[49m\u001b[43mcolumns\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcolumns\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 670\u001b[0m \u001b[43m \u001b[49m\u001b[43mfilters\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mfilters\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 671\u001b[0m \u001b[43m \u001b[49m\u001b[43mstorage_options\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mstorage_options\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 672\u001b[0m \u001b[43m \u001b[49m\u001b[43muse_nullable_dtypes\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43muse_nullable_dtypes\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 673\u001b[0m \u001b[43m \u001b[49m\u001b[43mdtype_backend\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdtype_backend\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 674\u001b[0m \u001b[43m \u001b[49m\u001b[43mfilesystem\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mfilesystem\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 675\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 676\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/git/graphrag/.venv/lib/python3.12/site-packages/pandas/io/parquet.py:267\u001b[0m, in \u001b[0;36mPyArrowImpl.read\u001b[0;34m(self, path, columns, filters, use_nullable_dtypes, dtype_backend, storage_options, filesystem, **kwargs)\u001b[0m\n\u001b[1;32m 264\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m manager \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124marray\u001b[39m\u001b[38;5;124m\"\u001b[39m:\n\u001b[1;32m 265\u001b[0m to_pandas_kwargs[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124msplit_blocks\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mTrue\u001b[39;00m \u001b[38;5;66;03m# type: ignore[assignment]\u001b[39;00m\n\u001b[0;32m--> 267\u001b[0m path_or_handle, handles, filesystem \u001b[38;5;241m=\u001b[39m \u001b[43m_get_path_or_handle\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 268\u001b[0m \u001b[43m \u001b[49m\u001b[43mpath\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 269\u001b[0m \u001b[43m \u001b[49m\u001b[43mfilesystem\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 270\u001b[0m \u001b[43m \u001b[49m\u001b[43mstorage_options\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mstorage_options\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 271\u001b[0m \u001b[43m \u001b[49m\u001b[43mmode\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mrb\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 272\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 273\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m 274\u001b[0m pa_table \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mapi\u001b[38;5;241m.\u001b[39mparquet\u001b[38;5;241m.\u001b[39mread_table(\n\u001b[1;32m 275\u001b[0m path_or_handle,\n\u001b[1;32m 276\u001b[0m columns\u001b[38;5;241m=\u001b[39mcolumns,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 279\u001b[0m \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs,\n\u001b[1;32m 280\u001b[0m )\n", + "File \u001b[0;32m~/git/graphrag/.venv/lib/python3.12/site-packages/pandas/io/parquet.py:140\u001b[0m, in \u001b[0;36m_get_path_or_handle\u001b[0;34m(path, fs, storage_options, mode, is_dir)\u001b[0m\n\u001b[1;32m 130\u001b[0m handles \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m 131\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m (\n\u001b[1;32m 132\u001b[0m \u001b[38;5;129;01mnot\u001b[39;00m fs\n\u001b[1;32m 133\u001b[0m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m is_dir\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 138\u001b[0m \u001b[38;5;66;03m# fsspec resources can also point to directories\u001b[39;00m\n\u001b[1;32m 139\u001b[0m \u001b[38;5;66;03m# this branch is used for example when reading from non-fsspec URLs\u001b[39;00m\n\u001b[0;32m--> 140\u001b[0m handles \u001b[38;5;241m=\u001b[39m \u001b[43mget_handle\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 141\u001b[0m \u001b[43m \u001b[49m\u001b[43mpath_or_handle\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmode\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mis_text\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mstorage_options\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mstorage_options\u001b[49m\n\u001b[1;32m 142\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 143\u001b[0m fs \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m 144\u001b[0m path_or_handle \u001b[38;5;241m=\u001b[39m handles\u001b[38;5;241m.\u001b[39mhandle\n", + "File \u001b[0;32m~/git/graphrag/.venv/lib/python3.12/site-packages/pandas/io/common.py:882\u001b[0m, in \u001b[0;36mget_handle\u001b[0;34m(path_or_buf, mode, encoding, compression, memory_map, is_text, errors, storage_options)\u001b[0m\n\u001b[1;32m 873\u001b[0m handle \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mopen\u001b[39m(\n\u001b[1;32m 874\u001b[0m handle,\n\u001b[1;32m 875\u001b[0m ioargs\u001b[38;5;241m.\u001b[39mmode,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 878\u001b[0m newline\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m 879\u001b[0m )\n\u001b[1;32m 880\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 881\u001b[0m \u001b[38;5;66;03m# Binary mode\u001b[39;00m\n\u001b[0;32m--> 882\u001b[0m handle \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mopen\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mhandle\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mioargs\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmode\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 883\u001b[0m handles\u001b[38;5;241m.\u001b[39mappend(handle)\n\u001b[1;32m 885\u001b[0m \u001b[38;5;66;03m# Convert BytesIO or file objects passed with an encoding\u001b[39;00m\n", + "\u001b[0;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: '/home/fei/git/graphrag/workdir/output/create_final_covariates.parquet'" + ] + } + ], "source": [ "cov_df = (pd.read_parquet(f\"{GRAPHRAG_FOLDER}/create_final_covariates.parquet\"),)\n", "# columns=[\"id\",\"text_unit_id\"])\n", @@ -1193,7 +1279,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": ".venv", "language": "python", "name": "python3" }, @@ -1207,7 +1293,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.8" + "version": "3.12.3" } }, "nbformat": 4, diff --git a/graphrag/config/models/graph_rag_config.py b/graphrag/config/models/graph_rag_config.py index 77ad944053..8c03074658 100644 --- a/graphrag/config/models/graph_rag_config.py +++ b/graphrag/config/models/graph_rag_config.py @@ -3,7 +3,7 @@ """Parameterization settings for the default configuration.""" -from devtools import pformat +from pprint import pformat from pydantic import Field import graphrag.config.defaults as defs diff --git a/graphrag/index/config/pipeline.py b/graphrag/index/config/pipeline.py index 839f2c77d2..448f67e2a6 100644 --- a/graphrag/index/config/pipeline.py +++ b/graphrag/index/config/pipeline.py @@ -5,7 +5,7 @@ from __future__ import annotations -from devtools import pformat +from pprint import pformat from pydantic import BaseModel, Field from graphrag.index.config.cache import PipelineCacheConfigTypes diff --git a/graphrag/index/operations/extract_entities/graph_extractor.py b/graphrag/index/operations/extract_entities/graph_extractor.py index f10b2c83e5..bf5bcce782 100644 --- a/graphrag/index/operations/extract_entities/graph_extractor.py +++ b/graphrag/index/operations/extract_entities/graph_extractor.py @@ -8,6 +8,7 @@ import traceback from collections.abc import Mapping from dataclasses import dataclass +from string import Formatter from typing import Any import networkx as nx @@ -30,7 +31,58 @@ log = logging.getLogger(__name__) - +class SafeFormatter(Formatter): + def __init__(self): + # 匹配嵌套结构占位符的正则表达式 + self.nested_pattern = re.compile(r"{([^{}]+)}") + # 匹配未成对的单独花括号 + self.unpaired_pattern = re.compile(r"(?:{[^{}]*$|^[^{}]*}|{[^{}]*}|})") + + + def format(self, format_string, *args, **kwargs): + # 替换未成对的花括号 + format_string = self._replace_unpaired_braces(format_string) + # 替换嵌套结构占位符 + format_string = self._replace_nested(format_string) + # 使用父类的 format 方法处理非嵌套占位符 + return super().format(format_string, *args, **kwargs) + + def get_value(self, key, args, kwargs) -> Any: + # 仅处理字符串键,如果 key 不存在于 kwargs,则保留原样占位符 + if isinstance(key, str) and key in kwargs: + return kwargs[key] + return f"{{{key}}}" + + def _replace_unpaired_braces(self, format_string): + """ + 替换未成对的花括号为普通字符 '{' 或 '}'。 + """ + # 替换未成对的 `{` 或 `}` 为普通字符 + def replace_unpaired(match): + unmatched = match.group(0) + # 如果是未闭合的 `{` 或单独的 `}`, 替换为普通字符 + if unmatched.startswith("{") and unmatched.endswith("}"): + return unmatched # 保留合法的占位符 + elif unmatched.startswith("{"): + return "{{" # 替换未闭合的 `{` 为普通字符 + elif unmatched.endswith("}"): + return "}}" # 替换单独的 `}` 为普通字符 + return unmatched + return self.unpaired_pattern.sub(replace_unpaired, format_string) + + def _replace_nested(self, format_string): + """ + 替换嵌套结构为普通文本{{}} + """ + def replace_nested(match): + key = match.group(1) + # # 如果是嵌套结构(检测到 "[" 或 "."),转义为 {{...}} + if "[" in key or "." in key: + return f"{{{{{key}}}}}" # 双括号转义,避免解析 + return match.group(0) # 保留原始内容 + return self.nested_pattern.sub(replace_nested, format_string) + + @dataclass class GraphExtractionResult: """Unipartite graph extraction result class definition.""" @@ -152,11 +204,14 @@ async def __call__( async def _process_document( self, text: str, prompt_variables: dict[str, str] ) -> str: + formatter = SafeFormatter() + kwargs = { + **prompt_variables, + self._input_text_key: text + } + formated_prompt = formatter.format(self._extraction_prompt, **kwargs) response = await self._llm( - self._extraction_prompt.format(**{ - **prompt_variables, - self._input_text_key: text, - }), + formated_prompt ) results = response.output.content or "" diff --git a/graphrag/requirements.txt b/graphrag/requirements.txt new file mode 100644 index 0000000000..8ab3579977 --- /dev/null +++ b/graphrag/requirements.txt @@ -0,0 +1,149 @@ +aiofiles==24.1.0 +aiolimiter==1.2.1 +annotated-types==0.7.0 +anyio==4.8.0 +anytree==2.12.1 +asttokens==2.4.1 +attrs==24.3.0 +autograd==1.7.0 +azure-common==1.1.28 +azure-core==1.32.0 +azure-cosmos==4.9.0 +azure-identity==1.19.0 +azure-search-documents==11.5.2 +azure-storage-blob==12.24.0 +backcall==0.2.0 +backlash==0.3.2 +beartype==0.18.5 +beautifulsoup4==4.12.3 +bleach==6.2.0 +certifi==2024.12.14 +cffi==1.17.1 +charset-normalizer==3.4.1 +click==8.1.8 +contourpy==1.3.1 +crank==0.8.1 +cryptography==44.0.0 +cycler==0.12.1 +decorator==5.1.1 +defusedxml==0.7.1 +deprecation==2.1.0 +devtools==0.12.2 +distro==1.9.0 +docopt==0.6.2 +environs==11.2.1 +executing==2.1.0 +fastjsonschema==2.21.1 +fnllm==0.0.10 +fonttools==4.55.3 +future==1.0.0 +gearbox==0.2.2 +gensim==4.3.3 +graspologic==3.4.1 +graspologic-native==1.2.1 +h11==0.14.0 +httpcore==1.0.7 +httpx==0.28.1 +hupper==1.12.1 +hyppo==0.4.0 +idna==3.10 +ipython==8.12.3 +isodate==0.7.2 +jedi==0.19.2 +Jinja2==3.1.5 +jiter==0.8.2 +joblib==1.4.2 +json_repair==0.30.3 +jsonschema==4.23.0 +jsonschema-specifications==2024.10.1 +jupyter_client==8.6.3 +jupyter_core==5.7.2 +jupyterlab_pygments==0.3.0 +kiwisolver==1.4.8 +lancedb==0.17.0 +llvmlite==0.44.0 +markdown-it-py==3.0.0 +MarkupSafe==3.0.2 +marshmallow==3.25.1 +matplotlib==3.10.0 +matplotlib-inline==0.1.7 +mdurl==0.1.2 +mistune==3.1.0 +msal==1.31.1 +msal-extensions==1.2.0 +nbclient==0.10.2 +nbconvert==7.16.5 +nbformat==5.10.4 +networkx==3.4.2 +nltk==3.9.1 +numba==0.61.0 +numpy==1.26.4 +openai==1.59.9 +overrides==7.7.0 +packaging==24.2 +pandas==2.2.3 +pandocfilters==1.5.1 +parso==0.8.4 +PasteDeploy==3.1.0 +patsy==1.0.1 +pexpect==4.9.0 +pickleshare==0.7.5 +pillow==11.1.0 +pipreqs==0.5.0 +platformdirs==4.3.6 +portalocker==2.10.1 +POT==0.9.5 +prompt_toolkit==3.0.50 +ptyprocess==0.7.0 +pure_eval==0.2.3 +pyarrow==15.0.2 +pycparser==2.22 +pydantic==2.10.5 +pydantic_core==2.27.2 +Pygments==2.19.1 +PyJWT==2.10.1 +pylance==0.20.0 +pynndescent==0.5.13 +pyparsing==3.2.1 +python-dateutil==2.9.0.post0 +python-dotenv==1.0.1 +pytz==2024.2 +PyYAML==6.0.2 +pyzmq==26.2.0 +referencing==0.36.1 +regex==2024.11.6 +repoze.lru==0.7 +requests==2.32.3 +rich==13.9.4 +rpds-py==0.22.3 +scikit-learn==1.6.1 +scipy==1.12.0 +seaborn==0.13.2 +shellingham==1.5.4 +six==1.17.0 +smart-open==7.1.0 +sniffio==1.3.1 +soupsieve==2.6 +stack-data==0.6.3 +statsmodels==0.14.4 +Tempita==0.6.0 +tenacity==9.0.0 +tg.devtools==2.4.3 +tgext.debugbar==0.5.0 +threadpoolctl==3.5.0 +tiktoken==0.8.0 +tinycss2==1.4.0 +tornado==6.4.2 +tqdm==4.67.1 +traitlets==5.14.3 +TurboGears2==2.4.3 +typer==0.15.1 +typing_extensions==4.12.2 +tzdata==2024.2 +umap-learn==0.5.7 +urllib3==2.3.0 +wcwidth==0.2.13 +webencodings==0.5.1 +WebOb==1.8.9 +wrapt==1.17.2 +yarg==0.1.9