Skip to content

Commit 3fd7db4

Browse files
authored
refine mindmap (#1817)
### What problem does this PR solve? #1594 ### Type of change - [x] Refactoring
1 parent 5650442 commit 3fd7db4

File tree

3 files changed

+39
-24
lines changed

3 files changed

+39
-24
lines changed

graphrag/index.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
import networkx as nx
2222
from api.db import LLMType
2323
from api.db.services.llm_service import LLMBundle
24+
from api.db.services.user_service import TenantService
2425
from graphrag.community_reports_extractor import CommunityReportsExtractor
2526
from graphrag.entity_resolution import EntityResolution
2627
from graphrag.graph_extractor import GraphExtractor
@@ -30,6 +31,11 @@
3031

3132

3233
def be_children(obj: dict, keyset:set):
34+
if isinstance(obj, str):
35+
obj = [obj]
36+
if isinstance(obj, list):
37+
for i in obj: keyset.add(i)
38+
return [{"id": i, "children":[]} for i in obj]
3339
arr = []
3440
for k,v in obj.items():
3541
k = re.sub(r"\*+", "", k)
@@ -65,7 +71,8 @@ def graph_merge(g1, g2):
6571

6672

6773
def build_knowlege_graph_chunks(tenant_id: str, chunks: List[str], callback, entity_types=["organization", "person", "location", "event", "time"]):
68-
llm_bdl = LLMBundle(tenant_id, LLMType.CHAT)
74+
_, tenant = TenantService.get_by_id(tenant_id)
75+
llm_bdl = LLMBundle(tenant_id, LLMType.CHAT, tenant.llm_id)
6976
ext = GraphExtractor(llm_bdl)
7077
left_token_count = llm_bdl.max_length - ext.prompt_token_count - 1024
7178
left_token_count = max(llm_bdl.max_length * 0.8, left_token_count)

graphrag/mind_map_extractor.py

Lines changed: 20 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,9 @@
1313
# See the License for the specific language governing permissions and
1414
# limitations under the License.
1515
#
16-
16+
import collections
17+
import logging
18+
import re
1719
import logging
1820
import traceback
1921
from concurrent.futures import ThreadPoolExecutor
@@ -65,7 +67,7 @@ def __call__(
6567
try:
6668
exe = ThreadPoolExecutor(max_workers=12)
6769
threads = []
68-
token_count = self._llm.max_length * 0.7
70+
token_count = max(self._llm.max_length * 0.8, self._llm.max_length-512)
6971
texts = []
7072
res = []
7173
cnt = 0
@@ -122,6 +124,19 @@ def _list_to_kv(self, data):
122124
continue
123125
return data
124126

127+
def _todict(self, layer:collections.OrderedDict):
128+
to_ret = layer
129+
if isinstance(layer, collections.OrderedDict):
130+
to_ret = dict(layer)
131+
132+
try:
133+
for key, value in to_ret.items():
134+
to_ret[key] = self._todict(value)
135+
except AttributeError:
136+
pass
137+
138+
return self._list_to_kv(to_ret)
139+
125140
def _process_document(
126141
self, text: str, prompt_variables: dict[str, str]
127142
) -> str:
@@ -132,6 +147,7 @@ def _process_document(
132147
text = perform_variable_replacements(self._mind_map_prompt, variables=variables)
133148
gen_conf = {"temperature": 0.5}
134149
response = self._llm.chat(text, [], gen_conf)
150+
response = re.sub(r"```[^\n]*", "", response)
135151
print(response)
136-
print("---------------------------------------------------\n", markdown_to_json.dictify(response))
137-
return dict(markdown_to_json.dictify(response))
152+
print("---------------------------------------------------\n", self._todict(markdown_to_json.dictify(response)))
153+
return self._todict(markdown_to_json.dictify(response))

graphrag/mind_map_prompt.py

Lines changed: 11 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -14,28 +14,20 @@
1414
# limitations under the License.
1515
#
1616
MIND_MAP_EXTRACTION_PROMPT = """
17-
- Role: You're a talent text processor.
17+
- Role: You're a talent text processor to summarize a piece of text into a mind map.
1818
19-
- Step of task:
20-
1. Generate a title for user's 'TEXT'。
21-
2. Classify the 'TEXT' into sections as you see fit.
22-
3. If the subject matter is really complex, split them into sub-sections.
19+
- Step of task:
20+
1. Generate a title for user's 'TEXT'。
21+
2. Classify the 'TEXT' into sections of a mind map.
22+
3. If the subject matter is really complex, split them into sub-sections and sub-subsections.
23+
4. Add a shot content summary of the bottom level section.
24+
25+
- Output requirement:
26+
- Always try to maximize the number of sub-sections.
27+
- In language of 'Text'
28+
- MUST IN FORMAT OF MARKDOWN
2329
24-
- Output requirement:
25-
- In language of
26-
- MUST IN FORMAT OF MARKDOWN
27-
28-
Output:
29-
## <Title>
30-
<Section Name>
31-
<Section Name>
32-
<Subsection Name>
33-
<Subsection Name>
34-
<Section Name>
35-
<Subsection Name>
36-
3730
-TEXT-
3831
{input_text}
3932
40-
Output:
4133
"""

0 commit comments

Comments
 (0)