forked from microsoft/kernel-memory
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Program.cs
159 lines (137 loc) · 6.32 KB
/
Program.cs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
// Copyright (c) Microsoft. All rights reserved.
using Microsoft.KernelMemory;
using Microsoft.KernelMemory.Configuration;
using Microsoft.KernelMemory.DocumentStorage.DevTools;
using Microsoft.KernelMemory.FileSystem.DevTools;
using Microsoft.KernelMemory.MemoryStorage.DevTools;
/// <summary>
/// This example shows how to retrieve N memory records before and after a relevant memory record.
///
/// Suppose uploading a book, and during the import KM splits the text in 10,000 partitions of 100 tokens, generating 10,000 memory records.
/// When searching memory by similarity, the system returns a list of relevant memories, containing snippets of text ~100 tokens long.
///
/// Before sending text snippets to a LLM along with a question (RAG), you might want to include extra information, e.g. text PRECEDING and FOLLOWING each text snippet, e.g. 100 tokens extra on both sides:
///
/// ----------
/// partition N - 1, memory record
/// text snippet
/// 100 tokens
/// ----------
/// partition N, RELEVANT memory record
/// text snippet
/// 100 tokens
/// ----------
/// partition N + 1, memory record
/// text snippet
/// 100 tokens
/// ---------
///
/// The code below shows how to fetch records before and after each RELEVANT memory record, leveraging the Partition Number property.
///
/// Note: when importing documents, you can set `OverlappingTokens` so that each partition contains a part of the previous and the next partitions.
/// This is another approach to always include a little more context, however this approach is limited by the max number of tokens an
/// embedding generator can work with, and in a way affects the semantics of each text snippet.
/// Also, when using the example below, you should consider setting OverlappingTokens to zero, to avoid text repetitions.
/// </summary>
public static class Program
{
public static async Task Main()
{
// Partition input text in chunks of 100 tokens
const int PartitionSize = 100;
// Search settings
const string Query = "astrobiology";
const float MinRelevance = 0.7f;
const int Limit = 2;
// Load OpenAI settings and API key
var openAIConfig = new OpenAIConfig();
new ConfigurationBuilder()
.AddJsonFile("appsettings.json")
.AddJsonFile("appsettings.Development.json", optional: true)
.Build()
.BindSection("KernelMemory:Services:OpenAI", openAIConfig);
// Customize memory records size (in tokens)
var textPartitioningOptions = new TextPartitioningOptions
{
MaxTokensPerParagraph = PartitionSize,
MaxTokensPerLine = PartitionSize,
OverlappingTokens = 0,
};
// Prepare memory instance, store memories on disk so import runs only once
var memory = new KernelMemoryBuilder()
.WithOpenAI(openAIConfig)
.WithCustomTextPartitioningOptions(textPartitioningOptions)
.WithSimpleFileStorage(new SimpleFileStorageConfig { StorageType = FileSystemTypes.Disk })
.WithSimpleVectorDb(new SimpleVectorDbConfig { StorageType = FileSystemTypes.Disk })
.Build();
// Load text into memory
Console.WriteLine("Importing memories...");
await memory.ImportDocumentAsync(filePath: "story.docx", documentId: "example207");
// Search
Console.WriteLine("Searching memories...");
SearchResult relevant = await memory.SearchAsync(query: Query, minRelevance: MinRelevance, limit: Limit);
Console.WriteLine($"Relevant documents: {relevant.Results.Count}");
foreach (Citation result in relevant.Results)
{
// Store the document IDs so we can load all their records later
Console.WriteLine($"Document ID: {result.DocumentId}");
Console.WriteLine($"Relevant partitions: {result.Partitions.Count}");
foreach (Citation.Partition partition in result.Partitions)
{
Console.WriteLine($" * Partition {partition.PartitionNumber}, relevance: {partition.Relevance}");
}
Console.WriteLine("--------------------------");
// For each relevant partition fetch the partition before and one after
foreach (Citation.Partition partition in result.Partitions)
{
// Collect partitions in a sorted collection
var partitions = new SortedDictionary<int, Citation.Partition> { [partition.PartitionNumber] = partition };
// Filters to fetch adjacent partitions
var filters = new List<MemoryFilter>
{
MemoryFilters.ByDocument(result.DocumentId).ByTag(Constants.ReservedFilePartitionNumberTag, $"{partition.PartitionNumber - 1}"),
MemoryFilters.ByDocument(result.DocumentId).ByTag(Constants.ReservedFilePartitionNumberTag, $"{partition.PartitionNumber + 1}")
};
// Fetch adjacent partitions and add them to the sorted collection
SearchResult adjacentList = await memory.SearchAsync("", filters: filters, limit: 2);
foreach (Citation.Partition adjacent in adjacentList.Results.First().Partitions)
{
partitions[adjacent.PartitionNumber] = adjacent;
}
// Print partitions in order
foreach (var p in partitions)
{
Console.WriteLine($"# Partition {p.Value.PartitionNumber}");
Console.WriteLine(p.Value.Text);
Console.WriteLine();
}
Console.WriteLine("--------------------------");
}
Console.WriteLine();
}
}
}
/* Result:
Importing memories...
Searching memories...
Relevant documents: 1
Document ID: example207
Relevant partitions: 2
* Partition 27, relevance: 0.8557962
* Partition 13, relevance: 0.85513425
--------------------------
# Partition 26
Dr. Mei Lin, a renowned ...
# Partition 27
As scientific interest in ...
# Partition 28
Meanwhile, back on Earth, the ...
--------------------------
# Partition 12
Appearing as a glowing, translucent ...
# Partition 13
Gerald Marshall, the Chief ...
# Partition 14
While further studies are ...
--------------------------
*/