-
-
Notifications
You must be signed in to change notification settings - Fork 90
Commit
* fix for LlmChain OutputKey. The result of CallAsync was always containing only ["text'] key ignoring OutputKey. This was causing an error when used with SequentialChain * implemented verbosity at LLMChain. Implemented ToString for the Message class. * ported CharacterTextSplitter - had to create and use TextSplitterBase class and Document class to match structure of original python implementation - i have not implemented methods which are relying on external source of information(from_huggingface_tokenizer,from_tiktoken_encoder) - added comments which are explaining logic behind implementation. it should help new people to understand what it does(original python implementation kind of hard to understand in some moments) * changed LengthFunctionDelegate to Func<string, int> * Revert "changed LengthFunctionDelegate to Func<string, int>" This reverts commit 9c340d4. * changed LengthFunctionDelegate to Func<string, int> --------- Co-authored-by: Konstantin S <[email protected]>
- Loading branch information
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,146 @@ | ||
using LangChain.Docstore; | ||
|
||
namespace LangChain.Base; | ||
|
||
/// <summary> | ||
/// Functionality for splitting text. | ||
/// <remarks> | ||
/// - ported from langchain/text_splitter.py | ||
/// | ||
/// </remarks> | ||
/// </summary> | ||
public abstract class TextSplitter | ||
{ | ||
private readonly int _chunkSize; | ||
private readonly int _chunkOverlap; | ||
private readonly Func<string, int> _lengthFunction; | ||
|
||
|
||
|
||
protected TextSplitter(int chunkSize = 4000, int chunkOverlap = 200, Func<string,int>? lengthFunction = null) | ||
{ | ||
if (chunkOverlap > chunkSize) | ||
{ | ||
throw new ArgumentException($"Chunk overlap ({chunkOverlap}) is greater than chunk size ({chunkSize})."); | ||
} | ||
|
||
_chunkSize = chunkSize; | ||
_chunkOverlap = chunkOverlap; | ||
_lengthFunction = lengthFunction ?? new Func<string, int>((str) => str.Length); | ||
} | ||
|
||
public abstract List<string> SplitText(string text); | ||
|
||
/// <summary> | ||
/// Create documents from a list of texts. | ||
/// </summary> | ||
/// <exception cref="ArgumentException"> | ||
/// If the number of texts and metadata(when not null) are not equal, this method will throw an ArgumentException. | ||
/// </exception> | ||
public List<Document> CreateDocuments(List<string> texts, List<Dictionary<string, object>>? metadatas = null) | ||
{ | ||
var documents = new List<Document>(); | ||
|
||
// if no metadata is provided, create a list of empty dictionaries | ||
metadatas ??= Enumerable.Repeat(new Dictionary<string, object>(), texts.Count).ToList(); | ||
|
||
if (texts.Count != metadatas.Count) | ||
{ | ||
throw new ArgumentException("Number of texts and metadata must be equal."); | ||
} | ||
|
||
|
||
// each text is split into chunks, and each chunk is added to the list of documents | ||
for (int i = 0; i < texts.Count; i++) | ||
{ | ||
var text = texts[i]; | ||
var metadata = metadatas[i]; | ||
|
||
foreach (var chunk in SplitText(text)) | ||
{ | ||
documents.Add(new Document(chunk, metadata)); | ||
} | ||
} | ||
|
||
return documents; | ||
} | ||
|
||
public List<Document> SplitDocuments(List<Document> documents) | ||
{ | ||
var texts = documents.Select(doc => doc.PageContent).ToList(); | ||
var metadatas = documents.Select(doc => doc.Metadata).ToList(); | ||
|
||
return CreateDocuments(texts, metadatas); | ||
} | ||
|
||
/// <summary> | ||
/// Joins a list of strings with a separator and returns null if the resulting string is empty | ||
/// </summary> | ||
protected string? JoinDocs(List<string> docs, string separator) | ||
{ | ||
var text = string.Join(separator, docs).Trim(); | ||
return string.IsNullOrEmpty(text) ? null : text; | ||
} | ||
|
||
/// <summary> | ||
/// Merges a list of texts into chunks of size chunk_size with overlap | ||
/// </summary> | ||
protected List<string> MergeSplits(IEnumerable<string> splits, string separator) | ||
{ | ||
var docs = new List<string>(); // result of chunks | ||
var currentDoc = new List<string>(); // documents of current chunk | ||
int total = 0; | ||
|
||
foreach (var split in splits) | ||
{ | ||
int len = _lengthFunction(split); | ||
|
||
// if we can't fit the next split into current chunk | ||
if (total + len >= _chunkSize) | ||
{ | ||
// if the chunk is already was too big | ||
if (total > _chunkSize) | ||
{ | ||
// todo: Implement a logger | ||
// todo: Log a warning about a split that is larger than the chunk size | ||
} | ||
|
||
|
||
if (currentDoc.Count > 0) | ||
{ | ||
// join all the docs in current chunk and add to the result | ||
var doc = JoinDocs(currentDoc, separator); | ||
if (doc != null) | ||
{ | ||
docs.Add(doc); | ||
} | ||
|
||
// start erasing docs from the beginning of the chunk until we can fit the next split | ||
while (total > _chunkOverlap || (total + len > _chunkSize && total > 0)) | ||
{ | ||
total -= _lengthFunction(currentDoc[0]); | ||
currentDoc.RemoveAt(0); | ||
} | ||
} | ||
} | ||
|
||
// add the next split to the current chunk | ||
currentDoc.Add(split); | ||
total += len; // recalculate the total length of the current chunk | ||
} | ||
|
||
// add the last chunk | ||
var lastDoc = JoinDocs(currentDoc, separator); | ||
if (lastDoc != null) | ||
{ | ||
docs.Add(lastDoc); | ||
} | ||
|
||
return docs; | ||
} | ||
|
||
// todo: Implement from_huggingface_tokenizer | ||
// todo: Implement from_tiktoken_encoder | ||
|
||
|
||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,78 @@ | ||
namespace LangChain.Docstore; | ||
|
||
/// <summary> | ||
/// Class for storing document | ||
/// <remarks> | ||
/// - no BaseModel implementation from pydantic | ||
/// - ported from langchain/docstore/document.py | ||
/// </remarks> | ||
/// </summary> | ||
public class Document | ||
{ | ||
public Document(string content, Dictionary<string, object> metadata) | ||
{ | ||
PageContent = content; | ||
Metadata = metadata; | ||
} | ||
|
||
public string PageContent { get; set; } | ||
public int LookupIndex { get; set; } | ||
public string LookupStr { get; set; } | ||
public Dictionary<string, object> Metadata { get; set; } | ||
|
||
/// <summary> | ||
/// Paragraphs of the page. | ||
/// </summary> | ||
public List<string> Paragraphs() | ||
{ | ||
return PageContent.Split(new []{"\n\n"},StringSplitOptions.None).ToList(); | ||
} | ||
/// <summary> | ||
/// Summary of the page (the first paragraph) | ||
/// </summary> | ||
public string Summary() | ||
{ | ||
return Paragraphs()[0]; | ||
} | ||
|
||
/// <summary> | ||
/// Lookup a term in the page, imitating cmd-F functionality. | ||
/// </summary> | ||
public string Lookup(string searchString) | ||
{ | ||
// if there is a new search string, reset the index | ||
if (searchString.ToLower() != LookupStr) | ||
Check warning on line 44 in src/libs/LangChain.Core/Docstore/Document.cs GitHub Actions / Build, test and publish / Build, test and publish
Check warning on line 44 in src/libs/LangChain.Core/Docstore/Document.cs GitHub Actions / Build, test and publish / Build, test and publish
|
||
{ | ||
LookupStr = searchString.ToLower(); | ||
Check warning on line 46 in src/libs/LangChain.Core/Docstore/Document.cs GitHub Actions / Build, test and publish / Build, test and publish
Check warning on line 46 in src/libs/LangChain.Core/Docstore/Document.cs GitHub Actions / Build, test and publish / Build, test and publish
|
||
LookupIndex = 0; | ||
} | ||
else | ||
{ | ||
LookupIndex++; | ||
} | ||
|
||
// get all the paragraphs that contain the search string | ||
var lookups = Paragraphs().Where(p => p.ToLower().Contains(LookupStr)).ToList(); | ||
Check warning on line 55 in src/libs/LangChain.Core/Docstore/Document.cs GitHub Actions / Build, test and publish / Build, test and publish
Check warning on line 55 in src/libs/LangChain.Core/Docstore/Document.cs GitHub Actions / Build, test and publish / Build, test and publish
|
||
|
||
if (lookups.Count == 0) | ||
{ | ||
return "No Results"; | ||
} | ||
else if (LookupIndex >= lookups.Count) | ||
{ | ||
return "No More Results"; | ||
} | ||
else | ||
{ | ||
string resultPrefix = $"(Result {LookupIndex + 1}/{lookups.Count})"; | ||
return $"{resultPrefix} {lookups[LookupIndex]}"; | ||
} | ||
} | ||
|
||
public override string ToString() | ||
{ | ||
var serializedMetadata = string.Join(", ", Metadata.Select(x => $"{{{x.Key}:{x.Value}}}")); | ||
return $"(PageContent='{PageContent}', LookupStr='{LookupStr}', Metadata={serializedMetadata}), LookupIndex={LookupIndex}"; | ||
} | ||
|
||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,30 @@ | ||
using LangChain.Base; | ||
|
||
namespace LangChain.TextSplitters; | ||
|
||
/// <summary> | ||
/// Implementation of splitting text that looks at characters | ||
/// </summary> | ||
public class CharacterTextSplitter:TextSplitter | ||
{ | ||
private readonly string? _separator; | ||
|
||
public CharacterTextSplitter(string? separator = "\n\n", int chunkSize = 4000, int chunkOverlap = 200, Func<string, int>? lengthFunction = null) : base(chunkSize, chunkOverlap, lengthFunction) | ||
{ | ||
_separator = separator; | ||
} | ||
|
||
public override List<string> SplitText(string text) | ||
{ | ||
List<string> splits; | ||
if (_separator!=null) | ||
{ | ||
splits = text.Split(new[] { _separator }, StringSplitOptions.None).ToList(); | ||
} | ||
else | ||
{ | ||
splits = new List<string> { text}; | ||
} | ||
return this.MergeSplits(splits,_separator); | ||
} | ||
} |