Skip to content

Commit

Permalink
feat: Implemented CharacterTextSplitter
Browse files Browse the repository at this point in the history
* fix for LlmChain OutputKey. The result of CallAsync was always containing only ["text'] key ignoring OutputKey. This was causing an error when used with SequentialChain

* implemented verbosity at LLMChain. Implemented ToString for the Message class.

* ported CharacterTextSplitter
- had to create and use TextSplitterBase class and Document class to match structure of original python implementation
- i have not implemented methods which are relying on external source of information(from_huggingface_tokenizer,from_tiktoken_encoder)
- added comments which are explaining logic behind implementation. it should help new people to understand what it does(original python implementation kind of hard to understand in some moments)

* changed LengthFunctionDelegate to Func<string, int>

* Revert "changed LengthFunctionDelegate to Func<string, int>"

This reverts commit 9c340d4.

* changed LengthFunctionDelegate to Func<string, int>

---------

Co-authored-by: Konstantin S <[email protected]>
  • Loading branch information
TesAnti and HavenDV authored Oct 18, 2023
1 parent 738ee71 commit 680c9f4
Show file tree
Hide file tree
Showing 9 changed files with 1,060 additions and 0 deletions.
146 changes: 146 additions & 0 deletions src/libs/LangChain.Core/Base/TextSplitter.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,146 @@
using LangChain.Docstore;

namespace LangChain.Base;

/// <summary>
/// Functionality for splitting text.
/// <remarks>
/// - ported from langchain/text_splitter.py
///
/// </remarks>
/// </summary>
public abstract class TextSplitter
{
private readonly int _chunkSize;
private readonly int _chunkOverlap;
private readonly Func<string, int> _lengthFunction;



protected TextSplitter(int chunkSize = 4000, int chunkOverlap = 200, Func<string,int>? lengthFunction = null)
{
if (chunkOverlap > chunkSize)
{
throw new ArgumentException($"Chunk overlap ({chunkOverlap}) is greater than chunk size ({chunkSize}).");
}

_chunkSize = chunkSize;
_chunkOverlap = chunkOverlap;
_lengthFunction = lengthFunction ?? new Func<string, int>((str) => str.Length);
}

public abstract List<string> SplitText(string text);

/// <summary>
/// Create documents from a list of texts.
/// </summary>
/// <exception cref="ArgumentException">
/// If the number of texts and metadata(when not null) are not equal, this method will throw an ArgumentException.
/// </exception>
public List<Document> CreateDocuments(List<string> texts, List<Dictionary<string, object>>? metadatas = null)
{
var documents = new List<Document>();

// if no metadata is provided, create a list of empty dictionaries
metadatas ??= Enumerable.Repeat(new Dictionary<string, object>(), texts.Count).ToList();

if (texts.Count != metadatas.Count)
{
throw new ArgumentException("Number of texts and metadata must be equal.");
}


// each text is split into chunks, and each chunk is added to the list of documents
for (int i = 0; i < texts.Count; i++)
{
var text = texts[i];
var metadata = metadatas[i];

foreach (var chunk in SplitText(text))
{
documents.Add(new Document(chunk, metadata));
}
}

return documents;
}

public List<Document> SplitDocuments(List<Document> documents)
{
var texts = documents.Select(doc => doc.PageContent).ToList();
var metadatas = documents.Select(doc => doc.Metadata).ToList();

return CreateDocuments(texts, metadatas);
}

/// <summary>
/// Joins a list of strings with a separator and returns null if the resulting string is empty
/// </summary>
protected string? JoinDocs(List<string> docs, string separator)
{
var text = string.Join(separator, docs).Trim();
return string.IsNullOrEmpty(text) ? null : text;
}

/// <summary>
/// Merges a list of texts into chunks of size chunk_size with overlap
/// </summary>
protected List<string> MergeSplits(IEnumerable<string> splits, string separator)
{
var docs = new List<string>(); // result of chunks
var currentDoc = new List<string>(); // documents of current chunk
int total = 0;

foreach (var split in splits)
{
int len = _lengthFunction(split);

// if we can't fit the next split into current chunk
if (total + len >= _chunkSize)
{
// if the chunk is already was too big
if (total > _chunkSize)
{
// todo: Implement a logger
// todo: Log a warning about a split that is larger than the chunk size
}


if (currentDoc.Count > 0)
{
// join all the docs in current chunk and add to the result
var doc = JoinDocs(currentDoc, separator);
if (doc != null)
{
docs.Add(doc);
}

// start erasing docs from the beginning of the chunk until we can fit the next split
while (total > _chunkOverlap || (total + len > _chunkSize && total > 0))
{
total -= _lengthFunction(currentDoc[0]);
currentDoc.RemoveAt(0);
}
}
}

// add the next split to the current chunk
currentDoc.Add(split);
total += len; // recalculate the total length of the current chunk
}

// add the last chunk
var lastDoc = JoinDocs(currentDoc, separator);
if (lastDoc != null)
{
docs.Add(lastDoc);
}

return docs;
}

// todo: Implement from_huggingface_tokenizer
// todo: Implement from_tiktoken_encoder


}
15 changes: 15 additions & 0 deletions src/libs/LangChain.Core/Chains/LLM/LLMChain.cs
Original file line number Diff line number Diff line change
Expand Up @@ -57,9 +57,24 @@ public override async Task<IChainValues> CallAsync(IChainValues values)
}

BasePromptValue promptValue = await Prompt.FormatPromptValue(new InputValues(values.Value));
var chatMessages = promptValue.ToChatMessages();
if (Verbose == true)
{

Console.WriteLine(string.Join("\n\n", chatMessages));
Console.WriteLine("\n".PadLeft(Console.WindowWidth, '>'));
}
var response = await Llm.GenerateAsync(new ChatRequest(promptValue.ToChatMessages(), stop));
if (Verbose == true)
{

Console.WriteLine(string.Join("\n\n", response.Messages.Except(chatMessages)));
Console.WriteLine("\n".PadLeft(Console.WindowWidth, '<'));
}

if(string.IsNullOrEmpty(OutputKey))
return new ChainValues(response.Messages.Last().Content);

return new ChainValues(OutputKey,response.Messages.Last().Content);
}

Expand Down
78 changes: 78 additions & 0 deletions src/libs/LangChain.Core/Docstore/Document.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
namespace LangChain.Docstore;

/// <summary>
/// Class for storing document
/// <remarks>
/// - no BaseModel implementation from pydantic
/// - ported from langchain/docstore/document.py
/// </remarks>
/// </summary>
public class Document
{
public Document(string content, Dictionary<string, object> metadata)
{
PageContent = content;
Metadata = metadata;
}

public string PageContent { get; set; }
public int LookupIndex { get; set; }
public string LookupStr { get; set; }
public Dictionary<string, object> Metadata { get; set; }

/// <summary>
/// Paragraphs of the page.
/// </summary>
public List<string> Paragraphs()
{
return PageContent.Split(new []{"\n\n"},StringSplitOptions.None).ToList();
}
/// <summary>
/// Summary of the page (the first paragraph)
/// </summary>
public string Summary()
{
return Paragraphs()[0];
}

/// <summary>
/// Lookup a term in the page, imitating cmd-F functionality.
/// </summary>
public string Lookup(string searchString)
{
// if there is a new search string, reset the index
if (searchString.ToLower() != LookupStr)

Check warning on line 44 in src/libs/LangChain.Core/Docstore/Document.cs

View workflow job for this annotation

GitHub Actions / Build, test and publish / Build, test and publish

Specify a culture or use an invariant version to avoid implicit dependency on current culture (https://learn.microsoft.com/dotnet/fundamentals/code-analysis/quality-rules/ca1311)

Check warning on line 44 in src/libs/LangChain.Core/Docstore/Document.cs

View workflow job for this annotation

GitHub Actions / Build, test and publish / Build, test and publish

The behavior of 'string.ToLower()' could vary based on the current user's locale settings. Replace this call in 'Document.Lookup(string)' with a call to 'string.ToLower(CultureInfo)'. (https://learn.microsoft.com/dotnet/fundamentals/code-analysis/quality-rules/ca1304)
{
LookupStr = searchString.ToLower();

Check warning on line 46 in src/libs/LangChain.Core/Docstore/Document.cs

View workflow job for this annotation

GitHub Actions / Build, test and publish / Build, test and publish

Specify a culture or use an invariant version to avoid implicit dependency on current culture (https://learn.microsoft.com/dotnet/fundamentals/code-analysis/quality-rules/ca1311)

Check warning on line 46 in src/libs/LangChain.Core/Docstore/Document.cs

View workflow job for this annotation

GitHub Actions / Build, test and publish / Build, test and publish

The behavior of 'string.ToLower()' could vary based on the current user's locale settings. Replace this call in 'Document.Lookup(string)' with a call to 'string.ToLower(CultureInfo)'. (https://learn.microsoft.com/dotnet/fundamentals/code-analysis/quality-rules/ca1304)
LookupIndex = 0;
}
else
{
LookupIndex++;
}

// get all the paragraphs that contain the search string
var lookups = Paragraphs().Where(p => p.ToLower().Contains(LookupStr)).ToList();

Check warning on line 55 in src/libs/LangChain.Core/Docstore/Document.cs

View workflow job for this annotation

GitHub Actions / Build, test and publish / Build, test and publish

Specify a culture or use an invariant version to avoid implicit dependency on current culture (https://learn.microsoft.com/dotnet/fundamentals/code-analysis/quality-rules/ca1311)

Check warning on line 55 in src/libs/LangChain.Core/Docstore/Document.cs

View workflow job for this annotation

GitHub Actions / Build, test and publish / Build, test and publish

The behavior of 'string.ToLower()' could vary based on the current user's locale settings. Replace this call in 'Document.Lookup(string)' with a call to 'string.ToLower(CultureInfo)'. (https://learn.microsoft.com/dotnet/fundamentals/code-analysis/quality-rules/ca1304)

if (lookups.Count == 0)
{
return "No Results";
}
else if (LookupIndex >= lookups.Count)
{
return "No More Results";
}
else
{
string resultPrefix = $"(Result {LookupIndex + 1}/{lookups.Count})";
return $"{resultPrefix} {lookups[LookupIndex]}";
}
}

public override string ToString()
{
var serializedMetadata = string.Join(", ", Metadata.Select(x => $"{{{x.Key}:{x.Value}}}"));
return $"(PageContent='{PageContent}', LookupStr='{LookupStr}', Metadata={serializedMetadata}), LookupIndex={LookupIndex}";
}

}
5 changes: 5 additions & 0 deletions src/libs/LangChain.Core/LangChain.Core.csproj
Original file line number Diff line number Diff line change
Expand Up @@ -35,4 +35,9 @@
<ProjectReference Include="..\Providers\LangChain.Providers.Abstractions\LangChain.Providers.Abstractions.csproj" />
</ItemGroup>

<ItemGroup>
<Folder Include="Docstore\" />
<Folder Include="TextSplitters\" />
</ItemGroup>

</Project>
30 changes: 30 additions & 0 deletions src/libs/LangChain.Core/TextSplitters/CharacterTextSplitter.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
using LangChain.Base;

namespace LangChain.TextSplitters;

/// <summary>
/// Implementation of splitting text that looks at characters
/// </summary>
public class CharacterTextSplitter:TextSplitter
{
private readonly string? _separator;

public CharacterTextSplitter(string? separator = "\n\n", int chunkSize = 4000, int chunkOverlap = 200, Func<string, int>? lengthFunction = null) : base(chunkSize, chunkOverlap, lengthFunction)
{
_separator = separator;
}

public override List<string> SplitText(string text)
{
List<string> splits;
if (_separator!=null)
{
splits = text.Split(new[] { _separator }, StringSplitOptions.None).ToList();
}
else
{
splits = new List<string> { text};
}
return this.MergeSplits(splits,_separator);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -40,4 +40,13 @@ public static Message Add(Message left, Message right)
{
return left + right;
}

public override string ToString()

Check warning on line 44 in src/libs/Providers/LangChain.Providers.Abstractions/Models/Message.cs

View workflow job for this annotation

GitHub Actions / Build, test and publish / Build, test and publish

Missing XML comment for publicly visible type or member 'Message.ToString()'

Check warning on line 44 in src/libs/Providers/LangChain.Providers.Abstractions/Models/Message.cs

View workflow job for this annotation

GitHub Actions / Build, test and publish / Build, test and publish

Missing XML comment for publicly visible type or member 'Message.ToString()'

Check warning on line 44 in src/libs/Providers/LangChain.Providers.Abstractions/Models/Message.cs

View workflow job for this annotation

GitHub Actions / Build, test and publish / Build, test and publish

Missing XML comment for publicly visible type or member 'Message.ToString()'

Check warning on line 44 in src/libs/Providers/LangChain.Providers.Abstractions/Models/Message.cs

View workflow job for this annotation

GitHub Actions / Build, test and publish / Build, test and publish

Missing XML comment for publicly visible type or member 'Message.ToString()'
{
if (FunctionName!=null)
{
return $"{Role}({FunctionName}):\n{Content}";
}
return $"{Role}: {Content}";
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
<Compile Remove="Resources\**\*.*" />
</ItemGroup>


<ItemGroup>
<PackageReference Include="H.Resources.Generator">
<PrivateAssets>all</PrivateAssets>
Expand All @@ -16,6 +17,7 @@
</ItemGroup>

<ItemGroup>
<ProjectReference Include="..\..\libs\LangChain.Core\LangChain.Core.csproj" />
<ProjectReference Include="..\..\libs\Splitters\LangChain.Splitters.CSharp\LangChain.Splitters.CSharp.csproj" />
</ItemGroup>

Expand Down
Loading

0 comments on commit 680c9f4

Please sign in to comment.