feat: Implemented CharacterTextSplitter

* fix for LlmChain OutputKey. The result of CallAsync was always containing only ["text'] key ignoring OutputKey. This was causing an error when used with SequentialChain * implemented verbosity at LLMChain. Implemented ToString for the Message class. * ported CharacterTextSplitter - had to create and use TextSplitterBase class and Document class to match structure of original python implementation - i have not implemented methods which are relying on external source of information(from_huggingface_tokenizer,from_tiktoken_encoder) - added comments which are explaining logic behind implementation. it should help new people to understand what it does(original python implementation kind of hard to understand in some moments) * changed LengthFunctionDelegate to Func<string, int> * Revert "changed LengthFunctionDelegate to Func<string, int>" This reverts commit 9c340d4. * changed LengthFunctionDelegate to Func<string, int> --------- Co-authored-by: Konstantin S <[email protected]>
tryAGI · Oct 18, 2023 · 680c9f4 · 680c9f4
1 parent 738ee71
commit 680c9f4
Show file tree

Hide file tree

Showing 9 changed files with 1,060 additions and 0 deletions.
diff --git a/src/libs/LangChain.Core/Base/TextSplitter.cs b/src/libs/LangChain.Core/Base/TextSplitter.cs
@@ -0,0 +1,146 @@
+using LangChain.Docstore;
+
+namespace LangChain.Base;
+
+/// <summary>
+/// Functionality for splitting text.
+/// <remarks>
+/// - ported from langchain/text_splitter.py
+/// 
+/// </remarks>
+/// </summary>
+public abstract class TextSplitter
+{
+    private readonly int _chunkSize;
+    private readonly int _chunkOverlap;
+    private readonly Func<string, int> _lengthFunction;
+
+
+
+    protected TextSplitter(int chunkSize = 4000, int chunkOverlap = 200, Func<string,int>? lengthFunction = null)
+    {
+        if (chunkOverlap > chunkSize)
+        {
+            throw new ArgumentException($"Chunk overlap ({chunkOverlap}) is greater than chunk size ({chunkSize}).");
+        }
+
+        _chunkSize = chunkSize;
+        _chunkOverlap = chunkOverlap;
+        _lengthFunction = lengthFunction ?? new Func<string, int>((str) => str.Length);
+    }
+
+    public abstract List<string> SplitText(string text);
+
+    /// <summary>
+    /// Create documents from a list of texts.
+    /// </summary>
+    /// <exception cref="ArgumentException">
+    /// If the number of texts and metadata(when not null) are not equal, this method will throw an ArgumentException.
+    /// </exception>
+    public List<Document> CreateDocuments(List<string> texts, List<Dictionary<string, object>>? metadatas = null)
+    {
+        var documents = new List<Document>();
+
+        // if no metadata is provided, create a list of empty dictionaries
+        metadatas ??= Enumerable.Repeat(new Dictionary<string, object>(), texts.Count).ToList();
+
+        if (texts.Count != metadatas.Count)
+        {
+            throw new ArgumentException("Number of texts and metadata must be equal.");
+        }
+
+
+        // each text is split into chunks, and each chunk is added to the list of documents
+        for (int i = 0; i < texts.Count; i++)
+        {
+            var text = texts[i];
+            var metadata = metadatas[i];
+
+            foreach (var chunk in SplitText(text))
+            {
+                documents.Add(new Document(chunk, metadata));
+            }
+        }
+
+        return documents;
+    }
+
+    public List<Document> SplitDocuments(List<Document> documents)
+    {
+        var texts = documents.Select(doc => doc.PageContent).ToList();
+        var metadatas = documents.Select(doc => doc.Metadata).ToList();
+
+        return CreateDocuments(texts, metadatas);
+    }
+
+    /// <summary>
+    /// Joins a list of strings with a separator and returns null if the resulting string is empty
+    /// </summary>
+    protected string? JoinDocs(List<string> docs, string separator)
+    {
+        var text = string.Join(separator, docs).Trim();
+        return string.IsNullOrEmpty(text) ? null : text;
+    }
+
+    /// <summary>
+    /// Merges a list of texts into chunks of size chunk_size with overlap
+    /// </summary>
+    protected List<string> MergeSplits(IEnumerable<string> splits, string separator)
+    {
+        var docs = new List<string>(); // result of chunks
+        var currentDoc = new List<string>(); // documents of current chunk
+        int total = 0;
+
+        foreach (var split in splits)
+        {
+            int len = _lengthFunction(split);
+
+            // if we can't fit the next split into current chunk
+            if (total + len >= _chunkSize)
+            {
+                // if the chunk is already was too big
+                if (total > _chunkSize)
+                {
+                    // todo: Implement a logger
+                    // todo: Log a warning about a split that is larger than the chunk size
+                }
+
+
+                if (currentDoc.Count > 0)
+                {
+                    // join all the docs in current chunk and add to the result
+                    var doc = JoinDocs(currentDoc, separator);
+                    if (doc != null)
+                    {
+                        docs.Add(doc);
+                    }
+
+                    // start erasing docs from the beginning of the chunk until we can fit the next split
+                    while (total > _chunkOverlap || (total + len > _chunkSize && total > 0))
+                    {
+                        total -= _lengthFunction(currentDoc[0]);
+                        currentDoc.RemoveAt(0);
+                    }
+                }
+            }
+
+            // add the next split to the current chunk
+            currentDoc.Add(split);
+            total += len; // recalculate the total length of the current chunk
+        }
+
+        // add the last chunk
+        var lastDoc = JoinDocs(currentDoc, separator);
+        if (lastDoc != null)
+        {
+            docs.Add(lastDoc);
+        }
+
+        return docs;
+    }
+
+    // todo: Implement from_huggingface_tokenizer
+    // todo: Implement from_tiktoken_encoder
+
+
+}
diff --git a/src/libs/LangChain.Core/Chains/LLM/LLMChain.cs b/src/libs/LangChain.Core/Chains/LLM/LLMChain.cs
@@ -57,9 +57,24 @@ public override async Task<IChainValues> CallAsync(IChainValues values)
         }
 
         BasePromptValue promptValue = await Prompt.FormatPromptValue(new InputValues(values.Value));
+        var chatMessages = promptValue.ToChatMessages();
+        if (Verbose == true)
+        {
+
+            Console.WriteLine(string.Join("\n\n", chatMessages));
+            Console.WriteLine("\n".PadLeft(Console.WindowWidth, '>'));
+        }
         var response = await Llm.GenerateAsync(new ChatRequest(promptValue.ToChatMessages(), stop));
+        if (Verbose == true)
+        {
+
+            Console.WriteLine(string.Join("\n\n", response.Messages.Except(chatMessages)));
+            Console.WriteLine("\n".PadLeft(Console.WindowWidth, '<'));
+        }
+
         if(string.IsNullOrEmpty(OutputKey))
             return new ChainValues(response.Messages.Last().Content);
+
         return new ChainValues(OutputKey,response.Messages.Last().Content);
     }
 

diff --git a/src/libs/LangChain.Core/Docstore/Document.cs b/src/libs/LangChain.Core/Docstore/Document.cs
@@ -0,0 +1,78 @@
+namespace LangChain.Docstore;
+
+/// <summary>
+/// Class for storing document
+/// <remarks>
+/// - no BaseModel implementation from pydantic
+/// - ported from langchain/docstore/document.py
+/// </remarks>
+/// </summary>
+public class Document
+{
+    public Document(string content, Dictionary<string, object> metadata)
+    {
+        PageContent = content;
+        Metadata = metadata;
+    }
+
+    public string PageContent { get; set; }
+    public int LookupIndex { get; set; }
+    public string LookupStr { get; set; }
+    public Dictionary<string, object> Metadata { get; set; }
+
+    /// <summary>
+    /// Paragraphs of the page.
+    /// </summary>
+    public List<string> Paragraphs()
+    {
+        return PageContent.Split(new []{"\n\n"},StringSplitOptions.None).ToList();
+    }
+    /// <summary>
+    /// Summary of the page (the first paragraph)
+    /// </summary>
+    public string Summary()
+    {
+        return Paragraphs()[0];
+    }
+
+    /// <summary>
+    /// Lookup a term in the page, imitating cmd-F functionality.
+    /// </summary>
+    public string Lookup(string searchString)
+    {
+        // if there is a new search string, reset the index
+        if (searchString.ToLower() != LookupStr)
+        {
+            LookupStr = searchString.ToLower();
+            LookupIndex = 0;
+        }
+        else
+        {
+            LookupIndex++;
+        }
+
+        // get all the paragraphs that contain the search string
+        var lookups = Paragraphs().Where(p => p.ToLower().Contains(LookupStr)).ToList();
+
+        if (lookups.Count == 0)
+        {
+            return "No Results";
+        }
+        else if (LookupIndex >= lookups.Count)
+        {
+            return "No More Results";
+        }
+        else
+        {
+            string resultPrefix = $"(Result {LookupIndex + 1}/{lookups.Count})";
+            return $"{resultPrefix} {lookups[LookupIndex]}";
+        }
+    }
+
+    public override string ToString()
+    {
+        var serializedMetadata = string.Join(", ", Metadata.Select(x => $"{{{x.Key}:{x.Value}}}"));
+        return $"(PageContent='{PageContent}', LookupStr='{LookupStr}', Metadata={serializedMetadata}), LookupIndex={LookupIndex}";
+    }
+
+}
diff --git a/src/libs/LangChain.Core/LangChain.Core.csproj b/src/libs/LangChain.Core/LangChain.Core.csproj
@@ -35,4 +35,9 @@
     <ProjectReference Include="..\Providers\LangChain.Providers.Abstractions\LangChain.Providers.Abstractions.csproj" />
   </ItemGroup>
 
+  <ItemGroup>
+    <Folder Include="Docstore\" />
+    <Folder Include="TextSplitters\" />
+  </ItemGroup>
+
 </Project>
diff --git a/src/libs/LangChain.Core/TextSplitters/CharacterTextSplitter.cs b/src/libs/LangChain.Core/TextSplitters/CharacterTextSplitter.cs
@@ -0,0 +1,30 @@
+using LangChain.Base;
+
+namespace LangChain.TextSplitters;
+
+/// <summary>
+/// Implementation of splitting text that looks at characters
+/// </summary>
+public class CharacterTextSplitter:TextSplitter
+{
+    private readonly string? _separator;
+
+    public CharacterTextSplitter(string? separator = "\n\n", int chunkSize = 4000, int chunkOverlap = 200, Func<string, int>? lengthFunction = null) : base(chunkSize, chunkOverlap, lengthFunction)
+    {
+        _separator = separator;
+    }
+
+    public override List<string> SplitText(string text)
+    {
+        List<string> splits;
+        if (_separator!=null)
+        {
+            splits = text.Split(new[] { _separator }, StringSplitOptions.None).ToList();
+        }
+        else
+        {
+            splits = new List<string> { text};
+        }
+        return this.MergeSplits(splits,_separator);
+    }
+}
diff --git a/src/libs/Providers/LangChain.Providers.Abstractions/Models/Message.cs b/src/libs/Providers/LangChain.Providers.Abstractions/Models/Message.cs
@@ -40,4 +40,13 @@ public static Message Add(Message left, Message right)
     {
         return left + right;
     }
+
+    public override string ToString()
+    {
+        if (FunctionName!=null)
+        {
+            return $"{Role}({FunctionName}):\n{Content}";
+        }
+        return $"{Role}: {Content}";
+    }
 }
diff --git a/src/tests/LangChain.Splitters.CSharp.UnitTests/LangChain.Splitters.CSharp.UnitTests.csproj b/src/tests/LangChain.Splitters.CSharp.UnitTests/LangChain.Splitters.CSharp.UnitTests.csproj
@@ -8,6 +8,7 @@
         <Compile Remove="Resources\**\*.*" />
     </ItemGroup>
 
+
     <ItemGroup>
         <PackageReference Include="H.Resources.Generator">
             <PrivateAssets>all</PrivateAssets>
@@ -16,6 +17,7 @@
     </ItemGroup>
 
     <ItemGroup>
+        <ProjectReference Include="..\..\libs\LangChain.Core\LangChain.Core.csproj" />
         <ProjectReference Include="..\..\libs\Splitters\LangChain.Splitters.CSharp\LangChain.Splitters.CSharp.csproj" />
     </ItemGroup>