From bc75bdbb4822ce70943c6b4e82a6391cd815859a Mon Sep 17 00:00:00 2001 From: abhik Date: Wed, 21 Feb 2024 19:44:22 +0530 Subject: [PATCH 1/3] use RecursiveCharacterTextSplitter to correctly split docs into chunks of specified size --- notebooks/en/rag_zephyr_langchain.ipynb | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/notebooks/en/rag_zephyr_langchain.ipynb b/notebooks/en/rag_zephyr_langchain.ipynb index 992d5820..36ef4935 100644 --- a/notebooks/en/rag_zephyr_langchain.ipynb +++ b/notebooks/en/rag_zephyr_langchain.ipynb @@ -155,9 +155,9 @@ }, "outputs": [], "source": [ - "from langchain.text_splitter import CharacterTextSplitter\n", + "from langchain.text_splitter import RecursiveCharacterTextSplitter\n", "\n", - "splitter = CharacterTextSplitter(chunk_size=512, chunk_overlap=30)\n", + "splitter = RecursiveCharacterTextSplitter(chunk_size=512, chunk_overlap=30)\n", "\n", "chunked_docs = splitter.split_documents(docs)" ] From 5a873294f47e5c82fe5742b2f57e17607300e8d2 Mon Sep 17 00:00:00 2001 From: abhik Date: Wed, 21 Feb 2024 23:44:51 +0530 Subject: [PATCH 2/3] update paragraph to explain use of RecursiveCharacterTextSplitter --- notebooks/en/rag_zephyr_langchain.ipynb | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/notebooks/en/rag_zephyr_langchain.ipynb b/notebooks/en/rag_zephyr_langchain.ipynb index 36ef4935..62bd9e8e 100644 --- a/notebooks/en/rag_zephyr_langchain.ipynb +++ b/notebooks/en/rag_zephyr_langchain.ipynb @@ -140,11 +140,7 @@ "source": [ "The content of individual GitHub issues may be longer than what an embedding model can take as input. If we want to embed all of the available content, we need to chunk the documents into appropriately sized pieces.\n", "\n", - "The most common and straightforward approach to chunking is to define a fixed size of chunks and whether there should be any overlap between them. Keeping some overlap between chunks allows us to preserve some semantic context between the chunks.\n", - "\n", - "Other approaches are typically more involved and take into account the documents' structure and context. For example, one may want to split a document based on sentences or paragraphs, or create chunks based on the\n", - "\n", - "The fixed-size chunking, however, works well for most common cases, so that is what we'll do here." + "The most common and straightforward approach to chunking is to define a fixed size of chunks and whether there should be any overlap between them. Keeping some overlap between chunks allows us to preserve some semantic context between the chunks. The recommended splitter for generic text is the [RecursiveCharacterTextSplitter](https://python.langchain.com/docs/modules/data_connection/document_transformers/recursive_text_splitter) , and that's what we'll use here. " ] }, { From df59e3f9d13803687de087e398a132e9f0f9fa35 Mon Sep 17 00:00:00 2001 From: abhik Date: Wed, 21 Feb 2024 23:47:24 +0530 Subject: [PATCH 3/3] remove unnecessary space --- notebooks/en/rag_zephyr_langchain.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/notebooks/en/rag_zephyr_langchain.ipynb b/notebooks/en/rag_zephyr_langchain.ipynb index 62bd9e8e..55738b98 100644 --- a/notebooks/en/rag_zephyr_langchain.ipynb +++ b/notebooks/en/rag_zephyr_langchain.ipynb @@ -140,7 +140,7 @@ "source": [ "The content of individual GitHub issues may be longer than what an embedding model can take as input. If we want to embed all of the available content, we need to chunk the documents into appropriately sized pieces.\n", "\n", - "The most common and straightforward approach to chunking is to define a fixed size of chunks and whether there should be any overlap between them. Keeping some overlap between chunks allows us to preserve some semantic context between the chunks. The recommended splitter for generic text is the [RecursiveCharacterTextSplitter](https://python.langchain.com/docs/modules/data_connection/document_transformers/recursive_text_splitter) , and that's what we'll use here. " + "The most common and straightforward approach to chunking is to define a fixed size of chunks and whether there should be any overlap between them. Keeping some overlap between chunks allows us to preserve some semantic context between the chunks. The recommended splitter for generic text is the [RecursiveCharacterTextSplitter](https://python.langchain.com/docs/modules/data_connection/document_transformers/recursive_text_splitter), and that's what we'll use here. " ] }, {