-
Notifications
You must be signed in to change notification settings - Fork 2
/
notion_rag.py
62 lines (48 loc) · 2.35 KB
/
notion_rag.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
"""
This script demonstrates how to preprocess and index documents using the LangChain library.
It uses a Notion database as the source of documents, splits the documents into smaller
chunks, preprocesses the metadata, and then builds a vector index using the Chroma vector store.
Summary:
1. The script loads documents from a Notion database using the NotionDBLoader class.
2. It splits the documents into smaller chunks using the RecursiveCharacterTextSplitter class.
3. The metadata of each split is preprocessed to handle any null values or complex data types.
4. The split documents are then indexed and stored in the Chroma vector store using the OpenAIEmbeddings for encoding the text.
5. The index is built and stored in the specified directory.
By following these steps, the script enables efficient similarity search and retrieval of documents based on their textual content.
"""
import dotenv
import os
dotenv.load_dotenv()
def import_pipeline():
# These three lines swap the stdlib sqlite3 lib with the pysqlite3 package
__import__('pysqlite3')
import sys
sys.modules['sqlite3'] = sys.modules.pop('pysqlite3')
from langchain.document_loaders import NotionDBLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma
import json
def preprocess_metadata(metadata):
for key, value in metadata.items():
if value is None:
metadata[key] = ''
elif isinstance(value, list):
metadata[key] = ', '.join(value)
elif isinstance(value, dict):
metadata[key] = json.dumps(value)
return metadata
loader = NotionDBLoader(
integration_token=os.environ['NOTION_API_KEY'],
database_id=os.environ['NOTION_DATABASE_ID'],
request_timeout_sec=30 # Optional, defaults to 10
)
docs = loader.load()
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=0)
all_splits = text_splitter.split_documents(docs)
for split in all_splits:
split.metadata = preprocess_metadata(split.metadata)
print('Building the index on Chroma...')
Chroma.from_documents(documents=all_splits, embedding=OpenAIEmbeddings(), persist_directory="./chroma")
if __name__ == '__main__':
import_pipeline()