-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathvector_db.py
107 lines (85 loc) · 3.29 KB
/
vector_db.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
from langchain.document_loaders import WebBaseLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores.milvus import Milvus
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from pymilvus import connections, Collection, FieldSchema, CollectionSchema, DataType, connections, utility
from typing import Dict, List, Tuple
import textwrap
vector_store: Milvus
connections.connect()
def collection_exists():
return utility.has_collection("iCitation")
def create_collection():
if utility.has_collection("iCitation"):
print("Dropping iCitation collection")
collection = Collection("iCitation")
collection.drop()
print("Creating iCitation collection")
# 1. define fields
fields = [
FieldSchema(name="pk", dtype=DataType.INT64, is_primary=True, auto_id=True),
FieldSchema(name="text", dtype=DataType.VARCHAR, max_length=65_535),
FieldSchema(name="vector", dtype=DataType.FLOAT_VECTOR, dim=768),
FieldSchema(name='metadata', dtype=DataType.JSON)
]
# 2. enable dynamic schema in schema definition
schema = CollectionSchema(
fields,
"Sources to search through for similarity"
)
# 3. reference the schema in a collection
collection = Collection("iCitation", schema)
# 4. index the vector field and load the collection
index_params = {
"metric_type": "L2",
"index_type": "HNSW",
"params": {"M": 8, "efConstruction": 64},
}
collection.create_index(
field_name="vector",
index_params=index_params
)
# 5. load the collection
collection.load()
print("iCitation collection loaded")
global vector_store
vector_store = Milvus(embeddings, "iCitation")
def add_sources(sources: list[str]):
print("Adding sources to iCitation collection")
loader = WebBaseLoader(web_path=sources)
docs = loader.load()
for doc in docs:
doc.metadata = {'metadata': doc.metadata}
text_splitter = RecursiveCharacterTextSplitter(
separators=["\n\n", "\n", ". "],
chunk_size = 300,
chunk_overlap = 0,
)
docs = text_splitter.split_documents(docs)
vector_store.add_documents(docs)
def search(sentence: str) -> Dict[str, List[Tuple[float, str]]]:
print("searching with query: ", sentence)
output = vector_store.similarity_search_with_score(sentence, 5)
sources = {}
for doc, score in output:
text = doc.page_content
# source = json.loads(doc.metadata['metadata'].decode('utf-8'))['source']
source = doc.metadata['metadata']['source']
wrapped_text = '\n'.join(textwrap.wrap(text, width=60))
if source in sources:
sources[source].append((score, wrapped_text))
else:
sources[source] = [(score, wrapped_text)]
print("searching done")
return sources
def delete_collection():
if utility.has_collection("iCitation"):
print("Dropping iCitation collection")
collection = Collection("iCitation")
collection.drop()
# create_collection()
print("retrieving HuggingFace embeddings")
embeddings = HuggingFaceEmbeddings()
if utility.has_collection("iCitation"):
print("retrieving iCitation collection")
vector_store = Milvus(embeddings, "iCitation")