-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathChatBot2.py
283 lines (227 loc) · 9.47 KB
/
ChatBot2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
import uuid
import hnswlib
from typing import List, Dict
from unstructured.partition.html import partition_html
from unstructured.chunking.title import chunk_by_title
import cohere # Assuming 'cohere' is the package name for Cohere API
co = cohere.Client("D14bT4Bm9SoiXE5ioVryf2DGOyIw1yjm1ccR0giQ")
class Vectorstore:
def __init__(self, raw_documents: List[Dict[str, str]]):
self.raw_documents = raw_documents
self.docs = []
self.docs_embs = []
self.retrieve_top_k = 10
self.rerank_top_k = 3
self.load_and_chunk()
self.embed()
self.index()
def load_and_chunk(self) -> None:
"""
Loads the text from the sources and chunks the HTML content.
"""
print("Loading documents...")
for raw_document in self.raw_documents:
elements = partition_html(url=raw_document["url"])
chunks = chunk_by_title(elements)
for chunk in chunks:
self.docs.append(
{
"title": raw_document["title"],
"text": str(chunk),
"url": raw_document["url"],
}
)
def embed(self) -> None:
"""
Embeds the document chunks using the Cohere API.
"""
print("Embedding document chunks...")
batch_size = 90
self.docs_len = len(self.docs)
for i in range(0, self.docs_len, batch_size):
batch = self.docs[i : min(i + batch_size, self.docs_len)]
texts = [item["text"] for item in batch]
docs_embs_batch = co.embed(
texts=texts, model="embed-english-v3.0", input_type="search_document"
).embeddings
self.docs_embs.extend(docs_embs_batch)
def index(self) -> None:
"""
Indexes the documents for efficient retrieval using hnswlib.
"""
print("Indexing documents...")
self.idx = hnswlib.Index(space="ip", dim=1024)
# self.idx = hnswlib.Index(space="cosine", dim=768) # Adjust space and dim according to your embedding dimensions
self.idx.init_index(max_elements=self.docs_len, ef_construction=512, M=64)
self.idx.add_items(self.docs_embs, list(range(len(self.docs_embs))))
print(f"Indexing complete with {self.idx.get_current_count()} documents.")
# def retrieve(self, query: str) -> List[Dict[str, str]]:
# """
# Retrieves document chunks based on the given query.
# Parameters:
# query (str): The query to retrieve document chunks for.
# Returns:
# List[Dict[str, str]]: A list of dictionaries representing the retrieved document chunks, with 'title', 'text', and 'url' keys.
# """
# print(f"Retrieving documents for query: {query}")
# # Dense retrieval
# query_emb = co.embed(
# texts=[query], model="embed-english-v3.0", input_type="search_query"
# ).embeddings
# doc_ids = self.idx.knn_query(query_emb, k=self.retrieve_top_k)[0][0]
# # Convert doc_ids to a list of integers (if necessary)
# # doc_ids = list(doc_ids)
# # Reranking
# rank_fields = ["title", "text"] # We'll use the title and text fields for reranking
# docs_to_rerank = [self.docs[doc_id] for doc_id in doc_ids]
# rerank_results = co.rerank(
# query=query,
# documents=docs_to_rerank,
# top_n=self.rerank_top_k,
# model="rerank-english-v3.0",
# rank_fields=rank_fields
# )
# # doc_ids_reranked = [result['id'] for result in rerank_results]
# doc_ids_reranked = [result.index for result in rerank_results]
# docs_retrieved = []
# for doc_id in doc_ids_reranked:
# docs_retrieved.append(
# {
# "title": self.docs[doc_id]["title"],
# "text": self.docs[doc_id]["text"],
# "url": self.docs[doc_id]["url"],
# }
# )
# return docs_retrieved
def retrieve(self, query: str) -> List[Dict[str, str]]:
"""
Retrieves document chunks based on the given query.
Parameters:
query (str): The query to retrieve document chunks for.
Returns:
List[Dict[str, str]]: A list of dictionaries representing the retrieved document chunks, with 'title', 'text', and 'url' keys.
"""
print(f"Retrieving documents for query: {query}")
# Dense retrieval
query_emb = co.embed(
texts=[query], model="embed-english-v3.0", input_type="search_query"
).embeddings
# Ensure that doc_ids are properly extracted as integers
doc_ids = self.idx.knn_query(query_emb, k=self.retrieve_top_k)[0][0].tolist()
# Reranking
rank_fields = ["title", "text"] # We'll use the title and text fields for reranking
docs_to_rerank = [self.docs[doc_id] for doc_id in doc_ids]
rerank_results = co.rerank(
query=query,
documents=docs_to_rerank,
top_n=self.rerank_top_k,
model="rerank-english-v3.0",
rank_fields=rank_fields
)
# Print the rerank results to inspect its structure
print(f"Rerank results: {rerank_results}")
# Extracting document indices from rerank results properly
doc_ids_reranked = [result.index for result in rerank_results.results]
print(f"Reranked doc_ids: {doc_ids_reranked}")
docs_retrieved = []
for doc_id in doc_ids_reranked:
docs_retrieved.append(
{
"title": self.docs[doc_id]["title"],
"text": self.docs[doc_id]["text"],
"url": self.docs[doc_id]["url"],
}
)
return docs_retrieved
# Example usage:
raw_documents = [
{
"title": "Text Embeddings",
"url": "https://docs.cohere.com/docs/text-embeddings"
},
{
"title": "Similarity Between Words and Sentences",
"url": "https://docs.cohere.com/docs/similarity-between-words-and-sentences"
},
{
"title": "The Attention Mechanism",
"url": "https://docs.cohere.com/docs/the-attention-mechanism"
},
{
"title": "Transformer Models",
"url": "https://docs.cohere.com/docs/transformer-models"
}
]
# Create an instance of Vectorstore
vectorstore = Vectorstore(raw_documents)
vectorstore.retrieve("multi-head attention definition")
class Chatbot:
def __init__(self, vectorstore: Vectorstore):
"""
Initializes an instance of the Chatbot class.
Parameters:
vectorstore (Vectorstore): An instance of the Vectorstore class.
"""
self.vectorstore = vectorstore
self.conversation_id = str(uuid.uuid4())
def run(self):
"""
Runs the chatbot application.
"""
while True:
# Get the user message
message = input("User: ")
# Typing "quit" ends the conversation
if message.lower() == "quit":
print("Ending chat.")
break
else:
print(f"User: {message}")
# Generate search queries, if any
response = co.chat(message=message, search_queries_only=True)
# If there are search queries, retrieve document chunks and respond
if response.search_queries:
print("Retrieving information...", end="")
# Retrieve document chunks for each query
documents = []
for query in response.search_queries:
documents.extend(self.vectorstore.retrieve(query.text))
# Use document chunks to respond
response = co.chat_stream(
message=message,
model="command-r",
documents=documents,
conversation_id=self.conversation_id,
)
# If there is no search query, directly respond
else:
response = co.chat_stream(
message=message,
model="command-r",
conversation_id=self.conversation_id,
)
# Print the chatbot response, citations, and documents
print("\nChatbot:")
citations = []
cited_documents = []
# Display response
for event in response:
if event.event_type == "text-generation":
print(event.text, end="")
elif event.event_type == "citation-generation":
citations.extend(event.citations)
elif event.event_type == "search-results":
cited_documents = event.documents
# Display citations and source documents
if citations:
print("\n\nCITATIONS:")
for citation in citations:
print(citation)
print("\nDOCUMENTS:")
for document in cited_documents:
print(document)
print(f"\n{'-'*100}\n")
# Instantiate and run the chatbot
if __name__ == "__main__":
chatbot = Chatbot(vectorstore)
chatbot.run()