Skip to content

Commit bf5e356

Browse files
committed
added duplication tracking
1 parent bb15826 commit bf5e356

File tree

1 file changed

+16
-11
lines changed

1 file changed

+16
-11
lines changed

ai_ta_backend/vector_database.py

+16-11
Original file line numberDiff line numberDiff line change
@@ -988,21 +988,25 @@ def context_padding(self, found_docs, search_query, course_name):
988988
print("inside context padding")
989989
print("found_docs", len(found_docs))
990990
documents_table = os.environ['NEW_NEW_NEWNEW_MATERIALS_SUPABASE_TABLE']
991-
for doc in found_docs:
991+
retrieved_contexts_identifiers = {}
992+
for doc in found_docs: # top N from QDRANT
992993
print(doc.metadata)
994+
995+
# check if this particular url/s3_path has a chunk index or page number or none and create a dictionary
996+
993997

994998
# if url present, query through that
995999
if doc.metadata['url']:
996-
url = doc.metadata['url']
997-
print("url: ", url)
998-
response = self.supabase_client.table(documents_table).select('*').eq('course_name', course_name).eq('url', url).execute()
999-
1000+
parent_doc_id = doc.metadata['url']
1001+
print("url: ", parent_doc_id)
1002+
response = self.supabase_client.table(documents_table).select('*').eq('course_name', course_name).eq('url', parent_doc_id).execute()
1003+
retrieved_contexts_identifiers[parent_doc_id] = []
10001004
# else use s3_path
10011005
else:
1002-
s3_path = doc.metadata['s3_path']
1003-
print("s3_path: ", s3_path)
1004-
response = self.supabase_client.table(documents_table).select('*').eq('course_name', course_name).eq('s3_path', s3_path).execute()
1005-
1006+
parent_doc_id = doc.metadata['s3_path']
1007+
print("s3_path: ", parent_doc_id)
1008+
response = self.supabase_client.table(documents_table).select('*').eq('course_name', course_name).eq('s3_path', parent_doc_id).execute()
1009+
retrieved_contexts_identifiers[parent_doc_id] = []
10061010
data = response.data
10071011
# at this point, we have the parent document
10081012
result_contexts = []
@@ -1011,14 +1015,15 @@ def context_padding(self, found_docs, search_query, course_name):
10111015
qdrant_chunk_index = doc.metadata['chunk_index']
10121016
print("chunk_index: ", qdrant_chunk_index)
10131017
print(len(data))
1014-
1018+
retrieved_indices = []
10151019
contexts = data[0]['contexts']
10161020
print("contexts: ", len(contexts))
10171021

10181022
for context in contexts:
10191023
chunk_index = context['chunk_index']
1020-
if (qdrant_chunk_index - 3 <= chunk_index <= qdrant_chunk_index + 3):
1024+
if (qdrant_chunk_index - 3 <= chunk_index <= qdrant_chunk_index + 3) and chunk_index not in retrieved_indices:
10211025
result_contexts.append(context)
1026+
retrieved_indices.append(chunk_index)
10221027

10231028
print(result_contexts)
10241029

0 commit comments

Comments
 (0)