@@ -988,21 +988,25 @@ def context_padding(self, found_docs, search_query, course_name):
988
988
print ("inside context padding" )
989
989
print ("found_docs" , len (found_docs ))
990
990
documents_table = os .environ ['NEW_NEW_NEWNEW_MATERIALS_SUPABASE_TABLE' ]
991
- for doc in found_docs :
991
+ retrieved_contexts_identifiers = {}
992
+ for doc in found_docs : # top N from QDRANT
992
993
print (doc .metadata )
994
+
995
+ # check if this particular url/s3_path has a chunk index or page number or none and create a dictionary
996
+
993
997
994
998
# if url present, query through that
995
999
if doc .metadata ['url' ]:
996
- url = doc .metadata ['url' ]
997
- print ("url: " , url )
998
- response = self .supabase_client .table (documents_table ).select ('*' ).eq ('course_name' , course_name ).eq ('url' , url ).execute ()
999
-
1000
+ parent_doc_id = doc .metadata ['url' ]
1001
+ print ("url: " , parent_doc_id )
1002
+ response = self .supabase_client .table (documents_table ).select ('*' ).eq ('course_name' , course_name ).eq ('url' , parent_doc_id ).execute ()
1003
+ retrieved_contexts_identifiers [ parent_doc_id ] = []
1000
1004
# else use s3_path
1001
1005
else :
1002
- s3_path = doc .metadata ['s3_path' ]
1003
- print ("s3_path: " , s3_path )
1004
- response = self .supabase_client .table (documents_table ).select ('*' ).eq ('course_name' , course_name ).eq ('s3_path' , s3_path ).execute ()
1005
-
1006
+ parent_doc_id = doc .metadata ['s3_path' ]
1007
+ print ("s3_path: " , parent_doc_id )
1008
+ response = self .supabase_client .table (documents_table ).select ('*' ).eq ('course_name' , course_name ).eq ('s3_path' , parent_doc_id ).execute ()
1009
+ retrieved_contexts_identifiers [ parent_doc_id ] = []
1006
1010
data = response .data
1007
1011
# at this point, we have the parent document
1008
1012
result_contexts = []
@@ -1011,14 +1015,15 @@ def context_padding(self, found_docs, search_query, course_name):
1011
1015
qdrant_chunk_index = doc .metadata ['chunk_index' ]
1012
1016
print ("chunk_index: " , qdrant_chunk_index )
1013
1017
print (len (data ))
1014
-
1018
+ retrieved_indices = []
1015
1019
contexts = data [0 ]['contexts' ]
1016
1020
print ("contexts: " , len (contexts ))
1017
1021
1018
1022
for context in contexts :
1019
1023
chunk_index = context ['chunk_index' ]
1020
- if (qdrant_chunk_index - 3 <= chunk_index <= qdrant_chunk_index + 3 ):
1024
+ if (qdrant_chunk_index - 3 <= chunk_index <= qdrant_chunk_index + 3 ) and chunk_index not in retrieved_indices :
1021
1025
result_contexts .append (context )
1026
+ retrieved_indices .append (chunk_index )
1022
1027
1023
1028
print (result_contexts )
1024
1029
0 commit comments