initial commit for parent-doc

UIUC-Chatbot · Mar 13, 2024 · f715906 · f715906
1 parent c249036
commit f715906
Showing 1 changed file with 6 additions and 1 deletion.
diff --git a/ai_ta_backend/service/retrieval_service.py b/ai_ta_backend/service/retrieval_service.py
@@ -17,6 +17,7 @@
 from ai_ta_backend.service.posthog_service import PosthogService
 from ai_ta_backend.service.sentry_service import SentryService
 from ai_ta_backend.utils.utils_tokenization import count_tokens_and_cost
+from ai_ta_backend.utils.context_parent_doc_padding import context_parent_doc_padding
 
 
 class RetrievalService:
@@ -69,14 +70,18 @@ def getTopContexts(self, search_query: str, course_name: str, token_limit: int =
 
       found_docs: list[Document] = self.vector_search(search_query=search_query, course_name=course_name)
 
+      # add parent doc retrieval here
+      parent_docs = context_parent_doc_padding(found_docs, search_query, course_name)
+      print(f"Number of final docs after context padding: {len(parent_docs)}")
+
       pre_prompt = "Please answer the following question. Use the context below, called your documents, only if it's helpful and don't use parts that are very irrelevant. It's good to quote from your documents directly, when you do always use Markdown footnotes for citations. Use react-markdown superscript to number the sources at the end of sentences (1, 2, 3...) and use react-markdown Footnotes to list the full document names for each number. Use ReactMarkdown aka 'react-markdown' formatting for super script citations, use semi-formal style. Feel free to say you don't know. \nHere's a few passages of the high quality documents:\n"
       # count tokens at start and end, then also count each context.
       token_counter, _ = count_tokens_and_cost(pre_prompt + "\n\nNow please respond to my query: " +  # type: ignore
                                                search_query)
 
       valid_docs = []
       num_tokens = 0
-      for doc in found_docs:
+      for doc in parent_docs:
         doc_string = f"Document: {doc.metadata['readable_filename']}{', page: ' + str(doc.metadata['pagenumber']) if doc.metadata['pagenumber'] else ''}\n{str(doc.page_content)}\n"
         num_tokens, prompt_cost = count_tokens_and_cost(doc_string)  # type: ignore