diff --git a/ai_ta_backend/service/retrieval_service.py b/ai_ta_backend/service/retrieval_service.py index af425218..266f4a01 100644 --- a/ai_ta_backend/service/retrieval_service.py +++ b/ai_ta_backend/service/retrieval_service.py @@ -17,6 +17,7 @@ from ai_ta_backend.service.posthog_service import PosthogService from ai_ta_backend.service.sentry_service import SentryService from ai_ta_backend.utils.utils_tokenization import count_tokens_and_cost +from ai_ta_backend.utils.context_parent_doc_padding import context_parent_doc_padding class RetrievalService: @@ -69,6 +70,10 @@ def getTopContexts(self, search_query: str, course_name: str, token_limit: int = found_docs: list[Document] = self.vector_search(search_query=search_query, course_name=course_name) + # add parent doc retrieval here + parent_docs = context_parent_doc_padding(found_docs, search_query, course_name) + print(f"Number of final docs after context padding: {len(parent_docs)}") + pre_prompt = "Please answer the following question. Use the context below, called your documents, only if it's helpful and don't use parts that are very irrelevant. It's good to quote from your documents directly, when you do always use Markdown footnotes for citations. Use react-markdown superscript to number the sources at the end of sentences (1, 2, 3...) and use react-markdown Footnotes to list the full document names for each number. Use ReactMarkdown aka 'react-markdown' formatting for super script citations, use semi-formal style. Feel free to say you don't know. \nHere's a few passages of the high quality documents:\n" # count tokens at start and end, then also count each context. token_counter, _ = count_tokens_and_cost(pre_prompt + "\n\nNow please respond to my query: " + # type: ignore @@ -76,7 +81,7 @@ def getTopContexts(self, search_query: str, course_name: str, token_limit: int = valid_docs = [] num_tokens = 0 - for doc in found_docs: + for doc in parent_docs: doc_string = f"Document: {doc.metadata['readable_filename']}{', page: ' + str(doc.metadata['pagenumber']) if doc.metadata['pagenumber'] else ''}\n{str(doc.page_content)}\n" num_tokens, prompt_cost = count_tokens_and_cost(doc_string) # type: ignore