Skip to content

Commit

Permalink
initial commit for parent-doc
Browse files Browse the repository at this point in the history
  • Loading branch information
star-nox committed Mar 13, 2024
1 parent c249036 commit f715906
Showing 1 changed file with 6 additions and 1 deletion.
7 changes: 6 additions & 1 deletion ai_ta_backend/service/retrieval_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
from ai_ta_backend.service.posthog_service import PosthogService
from ai_ta_backend.service.sentry_service import SentryService
from ai_ta_backend.utils.utils_tokenization import count_tokens_and_cost
from ai_ta_backend.utils.context_parent_doc_padding import context_parent_doc_padding


class RetrievalService:
Expand Down Expand Up @@ -69,14 +70,18 @@ def getTopContexts(self, search_query: str, course_name: str, token_limit: int =

found_docs: list[Document] = self.vector_search(search_query=search_query, course_name=course_name)

# add parent doc retrieval here
parent_docs = context_parent_doc_padding(found_docs, search_query, course_name)
print(f"Number of final docs after context padding: {len(parent_docs)}")

pre_prompt = "Please answer the following question. Use the context below, called your documents, only if it's helpful and don't use parts that are very irrelevant. It's good to quote from your documents directly, when you do always use Markdown footnotes for citations. Use react-markdown superscript to number the sources at the end of sentences (1, 2, 3...) and use react-markdown Footnotes to list the full document names for each number. Use ReactMarkdown aka 'react-markdown' formatting for super script citations, use semi-formal style. Feel free to say you don't know. \nHere's a few passages of the high quality documents:\n"
# count tokens at start and end, then also count each context.
token_counter, _ = count_tokens_and_cost(pre_prompt + "\n\nNow please respond to my query: " + # type: ignore
search_query)

valid_docs = []
num_tokens = 0
for doc in found_docs:
for doc in parent_docs:
doc_string = f"Document: {doc.metadata['readable_filename']}{', page: ' + str(doc.metadata['pagenumber']) if doc.metadata['pagenumber'] else ''}\n{str(doc.page_content)}\n"
num_tokens, prompt_cost = count_tokens_and_cost(doc_string) # type: ignore

Expand Down

0 comments on commit f715906

Please sign in to comment.