Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Do not merge] When chunk_size=0, skip vector db #649

Closed
wants to merge 4 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions backend/prompt_studio/prompt_studio_core/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ class ToolStudioPromptKeys:
UNIQUE_FILE_ID = "unique_file_id"
ID = "id"
FILE_NAME = "file_name"
FILE_PATH = "file_path"
FILE_HASH = "file_hash"
TOOL_ID = "tool_id"
NAME = "name"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -797,6 +797,7 @@ def _fetch_response(
TSPKeys.TOOL_ID: tool_id,
TSPKeys.RUN_ID: run_id,
TSPKeys.FILE_NAME: doc_name,
TSPKeys.FILE_PATH: os.path.split(doc_path)[0],
TSPKeys.FILE_HASH: file_hash,
Common.LOG_EVENTS_ID: StateStore.get(Common.LOG_EVENTS_ID),
}
Expand Down
1 change: 1 addition & 0 deletions prompt-service/src/unstract/prompt_service/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ class PromptServiceContants:
TOOL_ID = "tool_id"
RUN_ID = "run_id"
FILE_NAME = "file_name"
FILE_PATH = "file_path"
FILE_HASH = "file_hash"
NAME = "name"
ACTIVE = "active"
Expand Down
72 changes: 39 additions & 33 deletions prompt-service/src/unstract/prompt_service/main.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
import os
import time
import traceback
from enum import Enum
from json import JSONDecodeError
from typing import Any, Optional
from typing import Any

from flask import json, jsonify, request
from llama_index.core.vector_stores import ExactMatchFilter, MetadataFilters
Expand Down Expand Up @@ -91,6 +92,9 @@ def prompt_processor() -> Any:
file_hash = payload.get(PSKeys.FILE_HASH)
doc_name = str(payload.get(PSKeys.FILE_NAME, ""))
log_events_id: str = payload.get(PSKeys.LOG_EVENTS_ID, "")
file_path = str(payload.get(PSKeys.FILE_PATH, ""))
extracted_doc_name = doc_name.split(".")[0] + ".txt"
extract_file_path = os.path.join(file_path, "extract", extracted_doc_name)
structured_output: dict[str, Any] = {}
metadata: dict[str, Any] = {
PSKeys.RUN_ID: run_id,
Expand Down Expand Up @@ -241,7 +245,7 @@ def prompt_processor() -> Any:
RunLevel.RUN,
"Unable to obtain LLM / embedding / vectorDB",
)
return APIError(message=msg)
raise APIError(message=msg)

if output[PSKeys.TYPE] == PSKeys.TABLE:
try:
Expand Down Expand Up @@ -278,34 +282,15 @@ def prompt_processor() -> Any:
try:
context = ""
if output[PSKeys.CHUNK_SIZE] == 0:
# We can do this only for chunkless indexes
context: Optional[str] = index.query_index(
embedding_instance_id=output[PSKeys.EMBEDDING],
vector_db_instance_id=output[PSKeys.VECTOR_DB],
doc_id=doc_id,
usage_kwargs=usage_kwargs,
)
if not context:
# UN-1288 For Pinecone, we are seeing an inconsistent case where
# query with doc_id fails even though indexing just happened.
# This causes the following retrieve to return no text.
# To rule out any lag on the Pinecone vector DB write,
# the following sleep is added.
# Note: This will not fix the issue. Since this issue is
# inconsistent, and not reproducible easily,
# this is just a safety net.
time.sleep(2)
context: Optional[str] = index.query_index(
embedding_instance_id=output[PSKeys.EMBEDDING],
vector_db_instance_id=output[PSKeys.VECTOR_DB],
doc_id=doc_id,
usage_kwargs=usage_kwargs,
)
if context is None:
# TODO: Obtain user set name for vector DB
msg = NO_CONTEXT_ERROR
app.logger.error(
f"{msg} {output[PSKeys.VECTOR_DB]} for doc_id {doc_id}"
try:
# Read from extract_file_path and set that as context
with open(extract_file_path) as file:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@gaya3-zipstack does this work because the volumes are shared between backend and prompt-service? Was this change tested against a tool run in a workflow or a pipeline?

context = file.read()
file.close()
app.logger.info(
"Reading extracted file from %s for "
"context as chunk size is 0",
extract_file_path,
)
_publish_log(
log_events_id,
Expand All @@ -314,11 +299,32 @@ def prompt_processor() -> Any:
"prompt_key": prompt_name,
"doc_name": doc_name,
},
LogLevel.ERROR,
LogLevel.INFO,
RunLevel.RUN,
msg,
"Reading extracted file for context as chunk size is 0",
)
raise APIError(message=msg)
except FileNotFoundError:
msg = (
"Extracted file not present. "
"Please re-index the document and try again"
)
app.logger.error(
"Extracted file for document is missing at %s",
extract_file_path,
)
_publish_log(
log_events_id,
{
"tool_id": tool_id,
"prompt_key": prompt_name,
"doc_name": doc_name,
},
LogLevel.ERROR,
RunLevel.RUN,
"Extracted file not present.",
)
raise APIError(message=msg)

# TODO: Use vectorDB name when available
_publish_log(
log_events_id,
Expand Down