Zipstack · gaya3-zipstack · Aug 30, 2024 · Sep 3, 2024 · Sep 3, 2024 · Sep 4, 2024
diff --git a/backend/prompt_studio/prompt_studio_core/constants.py b/backend/prompt_studio/prompt_studio_core/constants.py
@@ -29,6 +29,7 @@ class ToolStudioPromptKeys:
     UNIQUE_FILE_ID = "unique_file_id"
     ID = "id"
     FILE_NAME = "file_name"
+    FILE_PATH = "file_path"
     FILE_HASH = "file_hash"
     TOOL_ID = "tool_id"
     NAME = "name"

diff --git a/backend/prompt_studio/prompt_studio_core/prompt_studio_helper.py b/backend/prompt_studio/prompt_studio_core/prompt_studio_helper.py
@@ -797,6 +797,7 @@ def _fetch_response(
             TSPKeys.TOOL_ID: tool_id,
             TSPKeys.RUN_ID: run_id,
             TSPKeys.FILE_NAME: doc_name,
+            TSPKeys.FILE_PATH: os.path.split(doc_path)[0],
             TSPKeys.FILE_HASH: file_hash,
             Common.LOG_EVENTS_ID: StateStore.get(Common.LOG_EVENTS_ID),
         }

diff --git a/prompt-service/src/unstract/prompt_service/constants.py b/prompt-service/src/unstract/prompt_service/constants.py
@@ -8,6 +8,7 @@ class PromptServiceContants:
     TOOL_ID = "tool_id"
     RUN_ID = "run_id"
     FILE_NAME = "file_name"
+    FILE_PATH = "file_path"
     FILE_HASH = "file_hash"
     NAME = "name"
     ACTIVE = "active"

diff --git a/prompt-service/src/unstract/prompt_service/main.py b/prompt-service/src/unstract/prompt_service/main.py
@@ -1,8 +1,9 @@
+import os
 import time
 import traceback
 from enum import Enum
 from json import JSONDecodeError
-from typing import Any, Optional
+from typing import Any
 
 from flask import json, jsonify, request
 from llama_index.core.vector_stores import ExactMatchFilter, MetadataFilters
@@ -91,6 +92,9 @@ def prompt_processor() -> Any:
     file_hash = payload.get(PSKeys.FILE_HASH)
     doc_name = str(payload.get(PSKeys.FILE_NAME, ""))
     log_events_id: str = payload.get(PSKeys.LOG_EVENTS_ID, "")
+    file_path = str(payload.get(PSKeys.FILE_PATH, ""))
+    extracted_doc_name = doc_name.split(".")[0] + ".txt"
+    extract_file_path = os.path.join(file_path, "extract", extracted_doc_name)
     structured_output: dict[str, Any] = {}
     metadata: dict[str, Any] = {
         PSKeys.RUN_ID: run_id,
@@ -241,7 +245,7 @@ def prompt_processor() -> Any:
                 RunLevel.RUN,
                 "Unable to obtain LLM / embedding / vectorDB",
             )
-            return APIError(message=msg)
+            raise APIError(message=msg)
 
         if output[PSKeys.TYPE] == PSKeys.TABLE:
             try:
@@ -278,34 +282,15 @@ def prompt_processor() -> Any:
         try:
             context = ""
             if output[PSKeys.CHUNK_SIZE] == 0:
-                # We can do this only for chunkless indexes
-                context: Optional[str] = index.query_index(
-                    embedding_instance_id=output[PSKeys.EMBEDDING],
-                    vector_db_instance_id=output[PSKeys.VECTOR_DB],
-                    doc_id=doc_id,
-                    usage_kwargs=usage_kwargs,
-                )
-                if not context:
-                    # UN-1288 For Pinecone, we are seeing an inconsistent case where
-                    # query with doc_id fails even though indexing just happened.
-                    # This causes the following retrieve to return no text.
-                    # To rule out any lag on the Pinecone vector DB write,
-                    # the following sleep is added.
-                    # Note: This will not fix the issue. Since this issue is
-                    # inconsistent, and not reproducible easily,
-                    # this is just a safety net.
-                    time.sleep(2)
-                    context: Optional[str] = index.query_index(
-                        embedding_instance_id=output[PSKeys.EMBEDDING],
-                        vector_db_instance_id=output[PSKeys.VECTOR_DB],
-                        doc_id=doc_id,
-                        usage_kwargs=usage_kwargs,
-                    )
-                    if context is None:
-                        # TODO: Obtain user set name for vector DB
-                        msg = NO_CONTEXT_ERROR
-                        app.logger.error(
-                            f"{msg} {output[PSKeys.VECTOR_DB]} for doc_id {doc_id}"
+                try:
+                    # Read from extract_file_path and set that as context
+                    with open(extract_file_path) as file:
+                        context = file.read()
+                        file.close()
+                        app.logger.info(
+                            "Reading extracted file from %s for "
+                            "context as chunk size is 0",
+                            extract_file_path,
                         )
                         _publish_log(
                             log_events_id,
@@ -314,11 +299,32 @@ def prompt_processor() -> Any:
                                 "prompt_key": prompt_name,
                                 "doc_name": doc_name,
                             },
-                            LogLevel.ERROR,
+                            LogLevel.INFO,
                             RunLevel.RUN,
-                            msg,
+                            "Reading extracted file for context as chunk size is 0",
                         )
-                        raise APIError(message=msg)
+                except FileNotFoundError:
+                    msg = (
+                        "Extracted file not present. "
+                        "Please re-index the document and try again"
+                    )
+                    app.logger.error(
+                        "Extracted file for document is missing at %s",
+                        extract_file_path,
+                    )
+                    _publish_log(
+                        log_events_id,
+                        {
+                            "tool_id": tool_id,
+                            "prompt_key": prompt_name,
+                            "doc_name": doc_name,
+                        },
+                        LogLevel.ERROR,
+                        RunLevel.RUN,
+                        "Extracted file not present.",
+                    )
+                    raise APIError(message=msg)
+
                 # TODO: Use vectorDB name when available
                 _publish_log(
                     log_events_id,