Skip to content

Commit

Permalink
refactor[SIN-169]: improve code for process queue
Browse files Browse the repository at this point in the history
  • Loading branch information
gventuri committed Oct 21, 2024
1 parent e07170e commit a94157c
Show file tree
Hide file tree
Showing 3 changed files with 58 additions and 9 deletions.
26 changes: 20 additions & 6 deletions backend/app/processing/process_queue.py
Original file line number Diff line number Diff line change
Expand Up @@ -288,7 +288,7 @@ def extractive_summary_process(api_key, process, process_step, asset_content):
@handle_exceptions
def extract_process(api_key, process, process_step, asset_content):
pdf_content = ""
vectorstore = ChromaDB(f"panda-etl-{process.project_id}", similary_threshold=3)
vectorstore = ChromaDB(f"panda-etl-{process.project_id}", similarity_threshold=3)
if (
("multiple_fields" not in process.details or not process.details["multiple_fields"])
and asset_content.content
Expand All @@ -312,13 +312,19 @@ def extract_process(api_key, process, process_step, asset_content):
prev_sentence = vectorstore.get_relevant_docs_by_id(
ids=[metadata["previous_sentence_id"]]
)
segment_data = [prev_sentence["documents"][0]] + segment_data
if prev_sentence["documents"] and len(prev_sentence["documents"][0]) > 0:
segment_data = [prev_sentence["documents"][0]] + segment_data
else:
logger.warning("Previous sentence document is empty.")

if metadata.get("next_sentence_id", -1) != -1:
next_sentence = vectorstore.get_relevant_docs_by_id(
ids=[metadata["next_sentence_id"]]
)
segment_data.append(next_sentence["documents"][0])
if next_sentence["documents"] and len(next_sentence["documents"][0]) > 0:
segment_data.append(next_sentence["documents"][0])
else:
logger.warning("Next sentence document is empty.")

pdf_content += "\n" + " ".join(segment_data)

Expand All @@ -336,7 +342,7 @@ def extract_process(api_key, process, process_step, asset_content):
pdf_content=pdf_content if pdf_content else None,
)

vectorstore = ChromaDB(f"panda-etl-{process.project_id}", similary_threshold=3)
vectorstore = ChromaDB(f"panda-etl-{process.project_id}", similarity_threshold=3)
all_relevant_docs = []

for context in data["context"]:
Expand Down Expand Up @@ -371,13 +377,22 @@ def extract_process(api_key, process, process_step, asset_content):
clean_source = clean_text(source)
# search for exact match Index
for index, relevant_doc in enumerate(relevant_docs["documents"][0]):
if not relevant_docs["documents"][0]:
logger.warning("No relevant documents found.")
continue
if clean_source in clean_text(relevant_doc):
most_relevant_index = index
match = True
break

if not match and len(relevant_docs["documents"][0]) > 0:
sources["sources"][source_index] = relevant_docs["documents"][0][0]
if relevant_docs["documents"][0]:
page_numbers.append(
relevant_docs["metadatas"][0][most_relevant_index]["page_number"]
)
else:
logger.warning("No documents available to assign to source.")

if len(relevant_docs["metadatas"][0]) > 0:
page_numbers.append(
Expand All @@ -392,14 +407,13 @@ def extract_process(api_key, process, process_step, asset_content):
"context": data["context"],
}

def find_best_match_for_short_reference(source, all_relevant_docs, asset_id, project_id):
def find_best_match_for_short_reference(source, all_relevant_docs, asset_id, project_id, threshold=0.8):
source_words = set(re.findall(r'\w+', source.lower()))
if not source_words:
return None # Return None if the source is empty

best_match = None
best_match_score = 0
threshold = 0.8

for relevant_docs in all_relevant_docs:
for doc, metadata in zip(relevant_docs["documents"][0], relevant_docs["metadatas"][0]):
Expand Down
4 changes: 2 additions & 2 deletions backend/app/vectorstore/chroma.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,13 +26,13 @@ def __init__(
persist_path: Optional[str] = None,
client_settings: Optional[config.Settings] = None,
max_samples: int = 3,
similary_threshold: int = 1.5,
similarity_threshold: int = 1.5,
batch_size: Optional[int] = None,
settings: Optional[BaseSettings] = None,
) -> None:
self.settings = settings or default_settings
self._max_samples = max_samples
self._similarity_threshold = similary_threshold
self._similarity_threshold = similarity_threshold
self._batch_size = batch_size or self.settings.chroma_batch_size

# Initialize Chromadb Client
Expand Down
37 changes: 36 additions & 1 deletion backend/tests/processing/test_process_queue.py
Original file line number Diff line number Diff line change
Expand Up @@ -142,6 +142,41 @@ def test_find_best_match_for_short_reference(mock_findall):

assert mock_findall.call_count == 6

@pytest.mark.parametrize("short_reference, expected_result", [
("AI and machine learning", True),
("Quantum computing", False),
])
@patch('app.processing.process_queue.re.findall')
def test_find_best_match_for_short_reference_parametrized(mock_findall, short_reference, expected_result):
mock_findall.side_effect = [
short_reference.lower().split(),
['this', 'is', 'a', 'long', 'document', 'about', 'ai', 'and', 'machine', 'learning'], # For the first document
['another', 'document', 'talking', 'about', 'natural', 'language', 'processing'], # For the second document
]

all_relevant_docs = [
{
"documents": [["This is a long document about AI and machine learning."]],
"metadatas": [[{"asset_id": 1, "project_id": 1, "page_number": 1}]]
},
{
"documents": [["Another document talking about natural language processing."]],
"metadatas": [[{"asset_id": 1, "project_id": 1, "page_number": 2}]]
}
]

result = find_best_match_for_short_reference(short_reference, all_relevant_docs, 1, 1)

if expected_result:
assert result is not None
assert "text" in result
assert "page_number" in result
assert short_reference.lower() in result["text"].lower()
else:
assert result is None

assert mock_findall.call_count == 3

@patch('app.processing.process_queue.ChromaDB')
@patch('app.processing.process_queue.extract_data')
def test_chroma_db_initialization(mock_extract_data, mock_chroma):
Expand All @@ -158,5 +193,5 @@ def test_chroma_db_initialization(mock_extract_data, mock_chroma):

extract_process("api_key", process, process_step, asset_content)

mock_chroma.assert_called_with(f"panda-etl-{process.project_id}", similary_threshold=3)
mock_chroma.assert_called_with(f"panda-etl-{process.project_id}", similarity_threshold=3)
assert mock_chroma.call_count >= 1

0 comments on commit a94157c

Please sign in to comment.