diff --git a/backend/apps/ai/common/base/chunk_command.py b/backend/apps/ai/common/base/chunk_command.py index 07098acc0e..dbf23210fe 100644 --- a/backend/apps/ai/common/base/chunk_command.py +++ b/backend/apps/ai/common/base/chunk_command.py @@ -57,13 +57,12 @@ def process_chunks_batch(self, entities: list[Model]) -> int: self.stdout.write(f"No content to chunk for {self.entity_name} {entity_key}") continue - chunk_texts = Chunk.split_text(full_content) - if not chunk_texts: + if not (unique_chunk_texts := set(Chunk.split_text(full_content))): self.stdout.write(f"No chunks created for {self.entity_name} {entity_key}") continue if chunks := create_chunks_and_embeddings( - chunk_texts=chunk_texts, + chunk_texts=list(unique_chunk_texts), context=context, openai_client=self.openai_client, save=False, diff --git a/backend/apps/ai/common/base/context_command.py b/backend/apps/ai/common/base/context_command.py index 5a43370450..5b96e8fe97 100644 --- a/backend/apps/ai/common/base/context_command.py +++ b/backend/apps/ai/common/base/context_command.py @@ -35,10 +35,12 @@ def process_context_batch(self, entities: list[Model]) -> int: ): processed += 1 entity_key = self.get_entity_key(entity) - self.stdout.write(f"Created context for {entity_key}") + self.stdout.write(f"Created/updated context for {entity_key}") else: entity_key = self.get_entity_key(entity) - self.stdout.write(self.style.ERROR(f"Failed to create context for {entity_key}")) + self.stdout.write( + self.style.ERROR(f"Failed to create/update context for {entity_key}") + ) return processed diff --git a/backend/apps/ai/management/commands/ai_update_slack_message_context.py b/backend/apps/ai/management/commands/ai_update_slack_message_context.py index c89b253692..ffe2827afc 100644 --- a/backend/apps/ai/management/commands/ai_update_slack_message_context.py +++ b/backend/apps/ai/management/commands/ai_update_slack_message_context.py @@ -10,26 +10,6 @@ class Command(BaseContextCommand): key_field_name = "slack_message_id" model_class = Message - def add_arguments(self, parser): - """Override to use different default batch size for messages.""" - super().add_arguments(parser) - parser.add_argument( - "--message-key", - type=str, - help="Process only the message with this key", - ) - parser.add_argument( - "--all", - action="store_true", - help="Process all the messages", - ) - parser.add_argument( - "--batch-size", - type=int, - default=100, - help="Number of messages to process in each batch", - ) - def extract_content(self, entity: Message) -> tuple[str, str]: """Extract content from the message.""" return entity.cleaned_text or "", "" diff --git a/backend/tests/apps/ai/common/base/chunk_command_test.py b/backend/tests/apps/ai/common/base/chunk_command_test.py index 627acf36f0..e0e24d23cd 100644 --- a/backend/tests/apps/ai/common/base/chunk_command_test.py +++ b/backend/tests/apps/ai/common/base/chunk_command_test.py @@ -211,16 +211,21 @@ def test_process_chunks_batch_success( mock_create_chunks.return_value = mock_chunks command.openai_client = Mock() - with patch.object(command.stdout, "write") as mock_write: + with ( + patch("apps.ai.models.chunk.Chunk.objects.filter") as mock_chunk_filter, + patch.object(command.stdout, "write") as mock_write, + ): + mock_qs = Mock() + mock_qs.values_list.return_value = [] + mock_chunk_filter.return_value = mock_qs result = command.process_chunks_batch([mock_entity]) assert result == 1 - mock_create_chunks.assert_called_once_with( - chunk_texts=["chunk1", "chunk2", "chunk3"], - context=mock_context, - openai_client=command.openai_client, - save=False, - ) + _, kwargs = mock_create_chunks.call_args + assert set(kwargs["chunk_texts"]) == {"chunk1", "chunk2", "chunk3"} + assert kwargs["context"] == mock_context + assert kwargs["openai_client"] == command.openai_client + assert kwargs["save"] is False mock_bulk_save.assert_called_once_with(mock_chunks) mock_write.assert_has_calls( [ @@ -261,7 +266,13 @@ def test_process_chunks_batch_multiple_entities( mock_create_chunks.return_value = mock_chunks[:2] command.openai_client = Mock() - with patch.object(command.stdout, "write"): + with ( + patch("apps.ai.models.chunk.Chunk.objects.filter") as mock_chunk_filter, + patch.object(command.stdout, "write"), + ): + mock_qs = Mock() + mock_qs.values_list.return_value = [] + mock_chunk_filter.return_value = mock_qs result = command.process_chunks_batch(entities) assert result == 3 @@ -325,14 +336,22 @@ def test_process_chunks_batch_content_combination( "extract_content", return_value=("prose", "metadata"), ): - command.process_chunks_batch([mock_entity]) + with patch("apps.ai.models.chunk.Chunk.objects.filter") as mock_chunk_filter: + mock_qs = Mock() + mock_qs.values_list.return_value = [] + mock_chunk_filter.return_value = mock_qs + command.process_chunks_batch([mock_entity]) expected_content = "metadata\n\nprose" mock_split_text.assert_called_once_with(expected_content) mock_split_text.reset_mock() with patch.object(command, "extract_content", return_value=("prose", "")): - command.process_chunks_batch([mock_entity]) + with patch("apps.ai.models.chunk.Chunk.objects.filter") as mock_chunk_filter: + mock_qs = Mock() + mock_qs.values_list.return_value = [] + mock_chunk_filter.return_value = mock_qs + command.process_chunks_batch([mock_entity]) mock_split_text.assert_called_with("prose") @@ -402,11 +421,52 @@ def test_process_chunks_batch_metadata_only_content( "extract_content", return_value=("", "metadata"), ): - command.process_chunks_batch([mock_entity]) + with patch("apps.ai.models.chunk.Chunk.objects.filter") as mock_chunk_filter: + mock_qs = Mock() + mock_qs.values_list.return_value = [] + mock_chunk_filter.return_value = mock_qs + command.process_chunks_batch([mock_entity]) mock_split_text.assert_called_once_with("metadata\n\n") mock_bulk_save.assert_called_once() + @patch("apps.ai.common.base.chunk_command.ContentType.objects.get_for_model") + @patch("apps.ai.common.base.chunk_command.Context.objects.filter") + @patch("apps.ai.models.chunk.Chunk.split_text") + @patch("apps.ai.common.base.chunk_command.create_chunks_and_embeddings") + @patch("apps.ai.models.chunk.Chunk.bulk_save") + def test_process_chunks_batch_with_duplicates( + self, + mock_bulk_save, + mock_create_chunks, + mock_split_text, + mock_context_filter, + mock_get_content_type, + command, + mock_entity, + mock_context, + mock_content_type, + mock_chunks, + ): + """Test that duplicate chunk texts are filtered out before processing.""" + mock_get_content_type.return_value = mock_content_type + mock_context_filter.return_value.first.return_value = mock_context + mock_split_text.return_value = ["chunk1", "chunk2", "chunk1", "chunk3", "chunk2"] + mock_create_chunks.return_value = mock_chunks + command.openai_client = Mock() + + with patch.object(command.stdout, "write"): + result = command.process_chunks_batch([mock_entity]) + + assert result == 1 + mock_split_text.assert_called_once() + _, kwargs = mock_create_chunks.call_args + assert set(kwargs["chunk_texts"]) == {"chunk1", "chunk2", "chunk3"} + assert kwargs["context"] == mock_context + assert kwargs["openai_client"] == command.openai_client + assert kwargs["save"] is False + mock_bulk_save.assert_called_once_with(mock_chunks) + def test_process_chunks_batch_whitespace_only_content( self, command, mock_entity, mock_context, mock_content_type ): diff --git a/backend/tests/apps/ai/common/base/context_command_test.py b/backend/tests/apps/ai/common/base/context_command_test.py index ff1d6cf9c5..7d1c9f2caf 100644 --- a/backend/tests/apps/ai/common/base/context_command_test.py +++ b/backend/tests/apps/ai/common/base/context_command_test.py @@ -116,7 +116,7 @@ def test_process_context_batch_success( entity=mock_entity, source="owasp_test_entity", ) - mock_write.assert_called_once_with("Created context for test-key-123") + mock_write.assert_called_once_with("Created/updated context for test-key-123") @patch("apps.ai.common.base.context_command.Context") def test_process_context_batch_creation_fails(self, mock_context_class, command, mock_entity): @@ -130,7 +130,7 @@ def test_process_context_batch_creation_fails(self, mock_context_class, command, mock_context_class.update_data.assert_called_once() mock_write.assert_called_once() call_args = mock_write.call_args[0][0] - assert "Failed to create context for test-key-123" in str(call_args) + assert "Failed to create/update context for test-key-123" in str(call_args) @patch("apps.ai.common.base.context_command.Context") def test_process_context_batch_multiple_entities( @@ -184,9 +184,9 @@ def test_process_context_batch_mixed_success_failure( assert mock_write.call_count == 3 write_calls = mock_write.call_args_list - assert "Created context for test-key-1" in str(write_calls[0]) - assert "Failed to create context for test-key-2" in str(write_calls[1]) - assert "Created context for test-key-3" in str(write_calls[2]) + assert "Created/updated context for test-key-1" in str(write_calls[0]) + assert "Failed to create/update context for test-key-2" in str(write_calls[1]) + assert "Created/updated context for test-key-3" in str(write_calls[2]) def test_process_context_batch_content_combination(self, command, mock_entity, mock_context): """Test that metadata and prose content are properly combined.""" @@ -261,7 +261,7 @@ def test_get_entity_key_usage(self, command, mock_context): with patch.object(command.stdout, "write") as mock_write: command.process_context_batch([entity]) - mock_write.assert_called_once_with("Created context for custom-entity-key") + mock_write.assert_called_once_with("Created/updated context for custom-entity-key") def test_process_context_batch_empty_list(self, command): """Test process_context_batch with empty entity list.""" diff --git a/backend/tests/apps/ai/management/commands/ai_update_committee_context_test.py b/backend/tests/apps/ai/management/commands/ai_update_committee_context_test.py index 6c1940dca1..1010011473 100644 --- a/backend/tests/apps/ai/management/commands/ai_update_committee_context_test.py +++ b/backend/tests/apps/ai/management/commands/ai_update_committee_context_test.py @@ -169,7 +169,9 @@ def test_process_context_batch_success(self, command, mock_committee): entity=mock_committee, source="owasp_committee", ) - mock_write.assert_called_once_with("Created context for test-committee") + mock_write.assert_called_once_with( + "Created/updated context for test-committee" + ) def test_process_context_batch_empty_content(self, command, mock_committee): """Test context batch processing with empty content.""" @@ -206,7 +208,7 @@ def test_process_context_batch_create_failure(self, command, mock_committee): assert result == 0 mock_error.assert_called_once_with( - "Failed to create context for test-committee" + "Failed to create/update context for test-committee" ) mock_write.assert_called_once_with("ERROR: Failed") diff --git a/backend/tests/apps/ai/management/commands/ai_update_slack_message_context_test.py b/backend/tests/apps/ai/management/commands/ai_update_slack_message_context_test.py index 2d6ea5fec3..b0af18f568 100644 --- a/backend/tests/apps/ai/management/commands/ai_update_slack_message_context_test.py +++ b/backend/tests/apps/ai/management/commands/ai_update_slack_message_context_test.py @@ -72,10 +72,9 @@ def test_add_arguments(self, command): parser = Mock() command.add_arguments(parser) - assert parser.add_argument.call_count == 6 + assert parser.add_argument.call_count == 3 calls = parser.add_argument.call_args_list - # First 3 calls are from parent class (BaseAICommand) assert calls[0][0] == ("--message-key",) assert calls[0][1]["type"] is str assert "Process only the message with this key" in calls[0][1]["help"] @@ -86,19 +85,5 @@ def test_add_arguments(self, command): assert calls[2][0] == ("--batch-size",) assert calls[2][1]["type"] is int - assert calls[2][1]["default"] == 50 # Default from parent class + assert calls[2][1]["default"] == 50 assert "Number of messages to process in each batch" in calls[2][1]["help"] - - # Next 3 calls are from the command itself (duplicates with different defaults) - assert calls[3][0] == ("--message-key",) - assert calls[3][1]["type"] is str - assert "Process only the message with this key" in calls[3][1]["help"] - - assert calls[4][0] == ("--all",) - assert calls[4][1]["action"] == "store_true" - assert "Process all the messages" in calls[4][1]["help"] - - assert calls[5][0] == ("--batch-size",) - assert calls[5][1]["type"] is int - assert calls[5][1]["default"] == 100 # Overridden default from command - assert "Number of messages to process in each batch" in calls[5][1]["help"]