OWASP · arkid15r · Oct 8, 2025 · Oct 2, 2025 · Oct 2, 2025 · Oct 6, 2025
@@ -57,13 +57,12 @@ def process_chunks_batch(self, entities: list[Model]) -> int:
                     self.stdout.write(f"No content to chunk for {self.entity_name} {entity_key}")
                     continue
 
-                chunk_texts = Chunk.split_text(full_content)
-                if not chunk_texts:
+                if not (unique_chunk_texts := set(Chunk.split_text(full_content))):
                     self.stdout.write(f"No chunks created for {self.entity_name} {entity_key}")
                     continue
 
                 if chunks := create_chunks_and_embeddings(
-                    chunk_texts=chunk_texts,
+                    chunk_texts=list(unique_chunk_texts),
                     context=context,
                     openai_client=self.openai_client,
                     save=False,

@@ -35,10 +35,12 @@ def process_context_batch(self, entities: list[Model]) -> int:
             ):
                 processed += 1
                 entity_key = self.get_entity_key(entity)
-                self.stdout.write(f"Created context for {entity_key}")
+                self.stdout.write(f"Created/updated context for {entity_key}")
             else:
                 entity_key = self.get_entity_key(entity)
-                self.stdout.write(self.style.ERROR(f"Failed to create context for {entity_key}"))
+                self.stdout.write(
+                    self.style.ERROR(f"Failed to create/update context for {entity_key}")
+                )
 
         return processed
 

@@ -10,26 +10,6 @@ class Command(BaseContextCommand):
     key_field_name = "slack_message_id"
     model_class = Message
 
-    def add_arguments(self, parser):
-        """Override to use different default batch size for messages."""
-        super().add_arguments(parser)
-        parser.add_argument(
-            "--message-key",
-            type=str,
-            help="Process only the message with this key",
-        )
-        parser.add_argument(
-            "--all",
-            action="store_true",
-            help="Process all the messages",
-        )
-        parser.add_argument(
-            "--batch-size",
-            type=int,
-            default=100,
-            help="Number of messages to process in each batch",
-        )
-
     def extract_content(self, entity: Message) -> tuple[str, str]:
         """Extract content from the message."""
         return entity.cleaned_text or "", ""

@@ -211,16 +211,21 @@ def test_process_chunks_batch_success(
         mock_create_chunks.return_value = mock_chunks
         command.openai_client = Mock()
 
-        with patch.object(command.stdout, "write") as mock_write:
+        with (
+            patch("apps.ai.models.chunk.Chunk.objects.filter") as mock_chunk_filter,
+            patch.object(command.stdout, "write") as mock_write,
+        ):
+            mock_qs = Mock()
+            mock_qs.values_list.return_value = []
+            mock_chunk_filter.return_value = mock_qs
             result = command.process_chunks_batch([mock_entity])
 
             assert result == 1
-            mock_create_chunks.assert_called_once_with(
-                chunk_texts=["chunk1", "chunk2", "chunk3"],
-                context=mock_context,
-                openai_client=command.openai_client,
-                save=False,
-            )
+            _, kwargs = mock_create_chunks.call_args
+            assert set(kwargs["chunk_texts"]) == {"chunk1", "chunk2", "chunk3"}
+            assert kwargs["context"] == mock_context
+            assert kwargs["openai_client"] == command.openai_client
+            assert kwargs["save"] is False
             mock_bulk_save.assert_called_once_with(mock_chunks)
             mock_write.assert_has_calls(
                 [
@@ -261,7 +266,13 @@ def test_process_chunks_batch_multiple_entities(
         mock_create_chunks.return_value = mock_chunks[:2]
         command.openai_client = Mock()
 
-        with patch.object(command.stdout, "write"):
+        with (
+            patch("apps.ai.models.chunk.Chunk.objects.filter") as mock_chunk_filter,
+            patch.object(command.stdout, "write"),
+        ):
+            mock_qs = Mock()
+            mock_qs.values_list.return_value = []
+            mock_chunk_filter.return_value = mock_qs
             result = command.process_chunks_batch(entities)
 
             assert result == 3
@@ -325,14 +336,22 @@ def test_process_chunks_batch_content_combination(
                 "extract_content",
                 return_value=("prose", "metadata"),
             ):
-                command.process_chunks_batch([mock_entity])
+                with patch("apps.ai.models.chunk.Chunk.objects.filter") as mock_chunk_filter:
+                    mock_qs = Mock()
+                    mock_qs.values_list.return_value = []
+                    mock_chunk_filter.return_value = mock_qs
+                    command.process_chunks_batch([mock_entity])
 
                 expected_content = "metadata\n\nprose"
                 mock_split_text.assert_called_once_with(expected_content)
 
             mock_split_text.reset_mock()
             with patch.object(command, "extract_content", return_value=("prose", "")):
-                command.process_chunks_batch([mock_entity])
+                with patch("apps.ai.models.chunk.Chunk.objects.filter") as mock_chunk_filter:
+                    mock_qs = Mock()
+                    mock_qs.values_list.return_value = []
+                    mock_chunk_filter.return_value = mock_qs
+                    command.process_chunks_batch([mock_entity])
 
                 mock_split_text.assert_called_with("prose")
 
@@ -402,11 +421,52 @@ def test_process_chunks_batch_metadata_only_content(
                 "extract_content",
                 return_value=("", "metadata"),
             ):
-                command.process_chunks_batch([mock_entity])
+                with patch("apps.ai.models.chunk.Chunk.objects.filter") as mock_chunk_filter:
+                    mock_qs = Mock()
+                    mock_qs.values_list.return_value = []
+                    mock_chunk_filter.return_value = mock_qs
+                    command.process_chunks_batch([mock_entity])
 
                 mock_split_text.assert_called_once_with("metadata\n\n")
                 mock_bulk_save.assert_called_once()
 
+    @patch("apps.ai.common.base.chunk_command.ContentType.objects.get_for_model")
+    @patch("apps.ai.common.base.chunk_command.Context.objects.filter")
+    @patch("apps.ai.models.chunk.Chunk.split_text")
+    @patch("apps.ai.common.base.chunk_command.create_chunks_and_embeddings")
+    @patch("apps.ai.models.chunk.Chunk.bulk_save")
+    def test_process_chunks_batch_with_duplicates(
+        self,
+        mock_bulk_save,
+        mock_create_chunks,
+        mock_split_text,
+        mock_context_filter,
+        mock_get_content_type,
+        command,
+        mock_entity,
+        mock_context,
+        mock_content_type,
+        mock_chunks,
+    ):
+        """Test that duplicate chunk texts are filtered out before processing."""
+        mock_get_content_type.return_value = mock_content_type
+        mock_context_filter.return_value.first.return_value = mock_context
+        mock_split_text.return_value = ["chunk1", "chunk2", "chunk1", "chunk3", "chunk2"]
+        mock_create_chunks.return_value = mock_chunks
+        command.openai_client = Mock()
+
+        with patch.object(command.stdout, "write"):
+            result = command.process_chunks_batch([mock_entity])
+
+            assert result == 1
+            mock_split_text.assert_called_once()
+            _, kwargs = mock_create_chunks.call_args
+            assert set(kwargs["chunk_texts"]) == {"chunk1", "chunk2", "chunk3"}
+            assert kwargs["context"] == mock_context
+            assert kwargs["openai_client"] == command.openai_client
+            assert kwargs["save"] is False
+            mock_bulk_save.assert_called_once_with(mock_chunks)
+
     def test_process_chunks_batch_whitespace_only_content(
         self, command, mock_entity, mock_context, mock_content_type
     ):

@@ -116,7 +116,7 @@ def test_process_context_batch_success(
                 entity=mock_entity,
                 source="owasp_test_entity",
             )
-            mock_write.assert_called_once_with("Created context for test-key-123")
+            mock_write.assert_called_once_with("Created/updated context for test-key-123")
 
     @patch("apps.ai.common.base.context_command.Context")
     def test_process_context_batch_creation_fails(self, mock_context_class, command, mock_entity):
@@ -130,7 +130,7 @@ def test_process_context_batch_creation_fails(self, mock_context_class, command,
             mock_context_class.update_data.assert_called_once()
             mock_write.assert_called_once()
             call_args = mock_write.call_args[0][0]
-            assert "Failed to create context for test-key-123" in str(call_args)
+            assert "Failed to create/update context for test-key-123" in str(call_args)
 
     @patch("apps.ai.common.base.context_command.Context")
     def test_process_context_batch_multiple_entities(
@@ -184,9 +184,9 @@ def test_process_context_batch_mixed_success_failure(
             assert mock_write.call_count == 3
 
             write_calls = mock_write.call_args_list
-            assert "Created context for test-key-1" in str(write_calls[0])
-            assert "Failed to create context for test-key-2" in str(write_calls[1])
-            assert "Created context for test-key-3" in str(write_calls[2])
+            assert "Created/updated context for test-key-1" in str(write_calls[0])
+            assert "Failed to create/update context for test-key-2" in str(write_calls[1])
+            assert "Created/updated context for test-key-3" in str(write_calls[2])
 
     def test_process_context_batch_content_combination(self, command, mock_entity, mock_context):
         """Test that metadata and prose content are properly combined."""
@@ -261,7 +261,7 @@ def test_get_entity_key_usage(self, command, mock_context):
             with patch.object(command.stdout, "write") as mock_write:
                 command.process_context_batch([entity])
 
-                mock_write.assert_called_once_with("Created context for custom-entity-key")
+                mock_write.assert_called_once_with("Created/updated context for custom-entity-key")
 
     def test_process_context_batch_empty_list(self, command):
         """Test process_context_batch with empty entity list."""

@@ -169,7 +169,9 @@ def test_process_context_batch_success(self, command, mock_committee):
                             entity=mock_committee,
                             source="owasp_committee",
                         )
-                        mock_write.assert_called_once_with("Created context for test-committee")
+                        mock_write.assert_called_once_with(
+                            "Created/updated context for test-committee"
+                        )
 
     def test_process_context_batch_empty_content(self, command, mock_committee):
         """Test context batch processing with empty content."""
@@ -206,7 +208,7 @@ def test_process_context_batch_create_failure(self, command, mock_committee):
 
                         assert result == 0
                         mock_error.assert_called_once_with(
-                            "Failed to create context for test-committee"
+                            "Failed to create/update context for test-committee"
                         )
                         mock_write.assert_called_once_with("ERROR: Failed")
 

@@ -72,10 +72,9 @@ def test_add_arguments(self, command):
         parser = Mock()
         command.add_arguments(parser)
 
-        assert parser.add_argument.call_count == 6
+        assert parser.add_argument.call_count == 3
         calls = parser.add_argument.call_args_list
 
-        # First 3 calls are from parent class (BaseAICommand)
         assert calls[0][0] == ("--message-key",)
         assert calls[0][1]["type"] is str
         assert "Process only the message with this key" in calls[0][1]["help"]
@@ -86,19 +85,5 @@ def test_add_arguments(self, command):
 
         assert calls[2][0] == ("--batch-size",)
         assert calls[2][1]["type"] is int
-        assert calls[2][1]["default"] == 50  # Default from parent class
+        assert calls[2][1]["default"] == 50
         assert "Number of messages to process in each batch" in calls[2][1]["help"]
-
-        # Next 3 calls are from the command itself (duplicates with different defaults)
-        assert calls[3][0] == ("--message-key",)
-        assert calls[3][1]["type"] is str
-        assert "Process only the message with this key" in calls[3][1]["help"]
-
-        assert calls[4][0] == ("--all",)
-        assert calls[4][1]["action"] == "store_true"
-        assert "Process all the messages" in calls[4][1]["help"]
-
-        assert calls[5][0] == ("--batch-size",)
-        assert calls[5][1]["type"] is int
-        assert calls[5][1]["default"] == 100  # Overridden default from command
-        assert "Number of messages to process in each batch" in calls[5][1]["help"]