@@ -211,16 +211,21 @@ def test_process_chunks_batch_success(
211211 mock_create_chunks .return_value = mock_chunks
212212 command .openai_client = Mock ()
213213
214- with patch .object (command .stdout , "write" ) as mock_write :
214+ with (
215+ patch ("apps.ai.models.chunk.Chunk.objects.filter" ) as mock_chunk_filter ,
216+ patch .object (command .stdout , "write" ) as mock_write ,
217+ ):
218+ mock_qs = Mock ()
219+ mock_qs .values_list .return_value = []
220+ mock_chunk_filter .return_value = mock_qs
215221 result = command .process_chunks_batch ([mock_entity ])
216222
217223 assert result == 1
218- mock_create_chunks .assert_called_once_with (
219- chunk_texts = ["chunk1" , "chunk2" , "chunk3" ],
220- context = mock_context ,
221- openai_client = command .openai_client ,
222- save = False ,
223- )
224+ _ , kwargs = mock_create_chunks .call_args
225+ assert set (kwargs ["chunk_texts" ]) == {"chunk1" , "chunk2" , "chunk3" }
226+ assert kwargs ["context" ] == mock_context
227+ assert kwargs ["openai_client" ] == command .openai_client
228+ assert kwargs ["save" ] is False
224229 mock_bulk_save .assert_called_once_with (mock_chunks )
225230 mock_write .assert_has_calls (
226231 [
@@ -261,7 +266,13 @@ def test_process_chunks_batch_multiple_entities(
261266 mock_create_chunks .return_value = mock_chunks [:2 ]
262267 command .openai_client = Mock ()
263268
264- with patch .object (command .stdout , "write" ):
269+ with (
270+ patch ("apps.ai.models.chunk.Chunk.objects.filter" ) as mock_chunk_filter ,
271+ patch .object (command .stdout , "write" ),
272+ ):
273+ mock_qs = Mock ()
274+ mock_qs .values_list .return_value = []
275+ mock_chunk_filter .return_value = mock_qs
265276 result = command .process_chunks_batch (entities )
266277
267278 assert result == 3
@@ -325,14 +336,22 @@ def test_process_chunks_batch_content_combination(
325336 "extract_content" ,
326337 return_value = ("prose" , "metadata" ),
327338 ):
328- command .process_chunks_batch ([mock_entity ])
339+ with patch ("apps.ai.models.chunk.Chunk.objects.filter" ) as mock_chunk_filter :
340+ mock_qs = Mock ()
341+ mock_qs .values_list .return_value = []
342+ mock_chunk_filter .return_value = mock_qs
343+ command .process_chunks_batch ([mock_entity ])
329344
330345 expected_content = "metadata\n \n prose"
331346 mock_split_text .assert_called_once_with (expected_content )
332347
333348 mock_split_text .reset_mock ()
334349 with patch .object (command , "extract_content" , return_value = ("prose" , "" )):
335- command .process_chunks_batch ([mock_entity ])
350+ with patch ("apps.ai.models.chunk.Chunk.objects.filter" ) as mock_chunk_filter :
351+ mock_qs = Mock ()
352+ mock_qs .values_list .return_value = []
353+ mock_chunk_filter .return_value = mock_qs
354+ command .process_chunks_batch ([mock_entity ])
336355
337356 mock_split_text .assert_called_with ("prose" )
338357
@@ -402,11 +421,52 @@ def test_process_chunks_batch_metadata_only_content(
402421 "extract_content" ,
403422 return_value = ("" , "metadata" ),
404423 ):
405- command .process_chunks_batch ([mock_entity ])
424+ with patch ("apps.ai.models.chunk.Chunk.objects.filter" ) as mock_chunk_filter :
425+ mock_qs = Mock ()
426+ mock_qs .values_list .return_value = []
427+ mock_chunk_filter .return_value = mock_qs
428+ command .process_chunks_batch ([mock_entity ])
406429
407430 mock_split_text .assert_called_once_with ("metadata\n \n " )
408431 mock_bulk_save .assert_called_once ()
409432
433+ @patch ("apps.ai.common.base.chunk_command.ContentType.objects.get_for_model" )
434+ @patch ("apps.ai.common.base.chunk_command.Context.objects.filter" )
435+ @patch ("apps.ai.models.chunk.Chunk.split_text" )
436+ @patch ("apps.ai.common.base.chunk_command.create_chunks_and_embeddings" )
437+ @patch ("apps.ai.models.chunk.Chunk.bulk_save" )
438+ def test_process_chunks_batch_with_duplicates (
439+ self ,
440+ mock_bulk_save ,
441+ mock_create_chunks ,
442+ mock_split_text ,
443+ mock_context_filter ,
444+ mock_get_content_type ,
445+ command ,
446+ mock_entity ,
447+ mock_context ,
448+ mock_content_type ,
449+ mock_chunks ,
450+ ):
451+ """Test that duplicate chunk texts are filtered out before processing."""
452+ mock_get_content_type .return_value = mock_content_type
453+ mock_context_filter .return_value .first .return_value = mock_context
454+ mock_split_text .return_value = ["chunk1" , "chunk2" , "chunk1" , "chunk3" , "chunk2" ]
455+ mock_create_chunks .return_value = mock_chunks
456+ command .openai_client = Mock ()
457+
458+ with patch .object (command .stdout , "write" ):
459+ result = command .process_chunks_batch ([mock_entity ])
460+
461+ assert result == 1
462+ mock_split_text .assert_called_once ()
463+ _ , kwargs = mock_create_chunks .call_args
464+ assert set (kwargs ["chunk_texts" ]) == {"chunk1" , "chunk2" , "chunk3" }
465+ assert kwargs ["context" ] == mock_context
466+ assert kwargs ["openai_client" ] == command .openai_client
467+ assert kwargs ["save" ] is False
468+ mock_bulk_save .assert_called_once_with (mock_chunks )
469+
410470 def test_process_chunks_batch_whitespace_only_content (
411471 self , command , mock_entity , mock_context , mock_content_type
412472 ):
0 commit comments