Fix break_chunk() appending wrong object (BUG-007)

The function was appending the entire DataChunk object instead of the individual item when processing non-string data (e.g., images). Bug: `result.append(chunk)` should have been `result.append(c)` This caused: - Type mismatches (returning DataChunk instead of MulitmodalChunk) - Potential circular references - Embedding failures for mixed content Fixed by appending the individual item `c` instead of the parent `chunk`. Updated existing test and added new test to verify behavior. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-02 17:22:58 +01:00 · 2025-12-19 18:27:13 +01:00 · 2025-12-19 18:27:13 +01:00 · 28bc10df92
commit 28bc10df92
parent 21dedbeb61
2 changed files with 24 additions and 5 deletions
--- a/src/memory/common/embedding.py
+++ b/src/memory/common/embedding.py
@ -49,12 +49,13 @@ def embed_chunks(
 def break_chunk(
    chunk: extract.DataChunk, chunk_size: int = DEFAULT_CHUNK_TOKENS
 ) -> list[extract.MulitmodalChunk]:
-    result = []
+    result: list[extract.MulitmodalChunk] = []
    for c in chunk.data:
        if isinstance(c, str):
            result += chunk_text(c, chunk_size, OVERLAP_TOKENS)
        else:
-            result.append(chunk)
+            # Non-string items (e.g., images) are passed through directly
            result.append(c)
    return result
--- a/tests/memory/common/test_embedding.py
+++ b/tests/memory/common/test_embedding.py
@ -153,11 +153,29 @@ def test_break_chunk_with_mixed_data_types():
    chunk = DataChunk(data=["text content", mock_image])
    result = break_chunk(chunk, chunk_size=100)
-    # Should have text chunks plus the original chunk (since it's not a string)
+    # Should have text chunks plus the image (non-string items are passed through)
    assert len(result) >= 2
    assert any(isinstance(item, str) for item in result)
-    # The original chunk should be preserved when it contains mixed data
+    # The individual non-string item (image) should be in result, not the DataChunk
-    assert chunk in result
+    assert mock_image in result
    # The DataChunk itself should NOT be in the result
    assert chunk not in result
 def test_break_chunk_preserves_non_string_items():
    """Non-string items (like images) should be preserved individually."""
    mock_image1 = Mock(spec=Image.Image)
    mock_image2 = Mock(spec=Image.Image)
    chunk = DataChunk(data=[mock_image1, "some text", mock_image2])
    result = break_chunk(chunk, chunk_size=100)
    # Both images should be in result
    assert mock_image1 in result
    assert mock_image2 in result
    # Text should be chunked
    assert "some text" in result
    # Total should be 3 items (2 images + 1 short text)
    assert len(result) == 3
 def test_embed_by_model_with_matching_chunks(mock_embed):