Fix break_chunk() appending wrong object (BUG-007)

The function was appending the entire DataChunk object instead of
the individual item when processing non-string data (e.g., images).

Bug: `result.append(chunk)` should have been `result.append(c)`

This caused:
- Type mismatches (returning DataChunk instead of MulitmodalChunk)
- Potential circular references
- Embedding failures for mixed content

Fixed by appending the individual item `c` instead of the parent `chunk`.
Updated existing test and added new test to verify behavior.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Daniel O'Connell 2025-12-19 18:27:13 +01:00
parent 21dedbeb61
commit 28bc10df92
2 changed files with 24 additions and 5 deletions

View File

@ -49,12 +49,13 @@ def embed_chunks(
def break_chunk( def break_chunk(
chunk: extract.DataChunk, chunk_size: int = DEFAULT_CHUNK_TOKENS chunk: extract.DataChunk, chunk_size: int = DEFAULT_CHUNK_TOKENS
) -> list[extract.MulitmodalChunk]: ) -> list[extract.MulitmodalChunk]:
result = [] result: list[extract.MulitmodalChunk] = []
for c in chunk.data: for c in chunk.data:
if isinstance(c, str): if isinstance(c, str):
result += chunk_text(c, chunk_size, OVERLAP_TOKENS) result += chunk_text(c, chunk_size, OVERLAP_TOKENS)
else: else:
result.append(chunk) # Non-string items (e.g., images) are passed through directly
result.append(c)
return result return result

View File

@ -153,11 +153,29 @@ def test_break_chunk_with_mixed_data_types():
chunk = DataChunk(data=["text content", mock_image]) chunk = DataChunk(data=["text content", mock_image])
result = break_chunk(chunk, chunk_size=100) result = break_chunk(chunk, chunk_size=100)
# Should have text chunks plus the original chunk (since it's not a string) # Should have text chunks plus the image (non-string items are passed through)
assert len(result) >= 2 assert len(result) >= 2
assert any(isinstance(item, str) for item in result) assert any(isinstance(item, str) for item in result)
# The original chunk should be preserved when it contains mixed data # The individual non-string item (image) should be in result, not the DataChunk
assert chunk in result assert mock_image in result
# The DataChunk itself should NOT be in the result
assert chunk not in result
def test_break_chunk_preserves_non_string_items():
"""Non-string items (like images) should be preserved individually."""
mock_image1 = Mock(spec=Image.Image)
mock_image2 = Mock(spec=Image.Image)
chunk = DataChunk(data=[mock_image1, "some text", mock_image2])
result = break_chunk(chunk, chunk_size=100)
# Both images should be in result
assert mock_image1 in result
assert mock_image2 in result
# Text should be chunked
assert "some text" in result
# Total should be 3 items (2 images + 1 short text)
assert len(result) == 3
def test_embed_by_model_with_matching_chunks(mock_embed): def test_embed_by_model_with_matching_chunks(mock_embed):