mirror of
https://github.com/mruwnik/memory.git
synced 2026-01-02 09:12:58 +01:00
Fix break_chunk() appending wrong object (BUG-007)
The function was appending the entire DataChunk object instead of the individual item when processing non-string data (e.g., images). Bug: `result.append(chunk)` should have been `result.append(c)` This caused: - Type mismatches (returning DataChunk instead of MulitmodalChunk) - Potential circular references - Embedding failures for mixed content Fixed by appending the individual item `c` instead of the parent `chunk`. Updated existing test and added new test to verify behavior. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
parent
21dedbeb61
commit
28bc10df92
@ -49,12 +49,13 @@ def embed_chunks(
|
|||||||
def break_chunk(
|
def break_chunk(
|
||||||
chunk: extract.DataChunk, chunk_size: int = DEFAULT_CHUNK_TOKENS
|
chunk: extract.DataChunk, chunk_size: int = DEFAULT_CHUNK_TOKENS
|
||||||
) -> list[extract.MulitmodalChunk]:
|
) -> list[extract.MulitmodalChunk]:
|
||||||
result = []
|
result: list[extract.MulitmodalChunk] = []
|
||||||
for c in chunk.data:
|
for c in chunk.data:
|
||||||
if isinstance(c, str):
|
if isinstance(c, str):
|
||||||
result += chunk_text(c, chunk_size, OVERLAP_TOKENS)
|
result += chunk_text(c, chunk_size, OVERLAP_TOKENS)
|
||||||
else:
|
else:
|
||||||
result.append(chunk)
|
# Non-string items (e.g., images) are passed through directly
|
||||||
|
result.append(c)
|
||||||
return result
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -153,11 +153,29 @@ def test_break_chunk_with_mixed_data_types():
|
|||||||
chunk = DataChunk(data=["text content", mock_image])
|
chunk = DataChunk(data=["text content", mock_image])
|
||||||
result = break_chunk(chunk, chunk_size=100)
|
result = break_chunk(chunk, chunk_size=100)
|
||||||
|
|
||||||
# Should have text chunks plus the original chunk (since it's not a string)
|
# Should have text chunks plus the image (non-string items are passed through)
|
||||||
assert len(result) >= 2
|
assert len(result) >= 2
|
||||||
assert any(isinstance(item, str) for item in result)
|
assert any(isinstance(item, str) for item in result)
|
||||||
# The original chunk should be preserved when it contains mixed data
|
# The individual non-string item (image) should be in result, not the DataChunk
|
||||||
assert chunk in result
|
assert mock_image in result
|
||||||
|
# The DataChunk itself should NOT be in the result
|
||||||
|
assert chunk not in result
|
||||||
|
|
||||||
|
|
||||||
|
def test_break_chunk_preserves_non_string_items():
|
||||||
|
"""Non-string items (like images) should be preserved individually."""
|
||||||
|
mock_image1 = Mock(spec=Image.Image)
|
||||||
|
mock_image2 = Mock(spec=Image.Image)
|
||||||
|
chunk = DataChunk(data=[mock_image1, "some text", mock_image2])
|
||||||
|
result = break_chunk(chunk, chunk_size=100)
|
||||||
|
|
||||||
|
# Both images should be in result
|
||||||
|
assert mock_image1 in result
|
||||||
|
assert mock_image2 in result
|
||||||
|
# Text should be chunked
|
||||||
|
assert "some text" in result
|
||||||
|
# Total should be 3 items (2 images + 1 short text)
|
||||||
|
assert len(result) == 3
|
||||||
|
|
||||||
|
|
||||||
def test_embed_by_model_with_matching_chunks(mock_embed):
|
def test_embed_by_model_with_matching_chunks(mock_embed):
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user