better handling of errors

This commit is contained in:
Daniel O'Connell 2025-05-27 22:39:24 +02:00
parent f5c3e458d7
commit 1291ca9d08
9 changed files with 39 additions and 32 deletions

View File

@ -212,12 +212,16 @@ def process_content_item(
- Commits database transaction - Commits database transaction
- Stores vectors in Qdrant - Stores vectors in Qdrant
""" """
status = "failed"
session.add(item) session.add(item)
session.flush() session.flush()
chunks_count = embed_source_item(item) chunks_count = embed_source_item(item)
session.flush() session.flush()
if not chunks_count:
return create_task_result(item, status, content_length=getattr(item, "size", 0))
try: try:
push_to_qdrant([item], collection_name) push_to_qdrant([item], collection_name)
status = "processed" status = "processed"
@ -228,7 +232,6 @@ def process_content_item(
except Exception as e: except Exception as e:
logger.error(f"Failed to push embeddings to Qdrant: {e}") logger.error(f"Failed to push embeddings to Qdrant: {e}")
item.embed_status = "FAILED" # type: ignore item.embed_status = "FAILED" # type: ignore
status = "failed"
session.commit() session.commit()
return create_task_result(item, status, content_length=getattr(item, "size", 0)) return create_task_result(item, status, content_length=getattr(item, "size", 0))

View File

@ -1,9 +1,9 @@
from datetime import datetime
import os import os
import subprocess import subprocess
from unittest.mock import patch
import uuid import uuid
from datetime import datetime
from pathlib import Path from pathlib import Path
from unittest.mock import Mock, patch
import pytest import pytest
import qdrant_client import qdrant_client
@ -207,10 +207,13 @@ def mock_file_storage(tmp_path: Path):
chunk_storage_dir.mkdir(parents=True, exist_ok=True) chunk_storage_dir.mkdir(parents=True, exist_ok=True)
image_storage_dir = tmp_path / "images" image_storage_dir = tmp_path / "images"
image_storage_dir.mkdir(parents=True, exist_ok=True) image_storage_dir.mkdir(parents=True, exist_ok=True)
email_storage_dir = tmp_path / "emails"
email_storage_dir.mkdir(parents=True, exist_ok=True)
with ( with (
patch.object(settings, "FILE_STORAGE_DIR", tmp_path), patch.object(settings, "FILE_STORAGE_DIR", tmp_path),
patch.object(settings, "CHUNK_STORAGE_DIR", chunk_storage_dir), patch.object(settings, "CHUNK_STORAGE_DIR", chunk_storage_dir),
patch.object(settings, "WEBPAGE_STORAGE_DIR", image_storage_dir), patch.object(settings, "WEBPAGE_STORAGE_DIR", image_storage_dir),
patch.object(settings, "EMAIL_STORAGE_DIR", email_storage_dir),
): ):
yield yield
@ -226,8 +229,11 @@ def qdrant():
@pytest.fixture(autouse=True) @pytest.fixture(autouse=True)
def mock_voyage_client(): def mock_voyage_client():
def embeder(chunks, *args, **kwargs):
return Mock(embeddings=[[0.1] * 1024] * len(chunks))
with patch.object(voyageai, "Client", autospec=True) as mock_client: with patch.object(voyageai, "Client", autospec=True) as mock_client:
client = mock_client() client = mock_client()
client.embed.return_value.embeddings = [[0.1] * 1024] client.embed = embeder
client.multimodal_embed.return_value.embeddings = [[0.1] * 1024] client.multimodal_embed = embeder
yield client yield client

View File

@ -16,6 +16,7 @@ from memory.common.db.models import (
EmailAttachment, EmailAttachment,
BookSection, BookSection,
BlogPost, BlogPost,
Book,
) )
@ -499,9 +500,8 @@ def test_mail_message_attachments_path(sender, folder, expected_path):
sha256=b"test", content="test", sender=sender, folder=folder sha256=b"test", content="test", sender=sender, folder=folder
) )
with patch.object(settings, "FILE_STORAGE_DIR", "/tmp/storage"): result = mail_message.attachments_path
result = mail_message.attachments_path assert str(result) == f"{settings.FILE_STORAGE_DIR}/emails/{expected_path}"
assert str(result) == f"/tmp/storage/{expected_path}"
@pytest.mark.parametrize( @pytest.mark.parametrize(
@ -520,15 +520,8 @@ def test_mail_message_safe_filename(tmp_path, filename, expected):
sha256=b"test", content="test", sender="user@example.com", folder="INBOX" sha256=b"test", content="test", sender="user@example.com", folder="INBOX"
) )
with patch.object(settings, "FILE_STORAGE_DIR", tmp_path): expected = settings.FILE_STORAGE_DIR / f"emails/user_example_com/INBOX/{expected}"
result = mail_message.safe_filename(filename) assert mail_message.safe_filename(filename) == expected
# Check that the path is correct
expected_path = tmp_path / "user_example_com" / "INBOX" / expected
assert result == expected_path
# Check that the directory was created
assert result.parent.exists()
@pytest.mark.parametrize( @pytest.mark.parametrize(
@ -847,6 +840,7 @@ def test_book_section_data_chunks(pages, expected_chunks):
start_page=10, start_page=10,
end_page=10 + len(pages), end_page=10 + len(pages),
pages=pages, pages=pages,
book=Book(id=1, title="Test Book", author="Test Author"),
) )
chunks = book_section.data_chunks() chunks = book_section.data_chunks()

View File

@ -15,8 +15,8 @@ def mock_embed(mock_voyage_client):
def embed(texts, model, input_type): def embed(texts, model, input_type):
return Mock(embeddings=[next(vectors) for _ in texts]) return Mock(embeddings=[next(vectors) for _ in texts])
mock_voyage_client.embed.side_effect = embed mock_voyage_client.embed = embed
mock_voyage_client.multimodal_embed.side_effect = embed mock_voyage_client.multimodal_embed = embed
return mock_voyage_client return mock_voyage_client

View File

@ -342,7 +342,7 @@ def test_sync_comic_embedding_failure(
assert result == { assert result == {
"comic_id": 1, "comic_id": 1,
"title": "Test Comic", "title": "Test Comic",
"status": "processed", "status": "failed",
"chunks_count": 0, "chunks_count": 0,
"embed_status": "FAILED", "embed_status": "FAILED",
"content_length": 90, "content_length": 90,

View File

@ -423,7 +423,7 @@ def test_create_task_result_no_title():
[ [
("success", False, "processed", "STORED"), ("success", False, "processed", "STORED"),
("success", True, "failed", "FAILED"), ("success", True, "failed", "FAILED"),
("empty", False, "processed", "FAILED"), ("empty", False, "failed", "FAILED"),
], ],
) )
def test_process_content_item( def test_process_content_item(

View File

@ -170,7 +170,7 @@ def test_embed_sections(db_session):
@patch("memory.workers.tasks.ebook.parse_ebook") @patch("memory.workers.tasks.ebook.parse_ebook")
def test_sync_book_success(mock_parse, mock_ebook, db_session, tmp_path): def test_sync_book_success(mock_parse, mock_ebook, db_session, tmp_path, qdrant):
"""Test successful book synchronization.""" """Test successful book synchronization."""
book_file = tmp_path / "test.epub" book_file = tmp_path / "test.epub"
book_file.write_text("dummy content") book_file.write_text("dummy content")
@ -315,17 +315,18 @@ def test_embed_sections_uses_correct_chunk_size(db_session, mock_voyage_client):
db_session.add_all(sections) db_session.add_all(sections)
db_session.flush() db_session.flush()
return_val = Mock(embeddings=[[0.1] * 1024] * 3)
mock_voyage_client.embed = Mock(return_value=return_val)
ebook.embed_sections(sections) ebook.embed_sections(sections)
# Verify that the voyage client was called with the full large content # Verify that the voyage client was called with the full large content
# Should be called 3 times: once for section content, twice for pages # Should be called 3 times: once for section content, twice for pages
assert mock_voyage_client.embed.call_count == 3 assert mock_voyage_client.embed.call_count == 1
# Check that the full content was passed to the embedding function # Check that the full content was passed to the embedding function
calls = mock_voyage_client.embed.call_args_list texts = mock_voyage_client.embed.call_args[0][0]
texts = [c[0][0] for c in calls]
assert texts == [ assert texts == [
[large_page_1.strip()], large_page_1.strip(),
[large_page_2.strip()], large_page_2.strip(),
[large_section_content.strip()], large_section_content.strip(),
] ]

View File

@ -283,9 +283,7 @@ def test_reingest_missing_chunks(db_session, qdrant, batch_size):
qd.upsert_vectors(qdrant, chunk.source.modality, [str(chunk.id)], [[1] * 1024]) qd.upsert_vectors(qdrant, chunk.source.modality, [str(chunk.id)], [[1] * 1024])
with patch.object(reingest_chunk, "delay", reingest_chunk): with patch.object(reingest_chunk, "delay", reingest_chunk):
with patch.object(settings, "CHUNK_REINGEST_SINCE_MINUTES", 60): result = reingest_missing_chunks(batch_size=batch_size, minutes_ago=60)
with patch.object(embedding, "embed_chunks", return_value=[[1] * 1024]):
result = reingest_missing_chunks(batch_size=batch_size)
assert result == { assert result == {
"photo": {"correct": 10, "missing": 10, "total": 20}, "photo": {"correct": 10, "missing": 10, "total": 20},

View File

@ -102,6 +102,7 @@ def test_process_attachment_disk(attachment_size, max_inline_size, message_id):
assert not cast(str, result.content) assert not cast(str, result.content)
assert cast(str, result.filename) == str( assert cast(str, result.filename) == str(
settings.FILE_STORAGE_DIR settings.FILE_STORAGE_DIR
/ "emails"
/ "sender_example_com" / "sender_example_com"
/ "INBOX" / "INBOX"
/ "test_with_special_chars.txt" / "test_with_special_chars.txt"
@ -183,7 +184,11 @@ def test_process_attachments_mixed():
# Verify large attachment has a path # Verify large attachment has a path
assert cast(str, results[1].filename) == str( assert cast(str, results[1].filename) == str(
settings.FILE_STORAGE_DIR / "sender_example_com" / "INBOX" / "large.txt" settings.FILE_STORAGE_DIR
/ "emails"
/ "sender_example_com"
/ "INBOX"
/ "large.txt"
) )