add ebook job

2025-07-31 23:26:08 +02:00 · 2025-05-24 20:21:41 +02:00 · 2025-05-24 20:21:41 +02:00 · 02d606deab
commit 02d606deab
parent b292baf59d
7 changed files with 772 additions and 19 deletions
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@ -17,7 +17,7 @@ jobs:
        run: |
          python -m pip install --upgrade pip
          pip install .[all]
-          pip install ruff==0.11.10 pylint
+          pip install ruff==0.11.10 pylint==1.1.400
      - name: Run linters
        run: |
          ruff check .
--- a/db/migrations/versions/20250523_163753_add_ebooks.py
+++ b/db/migrations/versions/20250523_163753_add_ebooks.py
@ -0,0 +1,107 @@
 """Add ebooks
 Revision ID: fe570eab952a
 Revises: b78b1fff9974
 Create Date: 2025-05-23 16:37:53.354723
 """
 from typing import Sequence, Union
 from alembic import op
 import sqlalchemy as sa
 from sqlalchemy.dialects import postgresql
 # revision identifiers, used by Alembic.
 revision: str = "fe570eab952a"
 down_revision: Union[str, None] = "b78b1fff9974"
 branch_labels: Union[str, Sequence[str], None] = None
 depends_on: Union[str, Sequence[str], None] = None
 def upgrade() -> None:
    op.create_table(
        "book",
        sa.Column("id", sa.BigInteger(), nullable=False),
        sa.Column("isbn", sa.Text(), nullable=True),
        sa.Column("title", sa.Text(), nullable=False),
        sa.Column("author", sa.Text(), nullable=True),
        sa.Column("publisher", sa.Text(), nullable=True),
        sa.Column("published", sa.DateTime(timezone=True), nullable=True),
        sa.Column("language", sa.Text(), nullable=True),
        sa.Column("edition", sa.Text(), nullable=True),
        sa.Column("series", sa.Text(), nullable=True),
        sa.Column("series_number", sa.Integer(), nullable=True),
        sa.Column("total_pages", sa.Integer(), nullable=True),
        sa.Column("file_path", sa.Text(), nullable=True),
        sa.Column("tags", sa.ARRAY(sa.Text()), nullable=False, server_default="{}"),
        sa.Column("metadata", postgresql.JSONB(astext_type=sa.Text()), nullable=True),
        sa.Column(
            "created_at",
            sa.DateTime(timezone=True),
            server_default=sa.text("now()"),
            nullable=True,
        ),
        sa.PrimaryKeyConstraint("id"),
        sa.UniqueConstraint("isbn"),
    )
    op.create_index("book_author_idx", "book", ["author"], unique=False)
    op.create_index("book_isbn_idx", "book", ["isbn"], unique=False)
    op.create_index("book_title_idx", "book", ["title"], unique=False)
    op.create_table(
        "book_section",
        sa.Column("id", sa.BigInteger(), nullable=False),
        sa.Column("book_id", sa.BigInteger(), nullable=False),
        sa.Column("section_title", sa.Text(), nullable=True),
        sa.Column("section_number", sa.Integer(), nullable=True),
        sa.Column("section_level", sa.Integer(), nullable=True),
        sa.Column("start_page", sa.Integer(), nullable=True),
        sa.Column("end_page", sa.Integer(), nullable=True),
        sa.Column("parent_section_id", sa.BigInteger(), nullable=True),
        sa.ForeignKeyConstraint(["book_id"], ["book.id"], ondelete="CASCADE"),
        sa.ForeignKeyConstraint(["id"], ["source_item.id"], ondelete="CASCADE"),
        sa.ForeignKeyConstraint(
            ["parent_section_id"],
            ["book_section.id"],
        ),
        sa.PrimaryKeyConstraint("id"),
    )
    op.create_index("book_section_book_idx", "book_section", ["book_id"], unique=False)
    op.create_index(
        "book_section_level_idx",
        "book_section",
        ["section_level", "section_number"],
        unique=False,
    )
    op.create_index(
        "book_section_parent_idx", "book_section", ["parent_section_id"], unique=False
    )
    op.drop_table("book_doc")
 def downgrade() -> None:
    op.create_table(
        "book_doc",
        sa.Column("id", sa.BIGINT(), autoincrement=False, nullable=False),
        sa.Column("title", sa.TEXT(), autoincrement=False, nullable=True),
        sa.Column("author", sa.TEXT(), autoincrement=False, nullable=True),
        sa.Column("chapter", sa.TEXT(), autoincrement=False, nullable=True),
        sa.Column(
            "published",
            postgresql.TIMESTAMP(timezone=True),
            autoincrement=False,
            nullable=True,
        ),
        sa.ForeignKeyConstraint(
            ["id"], ["source_item.id"], name="book_doc_id_fkey", ondelete="CASCADE"
        ),
        sa.PrimaryKeyConstraint("id", name="book_doc_pkey"),
    )
    op.drop_index("book_section_parent_idx", table_name="book_section")
    op.drop_index("book_section_level_idx", table_name="book_section")
    op.drop_index("book_section_book_idx", table_name="book_section")
    op.drop_table("book_section")
    op.drop_index("book_title_idx", table_name="book")
    op.drop_index("book_isbn_idx", table_name="book")
    op.drop_index("book_author_idx", table_name="book")
    op.drop_table("book")
--- a/dev.sh
+++ b/dev.sh
@ -13,6 +13,9 @@ echo -e "${GREEN}Starting development environment for Memory Knowledge Base...${
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 cd "$SCRIPT_DIR"
 docker volume create memory_file_storage
 docker run --rm -v memory_file_storage:/data busybox chown -R 1000:1000 /data
 # Create a temporary docker-compose override file to expose PostgreSQL
 echo -e "${YELLOW}Creating docker-compose override to expose PostgreSQL...${NC}"
 if [ ! -f docker-compose.override.yml ]; then
--- a/src/memory/common/db/models.py
+++ b/src/memory/common/db/models.py
@ -407,19 +407,98 @@ class Comic(SourceItem):
        return {k: v for k, v in payload.items() if v is not None}
-class BookDoc(SourceItem):
+class Book(Base):
-    __tablename__ = "book_doc"
+    """Book-level metadata table"""
    __tablename__ = "book"
    id = Column(BigInteger, primary_key=True)
    isbn = Column(Text, unique=True)
    title = Column(Text, nullable=False)
    author = Column(Text)
    publisher = Column(Text)
    published = Column(DateTime(timezone=True))
    language = Column(Text)
    edition = Column(Text)
    series = Column(Text)
    series_number = Column(Integer)
    total_pages = Column(Integer)
    file_path = Column(Text)
    tags = Column(ARRAY(Text), nullable=False, server_default="{}")
    # Metadata from ebook parser
    book_metadata = Column(JSONB, name="metadata")
    created_at = Column(DateTime(timezone=True), server_default=func.now())
    __table_args__ = (
        Index("book_isbn_idx", "isbn"),
        Index("book_author_idx", "author"),
        Index("book_title_idx", "title"),
    )
    def as_payload(self) -> dict:
        return {
            "source_id": self.id,
            "isbn": self.isbn,
            "title": self.title,
            "author": self.author,
            "publisher": self.publisher,
            "published": self.published,
            "language": self.language,
            "edition": self.edition,
            "series": self.series,
            "series_number": self.series_number,
            "tags": self.tags,
        } | (cast(dict, self.book_metadata) or {})
 class BookSection(SourceItem):
    """Individual sections/chapters of books"""
    __tablename__ = "book_section"
    id = Column(
        BigInteger, ForeignKey("source_item.id", ondelete="CASCADE"), primary_key=True
    )
-    title = Column(Text)
+    book_id = Column(
-    author = Column(Text)
+        BigInteger, ForeignKey("book.id", ondelete="CASCADE"), nullable=False
-    chapter = Column(Text)
+    )
    published = Column(DateTime(timezone=True))
-    __mapper_args__ = {
+    section_title = Column(Text)
-        "polymorphic_identity": "book_doc",
+    section_number = Column(Integer)
    section_level = Column(Integer)  # 1=chapter, 2=section, 3=subsection
    start_page = Column(Integer)
    end_page = Column(Integer)
    # Parent-child relationships for nested sections
    parent_section_id = Column(BigInteger, ForeignKey("book_section.id"))
    book = relationship("Book", backref="sections")
    parent = relationship(
        "BookSection",
        remote_side=[id],
        backref="children",
        foreign_keys=[parent_section_id],
    )
    __mapper_args__ = {"polymorphic_identity": "book_section"}
    __table_args__ = (
        Index("book_section_book_idx", "book_id"),
        Index("book_section_parent_idx", "parent_section_id"),
        Index("book_section_level_idx", "section_level", "section_number"),
    )
    def as_payload(self) -> dict:
        return {
            "source_id": self.id,
            "book_id": self.book_id,
            "section_title": self.section_title,
            "section_number": self.section_number,
            "section_level": self.section_level,
            "start_page": self.start_page,
            "end_page": self.end_page,
            "tags": self.tags,
        }
--- a/src/memory/common/parsers/ebook.py
+++ b/src/memory/common/parsers/ebook.py
@ -1,6 +1,6 @@
 import logging
 from dataclasses import dataclass, field
-from typing import Optional, List, Dict, Any, cast
+from typing import Any, cast
 from pathlib import Path
 import fitz  # PyMuPDF
@ -14,9 +14,9 @@ class Section:
    title: str
    content: str
-    number: Optional[int] = None
+    number: int | None = None
-    start_page: Optional[int] = None
+    start_page: int | None = None
-    end_page: Optional[int] = None
+    end_page: int | None = None
    children: list["Section"] = field(default_factory=list)
@ -26,11 +26,12 @@ class Ebook:
    title: str
    author: str
-    metadata: Dict[str, Any] = field(default_factory=dict)
+    file_path: Path
-    sections: List[Section] = field(default_factory=list)
+    metadata: dict[str, Any] = field(default_factory=dict)
    sections: list[Section] = field(default_factory=list)
    full_content: str = ""
    file_path: Optional[Path] = None
    file_type: str = ""
    n_pages: int = 0
 class Peekable:
@ -65,7 +66,7 @@ class Peekable:
 TOCItem = tuple[int, str, int]
-def extract_epub_metadata(doc) -> Dict[str, Any]:
+def extract_epub_metadata(doc) -> dict[str, Any]:
    """Extract metadata from a PyMuPDF document (EPUB)."""
    if not doc.metadata:
        return {}
@ -117,7 +118,7 @@ def extract_section_pages(doc, toc: Peekable, section_num: int = 1) -> Section |
    )
-def extract_sections(doc) -> List[Section]:
+def extract_sections(doc) -> list[Section]:
    """Extract all sections from a PyMuPDF document."""
    toc = doc.get_toc()
    if not toc:
@ -178,4 +179,5 @@ def parse_ebook(file_path: str | Path) -> Ebook:
        full_content=full_content,
        file_path=path,
        file_type=path.suffix.lower()[1:],
        n_pages=doc.page_count,
    )
--- a/src/memory/workers/tasks/ebook.py
+++ b/src/memory/workers/tasks/ebook.py
@ -0,0 +1,244 @@
 import hashlib
 import logging
 from pathlib import Path
 from typing import Iterable, cast
 from memory.common import embedding, qdrant, settings
 from memory.common.db.connection import make_session
 from memory.common.db.models import Book, BookSection
 from memory.common.parsers.ebook import Ebook, parse_ebook, Section
 from memory.workers.celery_app import app
 logger = logging.getLogger(__name__)
 SYNC_BOOK = "memory.workers.tasks.book.sync_book"
 # Minimum section length to embed (avoid noise from very short sections)
 MIN_SECTION_LENGTH = 100
 def create_book_from_ebook(ebook, tags: Iterable[str] = []) -> Book:
    """Create a Book model from parsed ebook data."""
    return Book(
        title=ebook.title,
        author=ebook.author,
        publisher=ebook.metadata.get("creator"),
        language=ebook.metadata.get("language"),
        total_pages=ebook.n_pages,
        file_path=ebook.file_path.as_posix(),
        book_metadata=ebook.metadata,
        tags=tags,
    )
 def section_processor(
    book: Book,
    all_sections: list[BookSection],
    section_map: dict[
        tuple[int, int | None], tuple[BookSection, tuple[int, int | None] | None]
    ],
 ):
    def process_section(
        section: Section,
        level: int = 1,
        parent_key: tuple[int, int | None] | None = None,
    ):
        if len(section.content.strip()) >= MIN_SECTION_LENGTH:
            sha256 = hashlib.sha256(
                f"{book.id}:{section.title}:{section.start_page}".encode()
            ).digest()
            book_section = BookSection(
                book_id=book.id,
                section_title=section.title,
                section_number=section.number,
                section_level=level,
                start_page=section.start_page,
                end_page=section.end_page,
                parent_section_id=None,  # Will be set after flush
                content=section.content,
                sha256=sha256,
                modality="book",
                tags=book.tags,
            )
            all_sections.append(book_section)
            section_key = (level, section.number)
            section_map[section_key] = (book_section, parent_key)
            # Process children
            for child in section.children:
                process_section(child, level + 1, section_key)
    return process_section
 def create_all_sections(
    ebook_sections: list[Section], book: Book
 ) -> tuple[list[BookSection], dict]:
    """Create all sections iteratively to handle parent-child relationships properly."""
    all_sections = []
    section_map = {}  # Maps (level, number) to section for parent lookup
    process_section = section_processor(book, all_sections, section_map)
    for section in ebook_sections:
        process_section(section)
    return all_sections, section_map
 def validate_and_parse_book(file_path: str) -> Ebook:
    """Validate file exists and parse the ebook."""
    path = Path(file_path)
    if not path.exists():
        raise FileNotFoundError(f"Book file not found: {path}")
    try:
        return parse_ebook(path)
    except Exception as e:
        logger.error(f"Failed to parse ebook {path}: {e}")
        raise
 def create_book_and_sections(
    ebook, session, tags: Iterable[str] = []
 ) -> tuple[Book, list[BookSection]]:
    """Create book and all its sections with proper relationships."""
    # Create book
    book = create_book_from_ebook(ebook, tags)
    session.add(book)
    session.flush()  # Get the book ID
    # Create all sections
    all_sections, section_map = create_all_sections(ebook.sections, book)
    session.add_all(all_sections)
    session.flush()
    for book_section, parent_key in section_map.values():
        if parent_key and parent_key in section_map:
            parent_section = section_map[parent_key][0]
            book_section.parent_section_id = cast(int, parent_section.id)
    return book, all_sections
 def embed_sections(all_sections: list[BookSection]) -> int:
    """Embed all sections and return count of successfully embedded sections."""
    embedded_count = 0
    for section in all_sections:
        try:
            _, chunks = embedding.embed(
                "text/plain",
                cast(str, section.content),
                metadata=section.as_payload(),
            )
            if chunks:
                section.chunks = chunks
                section.embed_status = "QUEUED"  # type: ignore
                embedded_count += 1
            else:
                section.embed_status = "FAILED"  # type: ignore
                logger.warning(
                    f"No chunks generated for section: {section.section_title}"
                )
        except IOError as e:
            section.embed_status = "FAILED"  # type: ignore
            logger.error(f"Failed to embed section {section.section_title}: {e}")
    return embedded_count
 def push_to_qdrant(all_sections: list[BookSection]):
    """Push embeddings to Qdrant for all successfully embedded sections."""
    vector_ids = []
    vectors = []
    payloads = []
    to_process = [s for s in all_sections if cast(str, s.embed_status) == "QUEUED"]
    all_chunks = [chunk for section in to_process for chunk in section.chunks]
    if not all_chunks:
        return
    vector_ids = [str(chunk.id) for chunk in all_chunks]
    vectors = [chunk.vector for chunk in all_chunks]
    payloads = [chunk.item_metadata for chunk in all_chunks]
    qdrant.upsert_vectors(
        client=qdrant.get_qdrant_client(),
        collection_name="book",
        ids=vector_ids,
        vectors=vectors,
        payloads=payloads,
    )
    for section in to_process:
        section.embed_status = "STORED"  # type: ignore
@app.task(name=SYNC_BOOK)
 def sync_book(file_path: str, tags: Iterable[str] = []) -> dict:
    """
    Synchronize a book from a file path.
    Args:
        file_path: Path to the ebook file
    Returns:
        dict: Summary of what was processed
    """
    ebook = validate_and_parse_book(file_path)
    with make_session() as session:
        # Check for existing book
        existing_book = (
            session.query(Book)
            .filter(Book.file_path == ebook.file_path.as_posix())
            .first()
        )
        if existing_book:
            logger.info(f"Book already exists: {existing_book.title}")
            return {
                "book_id": existing_book.id,
                "title": existing_book.title,
                "author": existing_book.author,
                "status": "already_exists",
                "sections_processed": 0,
            }
        # Create book and sections with relationships
        book, all_sections = create_book_and_sections(ebook, session, tags)
        # Embed sections
        embedded_count = embed_sections(all_sections)
        session.flush()
        # Push to Qdrant
        try:
            push_to_qdrant(all_sections)
        except Exception as e:
            logger.error(f"Failed to push embeddings to Qdrant: {e}")
            # Mark sections as failed
            for section in all_sections:
                if getattr(section, "embed_status") == "STORED":
                    section.embed_status = "FAILED"  # type: ignore
            raise
        session.commit()
        logger.info(
            f"Successfully processed book: {book.title} "
            f"({embedded_count}/{len(all_sections)} sections embedded)"
        )
        return {
            "book_id": book.id,
            "title": book.title,
            "author": book.author,
            "status": "processed",
            "total_sections": len(all_sections),
            "sections_embedded": embedded_count,
        }
--- a/tests/memory/workers/tasks/test_ebook_tasks.py
+++ b/tests/memory/workers/tasks/test_ebook_tasks.py
@ -0,0 +1,318 @@
 import pytest
 from pathlib import Path
 from unittest.mock import patch, Mock
 from memory.common.db.models import Book, BookSection, Chunk
 from memory.common.parsers.ebook import Ebook, Section
 from memory.workers.tasks import ebook
@pytest.fixture
 def mock_ebook():
    """Mock ebook data for testing."""
    return Ebook(
        title="Test Book",
        author="Test Author",
        metadata={"language": "en", "creator": "Test Publisher"},
        sections=[
            Section(
                title="Chapter 1",
                content="This is the content of chapter 1. "
                * 20,  # Make it long enough
                number=1,
                start_page=1,
                end_page=10,
                children=[
                    Section(
                        title="Section 1.1",
                        content="This is section 1.1 content. " * 15,
                        number=1,
                        start_page=1,
                        end_page=5,
                    ),
                    Section(
                        title="Section 1.2",
                        content="This is section 1.2 content. " * 15,
                        number=2,
                        start_page=6,
                        end_page=10,
                    ),
                ],
            ),
            Section(
                title="Chapter 2",
                content="This is the content of chapter 2. " * 20,
                number=2,
                start_page=11,
                end_page=20,
            ),
        ],
        file_path=Path("/test/book.epub"),
        n_pages=20,
    )
@pytest.fixture(autouse=True)
 def mock_embedding():
    """Mock the embedding function to return dummy vectors."""
    with patch("memory.workers.tasks.ebook.embedding.embed") as mock:
        mock.return_value = (
            "book",
            [
                Chunk(
                    vector=[0.1] * 1024,
                    item_metadata={"test": "data"},
                    content="Test content",
                    embedding_model="model",
                )
            ],
        )
        yield mock
@pytest.fixture
 def mock_qdrant():
    """Mock Qdrant operations."""
    with (
        patch("memory.workers.tasks.ebook.qdrant.upsert_vectors") as mock_upsert,
        patch("memory.workers.tasks.ebook.qdrant.get_qdrant_client") as mock_client,
    ):
        mock_client.return_value = Mock()
        yield mock_upsert
 def test_create_book_from_ebook(mock_ebook):
    """Test creating a Book model from ebook data."""
    book = ebook.create_book_from_ebook(mock_ebook)
    assert book.title == "Test Book"  # type: ignore
    assert book.author == "Test Author"  # type: ignore
    assert book.publisher == "Test Publisher"  # type: ignore
    assert book.language == "en"  # type: ignore
    assert book.file_path == "/test/book.epub"  # type: ignore
    assert book.total_pages == 20  # type: ignore
    assert book.book_metadata == {  # type: ignore
        "language": "en",
        "creator": "Test Publisher",
    }
 def test_validate_and_parse_book_success(mock_ebook, tmp_path):
    """Test successful book validation and parsing."""
    book_file = tmp_path / "test.epub"
    book_file.write_text("dummy content")
    with patch("memory.workers.tasks.ebook.parse_ebook", return_value=mock_ebook):
        assert ebook.validate_and_parse_book(str(book_file)) == mock_ebook
 def test_validate_and_parse_book_file_not_found():
    """Test handling of missing files."""
    with pytest.raises(FileNotFoundError):
        ebook.validate_and_parse_book("/nonexistent/file.epub")
 def test_validate_and_parse_book_parse_error(tmp_path):
    """Test handling of parsing errors."""
    book_file = tmp_path / "corrupted.epub"
    book_file.write_text("corrupted data")
    with patch(
        "memory.workers.tasks.ebook.parse_ebook", side_effect=Exception("Parse error")
    ):
        with pytest.raises(Exception, match="Parse error"):
            ebook.validate_and_parse_book(str(book_file))
 def test_create_book_and_sections(mock_ebook, db_session):
    """Test creating book and sections with relationships."""
    book, sections = ebook.create_book_and_sections(mock_ebook, db_session)
    # Verify book creation
    assert book.title == "Test Book"  # type: ignore
    assert book.id is not None
    # Verify sections creation
    assert len(sections) == 4  # Chapter 1, Section 1.1, Section 1.2, Chapter 2
    # Verify parent-child relationships
    chapter1 = next(s for s in sections if getattr(s, "section_title") == "Chapter 1")
    section11 = next(
        s for s in sections if getattr(s, "section_title") == "Section 1.1"
    )
    section12 = next(
        s for s in sections if getattr(s, "section_title") == "Section 1.2"
    )
    # Children should reference chapter 1 as parent
    assert getattr(section11, "parent_section_id") == chapter1.id
    assert getattr(section12, "parent_section_id") == chapter1.id
    # Chapter 1 should have no parent
    assert getattr(chapter1, "parent_section_id") is None
 def test_embed_sections(db_session, mock_embedding):
    """Test basic embedding sections workflow."""
    # Create a test book first
    book = Book(
        title="Test Book",
        author="Test Author",
        file_path="/test/path",
    )
    db_session.add(book)
    db_session.flush()  # Get the book ID
    # Create test sections with all required fields
    sections = [
        BookSection(
            book_id=book.id,
            section_title="Test Section",
            section_number=1,
            section_level=1,
            start_page=1,
            end_page=10,
            content="Test content " * 20,
            sha256=b"test_hash",
            modality="book",
            tags=["book"],
        )
    ]
    db_session.add_all(sections)
    db_session.flush()
    embedded_count = ebook.embed_sections(sections)
    assert embedded_count >= 0
    assert hasattr(sections[0], "embed_status")
 def test_push_to_qdrant(qdrant):
    """Test pushing embeddings to Qdrant."""
    # Create test sections with chunks
    mock_chunk = Mock(
        id="00000000-0000-0000-0000-000000000000",
        vector=[0.1] * 1024,
        item_metadata={"test": "data"},
    )
    mock_section = Mock(spec=BookSection)
    mock_section.embed_status = "QUEUED"
    mock_section.chunks = [mock_chunk]
    sections = [mock_section]
    ebook.push_to_qdrant(sections)  # type: ignore
    assert {r.id: r.payload for r in qdrant.scroll(collection_name="book")[0]} == {
        "00000000-0000-0000-0000-000000000000": {
            "test": "data",
        }
    }
    assert mock_section.embed_status == "STORED"
@patch("memory.workers.tasks.ebook.parse_ebook")
 def test_sync_book_success(mock_parse, mock_ebook, db_session, tmp_path):
    """Test successful book synchronization."""
    book_file = tmp_path / "test.epub"
    book_file.write_text("dummy content")
    mock_ebook.file_path = book_file
    mock_parse.return_value = mock_ebook
    result = ebook.sync_book(str(book_file), {"source", "test"})
    assert result == {
        "book_id": 1,
        "title": "Test Book",
        "author": "Test Author",
        "status": "processed",
        "total_sections": 4,
        "sections_embedded": 4,
    }
    book = db_session.query(Book).filter(Book.title == "Test Book").first()
    assert book is not None
    assert book.author == "Test Author"
    assert set(book.tags) == {"source", "test"}
    sections = (
        db_session.query(BookSection).filter(BookSection.book_id == book.id).all()
    )
    assert len(sections) == 4
@patch("memory.workers.tasks.ebook.parse_ebook")
 def test_sync_book_already_exists(mock_parse, mock_ebook, db_session, tmp_path):
    """Test that duplicate books are not processed."""
    book_file = tmp_path / "test.epub"
    book_file.write_text("dummy content")
    existing_book = Book(
        title="Existing Book",
        author="Author",
        file_path=str(book_file),
    )
    db_session.add(existing_book)
    db_session.commit()
    mock_ebook.file_path = book_file
    mock_parse.return_value = mock_ebook
    assert ebook.sync_book(str(book_file)) == {
        "book_id": existing_book.id,
        "title": "Existing Book",
        "author": "Author",
        "status": "already_exists",
        "sections_processed": 0,
    }
@patch("memory.workers.tasks.ebook.parse_ebook")
 def test_sync_book_embedding_failure(
    mock_parse, mock_ebook, db_session, tmp_path, mock_embedding
 ):
    """Test handling of embedding failures."""
    book_file = tmp_path / "test.epub"
    book_file.write_text("dummy content")
    mock_ebook.file_path = book_file
    mock_parse.return_value = mock_ebook
    mock_embedding.side_effect = IOError("Embedding failed")
    assert ebook.sync_book(str(book_file)) == {
        "book_id": 1,
        "title": "Test Book",
        "author": "Test Author",
        "status": "processed",
        "sections_embedded": 0,
        "total_sections": 4,
    }
    sections = db_session.query(BookSection).all()
    for section in sections:
        assert section.embed_status == "FAILED"
@patch("memory.workers.tasks.ebook.parse_ebook")
 def test_sync_book_qdrant_failure(mock_parse, mock_ebook, db_session, tmp_path):
    """Test handling of Qdrant failures."""
    book_file = tmp_path / "test.epub"
    book_file.write_text("dummy content")
    mock_ebook.file_path = book_file
    mock_parse.return_value = mock_ebook
    # Since embedding is already failing, this test will complete without hitting Qdrant
    # So let's just verify that the function completes without raising an exception
    with patch.object(ebook, "push_to_qdrant", side_effect=Exception("Qdrant failed")):
        with pytest.raises(Exception, match="Qdrant failed"):
            ebook.sync_book(str(book_file))
 def test_sync_book_file_not_found():
    """Test handling of missing files."""
    with pytest.raises(FileNotFoundError):
        ebook.sync_book("/nonexistent/file.epub")