add ebook job

2025-12-16 00:51:18 +01:00 · 2025-05-24 20:21:41 +02:00 · 2025-05-24 20:21:41 +02:00 · 02d606deab
commit 02d606deab
parent b292baf59d
7 changed files with 772 additions and 19 deletions
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@ -17,7 +17,7 @@ jobs:
        run: |
          python -m pip install --upgrade pip
          pip install .[all]
-          pip install ruff==0.11.10 pylint
+          pip install ruff==0.11.10 pylint==1.1.400
      - name: Run linters
        run: |
          ruff check .
--- a/db/migrations/versions/20250523_163753_add_ebooks.py
+++ b/db/migrations/versions/20250523_163753_add_ebooks.py
@ -0,0 +1,107 @@
+"""Add ebooks
+
+Revision ID: fe570eab952a
+Revises: b78b1fff9974
+Create Date: 2025-05-23 16:37:53.354723
+
+"""
+
+from typing import Sequence, Union
+
+from alembic import op
+import sqlalchemy as sa
+from sqlalchemy.dialects import postgresql
+
+# revision identifiers, used by Alembic.
+revision: str = "fe570eab952a"
+down_revision: Union[str, None] = "b78b1fff9974"
+branch_labels: Union[str, Sequence[str], None] = None
+depends_on: Union[str, Sequence[str], None] = None
+
+
+def upgrade() -> None:
+    op.create_table(
+        "book",
+        sa.Column("id", sa.BigInteger(), nullable=False),
+        sa.Column("isbn", sa.Text(), nullable=True),
+        sa.Column("title", sa.Text(), nullable=False),
+        sa.Column("author", sa.Text(), nullable=True),
+        sa.Column("publisher", sa.Text(), nullable=True),
+        sa.Column("published", sa.DateTime(timezone=True), nullable=True),
+        sa.Column("language", sa.Text(), nullable=True),
+        sa.Column("edition", sa.Text(), nullable=True),
+        sa.Column("series", sa.Text(), nullable=True),
+        sa.Column("series_number", sa.Integer(), nullable=True),
+        sa.Column("total_pages", sa.Integer(), nullable=True),
+        sa.Column("file_path", sa.Text(), nullable=True),
+        sa.Column("tags", sa.ARRAY(sa.Text()), nullable=False, server_default="{}"),
+        sa.Column("metadata", postgresql.JSONB(astext_type=sa.Text()), nullable=True),
+        sa.Column(
+            "created_at",
+            sa.DateTime(timezone=True),
+            server_default=sa.text("now()"),
+            nullable=True,
+        ),
+        sa.PrimaryKeyConstraint("id"),
+        sa.UniqueConstraint("isbn"),
+    )
+    op.create_index("book_author_idx", "book", ["author"], unique=False)
+    op.create_index("book_isbn_idx", "book", ["isbn"], unique=False)
+    op.create_index("book_title_idx", "book", ["title"], unique=False)
+    op.create_table(
+        "book_section",
+        sa.Column("id", sa.BigInteger(), nullable=False),
+        sa.Column("book_id", sa.BigInteger(), nullable=False),
+        sa.Column("section_title", sa.Text(), nullable=True),
+        sa.Column("section_number", sa.Integer(), nullable=True),
+        sa.Column("section_level", sa.Integer(), nullable=True),
+        sa.Column("start_page", sa.Integer(), nullable=True),
+        sa.Column("end_page", sa.Integer(), nullable=True),
+        sa.Column("parent_section_id", sa.BigInteger(), nullable=True),
+        sa.ForeignKeyConstraint(["book_id"], ["book.id"], ondelete="CASCADE"),
+        sa.ForeignKeyConstraint(["id"], ["source_item.id"], ondelete="CASCADE"),
+        sa.ForeignKeyConstraint(
+            ["parent_section_id"],
+            ["book_section.id"],
+        ),
+        sa.PrimaryKeyConstraint("id"),
+    )
+    op.create_index("book_section_book_idx", "book_section", ["book_id"], unique=False)
+    op.create_index(
+        "book_section_level_idx",
+        "book_section",
+        ["section_level", "section_number"],
+        unique=False,
+    )
+    op.create_index(
+        "book_section_parent_idx", "book_section", ["parent_section_id"], unique=False
+    )
+    op.drop_table("book_doc")
+
+
+def downgrade() -> None:
+    op.create_table(
+        "book_doc",
+        sa.Column("id", sa.BIGINT(), autoincrement=False, nullable=False),
+        sa.Column("title", sa.TEXT(), autoincrement=False, nullable=True),
+        sa.Column("author", sa.TEXT(), autoincrement=False, nullable=True),
+        sa.Column("chapter", sa.TEXT(), autoincrement=False, nullable=True),
+        sa.Column(
+            "published",
+            postgresql.TIMESTAMP(timezone=True),
+            autoincrement=False,
+            nullable=True,
+        ),
+        sa.ForeignKeyConstraint(
+            ["id"], ["source_item.id"], name="book_doc_id_fkey", ondelete="CASCADE"
+        ),
+        sa.PrimaryKeyConstraint("id", name="book_doc_pkey"),
+    )
+    op.drop_index("book_section_parent_idx", table_name="book_section")
+    op.drop_index("book_section_level_idx", table_name="book_section")
+    op.drop_index("book_section_book_idx", table_name="book_section")
+    op.drop_table("book_section")
+    op.drop_index("book_title_idx", table_name="book")
+    op.drop_index("book_isbn_idx", table_name="book")
+    op.drop_index("book_author_idx", table_name="book")
+    op.drop_table("book")
--- a/dev.sh
+++ b/dev.sh
@ -13,6 +13,9 @@ echo -e "${GREEN}Starting development environment for Memory Knowledge Base...${
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 cd "$SCRIPT_DIR"

+docker volume create memory_file_storage
+docker run --rm -v memory_file_storage:/data busybox chown -R 1000:1000 /data
+
 # Create a temporary docker-compose override file to expose PostgreSQL
 echo -e "${YELLOW}Creating docker-compose override to expose PostgreSQL...${NC}"
 if [ ! -f docker-compose.override.yml ]; then
--- a/src/memory/common/db/models.py
+++ b/src/memory/common/db/models.py
@ -407,19 +407,98 @@ class Comic(SourceItem):
        return {k: v for k, v in payload.items() if v is not None}


-class BookDoc(SourceItem):
-    __tablename__ = "book_doc"
+class Book(Base):
+    """Book-level metadata table"""
+
+    __tablename__ = "book"
+
+    id = Column(BigInteger, primary_key=True)
+    isbn = Column(Text, unique=True)
+    title = Column(Text, nullable=False)
+    author = Column(Text)
+    publisher = Column(Text)
+    published = Column(DateTime(timezone=True))
+    language = Column(Text)
+    edition = Column(Text)
+    series = Column(Text)
+    series_number = Column(Integer)
+    total_pages = Column(Integer)
+    file_path = Column(Text)
+    tags = Column(ARRAY(Text), nullable=False, server_default="{}")
+
+    # Metadata from ebook parser
+    book_metadata = Column(JSONB, name="metadata")
+
+    created_at = Column(DateTime(timezone=True), server_default=func.now())
+
+    __table_args__ = (
+        Index("book_isbn_idx", "isbn"),
+        Index("book_author_idx", "author"),
+        Index("book_title_idx", "title"),
+    )
+
+    def as_payload(self) -> dict:
+        return {
+            "source_id": self.id,
+            "isbn": self.isbn,
+            "title": self.title,
+            "author": self.author,
+            "publisher": self.publisher,
+            "published": self.published,
+            "language": self.language,
+            "edition": self.edition,
+            "series": self.series,
+            "series_number": self.series_number,
+            "tags": self.tags,
+        } | (cast(dict, self.book_metadata) or {})
+
+
+class BookSection(SourceItem):
+    """Individual sections/chapters of books"""
+
+    __tablename__ = "book_section"

    id = Column(
        BigInteger, ForeignKey("source_item.id", ondelete="CASCADE"), primary_key=True
    )
-    title = Column(Text)
-    author = Column(Text)
-    chapter = Column(Text)
-    published = Column(DateTime(timezone=True))
+    book_id = Column(
+        BigInteger, ForeignKey("book.id", ondelete="CASCADE"), nullable=False
+    )

-    __mapper_args__ = {
-        "polymorphic_identity": "book_doc",
+    section_title = Column(Text)
+    section_number = Column(Integer)
+    section_level = Column(Integer)  # 1=chapter, 2=section, 3=subsection
+    start_page = Column(Integer)
+    end_page = Column(Integer)
+
+    # Parent-child relationships for nested sections
+    parent_section_id = Column(BigInteger, ForeignKey("book_section.id"))
+
+    book = relationship("Book", backref="sections")
+    parent = relationship(
+        "BookSection",
+        remote_side=[id],
+        backref="children",
+        foreign_keys=[parent_section_id],
+    )
+
+    __mapper_args__ = {"polymorphic_identity": "book_section"}
+    __table_args__ = (
+        Index("book_section_book_idx", "book_id"),
+        Index("book_section_parent_idx", "parent_section_id"),
+        Index("book_section_level_idx", "section_level", "section_number"),
+    )
+
+    def as_payload(self) -> dict:
+        return {
+            "source_id": self.id,
+            "book_id": self.book_id,
+            "section_title": self.section_title,
+            "section_number": self.section_number,
+            "section_level": self.section_level,
+            "start_page": self.start_page,
+            "end_page": self.end_page,
+            "tags": self.tags,
        }


--- a/src/memory/common/parsers/ebook.py
+++ b/src/memory/common/parsers/ebook.py
@ -1,6 +1,6 @@
 import logging
 from dataclasses import dataclass, field
-from typing import Optional, List, Dict, Any, cast
+from typing import Any, cast
 from pathlib import Path

 import fitz  # PyMuPDF
@ -14,9 +14,9 @@ class Section:

    title: str
    content: str
-    number: Optional[int] = None
-    start_page: Optional[int] = None
-    end_page: Optional[int] = None
+    number: int | None = None
+    start_page: int | None = None
+    end_page: int | None = None
    children: list["Section"] = field(default_factory=list)


@ -26,11 +26,12 @@ class Ebook:

    title: str
    author: str
-    metadata: Dict[str, Any] = field(default_factory=dict)
-    sections: List[Section] = field(default_factory=list)
+    file_path: Path
+    metadata: dict[str, Any] = field(default_factory=dict)
+    sections: list[Section] = field(default_factory=list)
    full_content: str = ""
-    file_path: Optional[Path] = None
    file_type: str = ""
+    n_pages: int = 0


 class Peekable:
@ -65,7 +66,7 @@ class Peekable:
 TOCItem = tuple[int, str, int]


-def extract_epub_metadata(doc) -> Dict[str, Any]:
+def extract_epub_metadata(doc) -> dict[str, Any]:
    """Extract metadata from a PyMuPDF document (EPUB)."""
    if not doc.metadata:
        return {}
@ -117,7 +118,7 @@ def extract_section_pages(doc, toc: Peekable, section_num: int = 1) -> Section |
    )


-def extract_sections(doc) -> List[Section]:
+def extract_sections(doc) -> list[Section]:
    """Extract all sections from a PyMuPDF document."""
    toc = doc.get_toc()
    if not toc:
@ -178,4 +179,5 @@ def parse_ebook(file_path: str | Path) -> Ebook:
        full_content=full_content,
        file_path=path,
        file_type=path.suffix.lower()[1:],
+        n_pages=doc.page_count,
    )
--- a/src/memory/workers/tasks/ebook.py
+++ b/src/memory/workers/tasks/ebook.py
@ -0,0 +1,244 @@
+import hashlib
+import logging
+from pathlib import Path
+from typing import Iterable, cast
+
+from memory.common import embedding, qdrant, settings
+from memory.common.db.connection import make_session
+from memory.common.db.models import Book, BookSection
+from memory.common.parsers.ebook import Ebook, parse_ebook, Section
+from memory.workers.celery_app import app
+
+logger = logging.getLogger(__name__)
+
+
+SYNC_BOOK = "memory.workers.tasks.book.sync_book"
+
+# Minimum section length to embed (avoid noise from very short sections)
+MIN_SECTION_LENGTH = 100
+
+
+def create_book_from_ebook(ebook, tags: Iterable[str] = []) -> Book:
+    """Create a Book model from parsed ebook data."""
+    return Book(
+        title=ebook.title,
+        author=ebook.author,
+        publisher=ebook.metadata.get("creator"),
+        language=ebook.metadata.get("language"),
+        total_pages=ebook.n_pages,
+        file_path=ebook.file_path.as_posix(),
+        book_metadata=ebook.metadata,
+        tags=tags,
+    )
+
+
+def section_processor(
+    book: Book,
+    all_sections: list[BookSection],
+    section_map: dict[
+        tuple[int, int | None], tuple[BookSection, tuple[int, int | None] | None]
+    ],
+):
+    def process_section(
+        section: Section,
+        level: int = 1,
+        parent_key: tuple[int, int | None] | None = None,
+    ):
+        if len(section.content.strip()) >= MIN_SECTION_LENGTH:
+            sha256 = hashlib.sha256(
+                f"{book.id}:{section.title}:{section.start_page}".encode()
+            ).digest()
+
+            book_section = BookSection(
+                book_id=book.id,
+                section_title=section.title,
+                section_number=section.number,
+                section_level=level,
+                start_page=section.start_page,
+                end_page=section.end_page,
+                parent_section_id=None,  # Will be set after flush
+                content=section.content,
+                sha256=sha256,
+                modality="book",
+                tags=book.tags,
+            )
+
+            all_sections.append(book_section)
+            section_key = (level, section.number)
+            section_map[section_key] = (book_section, parent_key)
+
+            # Process children
+            for child in section.children:
+                process_section(child, level + 1, section_key)
+
+    return process_section
+
+
+def create_all_sections(
+    ebook_sections: list[Section], book: Book
+) -> tuple[list[BookSection], dict]:
+    """Create all sections iteratively to handle parent-child relationships properly."""
+    all_sections = []
+    section_map = {}  # Maps (level, number) to section for parent lookup
+
+    process_section = section_processor(book, all_sections, section_map)
+    for section in ebook_sections:
+        process_section(section)
+
+    return all_sections, section_map
+
+
+def validate_and_parse_book(file_path: str) -> Ebook:
+    """Validate file exists and parse the ebook."""
+    path = Path(file_path)
+    if not path.exists():
+        raise FileNotFoundError(f"Book file not found: {path}")
+
+    try:
+        return parse_ebook(path)
+    except Exception as e:
+        logger.error(f"Failed to parse ebook {path}: {e}")
+        raise
+
+
+def create_book_and_sections(
+    ebook, session, tags: Iterable[str] = []
+) -> tuple[Book, list[BookSection]]:
+    """Create book and all its sections with proper relationships."""
+    # Create book
+    book = create_book_from_ebook(ebook, tags)
+    session.add(book)
+    session.flush()  # Get the book ID
+
+    # Create all sections
+    all_sections, section_map = create_all_sections(ebook.sections, book)
+    session.add_all(all_sections)
+    session.flush()
+
+    for book_section, parent_key in section_map.values():
+        if parent_key and parent_key in section_map:
+            parent_section = section_map[parent_key][0]
+            book_section.parent_section_id = cast(int, parent_section.id)
+
+    return book, all_sections
+
+
+def embed_sections(all_sections: list[BookSection]) -> int:
+    """Embed all sections and return count of successfully embedded sections."""
+    embedded_count = 0
+
+    for section in all_sections:
+        try:
+            _, chunks = embedding.embed(
+                "text/plain",
+                cast(str, section.content),
+                metadata=section.as_payload(),
+            )
+
+            if chunks:
+                section.chunks = chunks
+                section.embed_status = "QUEUED"  # type: ignore
+                embedded_count += 1
+            else:
+                section.embed_status = "FAILED"  # type: ignore
+                logger.warning(
+                    f"No chunks generated for section: {section.section_title}"
+                )
+
+        except IOError as e:
+            section.embed_status = "FAILED"  # type: ignore
+            logger.error(f"Failed to embed section {section.section_title}: {e}")
+
+    return embedded_count
+
+
+def push_to_qdrant(all_sections: list[BookSection]):
+    """Push embeddings to Qdrant for all successfully embedded sections."""
+    vector_ids = []
+    vectors = []
+    payloads = []
+
+    to_process = [s for s in all_sections if cast(str, s.embed_status) == "QUEUED"]
+    all_chunks = [chunk for section in to_process for chunk in section.chunks]
+    if not all_chunks:
+        return
+
+    vector_ids = [str(chunk.id) for chunk in all_chunks]
+    vectors = [chunk.vector for chunk in all_chunks]
+    payloads = [chunk.item_metadata for chunk in all_chunks]
+
+    qdrant.upsert_vectors(
+        client=qdrant.get_qdrant_client(),
+        collection_name="book",
+        ids=vector_ids,
+        vectors=vectors,
+        payloads=payloads,
+    )
+
+    for section in to_process:
+        section.embed_status = "STORED"  # type: ignore
+
+
+@app.task(name=SYNC_BOOK)
+def sync_book(file_path: str, tags: Iterable[str] = []) -> dict:
+    """
+    Synchronize a book from a file path.
+
+    Args:
+        file_path: Path to the ebook file
+
+    Returns:
+        dict: Summary of what was processed
+    """
+    ebook = validate_and_parse_book(file_path)
+
+    with make_session() as session:
+        # Check for existing book
+        existing_book = (
+            session.query(Book)
+            .filter(Book.file_path == ebook.file_path.as_posix())
+            .first()
+        )
+        if existing_book:
+            logger.info(f"Book already exists: {existing_book.title}")
+            return {
+                "book_id": existing_book.id,
+                "title": existing_book.title,
+                "author": existing_book.author,
+                "status": "already_exists",
+                "sections_processed": 0,
+            }
+
+        # Create book and sections with relationships
+        book, all_sections = create_book_and_sections(ebook, session, tags)
+
+        # Embed sections
+        embedded_count = embed_sections(all_sections)
+        session.flush()
+
+        # Push to Qdrant
+        try:
+            push_to_qdrant(all_sections)
+        except Exception as e:
+            logger.error(f"Failed to push embeddings to Qdrant: {e}")
+            # Mark sections as failed
+            for section in all_sections:
+                if getattr(section, "embed_status") == "STORED":
+                    section.embed_status = "FAILED"  # type: ignore
+            raise
+
+        session.commit()
+
+        logger.info(
+            f"Successfully processed book: {book.title} "
+            f"({embedded_count}/{len(all_sections)} sections embedded)"
+        )
+
+        return {
+            "book_id": book.id,
+            "title": book.title,
+            "author": book.author,
+            "status": "processed",
+            "total_sections": len(all_sections),
+            "sections_embedded": embedded_count,
+        }
--- a/tests/memory/workers/tasks/test_ebook_tasks.py
+++ b/tests/memory/workers/tasks/test_ebook_tasks.py
@ -0,0 +1,318 @@
+import pytest
+from pathlib import Path
+from unittest.mock import patch, Mock
+
+from memory.common.db.models import Book, BookSection, Chunk
+from memory.common.parsers.ebook import Ebook, Section
+from memory.workers.tasks import ebook
+
+
+@pytest.fixture
+def mock_ebook():
+    """Mock ebook data for testing."""
+    return Ebook(
+        title="Test Book",
+        author="Test Author",
+        metadata={"language": "en", "creator": "Test Publisher"},
+        sections=[
+            Section(
+                title="Chapter 1",
+                content="This is the content of chapter 1. "
+                * 20,  # Make it long enough
+                number=1,
+                start_page=1,
+                end_page=10,
+                children=[
+                    Section(
+                        title="Section 1.1",
+                        content="This is section 1.1 content. " * 15,
+                        number=1,
+                        start_page=1,
+                        end_page=5,
+                    ),
+                    Section(
+                        title="Section 1.2",
+                        content="This is section 1.2 content. " * 15,
+                        number=2,
+                        start_page=6,
+                        end_page=10,
+                    ),
+                ],
+            ),
+            Section(
+                title="Chapter 2",
+                content="This is the content of chapter 2. " * 20,
+                number=2,
+                start_page=11,
+                end_page=20,
+            ),
+        ],
+        file_path=Path("/test/book.epub"),
+        n_pages=20,
+    )
+
+
+@pytest.fixture(autouse=True)
+def mock_embedding():
+    """Mock the embedding function to return dummy vectors."""
+    with patch("memory.workers.tasks.ebook.embedding.embed") as mock:
+        mock.return_value = (
+            "book",
+            [
+                Chunk(
+                    vector=[0.1] * 1024,
+                    item_metadata={"test": "data"},
+                    content="Test content",
+                    embedding_model="model",
+                )
+            ],
+        )
+        yield mock
+
+
+@pytest.fixture
+def mock_qdrant():
+    """Mock Qdrant operations."""
+    with (
+        patch("memory.workers.tasks.ebook.qdrant.upsert_vectors") as mock_upsert,
+        patch("memory.workers.tasks.ebook.qdrant.get_qdrant_client") as mock_client,
+    ):
+        mock_client.return_value = Mock()
+        yield mock_upsert
+
+
+def test_create_book_from_ebook(mock_ebook):
+    """Test creating a Book model from ebook data."""
+    book = ebook.create_book_from_ebook(mock_ebook)
+
+    assert book.title == "Test Book"  # type: ignore
+    assert book.author == "Test Author"  # type: ignore
+    assert book.publisher == "Test Publisher"  # type: ignore
+    assert book.language == "en"  # type: ignore
+    assert book.file_path == "/test/book.epub"  # type: ignore
+    assert book.total_pages == 20  # type: ignore
+    assert book.book_metadata == {  # type: ignore
+        "language": "en",
+        "creator": "Test Publisher",
+    }
+
+
+def test_validate_and_parse_book_success(mock_ebook, tmp_path):
+    """Test successful book validation and parsing."""
+    book_file = tmp_path / "test.epub"
+    book_file.write_text("dummy content")
+
+    with patch("memory.workers.tasks.ebook.parse_ebook", return_value=mock_ebook):
+        assert ebook.validate_and_parse_book(str(book_file)) == mock_ebook
+
+
+def test_validate_and_parse_book_file_not_found():
+    """Test handling of missing files."""
+    with pytest.raises(FileNotFoundError):
+        ebook.validate_and_parse_book("/nonexistent/file.epub")
+
+
+def test_validate_and_parse_book_parse_error(tmp_path):
+    """Test handling of parsing errors."""
+    book_file = tmp_path / "corrupted.epub"
+    book_file.write_text("corrupted data")
+
+    with patch(
+        "memory.workers.tasks.ebook.parse_ebook", side_effect=Exception("Parse error")
+    ):
+        with pytest.raises(Exception, match="Parse error"):
+            ebook.validate_and_parse_book(str(book_file))
+
+
+def test_create_book_and_sections(mock_ebook, db_session):
+    """Test creating book and sections with relationships."""
+    book, sections = ebook.create_book_and_sections(mock_ebook, db_session)
+
+    # Verify book creation
+    assert book.title == "Test Book"  # type: ignore
+    assert book.id is not None
+
+    # Verify sections creation
+    assert len(sections) == 4  # Chapter 1, Section 1.1, Section 1.2, Chapter 2
+
+    # Verify parent-child relationships
+    chapter1 = next(s for s in sections if getattr(s, "section_title") == "Chapter 1")
+    section11 = next(
+        s for s in sections if getattr(s, "section_title") == "Section 1.1"
+    )
+    section12 = next(
+        s for s in sections if getattr(s, "section_title") == "Section 1.2"
+    )
+
+    # Children should reference chapter 1 as parent
+    assert getattr(section11, "parent_section_id") == chapter1.id
+    assert getattr(section12, "parent_section_id") == chapter1.id
+
+    # Chapter 1 should have no parent
+    assert getattr(chapter1, "parent_section_id") is None
+
+
+def test_embed_sections(db_session, mock_embedding):
+    """Test basic embedding sections workflow."""
+    # Create a test book first
+    book = Book(
+        title="Test Book",
+        author="Test Author",
+        file_path="/test/path",
+    )
+    db_session.add(book)
+    db_session.flush()  # Get the book ID
+
+    # Create test sections with all required fields
+    sections = [
+        BookSection(
+            book_id=book.id,
+            section_title="Test Section",
+            section_number=1,
+            section_level=1,
+            start_page=1,
+            end_page=10,
+            content="Test content " * 20,
+            sha256=b"test_hash",
+            modality="book",
+            tags=["book"],
+        )
+    ]
+
+    db_session.add_all(sections)
+    db_session.flush()
+
+    embedded_count = ebook.embed_sections(sections)
+
+    assert embedded_count >= 0
+    assert hasattr(sections[0], "embed_status")
+
+
+def test_push_to_qdrant(qdrant):
+    """Test pushing embeddings to Qdrant."""
+    # Create test sections with chunks
+    mock_chunk = Mock(
+        id="00000000-0000-0000-0000-000000000000",
+        vector=[0.1] * 1024,
+        item_metadata={"test": "data"},
+    )
+
+    mock_section = Mock(spec=BookSection)
+    mock_section.embed_status = "QUEUED"
+    mock_section.chunks = [mock_chunk]
+
+    sections = [mock_section]
+
+    ebook.push_to_qdrant(sections)  # type: ignore
+
+    assert {r.id: r.payload for r in qdrant.scroll(collection_name="book")[0]} == {
+        "00000000-0000-0000-0000-000000000000": {
+            "test": "data",
+        }
+    }
+    assert mock_section.embed_status == "STORED"
+
+
+@patch("memory.workers.tasks.ebook.parse_ebook")
+def test_sync_book_success(mock_parse, mock_ebook, db_session, tmp_path):
+    """Test successful book synchronization."""
+    book_file = tmp_path / "test.epub"
+    book_file.write_text("dummy content")
+
+    mock_ebook.file_path = book_file
+    mock_parse.return_value = mock_ebook
+
+    result = ebook.sync_book(str(book_file), {"source", "test"})
+
+    assert result == {
+        "book_id": 1,
+        "title": "Test Book",
+        "author": "Test Author",
+        "status": "processed",
+        "total_sections": 4,
+        "sections_embedded": 4,
+    }
+
+    book = db_session.query(Book).filter(Book.title == "Test Book").first()
+    assert book is not None
+    assert book.author == "Test Author"
+    assert set(book.tags) == {"source", "test"}
+
+    sections = (
+        db_session.query(BookSection).filter(BookSection.book_id == book.id).all()
+    )
+    assert len(sections) == 4
+
+
+@patch("memory.workers.tasks.ebook.parse_ebook")
+def test_sync_book_already_exists(mock_parse, mock_ebook, db_session, tmp_path):
+    """Test that duplicate books are not processed."""
+    book_file = tmp_path / "test.epub"
+    book_file.write_text("dummy content")
+
+    existing_book = Book(
+        title="Existing Book",
+        author="Author",
+        file_path=str(book_file),
+    )
+    db_session.add(existing_book)
+    db_session.commit()
+
+    mock_ebook.file_path = book_file
+    mock_parse.return_value = mock_ebook
+
+    assert ebook.sync_book(str(book_file)) == {
+        "book_id": existing_book.id,
+        "title": "Existing Book",
+        "author": "Author",
+        "status": "already_exists",
+        "sections_processed": 0,
+    }
+
+
+@patch("memory.workers.tasks.ebook.parse_ebook")
+def test_sync_book_embedding_failure(
+    mock_parse, mock_ebook, db_session, tmp_path, mock_embedding
+):
+    """Test handling of embedding failures."""
+    book_file = tmp_path / "test.epub"
+    book_file.write_text("dummy content")
+
+    mock_ebook.file_path = book_file
+    mock_parse.return_value = mock_ebook
+
+    mock_embedding.side_effect = IOError("Embedding failed")
+    assert ebook.sync_book(str(book_file)) == {
+        "book_id": 1,
+        "title": "Test Book",
+        "author": "Test Author",
+        "status": "processed",
+        "sections_embedded": 0,
+        "total_sections": 4,
+    }
+
+    sections = db_session.query(BookSection).all()
+    for section in sections:
+        assert section.embed_status == "FAILED"
+
+
+@patch("memory.workers.tasks.ebook.parse_ebook")
+def test_sync_book_qdrant_failure(mock_parse, mock_ebook, db_session, tmp_path):
+    """Test handling of Qdrant failures."""
+    book_file = tmp_path / "test.epub"
+    book_file.write_text("dummy content")
+
+    mock_ebook.file_path = book_file
+    mock_parse.return_value = mock_ebook
+
+    # Since embedding is already failing, this test will complete without hitting Qdrant
+    # So let's just verify that the function completes without raising an exception
+    with patch.object(ebook, "push_to_qdrant", side_effect=Exception("Qdrant failed")):
+        with pytest.raises(Exception, match="Qdrant failed"):
+            ebook.sync_book(str(book_file))
+
+
+def test_sync_book_file_not_found():
+    """Test handling of missing files."""
+    with pytest.raises(FileNotFoundError):
+        ebook.sync_book("/nonexistent/file.epub")