mirror of
https://github.com/mruwnik/memory.git
synced 2025-06-08 13:24:41 +02:00
add ebook job
This commit is contained in:
parent
b292baf59d
commit
02d606deab
2
.github/workflows/ci.yml
vendored
2
.github/workflows/ci.yml
vendored
@ -17,7 +17,7 @@ jobs:
|
|||||||
run: |
|
run: |
|
||||||
python -m pip install --upgrade pip
|
python -m pip install --upgrade pip
|
||||||
pip install .[all]
|
pip install .[all]
|
||||||
pip install ruff==0.11.10 pylint
|
pip install ruff==0.11.10 pylint==1.1.400
|
||||||
- name: Run linters
|
- name: Run linters
|
||||||
run: |
|
run: |
|
||||||
ruff check .
|
ruff check .
|
||||||
|
107
db/migrations/versions/20250523_163753_add_ebooks.py
Normal file
107
db/migrations/versions/20250523_163753_add_ebooks.py
Normal file
@ -0,0 +1,107 @@
|
|||||||
|
"""Add ebooks
|
||||||
|
|
||||||
|
Revision ID: fe570eab952a
|
||||||
|
Revises: b78b1fff9974
|
||||||
|
Create Date: 2025-05-23 16:37:53.354723
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
from typing import Sequence, Union
|
||||||
|
|
||||||
|
from alembic import op
|
||||||
|
import sqlalchemy as sa
|
||||||
|
from sqlalchemy.dialects import postgresql
|
||||||
|
|
||||||
|
# revision identifiers, used by Alembic.
|
||||||
|
revision: str = "fe570eab952a"
|
||||||
|
down_revision: Union[str, None] = "b78b1fff9974"
|
||||||
|
branch_labels: Union[str, Sequence[str], None] = None
|
||||||
|
depends_on: Union[str, Sequence[str], None] = None
|
||||||
|
|
||||||
|
|
||||||
|
def upgrade() -> None:
|
||||||
|
op.create_table(
|
||||||
|
"book",
|
||||||
|
sa.Column("id", sa.BigInteger(), nullable=False),
|
||||||
|
sa.Column("isbn", sa.Text(), nullable=True),
|
||||||
|
sa.Column("title", sa.Text(), nullable=False),
|
||||||
|
sa.Column("author", sa.Text(), nullable=True),
|
||||||
|
sa.Column("publisher", sa.Text(), nullable=True),
|
||||||
|
sa.Column("published", sa.DateTime(timezone=True), nullable=True),
|
||||||
|
sa.Column("language", sa.Text(), nullable=True),
|
||||||
|
sa.Column("edition", sa.Text(), nullable=True),
|
||||||
|
sa.Column("series", sa.Text(), nullable=True),
|
||||||
|
sa.Column("series_number", sa.Integer(), nullable=True),
|
||||||
|
sa.Column("total_pages", sa.Integer(), nullable=True),
|
||||||
|
sa.Column("file_path", sa.Text(), nullable=True),
|
||||||
|
sa.Column("tags", sa.ARRAY(sa.Text()), nullable=False, server_default="{}"),
|
||||||
|
sa.Column("metadata", postgresql.JSONB(astext_type=sa.Text()), nullable=True),
|
||||||
|
sa.Column(
|
||||||
|
"created_at",
|
||||||
|
sa.DateTime(timezone=True),
|
||||||
|
server_default=sa.text("now()"),
|
||||||
|
nullable=True,
|
||||||
|
),
|
||||||
|
sa.PrimaryKeyConstraint("id"),
|
||||||
|
sa.UniqueConstraint("isbn"),
|
||||||
|
)
|
||||||
|
op.create_index("book_author_idx", "book", ["author"], unique=False)
|
||||||
|
op.create_index("book_isbn_idx", "book", ["isbn"], unique=False)
|
||||||
|
op.create_index("book_title_idx", "book", ["title"], unique=False)
|
||||||
|
op.create_table(
|
||||||
|
"book_section",
|
||||||
|
sa.Column("id", sa.BigInteger(), nullable=False),
|
||||||
|
sa.Column("book_id", sa.BigInteger(), nullable=False),
|
||||||
|
sa.Column("section_title", sa.Text(), nullable=True),
|
||||||
|
sa.Column("section_number", sa.Integer(), nullable=True),
|
||||||
|
sa.Column("section_level", sa.Integer(), nullable=True),
|
||||||
|
sa.Column("start_page", sa.Integer(), nullable=True),
|
||||||
|
sa.Column("end_page", sa.Integer(), nullable=True),
|
||||||
|
sa.Column("parent_section_id", sa.BigInteger(), nullable=True),
|
||||||
|
sa.ForeignKeyConstraint(["book_id"], ["book.id"], ondelete="CASCADE"),
|
||||||
|
sa.ForeignKeyConstraint(["id"], ["source_item.id"], ondelete="CASCADE"),
|
||||||
|
sa.ForeignKeyConstraint(
|
||||||
|
["parent_section_id"],
|
||||||
|
["book_section.id"],
|
||||||
|
),
|
||||||
|
sa.PrimaryKeyConstraint("id"),
|
||||||
|
)
|
||||||
|
op.create_index("book_section_book_idx", "book_section", ["book_id"], unique=False)
|
||||||
|
op.create_index(
|
||||||
|
"book_section_level_idx",
|
||||||
|
"book_section",
|
||||||
|
["section_level", "section_number"],
|
||||||
|
unique=False,
|
||||||
|
)
|
||||||
|
op.create_index(
|
||||||
|
"book_section_parent_idx", "book_section", ["parent_section_id"], unique=False
|
||||||
|
)
|
||||||
|
op.drop_table("book_doc")
|
||||||
|
|
||||||
|
|
||||||
|
def downgrade() -> None:
|
||||||
|
op.create_table(
|
||||||
|
"book_doc",
|
||||||
|
sa.Column("id", sa.BIGINT(), autoincrement=False, nullable=False),
|
||||||
|
sa.Column("title", sa.TEXT(), autoincrement=False, nullable=True),
|
||||||
|
sa.Column("author", sa.TEXT(), autoincrement=False, nullable=True),
|
||||||
|
sa.Column("chapter", sa.TEXT(), autoincrement=False, nullable=True),
|
||||||
|
sa.Column(
|
||||||
|
"published",
|
||||||
|
postgresql.TIMESTAMP(timezone=True),
|
||||||
|
autoincrement=False,
|
||||||
|
nullable=True,
|
||||||
|
),
|
||||||
|
sa.ForeignKeyConstraint(
|
||||||
|
["id"], ["source_item.id"], name="book_doc_id_fkey", ondelete="CASCADE"
|
||||||
|
),
|
||||||
|
sa.PrimaryKeyConstraint("id", name="book_doc_pkey"),
|
||||||
|
)
|
||||||
|
op.drop_index("book_section_parent_idx", table_name="book_section")
|
||||||
|
op.drop_index("book_section_level_idx", table_name="book_section")
|
||||||
|
op.drop_index("book_section_book_idx", table_name="book_section")
|
||||||
|
op.drop_table("book_section")
|
||||||
|
op.drop_index("book_title_idx", table_name="book")
|
||||||
|
op.drop_index("book_isbn_idx", table_name="book")
|
||||||
|
op.drop_index("book_author_idx", table_name="book")
|
||||||
|
op.drop_table("book")
|
3
dev.sh
3
dev.sh
@ -13,6 +13,9 @@ echo -e "${GREEN}Starting development environment for Memory Knowledge Base...${
|
|||||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||||
cd "$SCRIPT_DIR"
|
cd "$SCRIPT_DIR"
|
||||||
|
|
||||||
|
docker volume create memory_file_storage
|
||||||
|
docker run --rm -v memory_file_storage:/data busybox chown -R 1000:1000 /data
|
||||||
|
|
||||||
# Create a temporary docker-compose override file to expose PostgreSQL
|
# Create a temporary docker-compose override file to expose PostgreSQL
|
||||||
echo -e "${YELLOW}Creating docker-compose override to expose PostgreSQL...${NC}"
|
echo -e "${YELLOW}Creating docker-compose override to expose PostgreSQL...${NC}"
|
||||||
if [ ! -f docker-compose.override.yml ]; then
|
if [ ! -f docker-compose.override.yml ]; then
|
||||||
|
@ -407,19 +407,98 @@ class Comic(SourceItem):
|
|||||||
return {k: v for k, v in payload.items() if v is not None}
|
return {k: v for k, v in payload.items() if v is not None}
|
||||||
|
|
||||||
|
|
||||||
class BookDoc(SourceItem):
|
class Book(Base):
|
||||||
__tablename__ = "book_doc"
|
"""Book-level metadata table"""
|
||||||
|
|
||||||
|
__tablename__ = "book"
|
||||||
|
|
||||||
|
id = Column(BigInteger, primary_key=True)
|
||||||
|
isbn = Column(Text, unique=True)
|
||||||
|
title = Column(Text, nullable=False)
|
||||||
|
author = Column(Text)
|
||||||
|
publisher = Column(Text)
|
||||||
|
published = Column(DateTime(timezone=True))
|
||||||
|
language = Column(Text)
|
||||||
|
edition = Column(Text)
|
||||||
|
series = Column(Text)
|
||||||
|
series_number = Column(Integer)
|
||||||
|
total_pages = Column(Integer)
|
||||||
|
file_path = Column(Text)
|
||||||
|
tags = Column(ARRAY(Text), nullable=False, server_default="{}")
|
||||||
|
|
||||||
|
# Metadata from ebook parser
|
||||||
|
book_metadata = Column(JSONB, name="metadata")
|
||||||
|
|
||||||
|
created_at = Column(DateTime(timezone=True), server_default=func.now())
|
||||||
|
|
||||||
|
__table_args__ = (
|
||||||
|
Index("book_isbn_idx", "isbn"),
|
||||||
|
Index("book_author_idx", "author"),
|
||||||
|
Index("book_title_idx", "title"),
|
||||||
|
)
|
||||||
|
|
||||||
|
def as_payload(self) -> dict:
|
||||||
|
return {
|
||||||
|
"source_id": self.id,
|
||||||
|
"isbn": self.isbn,
|
||||||
|
"title": self.title,
|
||||||
|
"author": self.author,
|
||||||
|
"publisher": self.publisher,
|
||||||
|
"published": self.published,
|
||||||
|
"language": self.language,
|
||||||
|
"edition": self.edition,
|
||||||
|
"series": self.series,
|
||||||
|
"series_number": self.series_number,
|
||||||
|
"tags": self.tags,
|
||||||
|
} | (cast(dict, self.book_metadata) or {})
|
||||||
|
|
||||||
|
|
||||||
|
class BookSection(SourceItem):
|
||||||
|
"""Individual sections/chapters of books"""
|
||||||
|
|
||||||
|
__tablename__ = "book_section"
|
||||||
|
|
||||||
id = Column(
|
id = Column(
|
||||||
BigInteger, ForeignKey("source_item.id", ondelete="CASCADE"), primary_key=True
|
BigInteger, ForeignKey("source_item.id", ondelete="CASCADE"), primary_key=True
|
||||||
)
|
)
|
||||||
title = Column(Text)
|
book_id = Column(
|
||||||
author = Column(Text)
|
BigInteger, ForeignKey("book.id", ondelete="CASCADE"), nullable=False
|
||||||
chapter = Column(Text)
|
)
|
||||||
published = Column(DateTime(timezone=True))
|
|
||||||
|
|
||||||
__mapper_args__ = {
|
section_title = Column(Text)
|
||||||
"polymorphic_identity": "book_doc",
|
section_number = Column(Integer)
|
||||||
|
section_level = Column(Integer) # 1=chapter, 2=section, 3=subsection
|
||||||
|
start_page = Column(Integer)
|
||||||
|
end_page = Column(Integer)
|
||||||
|
|
||||||
|
# Parent-child relationships for nested sections
|
||||||
|
parent_section_id = Column(BigInteger, ForeignKey("book_section.id"))
|
||||||
|
|
||||||
|
book = relationship("Book", backref="sections")
|
||||||
|
parent = relationship(
|
||||||
|
"BookSection",
|
||||||
|
remote_side=[id],
|
||||||
|
backref="children",
|
||||||
|
foreign_keys=[parent_section_id],
|
||||||
|
)
|
||||||
|
|
||||||
|
__mapper_args__ = {"polymorphic_identity": "book_section"}
|
||||||
|
__table_args__ = (
|
||||||
|
Index("book_section_book_idx", "book_id"),
|
||||||
|
Index("book_section_parent_idx", "parent_section_id"),
|
||||||
|
Index("book_section_level_idx", "section_level", "section_number"),
|
||||||
|
)
|
||||||
|
|
||||||
|
def as_payload(self) -> dict:
|
||||||
|
return {
|
||||||
|
"source_id": self.id,
|
||||||
|
"book_id": self.book_id,
|
||||||
|
"section_title": self.section_title,
|
||||||
|
"section_number": self.section_number,
|
||||||
|
"section_level": self.section_level,
|
||||||
|
"start_page": self.start_page,
|
||||||
|
"end_page": self.end_page,
|
||||||
|
"tags": self.tags,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
import logging
|
import logging
|
||||||
from dataclasses import dataclass, field
|
from dataclasses import dataclass, field
|
||||||
from typing import Optional, List, Dict, Any, cast
|
from typing import Any, cast
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
import fitz # PyMuPDF
|
import fitz # PyMuPDF
|
||||||
@ -14,9 +14,9 @@ class Section:
|
|||||||
|
|
||||||
title: str
|
title: str
|
||||||
content: str
|
content: str
|
||||||
number: Optional[int] = None
|
number: int | None = None
|
||||||
start_page: Optional[int] = None
|
start_page: int | None = None
|
||||||
end_page: Optional[int] = None
|
end_page: int | None = None
|
||||||
children: list["Section"] = field(default_factory=list)
|
children: list["Section"] = field(default_factory=list)
|
||||||
|
|
||||||
|
|
||||||
@ -26,11 +26,12 @@ class Ebook:
|
|||||||
|
|
||||||
title: str
|
title: str
|
||||||
author: str
|
author: str
|
||||||
metadata: Dict[str, Any] = field(default_factory=dict)
|
file_path: Path
|
||||||
sections: List[Section] = field(default_factory=list)
|
metadata: dict[str, Any] = field(default_factory=dict)
|
||||||
|
sections: list[Section] = field(default_factory=list)
|
||||||
full_content: str = ""
|
full_content: str = ""
|
||||||
file_path: Optional[Path] = None
|
|
||||||
file_type: str = ""
|
file_type: str = ""
|
||||||
|
n_pages: int = 0
|
||||||
|
|
||||||
|
|
||||||
class Peekable:
|
class Peekable:
|
||||||
@ -65,7 +66,7 @@ class Peekable:
|
|||||||
TOCItem = tuple[int, str, int]
|
TOCItem = tuple[int, str, int]
|
||||||
|
|
||||||
|
|
||||||
def extract_epub_metadata(doc) -> Dict[str, Any]:
|
def extract_epub_metadata(doc) -> dict[str, Any]:
|
||||||
"""Extract metadata from a PyMuPDF document (EPUB)."""
|
"""Extract metadata from a PyMuPDF document (EPUB)."""
|
||||||
if not doc.metadata:
|
if not doc.metadata:
|
||||||
return {}
|
return {}
|
||||||
@ -117,7 +118,7 @@ def extract_section_pages(doc, toc: Peekable, section_num: int = 1) -> Section |
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def extract_sections(doc) -> List[Section]:
|
def extract_sections(doc) -> list[Section]:
|
||||||
"""Extract all sections from a PyMuPDF document."""
|
"""Extract all sections from a PyMuPDF document."""
|
||||||
toc = doc.get_toc()
|
toc = doc.get_toc()
|
||||||
if not toc:
|
if not toc:
|
||||||
@ -178,4 +179,5 @@ def parse_ebook(file_path: str | Path) -> Ebook:
|
|||||||
full_content=full_content,
|
full_content=full_content,
|
||||||
file_path=path,
|
file_path=path,
|
||||||
file_type=path.suffix.lower()[1:],
|
file_type=path.suffix.lower()[1:],
|
||||||
|
n_pages=doc.page_count,
|
||||||
)
|
)
|
||||||
|
244
src/memory/workers/tasks/ebook.py
Normal file
244
src/memory/workers/tasks/ebook.py
Normal file
@ -0,0 +1,244 @@
|
|||||||
|
import hashlib
|
||||||
|
import logging
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Iterable, cast
|
||||||
|
|
||||||
|
from memory.common import embedding, qdrant, settings
|
||||||
|
from memory.common.db.connection import make_session
|
||||||
|
from memory.common.db.models import Book, BookSection
|
||||||
|
from memory.common.parsers.ebook import Ebook, parse_ebook, Section
|
||||||
|
from memory.workers.celery_app import app
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
SYNC_BOOK = "memory.workers.tasks.book.sync_book"
|
||||||
|
|
||||||
|
# Minimum section length to embed (avoid noise from very short sections)
|
||||||
|
MIN_SECTION_LENGTH = 100
|
||||||
|
|
||||||
|
|
||||||
|
def create_book_from_ebook(ebook, tags: Iterable[str] = []) -> Book:
|
||||||
|
"""Create a Book model from parsed ebook data."""
|
||||||
|
return Book(
|
||||||
|
title=ebook.title,
|
||||||
|
author=ebook.author,
|
||||||
|
publisher=ebook.metadata.get("creator"),
|
||||||
|
language=ebook.metadata.get("language"),
|
||||||
|
total_pages=ebook.n_pages,
|
||||||
|
file_path=ebook.file_path.as_posix(),
|
||||||
|
book_metadata=ebook.metadata,
|
||||||
|
tags=tags,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def section_processor(
|
||||||
|
book: Book,
|
||||||
|
all_sections: list[BookSection],
|
||||||
|
section_map: dict[
|
||||||
|
tuple[int, int | None], tuple[BookSection, tuple[int, int | None] | None]
|
||||||
|
],
|
||||||
|
):
|
||||||
|
def process_section(
|
||||||
|
section: Section,
|
||||||
|
level: int = 1,
|
||||||
|
parent_key: tuple[int, int | None] | None = None,
|
||||||
|
):
|
||||||
|
if len(section.content.strip()) >= MIN_SECTION_LENGTH:
|
||||||
|
sha256 = hashlib.sha256(
|
||||||
|
f"{book.id}:{section.title}:{section.start_page}".encode()
|
||||||
|
).digest()
|
||||||
|
|
||||||
|
book_section = BookSection(
|
||||||
|
book_id=book.id,
|
||||||
|
section_title=section.title,
|
||||||
|
section_number=section.number,
|
||||||
|
section_level=level,
|
||||||
|
start_page=section.start_page,
|
||||||
|
end_page=section.end_page,
|
||||||
|
parent_section_id=None, # Will be set after flush
|
||||||
|
content=section.content,
|
||||||
|
sha256=sha256,
|
||||||
|
modality="book",
|
||||||
|
tags=book.tags,
|
||||||
|
)
|
||||||
|
|
||||||
|
all_sections.append(book_section)
|
||||||
|
section_key = (level, section.number)
|
||||||
|
section_map[section_key] = (book_section, parent_key)
|
||||||
|
|
||||||
|
# Process children
|
||||||
|
for child in section.children:
|
||||||
|
process_section(child, level + 1, section_key)
|
||||||
|
|
||||||
|
return process_section
|
||||||
|
|
||||||
|
|
||||||
|
def create_all_sections(
|
||||||
|
ebook_sections: list[Section], book: Book
|
||||||
|
) -> tuple[list[BookSection], dict]:
|
||||||
|
"""Create all sections iteratively to handle parent-child relationships properly."""
|
||||||
|
all_sections = []
|
||||||
|
section_map = {} # Maps (level, number) to section for parent lookup
|
||||||
|
|
||||||
|
process_section = section_processor(book, all_sections, section_map)
|
||||||
|
for section in ebook_sections:
|
||||||
|
process_section(section)
|
||||||
|
|
||||||
|
return all_sections, section_map
|
||||||
|
|
||||||
|
|
||||||
|
def validate_and_parse_book(file_path: str) -> Ebook:
|
||||||
|
"""Validate file exists and parse the ebook."""
|
||||||
|
path = Path(file_path)
|
||||||
|
if not path.exists():
|
||||||
|
raise FileNotFoundError(f"Book file not found: {path}")
|
||||||
|
|
||||||
|
try:
|
||||||
|
return parse_ebook(path)
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Failed to parse ebook {path}: {e}")
|
||||||
|
raise
|
||||||
|
|
||||||
|
|
||||||
|
def create_book_and_sections(
|
||||||
|
ebook, session, tags: Iterable[str] = []
|
||||||
|
) -> tuple[Book, list[BookSection]]:
|
||||||
|
"""Create book and all its sections with proper relationships."""
|
||||||
|
# Create book
|
||||||
|
book = create_book_from_ebook(ebook, tags)
|
||||||
|
session.add(book)
|
||||||
|
session.flush() # Get the book ID
|
||||||
|
|
||||||
|
# Create all sections
|
||||||
|
all_sections, section_map = create_all_sections(ebook.sections, book)
|
||||||
|
session.add_all(all_sections)
|
||||||
|
session.flush()
|
||||||
|
|
||||||
|
for book_section, parent_key in section_map.values():
|
||||||
|
if parent_key and parent_key in section_map:
|
||||||
|
parent_section = section_map[parent_key][0]
|
||||||
|
book_section.parent_section_id = cast(int, parent_section.id)
|
||||||
|
|
||||||
|
return book, all_sections
|
||||||
|
|
||||||
|
|
||||||
|
def embed_sections(all_sections: list[BookSection]) -> int:
|
||||||
|
"""Embed all sections and return count of successfully embedded sections."""
|
||||||
|
embedded_count = 0
|
||||||
|
|
||||||
|
for section in all_sections:
|
||||||
|
try:
|
||||||
|
_, chunks = embedding.embed(
|
||||||
|
"text/plain",
|
||||||
|
cast(str, section.content),
|
||||||
|
metadata=section.as_payload(),
|
||||||
|
)
|
||||||
|
|
||||||
|
if chunks:
|
||||||
|
section.chunks = chunks
|
||||||
|
section.embed_status = "QUEUED" # type: ignore
|
||||||
|
embedded_count += 1
|
||||||
|
else:
|
||||||
|
section.embed_status = "FAILED" # type: ignore
|
||||||
|
logger.warning(
|
||||||
|
f"No chunks generated for section: {section.section_title}"
|
||||||
|
)
|
||||||
|
|
||||||
|
except IOError as e:
|
||||||
|
section.embed_status = "FAILED" # type: ignore
|
||||||
|
logger.error(f"Failed to embed section {section.section_title}: {e}")
|
||||||
|
|
||||||
|
return embedded_count
|
||||||
|
|
||||||
|
|
||||||
|
def push_to_qdrant(all_sections: list[BookSection]):
|
||||||
|
"""Push embeddings to Qdrant for all successfully embedded sections."""
|
||||||
|
vector_ids = []
|
||||||
|
vectors = []
|
||||||
|
payloads = []
|
||||||
|
|
||||||
|
to_process = [s for s in all_sections if cast(str, s.embed_status) == "QUEUED"]
|
||||||
|
all_chunks = [chunk for section in to_process for chunk in section.chunks]
|
||||||
|
if not all_chunks:
|
||||||
|
return
|
||||||
|
|
||||||
|
vector_ids = [str(chunk.id) for chunk in all_chunks]
|
||||||
|
vectors = [chunk.vector for chunk in all_chunks]
|
||||||
|
payloads = [chunk.item_metadata for chunk in all_chunks]
|
||||||
|
|
||||||
|
qdrant.upsert_vectors(
|
||||||
|
client=qdrant.get_qdrant_client(),
|
||||||
|
collection_name="book",
|
||||||
|
ids=vector_ids,
|
||||||
|
vectors=vectors,
|
||||||
|
payloads=payloads,
|
||||||
|
)
|
||||||
|
|
||||||
|
for section in to_process:
|
||||||
|
section.embed_status = "STORED" # type: ignore
|
||||||
|
|
||||||
|
|
||||||
|
@app.task(name=SYNC_BOOK)
|
||||||
|
def sync_book(file_path: str, tags: Iterable[str] = []) -> dict:
|
||||||
|
"""
|
||||||
|
Synchronize a book from a file path.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
file_path: Path to the ebook file
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
dict: Summary of what was processed
|
||||||
|
"""
|
||||||
|
ebook = validate_and_parse_book(file_path)
|
||||||
|
|
||||||
|
with make_session() as session:
|
||||||
|
# Check for existing book
|
||||||
|
existing_book = (
|
||||||
|
session.query(Book)
|
||||||
|
.filter(Book.file_path == ebook.file_path.as_posix())
|
||||||
|
.first()
|
||||||
|
)
|
||||||
|
if existing_book:
|
||||||
|
logger.info(f"Book already exists: {existing_book.title}")
|
||||||
|
return {
|
||||||
|
"book_id": existing_book.id,
|
||||||
|
"title": existing_book.title,
|
||||||
|
"author": existing_book.author,
|
||||||
|
"status": "already_exists",
|
||||||
|
"sections_processed": 0,
|
||||||
|
}
|
||||||
|
|
||||||
|
# Create book and sections with relationships
|
||||||
|
book, all_sections = create_book_and_sections(ebook, session, tags)
|
||||||
|
|
||||||
|
# Embed sections
|
||||||
|
embedded_count = embed_sections(all_sections)
|
||||||
|
session.flush()
|
||||||
|
|
||||||
|
# Push to Qdrant
|
||||||
|
try:
|
||||||
|
push_to_qdrant(all_sections)
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Failed to push embeddings to Qdrant: {e}")
|
||||||
|
# Mark sections as failed
|
||||||
|
for section in all_sections:
|
||||||
|
if getattr(section, "embed_status") == "STORED":
|
||||||
|
section.embed_status = "FAILED" # type: ignore
|
||||||
|
raise
|
||||||
|
|
||||||
|
session.commit()
|
||||||
|
|
||||||
|
logger.info(
|
||||||
|
f"Successfully processed book: {book.title} "
|
||||||
|
f"({embedded_count}/{len(all_sections)} sections embedded)"
|
||||||
|
)
|
||||||
|
|
||||||
|
return {
|
||||||
|
"book_id": book.id,
|
||||||
|
"title": book.title,
|
||||||
|
"author": book.author,
|
||||||
|
"status": "processed",
|
||||||
|
"total_sections": len(all_sections),
|
||||||
|
"sections_embedded": embedded_count,
|
||||||
|
}
|
318
tests/memory/workers/tasks/test_ebook_tasks.py
Normal file
318
tests/memory/workers/tasks/test_ebook_tasks.py
Normal file
@ -0,0 +1,318 @@
|
|||||||
|
import pytest
|
||||||
|
from pathlib import Path
|
||||||
|
from unittest.mock import patch, Mock
|
||||||
|
|
||||||
|
from memory.common.db.models import Book, BookSection, Chunk
|
||||||
|
from memory.common.parsers.ebook import Ebook, Section
|
||||||
|
from memory.workers.tasks import ebook
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def mock_ebook():
|
||||||
|
"""Mock ebook data for testing."""
|
||||||
|
return Ebook(
|
||||||
|
title="Test Book",
|
||||||
|
author="Test Author",
|
||||||
|
metadata={"language": "en", "creator": "Test Publisher"},
|
||||||
|
sections=[
|
||||||
|
Section(
|
||||||
|
title="Chapter 1",
|
||||||
|
content="This is the content of chapter 1. "
|
||||||
|
* 20, # Make it long enough
|
||||||
|
number=1,
|
||||||
|
start_page=1,
|
||||||
|
end_page=10,
|
||||||
|
children=[
|
||||||
|
Section(
|
||||||
|
title="Section 1.1",
|
||||||
|
content="This is section 1.1 content. " * 15,
|
||||||
|
number=1,
|
||||||
|
start_page=1,
|
||||||
|
end_page=5,
|
||||||
|
),
|
||||||
|
Section(
|
||||||
|
title="Section 1.2",
|
||||||
|
content="This is section 1.2 content. " * 15,
|
||||||
|
number=2,
|
||||||
|
start_page=6,
|
||||||
|
end_page=10,
|
||||||
|
),
|
||||||
|
],
|
||||||
|
),
|
||||||
|
Section(
|
||||||
|
title="Chapter 2",
|
||||||
|
content="This is the content of chapter 2. " * 20,
|
||||||
|
number=2,
|
||||||
|
start_page=11,
|
||||||
|
end_page=20,
|
||||||
|
),
|
||||||
|
],
|
||||||
|
file_path=Path("/test/book.epub"),
|
||||||
|
n_pages=20,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(autouse=True)
|
||||||
|
def mock_embedding():
|
||||||
|
"""Mock the embedding function to return dummy vectors."""
|
||||||
|
with patch("memory.workers.tasks.ebook.embedding.embed") as mock:
|
||||||
|
mock.return_value = (
|
||||||
|
"book",
|
||||||
|
[
|
||||||
|
Chunk(
|
||||||
|
vector=[0.1] * 1024,
|
||||||
|
item_metadata={"test": "data"},
|
||||||
|
content="Test content",
|
||||||
|
embedding_model="model",
|
||||||
|
)
|
||||||
|
],
|
||||||
|
)
|
||||||
|
yield mock
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def mock_qdrant():
|
||||||
|
"""Mock Qdrant operations."""
|
||||||
|
with (
|
||||||
|
patch("memory.workers.tasks.ebook.qdrant.upsert_vectors") as mock_upsert,
|
||||||
|
patch("memory.workers.tasks.ebook.qdrant.get_qdrant_client") as mock_client,
|
||||||
|
):
|
||||||
|
mock_client.return_value = Mock()
|
||||||
|
yield mock_upsert
|
||||||
|
|
||||||
|
|
||||||
|
def test_create_book_from_ebook(mock_ebook):
|
||||||
|
"""Test creating a Book model from ebook data."""
|
||||||
|
book = ebook.create_book_from_ebook(mock_ebook)
|
||||||
|
|
||||||
|
assert book.title == "Test Book" # type: ignore
|
||||||
|
assert book.author == "Test Author" # type: ignore
|
||||||
|
assert book.publisher == "Test Publisher" # type: ignore
|
||||||
|
assert book.language == "en" # type: ignore
|
||||||
|
assert book.file_path == "/test/book.epub" # type: ignore
|
||||||
|
assert book.total_pages == 20 # type: ignore
|
||||||
|
assert book.book_metadata == { # type: ignore
|
||||||
|
"language": "en",
|
||||||
|
"creator": "Test Publisher",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def test_validate_and_parse_book_success(mock_ebook, tmp_path):
|
||||||
|
"""Test successful book validation and parsing."""
|
||||||
|
book_file = tmp_path / "test.epub"
|
||||||
|
book_file.write_text("dummy content")
|
||||||
|
|
||||||
|
with patch("memory.workers.tasks.ebook.parse_ebook", return_value=mock_ebook):
|
||||||
|
assert ebook.validate_and_parse_book(str(book_file)) == mock_ebook
|
||||||
|
|
||||||
|
|
||||||
|
def test_validate_and_parse_book_file_not_found():
|
||||||
|
"""Test handling of missing files."""
|
||||||
|
with pytest.raises(FileNotFoundError):
|
||||||
|
ebook.validate_and_parse_book("/nonexistent/file.epub")
|
||||||
|
|
||||||
|
|
||||||
|
def test_validate_and_parse_book_parse_error(tmp_path):
|
||||||
|
"""Test handling of parsing errors."""
|
||||||
|
book_file = tmp_path / "corrupted.epub"
|
||||||
|
book_file.write_text("corrupted data")
|
||||||
|
|
||||||
|
with patch(
|
||||||
|
"memory.workers.tasks.ebook.parse_ebook", side_effect=Exception("Parse error")
|
||||||
|
):
|
||||||
|
with pytest.raises(Exception, match="Parse error"):
|
||||||
|
ebook.validate_and_parse_book(str(book_file))
|
||||||
|
|
||||||
|
|
||||||
|
def test_create_book_and_sections(mock_ebook, db_session):
|
||||||
|
"""Test creating book and sections with relationships."""
|
||||||
|
book, sections = ebook.create_book_and_sections(mock_ebook, db_session)
|
||||||
|
|
||||||
|
# Verify book creation
|
||||||
|
assert book.title == "Test Book" # type: ignore
|
||||||
|
assert book.id is not None
|
||||||
|
|
||||||
|
# Verify sections creation
|
||||||
|
assert len(sections) == 4 # Chapter 1, Section 1.1, Section 1.2, Chapter 2
|
||||||
|
|
||||||
|
# Verify parent-child relationships
|
||||||
|
chapter1 = next(s for s in sections if getattr(s, "section_title") == "Chapter 1")
|
||||||
|
section11 = next(
|
||||||
|
s for s in sections if getattr(s, "section_title") == "Section 1.1"
|
||||||
|
)
|
||||||
|
section12 = next(
|
||||||
|
s for s in sections if getattr(s, "section_title") == "Section 1.2"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Children should reference chapter 1 as parent
|
||||||
|
assert getattr(section11, "parent_section_id") == chapter1.id
|
||||||
|
assert getattr(section12, "parent_section_id") == chapter1.id
|
||||||
|
|
||||||
|
# Chapter 1 should have no parent
|
||||||
|
assert getattr(chapter1, "parent_section_id") is None
|
||||||
|
|
||||||
|
|
||||||
|
def test_embed_sections(db_session, mock_embedding):
|
||||||
|
"""Test basic embedding sections workflow."""
|
||||||
|
# Create a test book first
|
||||||
|
book = Book(
|
||||||
|
title="Test Book",
|
||||||
|
author="Test Author",
|
||||||
|
file_path="/test/path",
|
||||||
|
)
|
||||||
|
db_session.add(book)
|
||||||
|
db_session.flush() # Get the book ID
|
||||||
|
|
||||||
|
# Create test sections with all required fields
|
||||||
|
sections = [
|
||||||
|
BookSection(
|
||||||
|
book_id=book.id,
|
||||||
|
section_title="Test Section",
|
||||||
|
section_number=1,
|
||||||
|
section_level=1,
|
||||||
|
start_page=1,
|
||||||
|
end_page=10,
|
||||||
|
content="Test content " * 20,
|
||||||
|
sha256=b"test_hash",
|
||||||
|
modality="book",
|
||||||
|
tags=["book"],
|
||||||
|
)
|
||||||
|
]
|
||||||
|
|
||||||
|
db_session.add_all(sections)
|
||||||
|
db_session.flush()
|
||||||
|
|
||||||
|
embedded_count = ebook.embed_sections(sections)
|
||||||
|
|
||||||
|
assert embedded_count >= 0
|
||||||
|
assert hasattr(sections[0], "embed_status")
|
||||||
|
|
||||||
|
|
||||||
|
def test_push_to_qdrant(qdrant):
|
||||||
|
"""Test pushing embeddings to Qdrant."""
|
||||||
|
# Create test sections with chunks
|
||||||
|
mock_chunk = Mock(
|
||||||
|
id="00000000-0000-0000-0000-000000000000",
|
||||||
|
vector=[0.1] * 1024,
|
||||||
|
item_metadata={"test": "data"},
|
||||||
|
)
|
||||||
|
|
||||||
|
mock_section = Mock(spec=BookSection)
|
||||||
|
mock_section.embed_status = "QUEUED"
|
||||||
|
mock_section.chunks = [mock_chunk]
|
||||||
|
|
||||||
|
sections = [mock_section]
|
||||||
|
|
||||||
|
ebook.push_to_qdrant(sections) # type: ignore
|
||||||
|
|
||||||
|
assert {r.id: r.payload for r in qdrant.scroll(collection_name="book")[0]} == {
|
||||||
|
"00000000-0000-0000-0000-000000000000": {
|
||||||
|
"test": "data",
|
||||||
|
}
|
||||||
|
}
|
||||||
|
assert mock_section.embed_status == "STORED"
|
||||||
|
|
||||||
|
|
||||||
|
@patch("memory.workers.tasks.ebook.parse_ebook")
|
||||||
|
def test_sync_book_success(mock_parse, mock_ebook, db_session, tmp_path):
|
||||||
|
"""Test successful book synchronization."""
|
||||||
|
book_file = tmp_path / "test.epub"
|
||||||
|
book_file.write_text("dummy content")
|
||||||
|
|
||||||
|
mock_ebook.file_path = book_file
|
||||||
|
mock_parse.return_value = mock_ebook
|
||||||
|
|
||||||
|
result = ebook.sync_book(str(book_file), {"source", "test"})
|
||||||
|
|
||||||
|
assert result == {
|
||||||
|
"book_id": 1,
|
||||||
|
"title": "Test Book",
|
||||||
|
"author": "Test Author",
|
||||||
|
"status": "processed",
|
||||||
|
"total_sections": 4,
|
||||||
|
"sections_embedded": 4,
|
||||||
|
}
|
||||||
|
|
||||||
|
book = db_session.query(Book).filter(Book.title == "Test Book").first()
|
||||||
|
assert book is not None
|
||||||
|
assert book.author == "Test Author"
|
||||||
|
assert set(book.tags) == {"source", "test"}
|
||||||
|
|
||||||
|
sections = (
|
||||||
|
db_session.query(BookSection).filter(BookSection.book_id == book.id).all()
|
||||||
|
)
|
||||||
|
assert len(sections) == 4
|
||||||
|
|
||||||
|
|
||||||
|
@patch("memory.workers.tasks.ebook.parse_ebook")
|
||||||
|
def test_sync_book_already_exists(mock_parse, mock_ebook, db_session, tmp_path):
|
||||||
|
"""Test that duplicate books are not processed."""
|
||||||
|
book_file = tmp_path / "test.epub"
|
||||||
|
book_file.write_text("dummy content")
|
||||||
|
|
||||||
|
existing_book = Book(
|
||||||
|
title="Existing Book",
|
||||||
|
author="Author",
|
||||||
|
file_path=str(book_file),
|
||||||
|
)
|
||||||
|
db_session.add(existing_book)
|
||||||
|
db_session.commit()
|
||||||
|
|
||||||
|
mock_ebook.file_path = book_file
|
||||||
|
mock_parse.return_value = mock_ebook
|
||||||
|
|
||||||
|
assert ebook.sync_book(str(book_file)) == {
|
||||||
|
"book_id": existing_book.id,
|
||||||
|
"title": "Existing Book",
|
||||||
|
"author": "Author",
|
||||||
|
"status": "already_exists",
|
||||||
|
"sections_processed": 0,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@patch("memory.workers.tasks.ebook.parse_ebook")
|
||||||
|
def test_sync_book_embedding_failure(
|
||||||
|
mock_parse, mock_ebook, db_session, tmp_path, mock_embedding
|
||||||
|
):
|
||||||
|
"""Test handling of embedding failures."""
|
||||||
|
book_file = tmp_path / "test.epub"
|
||||||
|
book_file.write_text("dummy content")
|
||||||
|
|
||||||
|
mock_ebook.file_path = book_file
|
||||||
|
mock_parse.return_value = mock_ebook
|
||||||
|
|
||||||
|
mock_embedding.side_effect = IOError("Embedding failed")
|
||||||
|
assert ebook.sync_book(str(book_file)) == {
|
||||||
|
"book_id": 1,
|
||||||
|
"title": "Test Book",
|
||||||
|
"author": "Test Author",
|
||||||
|
"status": "processed",
|
||||||
|
"sections_embedded": 0,
|
||||||
|
"total_sections": 4,
|
||||||
|
}
|
||||||
|
|
||||||
|
sections = db_session.query(BookSection).all()
|
||||||
|
for section in sections:
|
||||||
|
assert section.embed_status == "FAILED"
|
||||||
|
|
||||||
|
|
||||||
|
@patch("memory.workers.tasks.ebook.parse_ebook")
|
||||||
|
def test_sync_book_qdrant_failure(mock_parse, mock_ebook, db_session, tmp_path):
|
||||||
|
"""Test handling of Qdrant failures."""
|
||||||
|
book_file = tmp_path / "test.epub"
|
||||||
|
book_file.write_text("dummy content")
|
||||||
|
|
||||||
|
mock_ebook.file_path = book_file
|
||||||
|
mock_parse.return_value = mock_ebook
|
||||||
|
|
||||||
|
# Since embedding is already failing, this test will complete without hitting Qdrant
|
||||||
|
# So let's just verify that the function completes without raising an exception
|
||||||
|
with patch.object(ebook, "push_to_qdrant", side_effect=Exception("Qdrant failed")):
|
||||||
|
with pytest.raises(Exception, match="Qdrant failed"):
|
||||||
|
ebook.sync_book(str(book_file))
|
||||||
|
|
||||||
|
|
||||||
|
def test_sync_book_file_not_found():
|
||||||
|
"""Test handling of missing files."""
|
||||||
|
with pytest.raises(FileNotFoundError):
|
||||||
|
ebook.sync_book("/nonexistent/file.epub")
|
Loading…
x
Reference in New Issue
Block a user