mirror of
https://github.com/mruwnik/memory.git
synced 2025-06-08 13:24:41 +02:00
add ebook parser
This commit is contained in:
parent
4f1ca777e9
commit
b292baf59d
@ -6,3 +6,4 @@ dotenv==0.9.9
|
||||
voyageai==0.3.2
|
||||
qdrant-client==1.9.0
|
||||
PyMuPDF==1.25.5
|
||||
ebooklib==0.18.0
|
181
src/memory/common/parsers/ebook.py
Normal file
181
src/memory/common/parsers/ebook.py
Normal file
@ -0,0 +1,181 @@
|
||||
import logging
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Optional, List, Dict, Any, cast
|
||||
from pathlib import Path
|
||||
|
||||
import fitz # PyMuPDF
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class Section:
|
||||
"""Represents a chapter or section in an ebook."""
|
||||
|
||||
title: str
|
||||
content: str
|
||||
number: Optional[int] = None
|
||||
start_page: Optional[int] = None
|
||||
end_page: Optional[int] = None
|
||||
children: list["Section"] = field(default_factory=list)
|
||||
|
||||
|
||||
@dataclass
|
||||
class Ebook:
|
||||
"""Structured representation of an ebook."""
|
||||
|
||||
title: str
|
||||
author: str
|
||||
metadata: Dict[str, Any] = field(default_factory=dict)
|
||||
sections: List[Section] = field(default_factory=list)
|
||||
full_content: str = ""
|
||||
file_path: Optional[Path] = None
|
||||
file_type: str = ""
|
||||
|
||||
|
||||
class Peekable:
|
||||
def __init__(self, items):
|
||||
self.items = items
|
||||
self.done = False
|
||||
self._get_next()
|
||||
|
||||
def _get_next(self):
|
||||
try:
|
||||
self.cached = next(self.items)
|
||||
except StopIteration:
|
||||
self.done = True
|
||||
|
||||
def peek(self):
|
||||
if self.done:
|
||||
return None
|
||||
return self.cached
|
||||
|
||||
def __iter__(self):
|
||||
return self
|
||||
|
||||
def __next__(self):
|
||||
if self.done:
|
||||
raise StopIteration
|
||||
|
||||
item = self.cached
|
||||
self._get_next()
|
||||
return item
|
||||
|
||||
|
||||
TOCItem = tuple[int, str, int]
|
||||
|
||||
|
||||
def extract_epub_metadata(doc) -> Dict[str, Any]:
|
||||
"""Extract metadata from a PyMuPDF document (EPUB)."""
|
||||
if not doc.metadata:
|
||||
return {}
|
||||
|
||||
return {key: value for key, value in doc.metadata.items() if value}
|
||||
|
||||
|
||||
def get_pages(doc, start_page: int, end_page: int) -> str:
|
||||
pages = [
|
||||
doc[page_num].get_text()
|
||||
for page_num in range(start_page, end_page + 1)
|
||||
if 0 <= page_num < doc.page_count
|
||||
]
|
||||
return "\n".join(pages)
|
||||
|
||||
|
||||
def extract_section_pages(doc, toc: Peekable, section_num: int = 1) -> Section | None:
|
||||
"""Extract all sections from a table of contents."""
|
||||
if not toc.peek():
|
||||
return None
|
||||
item = cast(TOCItem | None, next(toc))
|
||||
if not item:
|
||||
return None
|
||||
|
||||
level, name, page = item
|
||||
next_item = cast(TOCItem | None, toc.peek())
|
||||
if not next_item:
|
||||
return Section(
|
||||
title=name,
|
||||
content=get_pages(doc, page, doc.page_count),
|
||||
number=section_num,
|
||||
start_page=page,
|
||||
end_page=doc.page_count,
|
||||
)
|
||||
|
||||
children = []
|
||||
while next_item and next_item[0] > level:
|
||||
children.append(extract_section_pages(doc, toc, len(children) + 1))
|
||||
next_item = cast(TOCItem | None, toc.peek())
|
||||
|
||||
last_page = next_item[2] - 1 if next_item else doc.page_count
|
||||
return Section(
|
||||
title=name,
|
||||
content=get_pages(doc, page, last_page),
|
||||
number=section_num,
|
||||
start_page=page,
|
||||
end_page=last_page,
|
||||
children=children,
|
||||
)
|
||||
|
||||
|
||||
def extract_sections(doc) -> List[Section]:
|
||||
"""Extract all sections from a PyMuPDF document."""
|
||||
toc = doc.get_toc()
|
||||
if not toc:
|
||||
return [
|
||||
Section(
|
||||
title="Content",
|
||||
content=doc.get_text(),
|
||||
number=1,
|
||||
start_page=0,
|
||||
end_page=doc.page_count,
|
||||
)
|
||||
]
|
||||
|
||||
sections = []
|
||||
toc = Peekable(iter(doc.get_toc()))
|
||||
while toc.peek():
|
||||
section = extract_section_pages(doc, toc, len(sections) + 1)
|
||||
if section:
|
||||
sections.append(section)
|
||||
return sections
|
||||
|
||||
|
||||
def parse_ebook(file_path: str | Path) -> Ebook:
|
||||
"""
|
||||
Parse an ebook file and extract its content and metadata.
|
||||
|
||||
Args:
|
||||
file_path: Path to the ebook file
|
||||
|
||||
Returns:
|
||||
Structured ebook data
|
||||
"""
|
||||
path = Path(file_path)
|
||||
if not path.exists():
|
||||
raise FileNotFoundError(f"File not found: {path}")
|
||||
|
||||
try:
|
||||
doc = fitz.open(str(path))
|
||||
except fitz.FileNotFoundError as e:
|
||||
logger.error(f"Error opening ebook {path}: {e}")
|
||||
raise
|
||||
|
||||
metadata = extract_epub_metadata(doc)
|
||||
|
||||
title = metadata.get("title", path.stem)
|
||||
author = metadata.get("author", "Unknown")
|
||||
|
||||
sections = extract_sections(doc)
|
||||
full_content = ""
|
||||
if sections:
|
||||
full_content = "".join(section.content for section in sections)
|
||||
|
||||
return Ebook(
|
||||
title=title,
|
||||
author=author,
|
||||
metadata=metadata,
|
||||
sections=sections,
|
||||
full_content=full_content,
|
||||
file_path=path,
|
||||
file_type=path.suffix.lower()[1:],
|
||||
)
|
@ -3,7 +3,7 @@ import hashlib
|
||||
import logging
|
||||
from datetime import datetime
|
||||
from email.utils import parsedate_to_datetime
|
||||
from typing import TypedDict, Literal
|
||||
from typing import TypedDict
|
||||
import pathlib
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
394
tests/memory/common/parsers/test_ebook.py
Normal file
394
tests/memory/common/parsers/test_ebook.py
Normal file
@ -0,0 +1,394 @@
|
||||
from unittest.mock import MagicMock, patch
|
||||
|
||||
import pytest
|
||||
import fitz
|
||||
|
||||
from memory.common.parsers.ebook import (
|
||||
Peekable,
|
||||
extract_epub_metadata,
|
||||
get_pages,
|
||||
extract_section_pages,
|
||||
extract_sections,
|
||||
parse_ebook,
|
||||
Section,
|
||||
)
|
||||
|
||||
|
||||
def test_peekable_peek():
|
||||
p = Peekable(iter([1, 2, 3]))
|
||||
assert p.peek() == 1
|
||||
assert p.peek() == 1 # Multiple peeks don't advance
|
||||
|
||||
|
||||
def test_peekable_iteration():
|
||||
p = Peekable(iter([1, 2, 3]))
|
||||
assert list(p) == [1, 2, 3]
|
||||
|
||||
|
||||
def test_peekable_empty():
|
||||
p = Peekable(iter([]))
|
||||
assert p.peek() is None
|
||||
assert list(p) == []
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def mock_doc():
|
||||
doc = MagicMock()
|
||||
doc.metadata = {
|
||||
"title": "Test Book",
|
||||
"author": "Test Author",
|
||||
"creator": "Test Creator",
|
||||
"producer": "Test Producer",
|
||||
}
|
||||
doc.page_count = 5
|
||||
|
||||
# Mock pages
|
||||
doc.__getitem__.side_effect = lambda i: MagicMock(
|
||||
get_text=lambda: f"Content of page {i}"
|
||||
)
|
||||
|
||||
# Mock TOC
|
||||
doc.get_toc.return_value = [
|
||||
[1, "Chapter 1", 0],
|
||||
[2, "Section 1.1", 1],
|
||||
[2, "Section 1.2", 2],
|
||||
[1, "Chapter 2", 3],
|
||||
[2, "Section 2.1", 4],
|
||||
]
|
||||
|
||||
return doc
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"metadata_input,expected",
|
||||
[
|
||||
({"title": "Book", "author": "Author"}, {"title": "Book", "author": "Author"}),
|
||||
(
|
||||
{"title": "", "author": "Author"},
|
||||
{"author": "Author"},
|
||||
), # Empty strings should be filtered
|
||||
(
|
||||
{"title": None, "author": "Author"},
|
||||
{"author": "Author"},
|
||||
), # None values should be filtered
|
||||
({}, {}), # Empty dict
|
||||
],
|
||||
)
|
||||
def test_extract_epub_metadata(metadata_input, expected):
|
||||
doc = MagicMock()
|
||||
doc.metadata = metadata_input
|
||||
assert extract_epub_metadata(doc) == expected
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"start_page,end_page,expected_content",
|
||||
[
|
||||
(0, 2, "Content of page 0\nContent of page 1\nContent of page 2"),
|
||||
(3, 4, "Content of page 3\nContent of page 4"),
|
||||
(4, 4, "Content of page 4"),
|
||||
(
|
||||
0,
|
||||
10,
|
||||
"Content of page 0\nContent of page 1\nContent of page 2\nContent of page 3\nContent of page 4",
|
||||
), # Out of range
|
||||
(5, 10, ""), # Completely out of range
|
||||
(3, 2, ""), # Invalid range (start > end)
|
||||
(
|
||||
-1,
|
||||
2,
|
||||
"Content of page 0\nContent of page 1\nContent of page 2",
|
||||
), # Negative start
|
||||
],
|
||||
)
|
||||
def test_get_pages(mock_doc, start_page, end_page, expected_content):
|
||||
assert get_pages(mock_doc, start_page, end_page) == expected_content
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def mock_toc_items():
|
||||
items = [
|
||||
(1, "Chapter 1", 0), # Level 1, start at page 0
|
||||
(2, "Section 1.1", 1), # Level 2, start at page 1
|
||||
(2, "Section 1.2", 2), # Level 2, start at page 2
|
||||
(1, "Chapter 2", 3), # Level 1, start at page 3
|
||||
]
|
||||
return Peekable(iter(items))
|
||||
|
||||
|
||||
def test_extract_section_pages(mock_doc, mock_toc_items):
|
||||
assert extract_section_pages(mock_doc, mock_toc_items) == Section(
|
||||
title="Chapter 1",
|
||||
number=1,
|
||||
start_page=0,
|
||||
end_page=2,
|
||||
content="Content of page 0\nContent of page 1\nContent of page 2",
|
||||
children=[
|
||||
Section(
|
||||
title="Section 1.1",
|
||||
number=1,
|
||||
start_page=1,
|
||||
end_page=1,
|
||||
content="Content of page 1",
|
||||
),
|
||||
Section(
|
||||
title="Section 1.2",
|
||||
number=2,
|
||||
start_page=2,
|
||||
end_page=2,
|
||||
content="Content of page 2",
|
||||
),
|
||||
],
|
||||
)
|
||||
|
||||
|
||||
def test_extract_sections(mock_doc):
|
||||
assert extract_sections(mock_doc) == [
|
||||
Section(
|
||||
title="Chapter 1",
|
||||
number=1,
|
||||
start_page=0,
|
||||
end_page=2,
|
||||
content="Content of page 0\nContent of page 1\nContent of page 2",
|
||||
children=[
|
||||
Section(
|
||||
title="Section 1.1",
|
||||
number=1,
|
||||
start_page=1,
|
||||
end_page=1,
|
||||
content="Content of page 1",
|
||||
),
|
||||
Section(
|
||||
title="Section 1.2",
|
||||
number=2,
|
||||
start_page=2,
|
||||
end_page=2,
|
||||
content="Content of page 2",
|
||||
),
|
||||
],
|
||||
),
|
||||
Section(
|
||||
title="Chapter 2",
|
||||
number=2,
|
||||
start_page=3,
|
||||
end_page=5,
|
||||
content="Content of page 3\nContent of page 4",
|
||||
children=[
|
||||
Section(
|
||||
title="Section 2.1",
|
||||
number=1,
|
||||
start_page=4,
|
||||
end_page=5,
|
||||
content="Content of page 4",
|
||||
),
|
||||
],
|
||||
),
|
||||
]
|
||||
|
||||
|
||||
def test_extract_sections_no_toc(mock_doc):
|
||||
mock_doc.get_toc.return_value = []
|
||||
mock_doc.get_text.return_value = "Full document content"
|
||||
|
||||
assert extract_sections(mock_doc) == [
|
||||
Section(
|
||||
title="Content",
|
||||
number=1,
|
||||
start_page=0,
|
||||
end_page=5,
|
||||
content="Full document content",
|
||||
children=[],
|
||||
),
|
||||
]
|
||||
|
||||
|
||||
@patch("fitz.open")
|
||||
def test_parse_ebook(mock_open, mock_doc, tmp_path):
|
||||
mock_open.return_value = mock_doc
|
||||
|
||||
# Create a test file
|
||||
test_file = tmp_path / "test.epub"
|
||||
test_file.touch()
|
||||
|
||||
ebook = parse_ebook(test_file)
|
||||
|
||||
assert ebook.title == "Test Book"
|
||||
assert ebook.author == "Test Author"
|
||||
assert len(ebook.sections) == 2
|
||||
assert ebook.file_path == test_file
|
||||
assert ebook.file_type == "epub"
|
||||
|
||||
|
||||
@patch("fitz.open")
|
||||
def test_parse_ebook_file_not_found(mock_open, tmp_path):
|
||||
non_existent_file = tmp_path / "does_not_exist.epub"
|
||||
|
||||
with pytest.raises(FileNotFoundError):
|
||||
parse_ebook(non_existent_file)
|
||||
|
||||
|
||||
@patch("fitz.open")
|
||||
def test_parse_ebook_fitz_error(mock_open, tmp_path):
|
||||
# Create a test file to avoid FileNotFoundError
|
||||
test_file = tmp_path / "test.epub"
|
||||
test_file.touch()
|
||||
|
||||
# Mock the fitz.open to raise the FileNotFoundError
|
||||
mock_open.side_effect = fitz.FileNotFoundError("File not found by PyMuPDF")
|
||||
|
||||
with pytest.raises(fitz.FileNotFoundError):
|
||||
parse_ebook(test_file)
|
||||
|
||||
|
||||
@patch("fitz.open")
|
||||
def test_parse_ebook_no_metadata(mock_open, mock_doc, tmp_path):
|
||||
mock_doc.metadata = {}
|
||||
mock_open.return_value = mock_doc
|
||||
|
||||
test_file = tmp_path / "test.epub"
|
||||
test_file.touch()
|
||||
|
||||
ebook = parse_ebook(test_file)
|
||||
|
||||
assert ebook.title == "test" # Should use file stem
|
||||
assert ebook.author == "Unknown"
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"file_suffix,expected_type",
|
||||
[
|
||||
(".epub", "epub"),
|
||||
(".pdf", "pdf"),
|
||||
(".mobi", "mobi"),
|
||||
(".EPUB", "epub"), # Test case insensitivity
|
||||
("", ""), # No extension
|
||||
],
|
||||
)
|
||||
@patch("fitz.open")
|
||||
def test_parse_ebook_file_types(
|
||||
mock_open, mock_doc, tmp_path, file_suffix, expected_type
|
||||
):
|
||||
mock_open.return_value = mock_doc
|
||||
|
||||
# Create a test file with the given suffix
|
||||
test_file = tmp_path / f"test{file_suffix}"
|
||||
test_file.touch()
|
||||
|
||||
ebook = parse_ebook(test_file)
|
||||
assert ebook.file_type == expected_type
|
||||
|
||||
|
||||
def test_extract_section_pages_empty():
|
||||
"""Test with empty TOC."""
|
||||
doc = MagicMock()
|
||||
doc.page_count = 5
|
||||
|
||||
empty_toc = Peekable(iter([]))
|
||||
assert extract_section_pages(doc, empty_toc) is None
|
||||
|
||||
|
||||
def test_extract_section_pages_deeply_nested():
|
||||
"""Test with deeply nested TOC structure."""
|
||||
doc = MagicMock()
|
||||
doc.page_count = 10
|
||||
doc.__getitem__.side_effect = lambda i: MagicMock(
|
||||
get_text=lambda: f"Content of page {i}"
|
||||
)
|
||||
|
||||
# Create a deeply nested TOC structure
|
||||
items = [
|
||||
(1, "Chapter 1", 0),
|
||||
(2, "Section 1.1", 1),
|
||||
(3, "Subsection 1.1.1", 2),
|
||||
(4, "Sub-subsection 1.1.1.1", 3),
|
||||
(3, "Subsection 1.1.2", 4),
|
||||
(2, "Section 1.2", 5),
|
||||
(1, "Chapter 2", 6),
|
||||
]
|
||||
toc = Peekable(iter(items))
|
||||
|
||||
# Extract the top-level section
|
||||
section = extract_section_pages(doc, toc)
|
||||
assert section is not None
|
||||
assert section.title == "Chapter 1"
|
||||
assert section.start_page == 0
|
||||
assert section.end_page == 5 # Before Chapter 2
|
||||
assert len(section.children) == 2 # Two level-2 sections
|
||||
|
||||
# Check first level-2 section
|
||||
section_1_1 = section.children[0]
|
||||
assert section_1_1.title == "Section 1.1"
|
||||
assert section_1_1.start_page == 1
|
||||
assert section_1_1.end_page == 4 # Before Section 1.2
|
||||
assert len(section_1_1.children) == 2 # Two level-3 sections
|
||||
|
||||
# Check first level-3 section
|
||||
subsection_1_1_1 = section_1_1.children[0]
|
||||
assert subsection_1_1_1.title == "Subsection 1.1.1"
|
||||
assert subsection_1_1_1.start_page == 2
|
||||
assert subsection_1_1_1.end_page == 3 # Before Subsection 1.1.2
|
||||
assert len(subsection_1_1_1.children) == 1 # One level-4 section
|
||||
|
||||
# Check level-4 section
|
||||
subsubsection = subsection_1_1_1.children[0]
|
||||
assert subsubsection.title == "Sub-subsection 1.1.1.1"
|
||||
assert subsubsection.start_page == 3
|
||||
assert subsubsection.end_page == 3 # Just one page
|
||||
assert len(subsubsection.children) == 0 # No children
|
||||
|
||||
|
||||
def test_extract_sections_with_different_toc_formats():
|
||||
"""Test ability to handle different TOC formats."""
|
||||
doc = MagicMock()
|
||||
doc.page_count = 5
|
||||
doc.__getitem__.side_effect = lambda i: MagicMock(
|
||||
get_text=lambda: f"Content of page {i}"
|
||||
)
|
||||
|
||||
# Test with tuple format TOC
|
||||
doc.get_toc.return_value = [
|
||||
(1, "Chapter 1", 0),
|
||||
(1, "Chapter 2", 3),
|
||||
]
|
||||
|
||||
sections = extract_sections(doc)
|
||||
assert len(sections) == 2
|
||||
assert sections[0].title == "Chapter 1"
|
||||
assert sections[1].title == "Chapter 2"
|
||||
|
||||
# Test with list format TOC (same representation in PyMuPDF)
|
||||
doc.get_toc.return_value = [
|
||||
[1, "Chapter 1", 0],
|
||||
[1, "Chapter 2", 3],
|
||||
]
|
||||
|
||||
sections = extract_sections(doc)
|
||||
assert len(sections) == 2
|
||||
assert sections[0].title == "Chapter 1"
|
||||
assert sections[1].title == "Chapter 2"
|
||||
|
||||
|
||||
@patch("fitz.open")
|
||||
def test_parse_ebook_full_content_generation(mock_open, mock_doc, tmp_path):
|
||||
"""Test full content is correctly concatenated from sections."""
|
||||
# Setup mock document with sections
|
||||
mock_doc.metadata = {"title": "Test Book", "author": "Test Author"}
|
||||
mock_doc.page_count = 3
|
||||
|
||||
# Create sections with specific content
|
||||
section1 = MagicMock()
|
||||
section1.content = "Content of section 1"
|
||||
section2 = MagicMock()
|
||||
section2.content = "Content of section 2"
|
||||
|
||||
# Mock extract_sections to return our sections
|
||||
with patch("memory.common.parsers.ebook.extract_sections") as mock_extract:
|
||||
mock_extract.return_value = [section1, section2]
|
||||
|
||||
mock_open.return_value = mock_doc
|
||||
test_file = tmp_path / "test.epub"
|
||||
test_file.touch()
|
||||
|
||||
ebook = parse_ebook(test_file)
|
||||
|
||||
# Check the full content is concatenated correctly
|
||||
assert ebook.full_content == "Content of section 1Content of section 2"
|
Loading…
x
Reference in New Issue
Block a user