memory/tests/memory/common/test_extract.py
2025-05-20 21:52:47 +02:00

188 lines
5.3 KiB
Python

import pathlib
import pytest
import pymupdf
from PIL import Image
import io
import shutil
from memory.common.extract import (
as_file,
extract_text,
extract_content,
doc_to_images,
extract_image,
docx_to_pdf,
extract_docx,
)
REGULAMIN = pathlib.Path(__file__).parent.parent.parent / "data" / "regulamin.pdf"
SAMPLE_DOCX = pathlib.Path(__file__).parent.parent.parent / "data" / "sample.docx"
# Helper to check if pdflatex is available
def is_pdflatex_available():
return shutil.which("pdflatex") is not None
def test_as_file_with_path(tmp_path):
test_path = tmp_path / "test.txt"
test_path.write_text("test content")
with as_file(test_path) as path:
assert path == test_path
assert path.read_text() == "test content"
def test_as_file_with_bytes():
content = b"test content"
with as_file(content) as path:
assert pathlib.Path(path).read_bytes() == content
def test_as_file_with_str():
content = "test content"
with as_file(content) as path:
assert pathlib.Path(path).read_text() == content
@pytest.mark.parametrize(
"input_content,expected",
[
("simple text", [{"contents": ["simple text"], "metadata": {}}]),
(b"bytes text", [{"contents": ["bytes text"], "metadata": {}}]),
],
)
def test_extract_text(input_content, expected):
assert extract_text(input_content) == expected
def test_extract_text_with_path(tmp_path):
test_file = tmp_path / "test.txt"
test_file.write_text("file text content")
assert extract_text(test_file) == [
{"contents": ["file text content"], "metadata": {}}
]
def test_doc_to_images():
result = doc_to_images(REGULAMIN)
assert len(result) == 2
with pymupdf.open(REGULAMIN) as pdf:
for page, pdf_page in zip(result, pdf.pages()):
pix = pdf_page.get_pixmap()
img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
assert page["contents"] == [img]
assert page["metadata"] == {
"page": pdf_page.number,
"width": pdf_page.rect.width,
"height": pdf_page.rect.height,
}
def test_extract_image_with_path(tmp_path):
img = Image.new("RGB", (100, 100), color="red")
img_path = tmp_path / "test.png"
img.save(img_path)
(page,) = extract_image(img_path)
assert page["contents"][0].tobytes() == img.convert("RGB").tobytes()
assert page["metadata"] == {}
def test_extract_image_with_bytes():
img = Image.new("RGB", (100, 100), color="blue")
buffer = io.BytesIO()
img.save(buffer, format="PNG")
img_bytes = buffer.getvalue()
(page,) = extract_image(img_bytes)
assert page["contents"][0].tobytes() == img.convert("RGB").tobytes()
assert page["metadata"] == {}
def test_extract_image_with_str():
with pytest.raises(ValueError):
extract_image("test")
@pytest.mark.parametrize(
"mime_type,content",
[
("text/plain", "Text content"),
("text/html", "<html>content</html>"),
("text/markdown", "# Heading"),
("text/csv", "a,b,c"),
],
)
def test_extract_content_different_text_types(mime_type, content):
assert extract_content(mime_type, content) == [
{"contents": [content], "metadata": {}}
]
def test_extract_content_pdf():
result = extract_content("application/pdf", REGULAMIN)
assert len(result) == 2
assert all(
isinstance(page["contents"], list)
and all(isinstance(c, Image.Image) for c in page["contents"])
for page in result
)
assert all("page" in page["metadata"] for page in result)
assert all("width" in page["metadata"] for page in result)
assert all("height" in page["metadata"] for page in result)
def test_extract_content_image(tmp_path):
# Create a test image
img = Image.new("RGB", (100, 100), color="red")
img_path = tmp_path / "test_img.png"
img.save(img_path)
(result,) = extract_content("image/png", img_path)
assert isinstance(result["contents"][0], Image.Image)
assert result["contents"][0].size == (100, 100)
assert result["metadata"] == {}
def test_extract_content_unsupported_type():
assert extract_content("unsupported/type", "content") == []
@pytest.mark.skipif(not is_pdflatex_available(), reason="pdflatex not installed")
def test_docx_to_pdf(tmp_path):
output_path = tmp_path / "output.pdf"
result_path = docx_to_pdf(SAMPLE_DOCX, output_path)
assert result_path == output_path
assert result_path.exists()
assert result_path.suffix == ".pdf"
# Verify the PDF is valid by opening it
with pymupdf.open(result_path) as pdf:
assert pdf.page_count > 0
@pytest.mark.skipif(not is_pdflatex_available(), reason="pdflatex not installed")
def test_docx_to_pdf_default_output():
# Test with default output path
result_path = docx_to_pdf(SAMPLE_DOCX)
assert result_path == SAMPLE_DOCX.with_suffix(".pdf")
assert result_path.exists()
@pytest.mark.skipif(not is_pdflatex_available(), reason="pdflatex not installed")
def test_extract_docx():
pages = extract_docx(SAMPLE_DOCX)
assert len(pages) > 0
assert all(isinstance(page, dict) for page in pages)
assert all("contents" in page for page in pages)
assert all("metadata" in page for page in pages)
assert all(isinstance(page["contents"][0], Image.Image) for page in pages)