From 6b48a768b925665d69496e9595b926ff929a7d01 Mon Sep 17 00:00:00 2001 From: Daniel O'Connell Date: Fri, 27 Jun 2025 03:38:55 +0200 Subject: [PATCH] better file retrieval --- requirements/requirements-api.txt | 2 +- src/memory/api/MCP/base.py | 1 + src/memory/api/MCP/memory.py | 73 ++++++++----------------- src/memory/common/extract.py | 89 ++++++++++++++++++++++++++++++- 4 files changed, 111 insertions(+), 54 deletions(-) diff --git a/requirements/requirements-api.txt b/requirements/requirements-api.txt index 30cebfe..9b01a9b 100644 --- a/requirements/requirements-api.txt +++ b/requirements/requirements-api.txt @@ -3,5 +3,5 @@ uvicorn==0.29.0 python-jose==3.3.0 python-multipart==0.0.9 sqladmin==0.20.1 -mcp==1.9.2 +mcp==1.10.0 bm25s[full]==0.2.13 \ No newline at end of file diff --git a/src/memory/api/MCP/base.py b/src/memory/api/MCP/base.py index 440d65f..04a4b61 100644 --- a/src/memory/api/MCP/base.py +++ b/src/memory/api/MCP/base.py @@ -61,6 +61,7 @@ templates = Jinja2Templates(directory=template_dir) oauth_provider = SimpleOAuthProvider() auth_settings = AuthSettings( issuer_url=cast(AnyHttpUrl, settings.SERVER_URL), + resource_server_url=cast(AnyHttpUrl, settings.SERVER_URL), client_registration_options=ClientRegistrationOptions( enabled=True, valid_scopes=["read", "write"], diff --git a/src/memory/api/MCP/memory.py b/src/memory/api/MCP/memory.py index 6d63519..ad84418 100644 --- a/src/memory/api/MCP/memory.py +++ b/src/memory/api/MCP/memory.py @@ -2,16 +2,18 @@ MCP tools for the epistemic sparring partner system. """ +import base64 import logging -import mimetypes import pathlib from datetime import datetime, timezone from typing import Any +from PIL import Image from pydantic import BaseModel from sqlalchemy import Text from sqlalchemy import cast as sql_cast from sqlalchemy.dialects.postgresql import ARRAY +from mcp.server.fastmcp.resources.base import Resource from memory.api.MCP.tools import mcp from memory.api.search.search import SearchFilters, search @@ -350,61 +352,30 @@ def fetch_file(filename: str) -> dict: Returns dict with content, mime_type, is_text, file_size. Text content as string, binary as base64. """ - path = settings.FILE_STORAGE_DIR / filename.lstrip("/") + path = settings.FILE_STORAGE_DIR / filename.strip().lstrip("/") if not path.exists(): raise FileNotFoundError(f"File not found: {filename}") - mime_type, _ = mimetypes.guess_type(str(path)) - mime_type = mime_type or "application/octet-stream" + mime_type = extract.get_mime_type(path) + chunks = extract.extract_data_chunks(mime_type, path, skip_summary=True) - text_extensions = { - ".md", - ".txt", - ".py", - ".js", - ".html", - ".css", - ".json", - ".xml", - ".yaml", - ".yml", - ".toml", - ".ini", - ".cfg", - ".conf", - } - text_mimes = { - "application/json", - "application/xml", - "application/javascript", - "application/x-yaml", - "application/yaml", - } - is_text = ( - mime_type.startswith("text/") - or mime_type in text_mimes - or path.suffix.lower() in text_extensions - ) + def serialize_chunk( + chunk: extract.DataChunk, data: extract.MulitmodalChunk + ) -> dict: + contents = data + if isinstance(data, Image.Image): + contents = data.tobytes() + if isinstance(contents, bytes): + contents = base64.b64encode(contents).decode("ascii") - try: - content = ( - path.read_text(encoding="utf-8") - if is_text - else __import__("base64").b64encode(path.read_bytes()).decode("ascii") - ) - except UnicodeDecodeError: - import base64 - - content = base64.b64encode(path.read_bytes()).decode("ascii") - is_text = False - mime_type = ( - "application/octet-stream" if mime_type.startswith("text/") else mime_type - ) + return { + "type": "text" if isinstance(data, str) else "image", + "mime_type": chunk.mime_type, + "data": contents, + } return { - "content": content, - "mime_type": mime_type, - "is_text": is_text, - "file_size": path.stat().st_size, - "filename": filename, + "content": [ + serialize_chunk(chunk, data) for chunk in chunks for data in chunk.data + ] } diff --git a/src/memory/common/extract.py b/src/memory/common/extract.py index 2fbe50a..92be227 100644 --- a/src/memory/common/extract.py +++ b/src/memory/common/extract.py @@ -1,12 +1,14 @@ from dataclasses import dataclass, field import io import logging +import mimetypes import pathlib import tempfile from contextlib import contextmanager from typing import Any, Generator, Sequence, cast from memory.common import chunker, summarizer +from memory.parsers import ebook import pymupdf # PyMuPDF import pypandoc from PIL import Image @@ -15,6 +17,70 @@ logger = logging.getLogger(__name__) MulitmodalChunk = Image.Image | str +TEXT_EXTENSIONS = { + ".md", + ".txt", + ".py", + ".js", + ".html", + ".css", + ".json", + ".xml", + ".yaml", + ".yml", + ".toml", + ".ini", + ".cfg", + ".conf", +} +IMAGE_EXTENSIONS = { + ".png", + ".jpg", + ".jpeg", + ".gif", + ".bmp", +} +CUSTOM_EXTENSIONS = { + ".epub": "application/epub+zip", + ".pdf": "application/pdf", + ".docx": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", + ".doc": "application/msword", + ".xls": "application/vnd.ms-excel", + ".xlsx": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", + ".ppt": "application/vnd.ms-powerpoint", + ".pptx": "application/vnd.openxmlformats-officedocument.presentationml.presentation", +} + + +def get_mime_type(path: pathlib.Path) -> str: + mime_type, _ = mimetypes.guess_type(str(path)) + if mime_type: + print(f"mime_type: {mime_type}") + return mime_type + ext = path.suffix.lower() + return CUSTOM_EXTENSIONS.get(ext, "application/octet-stream") + + +def is_text_file(path: pathlib.Path) -> bool: + mime_type = get_mime_type(path) + text_mimes = { + "application/json", + "application/xml", + "application/javascript", + "application/x-yaml", + "application/yaml", + } + return ( + mime_type.startswith("text/") + or mime_type in text_mimes + or path.suffix.lower() in TEXT_EXTENSIONS + ) + + +def is_image_file(path: pathlib.Path) -> bool: + mime_type = get_mime_type(path) + return mime_type.startswith("image/") or path.suffix.lower() in IMAGE_EXTENSIONS + def merge_metadata(*metadata: dict[str, Any]) -> dict[str, Any]: final = {} @@ -126,6 +192,7 @@ def extract_text( chunk_size: int | None = None, metadata: dict[str, Any] = {}, modality: str = "text", + skip_summary: bool = False, ) -> list[DataChunk]: if isinstance(content, pathlib.Path): content = content.read_text() @@ -137,7 +204,7 @@ def extract_text( DataChunk(data=[c], modality=modality, metadata=metadata) for c in chunker.chunk_text(content, chunk_size or chunker.DEFAULT_CHUNK_TOKENS) ] - if content and len(content) > chunker.DEFAULT_CHUNK_TOKENS * 2: + if not skip_summary and content and len(content) > chunker.DEFAULT_CHUNK_TOKENS * 2: summary, tags = summarizer.summarize(content) chunks.append( DataChunk( @@ -149,10 +216,26 @@ def extract_text( return chunks +def extract_ebook(file_path: str | pathlib.Path) -> list[DataChunk]: + book = ebook.parse_ebook(file_path) + return [ + DataChunk( + mime_type="text/plain", + data=[ + page.strip() + for section in book.sections + for page in section.pages + if page.strip() + ], + ) + ] + + def extract_data_chunks( mime_type: str, content: bytes | str | pathlib.Path, chunk_size: int | None = None, + skip_summary: bool = False, ) -> list[DataChunk]: chunks = [] logger.info(f"Extracting content from {mime_type}") @@ -164,7 +247,9 @@ def extract_data_chunks( ]: chunks = extract_docx(content) elif mime_type.startswith("text/"): - chunks = extract_text(content, chunk_size) + chunks = extract_text(content, chunk_size, skip_summary=skip_summary) elif mime_type.startswith("image/"): chunks = extract_image(content) + elif mime_type == "application/epub+zip": + chunks = extract_ebook(cast(pathlib.Path, content)) return chunks