better file retrieval

This commit is contained in:
Daniel O'Connell 2025-06-27 03:38:55 +02:00
parent a3daea883b
commit 6b48a768b9
4 changed files with 111 additions and 54 deletions

View File

@ -3,5 +3,5 @@ uvicorn==0.29.0
python-jose==3.3.0
python-multipart==0.0.9
sqladmin==0.20.1
mcp==1.9.2
mcp==1.10.0
bm25s[full]==0.2.13

View File

@ -61,6 +61,7 @@ templates = Jinja2Templates(directory=template_dir)
oauth_provider = SimpleOAuthProvider()
auth_settings = AuthSettings(
issuer_url=cast(AnyHttpUrl, settings.SERVER_URL),
resource_server_url=cast(AnyHttpUrl, settings.SERVER_URL),
client_registration_options=ClientRegistrationOptions(
enabled=True,
valid_scopes=["read", "write"],

View File

@ -2,16 +2,18 @@
MCP tools for the epistemic sparring partner system.
"""
import base64
import logging
import mimetypes
import pathlib
from datetime import datetime, timezone
from typing import Any
from PIL import Image
from pydantic import BaseModel
from sqlalchemy import Text
from sqlalchemy import cast as sql_cast
from sqlalchemy.dialects.postgresql import ARRAY
from mcp.server.fastmcp.resources.base import Resource
from memory.api.MCP.tools import mcp
from memory.api.search.search import SearchFilters, search
@ -350,61 +352,30 @@ def fetch_file(filename: str) -> dict:
Returns dict with content, mime_type, is_text, file_size.
Text content as string, binary as base64.
"""
path = settings.FILE_STORAGE_DIR / filename.lstrip("/")
path = settings.FILE_STORAGE_DIR / filename.strip().lstrip("/")
if not path.exists():
raise FileNotFoundError(f"File not found: {filename}")
mime_type, _ = mimetypes.guess_type(str(path))
mime_type = mime_type or "application/octet-stream"
mime_type = extract.get_mime_type(path)
chunks = extract.extract_data_chunks(mime_type, path, skip_summary=True)
text_extensions = {
".md",
".txt",
".py",
".js",
".html",
".css",
".json",
".xml",
".yaml",
".yml",
".toml",
".ini",
".cfg",
".conf",
}
text_mimes = {
"application/json",
"application/xml",
"application/javascript",
"application/x-yaml",
"application/yaml",
}
is_text = (
mime_type.startswith("text/")
or mime_type in text_mimes
or path.suffix.lower() in text_extensions
)
try:
content = (
path.read_text(encoding="utf-8")
if is_text
else __import__("base64").b64encode(path.read_bytes()).decode("ascii")
)
except UnicodeDecodeError:
import base64
content = base64.b64encode(path.read_bytes()).decode("ascii")
is_text = False
mime_type = (
"application/octet-stream" if mime_type.startswith("text/") else mime_type
)
def serialize_chunk(
chunk: extract.DataChunk, data: extract.MulitmodalChunk
) -> dict:
contents = data
if isinstance(data, Image.Image):
contents = data.tobytes()
if isinstance(contents, bytes):
contents = base64.b64encode(contents).decode("ascii")
return {
"content": content,
"mime_type": mime_type,
"is_text": is_text,
"file_size": path.stat().st_size,
"filename": filename,
"type": "text" if isinstance(data, str) else "image",
"mime_type": chunk.mime_type,
"data": contents,
}
return {
"content": [
serialize_chunk(chunk, data) for chunk in chunks for data in chunk.data
]
}

View File

@ -1,12 +1,14 @@
from dataclasses import dataclass, field
import io
import logging
import mimetypes
import pathlib
import tempfile
from contextlib import contextmanager
from typing import Any, Generator, Sequence, cast
from memory.common import chunker, summarizer
from memory.parsers import ebook
import pymupdf # PyMuPDF
import pypandoc
from PIL import Image
@ -15,6 +17,70 @@ logger = logging.getLogger(__name__)
MulitmodalChunk = Image.Image | str
TEXT_EXTENSIONS = {
".md",
".txt",
".py",
".js",
".html",
".css",
".json",
".xml",
".yaml",
".yml",
".toml",
".ini",
".cfg",
".conf",
}
IMAGE_EXTENSIONS = {
".png",
".jpg",
".jpeg",
".gif",
".bmp",
}
CUSTOM_EXTENSIONS = {
".epub": "application/epub+zip",
".pdf": "application/pdf",
".docx": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
".doc": "application/msword",
".xls": "application/vnd.ms-excel",
".xlsx": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
".ppt": "application/vnd.ms-powerpoint",
".pptx": "application/vnd.openxmlformats-officedocument.presentationml.presentation",
}
def get_mime_type(path: pathlib.Path) -> str:
mime_type, _ = mimetypes.guess_type(str(path))
if mime_type:
print(f"mime_type: {mime_type}")
return mime_type
ext = path.suffix.lower()
return CUSTOM_EXTENSIONS.get(ext, "application/octet-stream")
def is_text_file(path: pathlib.Path) -> bool:
mime_type = get_mime_type(path)
text_mimes = {
"application/json",
"application/xml",
"application/javascript",
"application/x-yaml",
"application/yaml",
}
return (
mime_type.startswith("text/")
or mime_type in text_mimes
or path.suffix.lower() in TEXT_EXTENSIONS
)
def is_image_file(path: pathlib.Path) -> bool:
mime_type = get_mime_type(path)
return mime_type.startswith("image/") or path.suffix.lower() in IMAGE_EXTENSIONS
def merge_metadata(*metadata: dict[str, Any]) -> dict[str, Any]:
final = {}
@ -126,6 +192,7 @@ def extract_text(
chunk_size: int | None = None,
metadata: dict[str, Any] = {},
modality: str = "text",
skip_summary: bool = False,
) -> list[DataChunk]:
if isinstance(content, pathlib.Path):
content = content.read_text()
@ -137,7 +204,7 @@ def extract_text(
DataChunk(data=[c], modality=modality, metadata=metadata)
for c in chunker.chunk_text(content, chunk_size or chunker.DEFAULT_CHUNK_TOKENS)
]
if content and len(content) > chunker.DEFAULT_CHUNK_TOKENS * 2:
if not skip_summary and content and len(content) > chunker.DEFAULT_CHUNK_TOKENS * 2:
summary, tags = summarizer.summarize(content)
chunks.append(
DataChunk(
@ -149,10 +216,26 @@ def extract_text(
return chunks
def extract_ebook(file_path: str | pathlib.Path) -> list[DataChunk]:
book = ebook.parse_ebook(file_path)
return [
DataChunk(
mime_type="text/plain",
data=[
page.strip()
for section in book.sections
for page in section.pages
if page.strip()
],
)
]
def extract_data_chunks(
mime_type: str,
content: bytes | str | pathlib.Path,
chunk_size: int | None = None,
skip_summary: bool = False,
) -> list[DataChunk]:
chunks = []
logger.info(f"Extracting content from {mime_type}")
@ -164,7 +247,9 @@ def extract_data_chunks(
]:
chunks = extract_docx(content)
elif mime_type.startswith("text/"):
chunks = extract_text(content, chunk_size)
chunks = extract_text(content, chunk_size, skip_summary=skip_summary)
elif mime_type.startswith("image/"):
chunks = extract_image(content)
elif mime_type == "application/epub+zip":
chunks = extract_ebook(cast(pathlib.Path, content))
return chunks