Compare commits

..

No commits in common. "50d0eb97db138ae4f61f59384c09ff8a46fb3408" and "a3daea883b774c3fe1b9329a4bc4332a12ed1ce9" have entirely different histories.

5 changed files with 55 additions and 112 deletions

View File

@ -1,5 +1,5 @@
# Backend base stage
FROM python:3.12-slim AS backend-base
FROM python:3.11-slim AS backend-base
WORKDIR /app

View File

@ -3,5 +3,5 @@ uvicorn==0.29.0
python-jose==3.3.0
python-multipart==0.0.9
sqladmin==0.20.1
mcp==1.10.0
mcp==1.9.2
bm25s[full]==0.2.13

View File

@ -61,7 +61,6 @@ templates = Jinja2Templates(directory=template_dir)
oauth_provider = SimpleOAuthProvider()
auth_settings = AuthSettings(
issuer_url=cast(AnyHttpUrl, settings.SERVER_URL),
resource_server_url=cast(AnyHttpUrl, settings.SERVER_URL),
client_registration_options=ClientRegistrationOptions(
enabled=True,
valid_scopes=["read", "write"],

View File

@ -2,18 +2,16 @@
MCP tools for the epistemic sparring partner system.
"""
import base64
import logging
import mimetypes
import pathlib
from datetime import datetime, timezone
from typing import Any
from PIL import Image
from pydantic import BaseModel
from sqlalchemy import Text
from sqlalchemy import cast as sql_cast
from sqlalchemy.dialects.postgresql import ARRAY
from mcp.server.fastmcp.resources.base import Resource
from memory.api.MCP.tools import mcp
from memory.api.search.search import SearchFilters, search
@ -352,30 +350,61 @@ def fetch_file(filename: str) -> dict:
Returns dict with content, mime_type, is_text, file_size.
Text content as string, binary as base64.
"""
path = settings.FILE_STORAGE_DIR / filename.strip().lstrip("/")
path = settings.FILE_STORAGE_DIR / filename.lstrip("/")
if not path.exists():
raise FileNotFoundError(f"File not found: {filename}")
mime_type = extract.get_mime_type(path)
chunks = extract.extract_data_chunks(mime_type, path, skip_summary=True)
mime_type, _ = mimetypes.guess_type(str(path))
mime_type = mime_type or "application/octet-stream"
def serialize_chunk(
chunk: extract.DataChunk, data: extract.MulitmodalChunk
) -> dict:
contents = data
if isinstance(data, Image.Image):
contents = data.tobytes()
if isinstance(contents, bytes):
contents = base64.b64encode(contents).decode("ascii")
text_extensions = {
".md",
".txt",
".py",
".js",
".html",
".css",
".json",
".xml",
".yaml",
".yml",
".toml",
".ini",
".cfg",
".conf",
}
text_mimes = {
"application/json",
"application/xml",
"application/javascript",
"application/x-yaml",
"application/yaml",
}
is_text = (
mime_type.startswith("text/")
or mime_type in text_mimes
or path.suffix.lower() in text_extensions
)
return {
"type": "text" if isinstance(data, str) else "image",
"mime_type": chunk.mime_type,
"data": contents,
}
try:
content = (
path.read_text(encoding="utf-8")
if is_text
else __import__("base64").b64encode(path.read_bytes()).decode("ascii")
)
except UnicodeDecodeError:
import base64
content = base64.b64encode(path.read_bytes()).decode("ascii")
is_text = False
mime_type = (
"application/octet-stream" if mime_type.startswith("text/") else mime_type
)
return {
"content": [
serialize_chunk(chunk, data) for chunk in chunks for data in chunk.data
]
"content": content,
"mime_type": mime_type,
"is_text": is_text,
"file_size": path.stat().st_size,
"filename": filename,
}

View File

@ -1,14 +1,12 @@
from dataclasses import dataclass, field
import io
import logging
import mimetypes
import pathlib
import tempfile
from contextlib import contextmanager
from typing import Any, Generator, Sequence, cast
from memory.common import chunker, summarizer
from memory.parsers import ebook
import pymupdf # PyMuPDF
import pypandoc
from PIL import Image
@ -17,70 +15,6 @@ logger = logging.getLogger(__name__)
MulitmodalChunk = Image.Image | str
TEXT_EXTENSIONS = {
".md",
".txt",
".py",
".js",
".html",
".css",
".json",
".xml",
".yaml",
".yml",
".toml",
".ini",
".cfg",
".conf",
}
IMAGE_EXTENSIONS = {
".png",
".jpg",
".jpeg",
".gif",
".bmp",
}
CUSTOM_EXTENSIONS = {
".epub": "application/epub+zip",
".pdf": "application/pdf",
".docx": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
".doc": "application/msword",
".xls": "application/vnd.ms-excel",
".xlsx": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
".ppt": "application/vnd.ms-powerpoint",
".pptx": "application/vnd.openxmlformats-officedocument.presentationml.presentation",
}
def get_mime_type(path: pathlib.Path) -> str:
mime_type, _ = mimetypes.guess_type(str(path))
if mime_type:
print(f"mime_type: {mime_type}")
return mime_type
ext = path.suffix.lower()
return CUSTOM_EXTENSIONS.get(ext, "application/octet-stream")
def is_text_file(path: pathlib.Path) -> bool:
mime_type = get_mime_type(path)
text_mimes = {
"application/json",
"application/xml",
"application/javascript",
"application/x-yaml",
"application/yaml",
}
return (
mime_type.startswith("text/")
or mime_type in text_mimes
or path.suffix.lower() in TEXT_EXTENSIONS
)
def is_image_file(path: pathlib.Path) -> bool:
mime_type = get_mime_type(path)
return mime_type.startswith("image/") or path.suffix.lower() in IMAGE_EXTENSIONS
def merge_metadata(*metadata: dict[str, Any]) -> dict[str, Any]:
final = {}
@ -192,7 +126,6 @@ def extract_text(
chunk_size: int | None = None,
metadata: dict[str, Any] = {},
modality: str = "text",
skip_summary: bool = False,
) -> list[DataChunk]:
if isinstance(content, pathlib.Path):
content = content.read_text()
@ -204,7 +137,7 @@ def extract_text(
DataChunk(data=[c], modality=modality, metadata=metadata)
for c in chunker.chunk_text(content, chunk_size or chunker.DEFAULT_CHUNK_TOKENS)
]
if not skip_summary and content and len(content) > chunker.DEFAULT_CHUNK_TOKENS * 2:
if content and len(content) > chunker.DEFAULT_CHUNK_TOKENS * 2:
summary, tags = summarizer.summarize(content)
chunks.append(
DataChunk(
@ -216,26 +149,10 @@ def extract_text(
return chunks
def extract_ebook(file_path: str | pathlib.Path) -> list[DataChunk]:
book = ebook.parse_ebook(file_path)
return [
DataChunk(
mime_type="text/plain",
data=[
page.strip()
for section in book.sections
for page in section.pages
if page.strip()
],
)
]
def extract_data_chunks(
mime_type: str,
content: bytes | str | pathlib.Path,
chunk_size: int | None = None,
skip_summary: bool = False,
) -> list[DataChunk]:
chunks = []
logger.info(f"Extracting content from {mime_type}")
@ -247,9 +164,7 @@ def extract_data_chunks(
]:
chunks = extract_docx(content)
elif mime_type.startswith("text/"):
chunks = extract_text(content, chunk_size, skip_summary=skip_summary)
chunks = extract_text(content, chunk_size)
elif mime_type.startswith("image/"):
chunks = extract_image(content)
elif mime_type == "application/epub+zip":
chunks = extract_ebook(cast(pathlib.Path, content))
return chunks