mirror of
https://github.com/mruwnik/memory.git
synced 2025-06-28 15:14:45 +02:00
better file retrieval
This commit is contained in:
parent
a3daea883b
commit
6b48a768b9
@ -3,5 +3,5 @@ uvicorn==0.29.0
|
||||
python-jose==3.3.0
|
||||
python-multipart==0.0.9
|
||||
sqladmin==0.20.1
|
||||
mcp==1.9.2
|
||||
mcp==1.10.0
|
||||
bm25s[full]==0.2.13
|
@ -61,6 +61,7 @@ templates = Jinja2Templates(directory=template_dir)
|
||||
oauth_provider = SimpleOAuthProvider()
|
||||
auth_settings = AuthSettings(
|
||||
issuer_url=cast(AnyHttpUrl, settings.SERVER_URL),
|
||||
resource_server_url=cast(AnyHttpUrl, settings.SERVER_URL),
|
||||
client_registration_options=ClientRegistrationOptions(
|
||||
enabled=True,
|
||||
valid_scopes=["read", "write"],
|
||||
|
@ -2,16 +2,18 @@
|
||||
MCP tools for the epistemic sparring partner system.
|
||||
"""
|
||||
|
||||
import base64
|
||||
import logging
|
||||
import mimetypes
|
||||
import pathlib
|
||||
from datetime import datetime, timezone
|
||||
from typing import Any
|
||||
from PIL import Image
|
||||
|
||||
from pydantic import BaseModel
|
||||
from sqlalchemy import Text
|
||||
from sqlalchemy import cast as sql_cast
|
||||
from sqlalchemy.dialects.postgresql import ARRAY
|
||||
from mcp.server.fastmcp.resources.base import Resource
|
||||
|
||||
from memory.api.MCP.tools import mcp
|
||||
from memory.api.search.search import SearchFilters, search
|
||||
@ -350,61 +352,30 @@ def fetch_file(filename: str) -> dict:
|
||||
Returns dict with content, mime_type, is_text, file_size.
|
||||
Text content as string, binary as base64.
|
||||
"""
|
||||
path = settings.FILE_STORAGE_DIR / filename.lstrip("/")
|
||||
path = settings.FILE_STORAGE_DIR / filename.strip().lstrip("/")
|
||||
if not path.exists():
|
||||
raise FileNotFoundError(f"File not found: {filename}")
|
||||
|
||||
mime_type, _ = mimetypes.guess_type(str(path))
|
||||
mime_type = mime_type or "application/octet-stream"
|
||||
mime_type = extract.get_mime_type(path)
|
||||
chunks = extract.extract_data_chunks(mime_type, path, skip_summary=True)
|
||||
|
||||
text_extensions = {
|
||||
".md",
|
||||
".txt",
|
||||
".py",
|
||||
".js",
|
||||
".html",
|
||||
".css",
|
||||
".json",
|
||||
".xml",
|
||||
".yaml",
|
||||
".yml",
|
||||
".toml",
|
||||
".ini",
|
||||
".cfg",
|
||||
".conf",
|
||||
}
|
||||
text_mimes = {
|
||||
"application/json",
|
||||
"application/xml",
|
||||
"application/javascript",
|
||||
"application/x-yaml",
|
||||
"application/yaml",
|
||||
}
|
||||
is_text = (
|
||||
mime_type.startswith("text/")
|
||||
or mime_type in text_mimes
|
||||
or path.suffix.lower() in text_extensions
|
||||
)
|
||||
|
||||
try:
|
||||
content = (
|
||||
path.read_text(encoding="utf-8")
|
||||
if is_text
|
||||
else __import__("base64").b64encode(path.read_bytes()).decode("ascii")
|
||||
)
|
||||
except UnicodeDecodeError:
|
||||
import base64
|
||||
|
||||
content = base64.b64encode(path.read_bytes()).decode("ascii")
|
||||
is_text = False
|
||||
mime_type = (
|
||||
"application/octet-stream" if mime_type.startswith("text/") else mime_type
|
||||
)
|
||||
def serialize_chunk(
|
||||
chunk: extract.DataChunk, data: extract.MulitmodalChunk
|
||||
) -> dict:
|
||||
contents = data
|
||||
if isinstance(data, Image.Image):
|
||||
contents = data.tobytes()
|
||||
if isinstance(contents, bytes):
|
||||
contents = base64.b64encode(contents).decode("ascii")
|
||||
|
||||
return {
|
||||
"content": content,
|
||||
"mime_type": mime_type,
|
||||
"is_text": is_text,
|
||||
"file_size": path.stat().st_size,
|
||||
"filename": filename,
|
||||
"type": "text" if isinstance(data, str) else "image",
|
||||
"mime_type": chunk.mime_type,
|
||||
"data": contents,
|
||||
}
|
||||
|
||||
return {
|
||||
"content": [
|
||||
serialize_chunk(chunk, data) for chunk in chunks for data in chunk.data
|
||||
]
|
||||
}
|
||||
|
@ -1,12 +1,14 @@
|
||||
from dataclasses import dataclass, field
|
||||
import io
|
||||
import logging
|
||||
import mimetypes
|
||||
import pathlib
|
||||
import tempfile
|
||||
from contextlib import contextmanager
|
||||
from typing import Any, Generator, Sequence, cast
|
||||
|
||||
from memory.common import chunker, summarizer
|
||||
from memory.parsers import ebook
|
||||
import pymupdf # PyMuPDF
|
||||
import pypandoc
|
||||
from PIL import Image
|
||||
@ -15,6 +17,70 @@ logger = logging.getLogger(__name__)
|
||||
|
||||
MulitmodalChunk = Image.Image | str
|
||||
|
||||
TEXT_EXTENSIONS = {
|
||||
".md",
|
||||
".txt",
|
||||
".py",
|
||||
".js",
|
||||
".html",
|
||||
".css",
|
||||
".json",
|
||||
".xml",
|
||||
".yaml",
|
||||
".yml",
|
||||
".toml",
|
||||
".ini",
|
||||
".cfg",
|
||||
".conf",
|
||||
}
|
||||
IMAGE_EXTENSIONS = {
|
||||
".png",
|
||||
".jpg",
|
||||
".jpeg",
|
||||
".gif",
|
||||
".bmp",
|
||||
}
|
||||
CUSTOM_EXTENSIONS = {
|
||||
".epub": "application/epub+zip",
|
||||
".pdf": "application/pdf",
|
||||
".docx": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
||||
".doc": "application/msword",
|
||||
".xls": "application/vnd.ms-excel",
|
||||
".xlsx": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
|
||||
".ppt": "application/vnd.ms-powerpoint",
|
||||
".pptx": "application/vnd.openxmlformats-officedocument.presentationml.presentation",
|
||||
}
|
||||
|
||||
|
||||
def get_mime_type(path: pathlib.Path) -> str:
|
||||
mime_type, _ = mimetypes.guess_type(str(path))
|
||||
if mime_type:
|
||||
print(f"mime_type: {mime_type}")
|
||||
return mime_type
|
||||
ext = path.suffix.lower()
|
||||
return CUSTOM_EXTENSIONS.get(ext, "application/octet-stream")
|
||||
|
||||
|
||||
def is_text_file(path: pathlib.Path) -> bool:
|
||||
mime_type = get_mime_type(path)
|
||||
text_mimes = {
|
||||
"application/json",
|
||||
"application/xml",
|
||||
"application/javascript",
|
||||
"application/x-yaml",
|
||||
"application/yaml",
|
||||
}
|
||||
return (
|
||||
mime_type.startswith("text/")
|
||||
or mime_type in text_mimes
|
||||
or path.suffix.lower() in TEXT_EXTENSIONS
|
||||
)
|
||||
|
||||
|
||||
def is_image_file(path: pathlib.Path) -> bool:
|
||||
mime_type = get_mime_type(path)
|
||||
return mime_type.startswith("image/") or path.suffix.lower() in IMAGE_EXTENSIONS
|
||||
|
||||
|
||||
def merge_metadata(*metadata: dict[str, Any]) -> dict[str, Any]:
|
||||
final = {}
|
||||
@ -126,6 +192,7 @@ def extract_text(
|
||||
chunk_size: int | None = None,
|
||||
metadata: dict[str, Any] = {},
|
||||
modality: str = "text",
|
||||
skip_summary: bool = False,
|
||||
) -> list[DataChunk]:
|
||||
if isinstance(content, pathlib.Path):
|
||||
content = content.read_text()
|
||||
@ -137,7 +204,7 @@ def extract_text(
|
||||
DataChunk(data=[c], modality=modality, metadata=metadata)
|
||||
for c in chunker.chunk_text(content, chunk_size or chunker.DEFAULT_CHUNK_TOKENS)
|
||||
]
|
||||
if content and len(content) > chunker.DEFAULT_CHUNK_TOKENS * 2:
|
||||
if not skip_summary and content and len(content) > chunker.DEFAULT_CHUNK_TOKENS * 2:
|
||||
summary, tags = summarizer.summarize(content)
|
||||
chunks.append(
|
||||
DataChunk(
|
||||
@ -149,10 +216,26 @@ def extract_text(
|
||||
return chunks
|
||||
|
||||
|
||||
def extract_ebook(file_path: str | pathlib.Path) -> list[DataChunk]:
|
||||
book = ebook.parse_ebook(file_path)
|
||||
return [
|
||||
DataChunk(
|
||||
mime_type="text/plain",
|
||||
data=[
|
||||
page.strip()
|
||||
for section in book.sections
|
||||
for page in section.pages
|
||||
if page.strip()
|
||||
],
|
||||
)
|
||||
]
|
||||
|
||||
|
||||
def extract_data_chunks(
|
||||
mime_type: str,
|
||||
content: bytes | str | pathlib.Path,
|
||||
chunk_size: int | None = None,
|
||||
skip_summary: bool = False,
|
||||
) -> list[DataChunk]:
|
||||
chunks = []
|
||||
logger.info(f"Extracting content from {mime_type}")
|
||||
@ -164,7 +247,9 @@ def extract_data_chunks(
|
||||
]:
|
||||
chunks = extract_docx(content)
|
||||
elif mime_type.startswith("text/"):
|
||||
chunks = extract_text(content, chunk_size)
|
||||
chunks = extract_text(content, chunk_size, skip_summary=skip_summary)
|
||||
elif mime_type.startswith("image/"):
|
||||
chunks = extract_image(content)
|
||||
elif mime_type == "application/epub+zip":
|
||||
chunks = extract_ebook(cast(pathlib.Path, content))
|
||||
return chunks
|
||||
|
Loading…
x
Reference in New Issue
Block a user