mirror of
https://github.com/mruwnik/memory.git
synced 2025-06-28 23:24:43 +02:00
better file retrieval
This commit is contained in:
parent
a3daea883b
commit
6b48a768b9
@ -3,5 +3,5 @@ uvicorn==0.29.0
|
|||||||
python-jose==3.3.0
|
python-jose==3.3.0
|
||||||
python-multipart==0.0.9
|
python-multipart==0.0.9
|
||||||
sqladmin==0.20.1
|
sqladmin==0.20.1
|
||||||
mcp==1.9.2
|
mcp==1.10.0
|
||||||
bm25s[full]==0.2.13
|
bm25s[full]==0.2.13
|
@ -61,6 +61,7 @@ templates = Jinja2Templates(directory=template_dir)
|
|||||||
oauth_provider = SimpleOAuthProvider()
|
oauth_provider = SimpleOAuthProvider()
|
||||||
auth_settings = AuthSettings(
|
auth_settings = AuthSettings(
|
||||||
issuer_url=cast(AnyHttpUrl, settings.SERVER_URL),
|
issuer_url=cast(AnyHttpUrl, settings.SERVER_URL),
|
||||||
|
resource_server_url=cast(AnyHttpUrl, settings.SERVER_URL),
|
||||||
client_registration_options=ClientRegistrationOptions(
|
client_registration_options=ClientRegistrationOptions(
|
||||||
enabled=True,
|
enabled=True,
|
||||||
valid_scopes=["read", "write"],
|
valid_scopes=["read", "write"],
|
||||||
|
@ -2,16 +2,18 @@
|
|||||||
MCP tools for the epistemic sparring partner system.
|
MCP tools for the epistemic sparring partner system.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
import base64
|
||||||
import logging
|
import logging
|
||||||
import mimetypes
|
|
||||||
import pathlib
|
import pathlib
|
||||||
from datetime import datetime, timezone
|
from datetime import datetime, timezone
|
||||||
from typing import Any
|
from typing import Any
|
||||||
|
from PIL import Image
|
||||||
|
|
||||||
from pydantic import BaseModel
|
from pydantic import BaseModel
|
||||||
from sqlalchemy import Text
|
from sqlalchemy import Text
|
||||||
from sqlalchemy import cast as sql_cast
|
from sqlalchemy import cast as sql_cast
|
||||||
from sqlalchemy.dialects.postgresql import ARRAY
|
from sqlalchemy.dialects.postgresql import ARRAY
|
||||||
|
from mcp.server.fastmcp.resources.base import Resource
|
||||||
|
|
||||||
from memory.api.MCP.tools import mcp
|
from memory.api.MCP.tools import mcp
|
||||||
from memory.api.search.search import SearchFilters, search
|
from memory.api.search.search import SearchFilters, search
|
||||||
@ -350,61 +352,30 @@ def fetch_file(filename: str) -> dict:
|
|||||||
Returns dict with content, mime_type, is_text, file_size.
|
Returns dict with content, mime_type, is_text, file_size.
|
||||||
Text content as string, binary as base64.
|
Text content as string, binary as base64.
|
||||||
"""
|
"""
|
||||||
path = settings.FILE_STORAGE_DIR / filename.lstrip("/")
|
path = settings.FILE_STORAGE_DIR / filename.strip().lstrip("/")
|
||||||
if not path.exists():
|
if not path.exists():
|
||||||
raise FileNotFoundError(f"File not found: {filename}")
|
raise FileNotFoundError(f"File not found: {filename}")
|
||||||
|
|
||||||
mime_type, _ = mimetypes.guess_type(str(path))
|
mime_type = extract.get_mime_type(path)
|
||||||
mime_type = mime_type or "application/octet-stream"
|
chunks = extract.extract_data_chunks(mime_type, path, skip_summary=True)
|
||||||
|
|
||||||
text_extensions = {
|
def serialize_chunk(
|
||||||
".md",
|
chunk: extract.DataChunk, data: extract.MulitmodalChunk
|
||||||
".txt",
|
) -> dict:
|
||||||
".py",
|
contents = data
|
||||||
".js",
|
if isinstance(data, Image.Image):
|
||||||
".html",
|
contents = data.tobytes()
|
||||||
".css",
|
if isinstance(contents, bytes):
|
||||||
".json",
|
contents = base64.b64encode(contents).decode("ascii")
|
||||||
".xml",
|
|
||||||
".yaml",
|
|
||||||
".yml",
|
|
||||||
".toml",
|
|
||||||
".ini",
|
|
||||||
".cfg",
|
|
||||||
".conf",
|
|
||||||
}
|
|
||||||
text_mimes = {
|
|
||||||
"application/json",
|
|
||||||
"application/xml",
|
|
||||||
"application/javascript",
|
|
||||||
"application/x-yaml",
|
|
||||||
"application/yaml",
|
|
||||||
}
|
|
||||||
is_text = (
|
|
||||||
mime_type.startswith("text/")
|
|
||||||
or mime_type in text_mimes
|
|
||||||
or path.suffix.lower() in text_extensions
|
|
||||||
)
|
|
||||||
|
|
||||||
try:
|
|
||||||
content = (
|
|
||||||
path.read_text(encoding="utf-8")
|
|
||||||
if is_text
|
|
||||||
else __import__("base64").b64encode(path.read_bytes()).decode("ascii")
|
|
||||||
)
|
|
||||||
except UnicodeDecodeError:
|
|
||||||
import base64
|
|
||||||
|
|
||||||
content = base64.b64encode(path.read_bytes()).decode("ascii")
|
|
||||||
is_text = False
|
|
||||||
mime_type = (
|
|
||||||
"application/octet-stream" if mime_type.startswith("text/") else mime_type
|
|
||||||
)
|
|
||||||
|
|
||||||
return {
|
return {
|
||||||
"content": content,
|
"type": "text" if isinstance(data, str) else "image",
|
||||||
"mime_type": mime_type,
|
"mime_type": chunk.mime_type,
|
||||||
"is_text": is_text,
|
"data": contents,
|
||||||
"file_size": path.stat().st_size,
|
}
|
||||||
"filename": filename,
|
|
||||||
|
return {
|
||||||
|
"content": [
|
||||||
|
serialize_chunk(chunk, data) for chunk in chunks for data in chunk.data
|
||||||
|
]
|
||||||
}
|
}
|
||||||
|
@ -1,12 +1,14 @@
|
|||||||
from dataclasses import dataclass, field
|
from dataclasses import dataclass, field
|
||||||
import io
|
import io
|
||||||
import logging
|
import logging
|
||||||
|
import mimetypes
|
||||||
import pathlib
|
import pathlib
|
||||||
import tempfile
|
import tempfile
|
||||||
from contextlib import contextmanager
|
from contextlib import contextmanager
|
||||||
from typing import Any, Generator, Sequence, cast
|
from typing import Any, Generator, Sequence, cast
|
||||||
|
|
||||||
from memory.common import chunker, summarizer
|
from memory.common import chunker, summarizer
|
||||||
|
from memory.parsers import ebook
|
||||||
import pymupdf # PyMuPDF
|
import pymupdf # PyMuPDF
|
||||||
import pypandoc
|
import pypandoc
|
||||||
from PIL import Image
|
from PIL import Image
|
||||||
@ -15,6 +17,70 @@ logger = logging.getLogger(__name__)
|
|||||||
|
|
||||||
MulitmodalChunk = Image.Image | str
|
MulitmodalChunk = Image.Image | str
|
||||||
|
|
||||||
|
TEXT_EXTENSIONS = {
|
||||||
|
".md",
|
||||||
|
".txt",
|
||||||
|
".py",
|
||||||
|
".js",
|
||||||
|
".html",
|
||||||
|
".css",
|
||||||
|
".json",
|
||||||
|
".xml",
|
||||||
|
".yaml",
|
||||||
|
".yml",
|
||||||
|
".toml",
|
||||||
|
".ini",
|
||||||
|
".cfg",
|
||||||
|
".conf",
|
||||||
|
}
|
||||||
|
IMAGE_EXTENSIONS = {
|
||||||
|
".png",
|
||||||
|
".jpg",
|
||||||
|
".jpeg",
|
||||||
|
".gif",
|
||||||
|
".bmp",
|
||||||
|
}
|
||||||
|
CUSTOM_EXTENSIONS = {
|
||||||
|
".epub": "application/epub+zip",
|
||||||
|
".pdf": "application/pdf",
|
||||||
|
".docx": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
||||||
|
".doc": "application/msword",
|
||||||
|
".xls": "application/vnd.ms-excel",
|
||||||
|
".xlsx": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
|
||||||
|
".ppt": "application/vnd.ms-powerpoint",
|
||||||
|
".pptx": "application/vnd.openxmlformats-officedocument.presentationml.presentation",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def get_mime_type(path: pathlib.Path) -> str:
|
||||||
|
mime_type, _ = mimetypes.guess_type(str(path))
|
||||||
|
if mime_type:
|
||||||
|
print(f"mime_type: {mime_type}")
|
||||||
|
return mime_type
|
||||||
|
ext = path.suffix.lower()
|
||||||
|
return CUSTOM_EXTENSIONS.get(ext, "application/octet-stream")
|
||||||
|
|
||||||
|
|
||||||
|
def is_text_file(path: pathlib.Path) -> bool:
|
||||||
|
mime_type = get_mime_type(path)
|
||||||
|
text_mimes = {
|
||||||
|
"application/json",
|
||||||
|
"application/xml",
|
||||||
|
"application/javascript",
|
||||||
|
"application/x-yaml",
|
||||||
|
"application/yaml",
|
||||||
|
}
|
||||||
|
return (
|
||||||
|
mime_type.startswith("text/")
|
||||||
|
or mime_type in text_mimes
|
||||||
|
or path.suffix.lower() in TEXT_EXTENSIONS
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def is_image_file(path: pathlib.Path) -> bool:
|
||||||
|
mime_type = get_mime_type(path)
|
||||||
|
return mime_type.startswith("image/") or path.suffix.lower() in IMAGE_EXTENSIONS
|
||||||
|
|
||||||
|
|
||||||
def merge_metadata(*metadata: dict[str, Any]) -> dict[str, Any]:
|
def merge_metadata(*metadata: dict[str, Any]) -> dict[str, Any]:
|
||||||
final = {}
|
final = {}
|
||||||
@ -126,6 +192,7 @@ def extract_text(
|
|||||||
chunk_size: int | None = None,
|
chunk_size: int | None = None,
|
||||||
metadata: dict[str, Any] = {},
|
metadata: dict[str, Any] = {},
|
||||||
modality: str = "text",
|
modality: str = "text",
|
||||||
|
skip_summary: bool = False,
|
||||||
) -> list[DataChunk]:
|
) -> list[DataChunk]:
|
||||||
if isinstance(content, pathlib.Path):
|
if isinstance(content, pathlib.Path):
|
||||||
content = content.read_text()
|
content = content.read_text()
|
||||||
@ -137,7 +204,7 @@ def extract_text(
|
|||||||
DataChunk(data=[c], modality=modality, metadata=metadata)
|
DataChunk(data=[c], modality=modality, metadata=metadata)
|
||||||
for c in chunker.chunk_text(content, chunk_size or chunker.DEFAULT_CHUNK_TOKENS)
|
for c in chunker.chunk_text(content, chunk_size or chunker.DEFAULT_CHUNK_TOKENS)
|
||||||
]
|
]
|
||||||
if content and len(content) > chunker.DEFAULT_CHUNK_TOKENS * 2:
|
if not skip_summary and content and len(content) > chunker.DEFAULT_CHUNK_TOKENS * 2:
|
||||||
summary, tags = summarizer.summarize(content)
|
summary, tags = summarizer.summarize(content)
|
||||||
chunks.append(
|
chunks.append(
|
||||||
DataChunk(
|
DataChunk(
|
||||||
@ -149,10 +216,26 @@ def extract_text(
|
|||||||
return chunks
|
return chunks
|
||||||
|
|
||||||
|
|
||||||
|
def extract_ebook(file_path: str | pathlib.Path) -> list[DataChunk]:
|
||||||
|
book = ebook.parse_ebook(file_path)
|
||||||
|
return [
|
||||||
|
DataChunk(
|
||||||
|
mime_type="text/plain",
|
||||||
|
data=[
|
||||||
|
page.strip()
|
||||||
|
for section in book.sections
|
||||||
|
for page in section.pages
|
||||||
|
if page.strip()
|
||||||
|
],
|
||||||
|
)
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
def extract_data_chunks(
|
def extract_data_chunks(
|
||||||
mime_type: str,
|
mime_type: str,
|
||||||
content: bytes | str | pathlib.Path,
|
content: bytes | str | pathlib.Path,
|
||||||
chunk_size: int | None = None,
|
chunk_size: int | None = None,
|
||||||
|
skip_summary: bool = False,
|
||||||
) -> list[DataChunk]:
|
) -> list[DataChunk]:
|
||||||
chunks = []
|
chunks = []
|
||||||
logger.info(f"Extracting content from {mime_type}")
|
logger.info(f"Extracting content from {mime_type}")
|
||||||
@ -164,7 +247,9 @@ def extract_data_chunks(
|
|||||||
]:
|
]:
|
||||||
chunks = extract_docx(content)
|
chunks = extract_docx(content)
|
||||||
elif mime_type.startswith("text/"):
|
elif mime_type.startswith("text/"):
|
||||||
chunks = extract_text(content, chunk_size)
|
chunks = extract_text(content, chunk_size, skip_summary=skip_summary)
|
||||||
elif mime_type.startswith("image/"):
|
elif mime_type.startswith("image/"):
|
||||||
chunks = extract_image(content)
|
chunks = extract_image(content)
|
||||||
|
elif mime_type == "application/epub+zip":
|
||||||
|
chunks = extract_ebook(cast(pathlib.Path, content))
|
||||||
return chunks
|
return chunks
|
||||||
|
Loading…
x
Reference in New Issue
Block a user