diff --git a/frontend/src/App.css b/frontend/src/App.css index ba4eae3..32ff733 100644 --- a/frontend/src/App.css +++ b/frontend/src/App.css @@ -433,6 +433,14 @@ body { background: #e2e8f0; } +.metadata { + margin-top: 1rem; + padding: 1rem 2rem; + background: #f9fafb; + border-radius: 8px; + border: 1px solid #e5e7eb; +} + /* Responsive design */ @media (max-width: 768px) { .app-header { diff --git a/frontend/src/components/Search.tsx b/frontend/src/components/Search.tsx index 3a9aaac..13589dc 100644 --- a/frontend/src/components/Search.tsx +++ b/frontend/src/components/Search.tsx @@ -2,7 +2,6 @@ import React, { useState, useEffect } from 'react' import { useNavigate } from 'react-router-dom' import ReactMarkdown from 'react-markdown' import { useMCP } from '../hooks/useMCP' -import { useAuth } from '../hooks/useAuth' import Loading from './Loading' type SearchItem = { @@ -24,7 +23,7 @@ const Tag = ({ tags }: { tags: string[] }) => { ) } -const formatText = ({ filename, content, chunks, tags }: SearchItem) => { +const TextResult = ({ filename, content, chunks, tags }: SearchItem) => { return (

{filename || 'Untitled'}

@@ -45,11 +44,12 @@ const formatText = ({ filename, content, chunks, tags }: SearchItem) => { ) } -const formatMarkdown = ({ filename, content, chunks, tags, metadata }: SearchItem) => { +const MarkdownResult = ({ filename, content, chunks, tags, metadata }: SearchItem) => { return (

{filename || 'Untitled'}

+
{content || 'No content available'}
@@ -70,7 +70,7 @@ const formatMarkdown = ({ filename, content, chunks, tags, metadata }: SearchIte ) } -const formatImage = ({ filename, chunks, tags, metadata }: SearchItem) => { +const ImageResult = ({ filename, chunks, tags, metadata }: SearchItem) => { const title = metadata?.title || filename || 'Untitled' const { fetchFile } = useMCP() const [mime_type, setMimeType] = useState() @@ -95,17 +95,66 @@ const formatImage = ({ filename, chunks, tags, metadata }: SearchItem) => { ) } +const Metadata = ({ metadata }: { metadata: any }) => { + if (!metadata) return null + return ( +
+
    + {Object.entries(metadata).map(([key, value]) => ( +
  • {key}: {typeof value === 'string' ? value : JSON.stringify(value)}
  • + ))} +
+
+ ) +} + +const PDFResult = ({ filename, content, tags, metadata }: SearchItem) => { + return ( +
+

{filename || 'Untitled'}

+ + View PDF + + {content &&
+
+ View Source + {content} +
+
} +
+ ) +} + +const EmailResult = ({ content, tags, metadata }: SearchItem) => { + return ( +
+

{metadata?.title || metadata?.subject || 'Untitled'}

+ + + {content &&
+ {content} +
} +
+ ) +} + const SearchResult = ({ result }: { result: SearchItem }) => { if (result.mime_type.startsWith('image/')) { - return formatImage(result) + return } if (result.mime_type.startsWith('text/markdown')) { - console.log(result) - return formatMarkdown(result) + return } if (result.mime_type.startsWith('text/')) { - return formatText(result) + return } + if (result.mime_type.startsWith('application/pdf')) { + return + } + if (result.mime_type.startsWith('message/rfc822')) { + return + } + console.log(result) return null } diff --git a/frontend/src/hooks/useMCP.ts b/frontend/src/hooks/useMCP.ts index d1f44df..ca2c150 100644 --- a/frontend/src/hooks/useMCP.ts +++ b/frontend/src/hooks/useMCP.ts @@ -115,6 +115,9 @@ export const useMCP = () => { } const resp = await parseJsonRpcResponse(response) + if (resp?.result?.isError) { + throw new Error(resp?.result?.content[0].text) + } return resp?.result?.content.map((item: any) => JSON.parse(item.text)) }, [apiCall]) diff --git a/src/memory/api/app.py b/src/memory/api/app.py index c418bce..57ffbd6 100644 --- a/src/memory/api/app.py +++ b/src/memory/api/app.py @@ -6,7 +6,7 @@ import contextlib import os import logging -from fastapi import FastAPI, UploadFile, Request +from fastapi import FastAPI, UploadFile, Request, HTTPException from fastapi.responses import FileResponse from fastapi.middleware.cors import CORSMiddleware from sqladmin import Admin @@ -50,6 +50,24 @@ async def serve_react_app(full_path: str): return FileResponse(settings.STATIC_DIR / "index.html") +@app.get("/files/{path:path}") +async def serve_file(path: str): + file_path = settings.FILE_STORAGE_DIR / path + if not file_path.is_file(): + raise HTTPException(status_code=404, detail="File not found") + return FileResponse(file_path) + + +async def input_type(item: str | UploadFile) -> list[extract.DataChunk]: + if not item: + return [] + + if isinstance(item, str): + return extract.extract_text(item) + content_type = item.content_type or "application/octet-stream" + return extract.extract_data_chunks(content_type, await item.read()) + + # SQLAdmin setup with OAuth protection engine = get_engine() admin = Admin(app, engine) @@ -72,16 +90,6 @@ async def health_check(request: Request): app.mount("/", mcp.streamable_http_app()) -async def input_type(item: str | UploadFile) -> list[extract.DataChunk]: - if not item: - return [] - - if isinstance(item, str): - return extract.extract_text(item) - content_type = item.content_type or "application/octet-stream" - return extract.extract_data_chunks(content_type, await item.read()) - - def main(reload: bool = False): """Run the FastAPI server in debug mode with auto-reloading.""" import uvicorn diff --git a/src/memory/api/search/bm25.py b/src/memory/api/search/bm25.py index 5c68f1d..91e07f5 100644 --- a/src/memory/api/search/bm25.py +++ b/src/memory/api/search/bm25.py @@ -23,7 +23,8 @@ async def search_bm25( ) -> list[tuple[SourceData, AnnotatedChunk]]: with make_session() as db: items_query = db.query(Chunk.id, Chunk.content).filter( - Chunk.collection_name.in_(modalities) + Chunk.collection_name.in_(modalities), + Chunk.content.isnot(None), ) if source_ids := filters.get("source_ids"): @@ -46,6 +47,7 @@ async def search_bm25( item_ids = { sha256(item.content.lower().strip().encode("utf-8")).hexdigest(): item.id for item in items + if item.content } corpus = [item.content.lower().strip() for item in items] diff --git a/src/memory/api/search/utils.py b/src/memory/api/search/utils.py index 24a937d..4a9da22 100644 --- a/src/memory/api/search/utils.py +++ b/src/memory/api/search/utils.py @@ -1,4 +1,5 @@ import asyncio +import traceback from datetime import datetime import logging from collections import defaultdict @@ -28,7 +29,7 @@ class SourceData(BaseModel): mime_type: str | None filename: str | None content_length: int - contents: dict | None + contents: dict | str | None created_at: datetime | None @staticmethod @@ -87,6 +88,7 @@ async def with_timeout( logger.warning(f"Search timed out after {timeout}s") return [] except Exception as e: + traceback.print_exc() logger.error(f"Search failed: {e}") return [] @@ -109,8 +111,14 @@ def group_chunks( def make_result(source: SourceData, chunks: list[AnnotatedChunk]) -> SearchResult: contents = source.contents or {} - tags = contents.pop("tags", []) - content = contents.pop("content", None) + tags = [] + if isinstance(contents, dict): + tags = contents.pop("tags", []) + content = contents.pop("content", None) + print(content) + else: + content = contents + contents = {} return SearchResult( id=source.id, diff --git a/src/memory/common/db/models/source_items.py b/src/memory/common/db/models/source_items.py index cd77453..b391b6f 100644 --- a/src/memory/common/db/models/source_items.py +++ b/src/memory/common/db/models/source_items.py @@ -93,35 +93,76 @@ class MailMessage(SourceItem): } @property - def parsed_content(self): + def parsed_content(self) -> dict[str, Any]: from memory.parsers.email import parse_email_message - return parse_email_message(cast(str, self.content), cast(str, self.message_id)) + return cast( + dict[str, Any], + parse_email_message(cast(str, self.content), cast(str, self.message_id)), + ) @property def body(self) -> str: return self.parsed_content["body"] - @property - def display_contents(self) -> str | None: - content = self.parsed_content - return textwrap.dedent( - """ + def format_content(self, content: dict[str, Any]) -> str: + sender = ( + cast(str, self.sender) or content.get("from") or content.get("sender", "") + ) + recipients = ( + cast(list[str], self.recipients) + or content.get("to") + or content.get("recipients", []) + ) + date = ( + cast(datetime, self.sent_at) and self.sent_at.isoformat() + ) or content.get("date", "") + + return ( + textwrap.dedent( + """ Subject: {subject} From: {sender} To: {recipients} Date: {date} - Body: + Body: {body} """ - ).format( - subject=content.get("subject", ""), - sender=content.get("from", ""), - recipients=content.get("to", ""), - date=content.get("date", ""), - body=content.get("body", ""), + ) + .format( + subject=cast(str, self.subject) or content.get("subject", ""), + sender=sender, + recipients=", ".join(recipients), + date=date, + body=content.get("body", ""), + ) + .strip() ) + @property + def display_contents(self) -> dict | None: + return { + **cast(dict, super().display_contents), + "content": self.body, + "subject": self.subject, + "sender": self.sender, + "recipients": self.recipients, + "date": cast(datetime | None, self.sent_at) and self.sent_at.isoformat(), + } + + def _chunk_contents(self) -> Sequence[extract.DataChunk]: + content = self.parsed_content + chunks = extract.extract_text(cast(str, self.body)) + + def add_header(item: extract.MulitmodalChunk) -> extract.MulitmodalChunk: + if isinstance(item, str): + return self.format_content(content | {"body": item}).strip() + return item + + for chunk in chunks: + chunk.data = [add_header(item) for item in chunk.data] + return chunks + # Add indexes __table_args__ = ( Index("mail_sent_idx", "sent_at"), @@ -161,13 +202,22 @@ class EmailAttachment(SourceItem): def data_chunks(self, metadata: dict[str, Any] = {}) -> Sequence[Chunk]: if cast(str | None, self.filename): - contents = pathlib.Path(cast(str, self.filename)).read_bytes() + contents = ( + settings.FILE_STORAGE_DIR / cast(str, self.filename) + ).read_bytes() else: contents = cast(str, self.content) chunks = extract.extract_data_chunks(cast(str, self.mime_type), contents) return [self._make_chunk(c, metadata) for c in chunks] + @property + def display_contents(self) -> dict: + return { + **cast(dict, super().display_contents), + **self.mail_message.display_contents, + } + # Add indexes __table_args__ = (Index("email_attachment_message_idx", "mail_message_id"),) diff --git a/src/memory/common/summarizer.py b/src/memory/common/summarizer.py index fe0b51a..9843649 100644 --- a/src/memory/common/summarizer.py +++ b/src/memory/common/summarizer.py @@ -8,6 +8,7 @@ from memory.common import settings, chunker logger = logging.getLogger(__name__) +MAX_TOKENS = 200000 TAGS_PROMPT = """ The following text is already concise. Please identify 3-5 relevant tags that capture the main topics or themes. @@ -148,6 +149,12 @@ def summarize(content: str, target_tokens: int | None = None) -> tuple[str, list content=content, ) + if chunker.approx_token_count(prompt) > MAX_TOKENS: + logger.warning( + f"Prompt too long ({chunker.approx_token_count(prompt)} tokens), truncating" + ) + prompt = truncate(prompt, MAX_TOKENS - 20) + try: if settings.SUMMARIZER_MODEL.startswith("anthropic"): result = _call_anthropic(prompt) diff --git a/src/memory/parsers/email.py b/src/memory/parsers/email.py index b728bb6..bbe93ab 100644 --- a/src/memory/parsers/email.py +++ b/src/memory/parsers/email.py @@ -1,4 +1,5 @@ import email +import email.message import hashlib import logging import pathlib @@ -6,6 +7,8 @@ from datetime import datetime from email.utils import parsedate_to_datetime from typing import TypedDict +from markdownify import markdownify + logger = logging.getLogger(__name__) @@ -71,33 +74,60 @@ def extract_date(msg: email.message.Message) -> datetime | None: # type: ignore def extract_body(msg: email.message.Message) -> str: # type: ignore """ - Extract plain text body from email message. + Extract body from email message, preferring HTML converted to markdown. Args: msg: Email message object Returns: - Plain text body content + Body content as markdown (if HTML found) or plain text """ - body = "" + html_body = "" + plain_body = "" if not msg.is_multipart(): try: - return msg.get_payload(decode=True).decode(errors="replace") + payload = msg.get_payload(decode=True) + if isinstance(payload, bytes): + content = payload.decode(errors="replace") + else: + content = str(payload) + content_type = msg.get_content_type() + if content_type == "text/html": + return markdownify(content).strip() + else: + return content except Exception as e: logger.error(f"Error decoding message body: {str(e)}") return "" + # Extract both HTML and plain text parts for part in msg.walk(): content_type = part.get_content_type() content_disposition = str(part.get("Content-Disposition", "")) - if content_type == "text/plain" and "attachment" not in content_disposition: - try: - body += part.get_payload(decode=True).decode(errors="replace") + "\n" - except Exception as e: - logger.error(f"Error decoding message part: {str(e)}") - return body + if "attachment" in content_disposition: + continue + + try: + payload = part.get_payload(decode=True) + if isinstance(payload, bytes): + content = payload.decode(errors="replace") + else: + content = str(payload) + + if content_type == "text/html": + html_body += content + "\n" + elif content_type == "text/plain": + plain_body += content + "\n" + except Exception as e: + logger.error(f"Error decoding message part: {str(e)}") + + # Prefer HTML (converted to markdown) over plain text + if html_body.strip(): + return markdownify(html_body).strip() + else: + return plain_body.strip() def extract_attachments(msg: email.message.Message) -> list[Attachment]: # type: ignore diff --git a/src/memory/workers/email.py b/src/memory/workers/email.py index f1133e7..4c708a7 100644 --- a/src/memory/workers/email.py +++ b/src/memory/workers/email.py @@ -61,7 +61,7 @@ def process_attachment( mime_type=attachment["content_type"], mail_message=message, content=content, - filename=file_path and str(file_path), + filename=file_path and str(file_path.relative_to(settings.FILE_STORAGE_DIR)), ) @@ -149,7 +149,7 @@ def extract_email_uid( def fetch_email(conn: imaplib.IMAP4_SSL, uid: str) -> RawEmailResponse | None: try: - status, msg_data = conn.fetch(uid, "(UID RFC822)") + status, msg_data = conn.fetch(uid, "(UID BODY.PEEK[])") if status != "OK" or not msg_data or not msg_data[0]: logger.error(f"Error fetching message {uid}") return None diff --git a/tests/data/contents.py b/tests/data/contents.py index e2970d4..e25ab8b 100644 --- a/tests/data/contents.py +++ b/tests/data/contents.py @@ -237,6 +237,40 @@ SAMPLE_TEXT = BeautifulSoup(SAMPLE_HTML, "html.parser").get_text() SECOND_PAGE_MARKDOWN = markdownify(SECOND_PAGE) SECOND_PAGE_TEXT = BeautifulSoup(SECOND_PAGE, "html.parser").get_text() +SAMPLE_EMAIL = f"""From: john.doe@techcorp.com +To: research-team@techcorp.com, jane.smith@university.edu +CC: newsletter@programming-weekly.com +Subject: The Evolution of Programming Languages - Research Article +Date: Wed, 15 Jan 2025 14:30:00 +0000 +Message-ID: <20250115143000.12345@techcorp.com> +MIME-Version: 1.0 +Content-Type: multipart/mixed; boundary="----=_NextPart_000_0001_01DA1234.56789ABC" + +This is a multi-part message in MIME format. + +------=_NextPart_000_0001_01DA1234.56789ABC +Content-Type: text/html; charset=utf-8 +Content-Transfer-Encoding: quoted-printable + +{SAMPLE_HTML} + +------=_NextPart_000_0001_01DA1234.56789ABC +Content-Type: image/png +Content-Disposition: attachment; filename="lang_timeline.png" +Content-Transfer-Encoding: base64 + +iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mNkYPhfDwAChwGA60e6kgAAAABJRU5ErkJggg== + +------=_NextPart_000_0001_01DA1234.56789ABC +Content-Type: image/jpeg +Content-Disposition: attachment; filename="code_complexity.jpg" +Content-Transfer-Encoding: base64 + +/9j/4AAQSkZJRgABAQEAYABgAAD/2wBDAAEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEB + +------=_NextPart_000_0001_01DA1234.56789ABC-- +""" + def image_hash(image: Image.Image) -> str: return hashlib.sha256(image.tobytes()).hexdigest() diff --git a/tests/memory/common/db/models/test_source_item_embeddings.py b/tests/memory/common/db/models/test_source_item_embeddings.py index 579c7eb..4259bb8 100644 --- a/tests/memory/common/db/models/test_source_item_embeddings.py +++ b/tests/memory/common/db/models/test_source_item_embeddings.py @@ -1,4 +1,5 @@ import hashlib +import textwrap from datetime import datetime from typing import Sequence, cast from unittest.mock import ANY, Mock, call @@ -20,6 +21,7 @@ from memory.common.db.models.source_items import ( from memory.common.db.models.sources import Book from memory.common.embedding import embed_source_item from memory.common.extract import page_to_image +from memory.parsers.email import parse_email_message from tests.data.contents import ( CHUNKS, DATA_DIR, @@ -27,6 +29,7 @@ from tests.data.contents import ( LANG_TIMELINE_HASH, CODE_COMPLEXITY, CODE_COMPLEXITY_HASH, + SAMPLE_EMAIL, SAMPLE_MARKDOWN, SAMPLE_TEXT, SECOND_PAGE, @@ -127,31 +130,41 @@ def test_base_source_item_mixed_embeddings(mock_voyage_client): ] == [LANG_TIMELINE_HASH] -def test_mail_message_embeddings(mock_voyage_client): +def test_mail_message_with_attachments_embeddings(mock_voyage_client): + email = parse_email_message(SAMPLE_EMAIL, "123") item = MailMessage( id=1, - content=SAMPLE_MARKDOWN, + content=SAMPLE_EMAIL, mime_type="text/html", modality="text", - sha256=hashlib.sha256(SAMPLE_MARKDOWN.encode("utf-8")).hexdigest(), - size=len(SAMPLE_MARKDOWN), + sha256=hashlib.sha256(email["body"].encode("utf-8")).hexdigest(), + size=len(email["body"]), tags=["bla"], message_id="123", - subject="Test Subject", - sender="test@example.com", - recipients=["test@example.com"], + subject=email["subject"], + sender=email["sender"], + recipients=email["recipients"], folder="INBOX", sent_at=datetime(2025, 1, 1, 12, 0, 0), ) + email_header = textwrap.dedent( + f""" + Subject: {email["subject"]} + From: {email["sender"]} + To: {", ".join(email["recipients"])} + Date: 2025-01-01T12:00:00 + Body: + """ + ).lstrip() metadata = item.as_payload() - metadata["tags"] = {"bla", "test@example.com"} + metadata["tags"] = {"bla", "john.doe@techcorp.com"} | set(email["recipients"]) expected = [ - (CHUNKS[0].strip(), [], metadata), - (CHUNKS[1].strip(), [], metadata), + (email_header + CHUNKS[0].strip(), [], metadata), + (email_header + CHUNKS[1].strip().replace("—", "\\\\u2014"), [], metadata), ( - "test summary", + email_header + "test summary", [], - metadata | {"tags": {"tag1", "tag2", "bla", "test@example.com"}}, + metadata | {"tags": {"tag1", "tag2"} | metadata["tags"]}, ), ] @@ -166,7 +179,11 @@ def test_mail_message_embeddings(mock_voyage_client): assert not mock_voyage_client.multimodal_embed.call_count assert mock_voyage_client.embed.call_args == call( - [CHUNKS[0].strip(), CHUNKS[1].strip(), "test summary"], + [ + email_header + CHUNKS[0].strip(), + email_header + CHUNKS[1].strip().replace("—", "\\\\u2014"), + email_header + "test summary", + ], model=settings.TEXT_EMBEDDING_MODEL, input_type="document", ) diff --git a/tests/memory/common/db/models/test_source_items.py b/tests/memory/common/db/models/test_source_items.py index 8d838e4..a2f6bc9 100644 --- a/tests/memory/common/db/models/test_source_items.py +++ b/tests/memory/common/db/models/test_source_items.py @@ -183,13 +183,27 @@ Subject: Test Subject Test Body Content""" mail_message = MailMessage( - sha256=b"test", content=email_content, message_id="" + sha256=b"test", + content=email_content, + message_id="", + sender="sender@example.com", + recipients=["recipient@example.com"], + subject="Test Subject", + size=1024, + sent_at=datetime(2023, 1, 1, 12, 0, 0), ) - expected = ( - "\nSubject: Test Subject\nFrom: \nTo: \nDate: \nBody: \nTest Body Content\n" - ) - assert mail_message.display_contents == expected + assert mail_message.display_contents == { + "content": "Test Body Content", + "date": "2023-01-01T12:00:00", + "filename": None, + "mime_type": None, + "size": 1024, + "subject": "Test Subject", + "sender": "sender@example.com", + "recipients": ["recipient@example.com"], + "tags": None, + } @pytest.mark.parametrize( diff --git a/tests/memory/parsers/test_email_parsers.py b/tests/memory/parsers/test_email_parsers.py index 012dd54..dc1887d 100644 --- a/tests/memory/parsers/test_email_parsers.py +++ b/tests/memory/parsers/test_email_parsers.py @@ -246,12 +246,11 @@ def test_parse_simple_email(): "subject": "Test Subject", "sender": "sender@example.com", "recipients": ["recipient@example.com"], - "body": "Test body content\n", + "body": "Test body content", "attachments": [], "sent_at": ANY, "raw_email": msg.as_string(), - "hash": b"\xed\xa0\x9b\xd4\t4\x06\xb9l\xa4\xb3*\xe4NpZ\x19\xc2\x9b\x87" - + b"\xa6\x12\r\x7fS\xb6\xf1\xbe\x95\x9c\x99\xf1", + "hash": b"\xa8\x8c\xa9\x16\xae\xe7\x99\xca\xc9\xd1q\x8e\xcb\xfc5+ \x03aZLz\xea\xd2\x05\xb9B\xf1i\xde\xa6\xe2", } assert abs(result["sent_at"].timestamp() - test_date.timestamp()) < 86400 # type: ignore diff --git a/tests/memory/workers/test_email.py b/tests/memory/workers/test_email.py index f97aecf..9935efd 100644 --- a/tests/memory/workers/test_email.py +++ b/tests/memory/workers/test_email.py @@ -1,5 +1,6 @@ import base64 import pathlib +import textwrap from datetime import datetime from typing import cast from unittest.mock import MagicMock, patch @@ -100,12 +101,9 @@ def test_process_attachment_disk(attachment_size, max_inline_size, message_id): assert result is not None assert not cast(str, result.content) - assert cast(str, result.filename) == str( - settings.FILE_STORAGE_DIR - / "emails" - / "sender_example_com" - / "INBOX" - / "test_with_special_chars.txt" + assert ( + cast(str, result.filename) + == "emails/sender_example_com/INBOX/test_with_special_chars.txt" ) @@ -183,13 +181,7 @@ def test_process_attachments_mixed(): assert cast(str, results[2].content) == "c" * 30 # Verify large attachment has a path - assert cast(str, results[1].filename) == str( - settings.FILE_STORAGE_DIR - / "emails" - / "sender_example_com" - / "INBOX" - / "large.txt" - ) + assert cast(str, results[1].filename) == "emails/sender_example_com/INBOX/large.txt" def test_extract_email_uid_valid(): @@ -256,8 +248,19 @@ def test_create_mail_message(db_session): assert cast(list[str], mail_message.recipients) == ["recipient@example.com"] assert mail_message.sent_at.isoformat()[:-6] == "2023-01-01T12:00:00" assert cast(str, mail_message.content) == raw_email - assert mail_message.body == "Test body content\n" + assert mail_message.body == "Test body content" assert mail_message.attachments == attachments + assert mail_message.display_contents == { + "content": "Test body content", + "subject": "Test Subject", + "sender": "sender@example.com", + "recipients": ["recipient@example.com"], + "date": "2023-01-01T12:00:00+00:00", + "mime_type": "message/rfc822", + "size": 412, + "tags": ["test"], + "filename": None, + } def test_fetch_email(email_provider):