better emails embedding + format search results

2025-06-28 15:14:45 +02:00 · 2025-06-09 13:51:58 +02:00 · 2025-06-09 13:51:58 +02:00 · 780e27ba04
commit 780e27ba04
parent d73c5bc928
15 changed files with 317 additions and 85 deletions
--- a/frontend/src/App.css
+++ b/frontend/src/App.css
@ -433,6 +433,14 @@ body {
  background: #e2e8f0;
 }
 .metadata {
  margin-top: 1rem;
  padding: 1rem 2rem;
  background: #f9fafb;
  border-radius: 8px;
  border: 1px solid #e5e7eb;
 }
 /* Responsive design */
@media (max-width: 768px) {
  .app-header {
--- a/frontend/src/components/Search.tsx
+++ b/frontend/src/components/Search.tsx
@ -2,7 +2,6 @@ import React, { useState, useEffect } from 'react'
 import { useNavigate } from 'react-router-dom'
 import ReactMarkdown from 'react-markdown'
 import { useMCP } from '../hooks/useMCP'
 import { useAuth } from '../hooks/useAuth'
 import Loading from './Loading'
 type SearchItem = {
@ -24,7 +23,7 @@ const Tag = ({ tags }: { tags: string[] }) => {
    )
 }
-const formatText = ({ filename, content, chunks, tags }: SearchItem) => {
+const TextResult = ({ filename, content, chunks, tags }: SearchItem) => {
    return (
        <div className="search-result-card">
            <h4>{filename || 'Untitled'}</h4>
@ -45,11 +44,12 @@ const formatText = ({ filename, content, chunks, tags }: SearchItem) => {
    )
 }
-const formatMarkdown = ({ filename, content, chunks, tags, metadata }: SearchItem) => {
+const MarkdownResult = ({ filename, content, chunks, tags, metadata }: SearchItem) => {
    return (
        <div className="search-result-card">
            <h4>{filename || 'Untitled'}</h4>
            <Tag tags={tags} />
            <Metadata metadata={metadata} />
            <div className="markdown-content">
                <ReactMarkdown>{content || 'No content available'}</ReactMarkdown>
            </div>
@ -70,7 +70,7 @@ const formatMarkdown = ({ filename, content, chunks, tags, metadata }: SearchIte
    )
 }
-const formatImage = ({ filename, chunks, tags, metadata }: SearchItem) => {
+const ImageResult = ({ filename, chunks, tags, metadata }: SearchItem) => {
    const title = metadata?.title || filename || 'Untitled'
    const { fetchFile } = useMCP()
    const [mime_type, setMimeType] = useState<string>()
@ -95,17 +95,66 @@ const formatImage = ({ filename, chunks, tags, metadata }: SearchItem) => {
    )
 }
 const Metadata = ({ metadata }: { metadata: any }) => {
    if (!metadata) return null
    return (
        <div className="metadata">
            <ul>    
                {Object.entries(metadata).map(([key, value]) => (
                    <li key={key}>{key}: {typeof value === 'string' ? value : JSON.stringify(value)}</li>
                ))}
            </ul>
        </div>
    )
 }
 const PDFResult = ({ filename, content, tags, metadata }: SearchItem) => {
    return (
        <div className="search-result-card">
            <h4>{filename || 'Untitled'}</h4>
            <Tag tags={tags} />
            <a href={`http://localhost:8000/files/${filename}`}>View PDF</a>
            <Metadata metadata={metadata} />
            {content && <div className="markdown-content">
                <details>
                    <summary>View Source</summary>
                    <ReactMarkdown>{content}</ReactMarkdown>
                </details>
            </div>}
        </div>
    )
 }
 const EmailResult = ({ content, tags, metadata }: SearchItem) => {
    return (
        <div className="search-result-card">
            <h4>{metadata?.title || metadata?.subject || 'Untitled'}</h4>
            <Tag tags={tags} />
            <Metadata metadata={metadata} />
            {content && <div className="markdown-content">
                <ReactMarkdown>{content}</ReactMarkdown>
            </div>}
        </div>
    )
 }
 const SearchResult = ({ result }: { result: SearchItem }) => {
    if (result.mime_type.startsWith('image/')) {
-        return formatImage(result)
+        return <ImageResult {...result} />
    }
    if (result.mime_type.startsWith('text/markdown')) {
-        console.log(result)
+        return <MarkdownResult {...result} /> 
        return formatMarkdown(result)
    }
    if (result.mime_type.startsWith('text/')) {
-        return formatText(result)
+        return <TextResult {...result} />
    }
    if (result.mime_type.startsWith('application/pdf')) {
        return <PDFResult {...result} />
    }
    if (result.mime_type.startsWith('message/rfc822')) {
        return <EmailResult {...result} />
    }
    console.log(result)
    return null
 }
--- a/frontend/src/hooks/useMCP.ts
+++ b/frontend/src/hooks/useMCP.ts
@ -115,6 +115,9 @@ export const useMCP = () => {
    }
    const resp = await parseJsonRpcResponse(response)
    if (resp?.result?.isError) {
      throw new Error(resp?.result?.content[0].text)
    }
    return resp?.result?.content.map((item: any) => JSON.parse(item.text))
  }, [apiCall])
--- a/src/memory/api/app.py
+++ b/src/memory/api/app.py
@ -6,7 +6,7 @@ import contextlib
 import os
 import logging
-from fastapi import FastAPI, UploadFile, Request
+from fastapi import FastAPI, UploadFile, Request, HTTPException
 from fastapi.responses import FileResponse
 from fastapi.middleware.cors import CORSMiddleware
 from sqladmin import Admin
@ -50,6 +50,24 @@ async def serve_react_app(full_path: str):
    return FileResponse(settings.STATIC_DIR / "index.html")
@app.get("/files/{path:path}")
 async def serve_file(path: str):
    file_path = settings.FILE_STORAGE_DIR / path
    if not file_path.is_file():
        raise HTTPException(status_code=404, detail="File not found")
    return FileResponse(file_path)
 async def input_type(item: str | UploadFile) -> list[extract.DataChunk]:
    if not item:
        return []
    if isinstance(item, str):
        return extract.extract_text(item)
    content_type = item.content_type or "application/octet-stream"
    return extract.extract_data_chunks(content_type, await item.read())
 # SQLAdmin setup with OAuth protection
 engine = get_engine()
 admin = Admin(app, engine)
@ -72,16 +90,6 @@ async def health_check(request: Request):
 app.mount("/", mcp.streamable_http_app())
 async def input_type(item: str | UploadFile) -> list[extract.DataChunk]:
    if not item:
        return []
    if isinstance(item, str):
        return extract.extract_text(item)
    content_type = item.content_type or "application/octet-stream"
    return extract.extract_data_chunks(content_type, await item.read())
 def main(reload: bool = False):
    """Run the FastAPI server in debug mode with auto-reloading."""
    import uvicorn
--- a/src/memory/api/search/bm25.py
+++ b/src/memory/api/search/bm25.py
@ -23,7 +23,8 @@ async def search_bm25(
 ) -> list[tuple[SourceData, AnnotatedChunk]]:
    with make_session() as db:
        items_query = db.query(Chunk.id, Chunk.content).filter(
-            Chunk.collection_name.in_(modalities)
+            Chunk.collection_name.in_(modalities),
            Chunk.content.isnot(None),
        )
        if source_ids := filters.get("source_ids"):
@ -46,6 +47,7 @@ async def search_bm25(
        item_ids = {
            sha256(item.content.lower().strip().encode("utf-8")).hexdigest(): item.id
            for item in items
            if item.content
        }
        corpus = [item.content.lower().strip() for item in items]
--- a/src/memory/api/search/utils.py
+++ b/src/memory/api/search/utils.py
@ -1,4 +1,5 @@
 import asyncio
 import traceback
 from datetime import datetime
 import logging
 from collections import defaultdict
@ -28,7 +29,7 @@ class SourceData(BaseModel):
    mime_type: str | None
    filename: str | None
    content_length: int
-    contents: dict | None
+    contents: dict | str | None
    created_at: datetime | None
    @staticmethod
@ -87,6 +88,7 @@ async def with_timeout(
        logger.warning(f"Search timed out after {timeout}s")
        return []
    except Exception as e:
        traceback.print_exc()
        logger.error(f"Search failed: {e}")
        return []
@ -109,8 +111,14 @@ def group_chunks(
    def make_result(source: SourceData, chunks: list[AnnotatedChunk]) -> SearchResult:
        contents = source.contents or {}
-        tags = contents.pop("tags", [])
+        tags = []
-        content = contents.pop("content", None)
+        if isinstance(contents, dict):
            tags = contents.pop("tags", [])
            content = contents.pop("content", None)
            print(content)
        else:
            content = contents
            contents = {}
        return SearchResult(
            id=source.id,
--- a/src/memory/common/db/models/source_items.py
+++ b/src/memory/common/db/models/source_items.py
@ -93,35 +93,76 @@ class MailMessage(SourceItem):
        }
    @property
-    def parsed_content(self):
+    def parsed_content(self) -> dict[str, Any]:
        from memory.parsers.email import parse_email_message
-        return parse_email_message(cast(str, self.content), cast(str, self.message_id))
+        return cast(
            dict[str, Any],
            parse_email_message(cast(str, self.content), cast(str, self.message_id)),
        )
    @property
    def body(self) -> str:
        return self.parsed_content["body"]
-    @property
+    def format_content(self, content: dict[str, Any]) -> str:
-    def display_contents(self) -> str | None:
+        sender = (
-        content = self.parsed_content
+            cast(str, self.sender) or content.get("from") or content.get("sender", "")
-        return textwrap.dedent(
+        )
-            """
+        recipients = (
            cast(list[str], self.recipients)
            or content.get("to")
            or content.get("recipients", [])
        )
        date = (
            cast(datetime, self.sent_at) and self.sent_at.isoformat()
        ) or content.get("date", "")
        return (
            textwrap.dedent(
                """
            Subject: {subject}
            From: {sender}
            To: {recipients}
            Date: {date}
-            Body: 
+            Body:
            {body}
            """
-        ).format(
+            )
-            subject=content.get("subject", ""),
+            .format(
-            sender=content.get("from", ""),
+                subject=cast(str, self.subject) or content.get("subject", ""),
-            recipients=content.get("to", ""),
+                sender=sender,
-            date=content.get("date", ""),
+                recipients=", ".join(recipients),
-            body=content.get("body", ""),
+                date=date,
                body=content.get("body", ""),
            )
            .strip()
        )
    @property
    def display_contents(self) -> dict | None:
        return {
            **cast(dict, super().display_contents),
            "content": self.body,
            "subject": self.subject,
            "sender": self.sender,
            "recipients": self.recipients,
            "date": cast(datetime | None, self.sent_at) and self.sent_at.isoformat(),
        }
    def _chunk_contents(self) -> Sequence[extract.DataChunk]:
        content = self.parsed_content
        chunks = extract.extract_text(cast(str, self.body))
        def add_header(item: extract.MulitmodalChunk) -> extract.MulitmodalChunk:
            if isinstance(item, str):
                return self.format_content(content | {"body": item}).strip()
            return item
        for chunk in chunks:
            chunk.data = [add_header(item) for item in chunk.data]
        return chunks
    # Add indexes
    __table_args__ = (
        Index("mail_sent_idx", "sent_at"),
@ -161,13 +202,22 @@ class EmailAttachment(SourceItem):
    def data_chunks(self, metadata: dict[str, Any] = {}) -> Sequence[Chunk]:
        if cast(str | None, self.filename):
-            contents = pathlib.Path(cast(str, self.filename)).read_bytes()
+            contents = (
                settings.FILE_STORAGE_DIR / cast(str, self.filename)
            ).read_bytes()
        else:
            contents = cast(str, self.content)
        chunks = extract.extract_data_chunks(cast(str, self.mime_type), contents)
        return [self._make_chunk(c, metadata) for c in chunks]
    @property
    def display_contents(self) -> dict:
        return {
            **cast(dict, super().display_contents),
            **self.mail_message.display_contents,
        }
    # Add indexes
    __table_args__ = (Index("email_attachment_message_idx", "mail_message_id"),)
--- a/src/memory/common/summarizer.py
+++ b/src/memory/common/summarizer.py
@ -8,6 +8,7 @@ from memory.common import settings, chunker
 logger = logging.getLogger(__name__)
 MAX_TOKENS = 200000
 TAGS_PROMPT = """
 The following text is already concise. Please identify 3-5 relevant tags that capture the main topics or themes.
@ -148,6 +149,12 @@ def summarize(content: str, target_tokens: int | None = None) -> tuple[str, list
            content=content,
        )
    if chunker.approx_token_count(prompt) > MAX_TOKENS:
        logger.warning(
            f"Prompt too long ({chunker.approx_token_count(prompt)} tokens), truncating"
        )
        prompt = truncate(prompt, MAX_TOKENS - 20)
    try:
        if settings.SUMMARIZER_MODEL.startswith("anthropic"):
            result = _call_anthropic(prompt)
--- a/src/memory/parsers/email.py
+++ b/src/memory/parsers/email.py
@ -1,4 +1,5 @@
 import email
 import email.message
 import hashlib
 import logging
 import pathlib
@ -6,6 +7,8 @@ from datetime import datetime
 from email.utils import parsedate_to_datetime
 from typing import TypedDict
 from markdownify import markdownify
 logger = logging.getLogger(__name__)
@ -71,33 +74,60 @@ def extract_date(msg: email.message.Message) -> datetime | None:  # type: ignore
 def extract_body(msg: email.message.Message) -> str:  # type: ignore
    """
-    Extract plain text body from email message.
+    Extract body from email message, preferring HTML converted to markdown.
    Args:
        msg: Email message object
    Returns:
-        Plain text body content
+        Body content as markdown (if HTML found) or plain text
    """
-    body = ""
+    html_body = ""
    plain_body = ""
    if not msg.is_multipart():
        try:
-            return msg.get_payload(decode=True).decode(errors="replace")
+            payload = msg.get_payload(decode=True)
            if isinstance(payload, bytes):
                content = payload.decode(errors="replace")
            else:
                content = str(payload)
            content_type = msg.get_content_type()
            if content_type == "text/html":
                return markdownify(content).strip()
            else:
                return content
        except Exception as e:
            logger.error(f"Error decoding message body: {str(e)}")
            return ""
    # Extract both HTML and plain text parts
    for part in msg.walk():
        content_type = part.get_content_type()
        content_disposition = str(part.get("Content-Disposition", ""))
-        if content_type == "text/plain" and "attachment" not in content_disposition:
+        if "attachment" in content_disposition:
-            try:
+            continue
-                body += part.get_payload(decode=True).decode(errors="replace") + "\n"
+
-            except Exception as e:
+        try:
-                logger.error(f"Error decoding message part: {str(e)}")
+            payload = part.get_payload(decode=True)
-    return body
+            if isinstance(payload, bytes):
                content = payload.decode(errors="replace")
            else:
                content = str(payload)
            if content_type == "text/html":
                html_body += content + "\n"
            elif content_type == "text/plain":
                plain_body += content + "\n"
        except Exception as e:
            logger.error(f"Error decoding message part: {str(e)}")
    # Prefer HTML (converted to markdown) over plain text
    if html_body.strip():
        return markdownify(html_body).strip()
    else:
        return plain_body.strip()
 def extract_attachments(msg: email.message.Message) -> list[Attachment]:  # type: ignore
--- a/src/memory/workers/email.py
+++ b/src/memory/workers/email.py
@ -61,7 +61,7 @@ def process_attachment(
        mime_type=attachment["content_type"],
        mail_message=message,
        content=content,
-        filename=file_path and str(file_path),
+        filename=file_path and str(file_path.relative_to(settings.FILE_STORAGE_DIR)),
    )
@ -149,7 +149,7 @@ def extract_email_uid(
 def fetch_email(conn: imaplib.IMAP4_SSL, uid: str) -> RawEmailResponse | None:
    try:
-        status, msg_data = conn.fetch(uid, "(UID RFC822)")
+        status, msg_data = conn.fetch(uid, "(UID BODY.PEEK[])")
        if status != "OK" or not msg_data or not msg_data[0]:
            logger.error(f"Error fetching message {uid}")
            return None
--- a/tests/data/contents.py
+++ b/tests/data/contents.py
@ -237,6 +237,40 @@ SAMPLE_TEXT = BeautifulSoup(SAMPLE_HTML, "html.parser").get_text()
 SECOND_PAGE_MARKDOWN = markdownify(SECOND_PAGE)
 SECOND_PAGE_TEXT = BeautifulSoup(SECOND_PAGE, "html.parser").get_text()
 SAMPLE_EMAIL = f"""From: john.doe@techcorp.com
 To: research-team@techcorp.com, jane.smith@university.edu
 CC: newsletter@programming-weekly.com
 Subject: The Evolution of Programming Languages - Research Article
 Date: Wed, 15 Jan 2025 14:30:00 +0000
 Message-ID: <20250115143000.12345@techcorp.com>
 MIME-Version: 1.0
 Content-Type: multipart/mixed; boundary="----=_NextPart_000_0001_01DA1234.56789ABC"
 This is a multi-part message in MIME format.
 ------=_NextPart_000_0001_01DA1234.56789ABC
 Content-Type: text/html; charset=utf-8
 Content-Transfer-Encoding: quoted-printable
 {SAMPLE_HTML}
 ------=_NextPart_000_0001_01DA1234.56789ABC
 Content-Type: image/png
 Content-Disposition: attachment; filename="lang_timeline.png"
 Content-Transfer-Encoding: base64
 iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mNkYPhfDwAChwGA60e6kgAAAABJRU5ErkJggg==
 ------=_NextPart_000_0001_01DA1234.56789ABC
 Content-Type: image/jpeg
 Content-Disposition: attachment; filename="code_complexity.jpg"
 Content-Transfer-Encoding: base64
 /9j/4AAQSkZJRgABAQEAYABgAAD/2wBDAAEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEB
 ------=_NextPart_000_0001_01DA1234.56789ABC--
 """
 def image_hash(image: Image.Image) -> str:
    return hashlib.sha256(image.tobytes()).hexdigest()
--- a/tests/memory/common/db/models/test_source_item_embeddings.py
+++ b/tests/memory/common/db/models/test_source_item_embeddings.py
@ -1,4 +1,5 @@
 import hashlib
 import textwrap
 from datetime import datetime
 from typing import Sequence, cast
 from unittest.mock import ANY, Mock, call
@ -20,6 +21,7 @@ from memory.common.db.models.source_items import (
 from memory.common.db.models.sources import Book
 from memory.common.embedding import embed_source_item
 from memory.common.extract import page_to_image
 from memory.parsers.email import parse_email_message
 from tests.data.contents import (
    CHUNKS,
    DATA_DIR,
@ -27,6 +29,7 @@ from tests.data.contents import (
    LANG_TIMELINE_HASH,
    CODE_COMPLEXITY,
    CODE_COMPLEXITY_HASH,
    SAMPLE_EMAIL,
    SAMPLE_MARKDOWN,
    SAMPLE_TEXT,
    SECOND_PAGE,
@ -127,31 +130,41 @@ def test_base_source_item_mixed_embeddings(mock_voyage_client):
    ] == [LANG_TIMELINE_HASH]
-def test_mail_message_embeddings(mock_voyage_client):
+def test_mail_message_with_attachments_embeddings(mock_voyage_client):
    email = parse_email_message(SAMPLE_EMAIL, "123")
    item = MailMessage(
        id=1,
-        content=SAMPLE_MARKDOWN,
+        content=SAMPLE_EMAIL,
        mime_type="text/html",
        modality="text",
-        sha256=hashlib.sha256(SAMPLE_MARKDOWN.encode("utf-8")).hexdigest(),
+        sha256=hashlib.sha256(email["body"].encode("utf-8")).hexdigest(),
-        size=len(SAMPLE_MARKDOWN),
+        size=len(email["body"]),
        tags=["bla"],
        message_id="123",
-        subject="Test Subject",
+        subject=email["subject"],
-        sender="test@example.com",
+        sender=email["sender"],
-        recipients=["test@example.com"],
+        recipients=email["recipients"],
        folder="INBOX",
        sent_at=datetime(2025, 1, 1, 12, 0, 0),
    )
    email_header = textwrap.dedent(
        f"""
        Subject: {email["subject"]}
        From: {email["sender"]}
        To: {", ".join(email["recipients"])}
        Date: 2025-01-01T12:00:00
        Body:
        """
    ).lstrip()
    metadata = item.as_payload()
-    metadata["tags"] = {"bla", "test@example.com"}
+    metadata["tags"] = {"bla", "john.doe@techcorp.com"} | set(email["recipients"])
    expected = [
-        (CHUNKS[0].strip(), [], metadata),
+        (email_header + CHUNKS[0].strip(), [], metadata),
-        (CHUNKS[1].strip(), [], metadata),
+        (email_header + CHUNKS[1].strip().replace("—", "\\\\u2014"), [], metadata),
        (
-            "test summary",
+            email_header + "test summary",
            [],
-            metadata | {"tags": {"tag1", "tag2", "bla", "test@example.com"}},
+            metadata | {"tags": {"tag1", "tag2"} | metadata["tags"]},
        ),
    ]
@ -166,7 +179,11 @@ def test_mail_message_embeddings(mock_voyage_client):
    assert not mock_voyage_client.multimodal_embed.call_count
    assert mock_voyage_client.embed.call_args == call(
-        [CHUNKS[0].strip(), CHUNKS[1].strip(), "test summary"],
+        [
            email_header + CHUNKS[0].strip(),
            email_header + CHUNKS[1].strip().replace("—", "\\\\u2014"),
            email_header + "test summary",
        ],
        model=settings.TEXT_EMBEDDING_MODEL,
        input_type="document",
    )
--- a/tests/memory/common/db/models/test_source_items.py
+++ b/tests/memory/common/db/models/test_source_items.py
@ -183,13 +183,27 @@ Subject: Test Subject
 Test Body Content"""
    mail_message = MailMessage(
-        sha256=b"test", content=email_content, message_id="<test@example.com>"
+        sha256=b"test",
        content=email_content,
        message_id="<test@example.com>",
        sender="sender@example.com",
        recipients=["recipient@example.com"],
        subject="Test Subject",
        size=1024,
        sent_at=datetime(2023, 1, 1, 12, 0, 0),
    )
-    expected = (
+    assert mail_message.display_contents == {
-        "\nSubject: Test Subject\nFrom: \nTo: \nDate: \nBody: \nTest Body Content\n"
+        "content": "Test Body Content",
-    )
+        "date": "2023-01-01T12:00:00",
-    assert mail_message.display_contents == expected
+        "filename": None,
        "mime_type": None,
        "size": 1024,
        "subject": "Test Subject",
        "sender": "sender@example.com",
        "recipients": ["recipient@example.com"],
        "tags": None,
    }
@pytest.mark.parametrize(
--- a/tests/memory/parsers/test_email_parsers.py
+++ b/tests/memory/parsers/test_email_parsers.py
@ -246,12 +246,11 @@ def test_parse_simple_email():
        "subject": "Test Subject",
        "sender": "sender@example.com",
        "recipients": ["recipient@example.com"],
-        "body": "Test body content\n",
+        "body": "Test body content",
        "attachments": [],
        "sent_at": ANY,
        "raw_email": msg.as_string(),
-        "hash": b"\xed\xa0\x9b\xd4\t4\x06\xb9l\xa4\xb3*\xe4NpZ\x19\xc2\x9b\x87"
+        "hash": b"\xa8\x8c\xa9\x16\xae\xe7\x99\xca\xc9\xd1q\x8e\xcb\xfc5+ \x03aZLz\xea\xd2\x05\xb9B\xf1i\xde\xa6\xe2",
        + b"\xa6\x12\r\x7fS\xb6\xf1\xbe\x95\x9c\x99\xf1",
    }
    assert abs(result["sent_at"].timestamp() - test_date.timestamp()) < 86400  # type: ignore
--- a/tests/memory/workers/test_email.py
+++ b/tests/memory/workers/test_email.py
@ -1,5 +1,6 @@
 import base64
 import pathlib
 import textwrap
 from datetime import datetime
 from typing import cast
 from unittest.mock import MagicMock, patch
@ -100,12 +101,9 @@ def test_process_attachment_disk(attachment_size, max_inline_size, message_id):
    assert result is not None
    assert not cast(str, result.content)
-    assert cast(str, result.filename) == str(
+    assert (
-        settings.FILE_STORAGE_DIR
+        cast(str, result.filename)
-        / "emails"
+        == "emails/sender_example_com/INBOX/test_with_special_chars.txt"
        / "sender_example_com"
        / "INBOX"
        / "test_with_special_chars.txt"
    )
@ -183,13 +181,7 @@ def test_process_attachments_mixed():
    assert cast(str, results[2].content) == "c" * 30
    # Verify large attachment has a path
-    assert cast(str, results[1].filename) == str(
+    assert cast(str, results[1].filename) == "emails/sender_example_com/INBOX/large.txt"
        settings.FILE_STORAGE_DIR
        / "emails"
        / "sender_example_com"
        / "INBOX"
        / "large.txt"
    )
 def test_extract_email_uid_valid():
@ -256,8 +248,19 @@ def test_create_mail_message(db_session):
    assert cast(list[str], mail_message.recipients) == ["recipient@example.com"]
    assert mail_message.sent_at.isoformat()[:-6] == "2023-01-01T12:00:00"
    assert cast(str, mail_message.content) == raw_email
-    assert mail_message.body == "Test body content\n"
+    assert mail_message.body == "Test body content"
    assert mail_message.attachments == attachments
    assert mail_message.display_contents == {
        "content": "Test body content",
        "subject": "Test Subject",
        "sender": "sender@example.com",
        "recipients": ["recipient@example.com"],
        "date": "2023-01-01T12:00:00+00:00",
        "mime_type": "message/rfc822",
        "size": 412,
        "tags": ["test"],
        "filename": None,
    }
 def test_fetch_email(email_provider):