better emails embedding + format search results

This commit is contained in:
Daniel O'Connell 2025-06-09 13:51:58 +02:00
parent d73c5bc928
commit 780e27ba04
15 changed files with 317 additions and 85 deletions

View File

@ -433,6 +433,14 @@ body {
background: #e2e8f0; background: #e2e8f0;
} }
.metadata {
margin-top: 1rem;
padding: 1rem 2rem;
background: #f9fafb;
border-radius: 8px;
border: 1px solid #e5e7eb;
}
/* Responsive design */ /* Responsive design */
@media (max-width: 768px) { @media (max-width: 768px) {
.app-header { .app-header {

View File

@ -2,7 +2,6 @@ import React, { useState, useEffect } from 'react'
import { useNavigate } from 'react-router-dom' import { useNavigate } from 'react-router-dom'
import ReactMarkdown from 'react-markdown' import ReactMarkdown from 'react-markdown'
import { useMCP } from '../hooks/useMCP' import { useMCP } from '../hooks/useMCP'
import { useAuth } from '../hooks/useAuth'
import Loading from './Loading' import Loading from './Loading'
type SearchItem = { type SearchItem = {
@ -24,7 +23,7 @@ const Tag = ({ tags }: { tags: string[] }) => {
) )
} }
const formatText = ({ filename, content, chunks, tags }: SearchItem) => { const TextResult = ({ filename, content, chunks, tags }: SearchItem) => {
return ( return (
<div className="search-result-card"> <div className="search-result-card">
<h4>{filename || 'Untitled'}</h4> <h4>{filename || 'Untitled'}</h4>
@ -45,11 +44,12 @@ const formatText = ({ filename, content, chunks, tags }: SearchItem) => {
) )
} }
const formatMarkdown = ({ filename, content, chunks, tags, metadata }: SearchItem) => { const MarkdownResult = ({ filename, content, chunks, tags, metadata }: SearchItem) => {
return ( return (
<div className="search-result-card"> <div className="search-result-card">
<h4>{filename || 'Untitled'}</h4> <h4>{filename || 'Untitled'}</h4>
<Tag tags={tags} /> <Tag tags={tags} />
<Metadata metadata={metadata} />
<div className="markdown-content"> <div className="markdown-content">
<ReactMarkdown>{content || 'No content available'}</ReactMarkdown> <ReactMarkdown>{content || 'No content available'}</ReactMarkdown>
</div> </div>
@ -70,7 +70,7 @@ const formatMarkdown = ({ filename, content, chunks, tags, metadata }: SearchIte
) )
} }
const formatImage = ({ filename, chunks, tags, metadata }: SearchItem) => { const ImageResult = ({ filename, chunks, tags, metadata }: SearchItem) => {
const title = metadata?.title || filename || 'Untitled' const title = metadata?.title || filename || 'Untitled'
const { fetchFile } = useMCP() const { fetchFile } = useMCP()
const [mime_type, setMimeType] = useState<string>() const [mime_type, setMimeType] = useState<string>()
@ -95,17 +95,66 @@ const formatImage = ({ filename, chunks, tags, metadata }: SearchItem) => {
) )
} }
const Metadata = ({ metadata }: { metadata: any }) => {
if (!metadata) return null
return (
<div className="metadata">
<ul>
{Object.entries(metadata).map(([key, value]) => (
<li key={key}>{key}: {typeof value === 'string' ? value : JSON.stringify(value)}</li>
))}
</ul>
</div>
)
}
const PDFResult = ({ filename, content, tags, metadata }: SearchItem) => {
return (
<div className="search-result-card">
<h4>{filename || 'Untitled'}</h4>
<Tag tags={tags} />
<a href={`http://localhost:8000/files/${filename}`}>View PDF</a>
<Metadata metadata={metadata} />
{content && <div className="markdown-content">
<details>
<summary>View Source</summary>
<ReactMarkdown>{content}</ReactMarkdown>
</details>
</div>}
</div>
)
}
const EmailResult = ({ content, tags, metadata }: SearchItem) => {
return (
<div className="search-result-card">
<h4>{metadata?.title || metadata?.subject || 'Untitled'}</h4>
<Tag tags={tags} />
<Metadata metadata={metadata} />
{content && <div className="markdown-content">
<ReactMarkdown>{content}</ReactMarkdown>
</div>}
</div>
)
}
const SearchResult = ({ result }: { result: SearchItem }) => { const SearchResult = ({ result }: { result: SearchItem }) => {
if (result.mime_type.startsWith('image/')) { if (result.mime_type.startsWith('image/')) {
return formatImage(result) return <ImageResult {...result} />
} }
if (result.mime_type.startsWith('text/markdown')) { if (result.mime_type.startsWith('text/markdown')) {
console.log(result) return <MarkdownResult {...result} />
return formatMarkdown(result)
} }
if (result.mime_type.startsWith('text/')) { if (result.mime_type.startsWith('text/')) {
return formatText(result) return <TextResult {...result} />
} }
if (result.mime_type.startsWith('application/pdf')) {
return <PDFResult {...result} />
}
if (result.mime_type.startsWith('message/rfc822')) {
return <EmailResult {...result} />
}
console.log(result)
return null return null
} }

View File

@ -115,6 +115,9 @@ export const useMCP = () => {
} }
const resp = await parseJsonRpcResponse(response) const resp = await parseJsonRpcResponse(response)
if (resp?.result?.isError) {
throw new Error(resp?.result?.content[0].text)
}
return resp?.result?.content.map((item: any) => JSON.parse(item.text)) return resp?.result?.content.map((item: any) => JSON.parse(item.text))
}, [apiCall]) }, [apiCall])

View File

@ -6,7 +6,7 @@ import contextlib
import os import os
import logging import logging
from fastapi import FastAPI, UploadFile, Request from fastapi import FastAPI, UploadFile, Request, HTTPException
from fastapi.responses import FileResponse from fastapi.responses import FileResponse
from fastapi.middleware.cors import CORSMiddleware from fastapi.middleware.cors import CORSMiddleware
from sqladmin import Admin from sqladmin import Admin
@ -50,6 +50,24 @@ async def serve_react_app(full_path: str):
return FileResponse(settings.STATIC_DIR / "index.html") return FileResponse(settings.STATIC_DIR / "index.html")
@app.get("/files/{path:path}")
async def serve_file(path: str):
file_path = settings.FILE_STORAGE_DIR / path
if not file_path.is_file():
raise HTTPException(status_code=404, detail="File not found")
return FileResponse(file_path)
async def input_type(item: str | UploadFile) -> list[extract.DataChunk]:
if not item:
return []
if isinstance(item, str):
return extract.extract_text(item)
content_type = item.content_type or "application/octet-stream"
return extract.extract_data_chunks(content_type, await item.read())
# SQLAdmin setup with OAuth protection # SQLAdmin setup with OAuth protection
engine = get_engine() engine = get_engine()
admin = Admin(app, engine) admin = Admin(app, engine)
@ -72,16 +90,6 @@ async def health_check(request: Request):
app.mount("/", mcp.streamable_http_app()) app.mount("/", mcp.streamable_http_app())
async def input_type(item: str | UploadFile) -> list[extract.DataChunk]:
if not item:
return []
if isinstance(item, str):
return extract.extract_text(item)
content_type = item.content_type or "application/octet-stream"
return extract.extract_data_chunks(content_type, await item.read())
def main(reload: bool = False): def main(reload: bool = False):
"""Run the FastAPI server in debug mode with auto-reloading.""" """Run the FastAPI server in debug mode with auto-reloading."""
import uvicorn import uvicorn

View File

@ -23,7 +23,8 @@ async def search_bm25(
) -> list[tuple[SourceData, AnnotatedChunk]]: ) -> list[tuple[SourceData, AnnotatedChunk]]:
with make_session() as db: with make_session() as db:
items_query = db.query(Chunk.id, Chunk.content).filter( items_query = db.query(Chunk.id, Chunk.content).filter(
Chunk.collection_name.in_(modalities) Chunk.collection_name.in_(modalities),
Chunk.content.isnot(None),
) )
if source_ids := filters.get("source_ids"): if source_ids := filters.get("source_ids"):
@ -46,6 +47,7 @@ async def search_bm25(
item_ids = { item_ids = {
sha256(item.content.lower().strip().encode("utf-8")).hexdigest(): item.id sha256(item.content.lower().strip().encode("utf-8")).hexdigest(): item.id
for item in items for item in items
if item.content
} }
corpus = [item.content.lower().strip() for item in items] corpus = [item.content.lower().strip() for item in items]

View File

@ -1,4 +1,5 @@
import asyncio import asyncio
import traceback
from datetime import datetime from datetime import datetime
import logging import logging
from collections import defaultdict from collections import defaultdict
@ -28,7 +29,7 @@ class SourceData(BaseModel):
mime_type: str | None mime_type: str | None
filename: str | None filename: str | None
content_length: int content_length: int
contents: dict | None contents: dict | str | None
created_at: datetime | None created_at: datetime | None
@staticmethod @staticmethod
@ -87,6 +88,7 @@ async def with_timeout(
logger.warning(f"Search timed out after {timeout}s") logger.warning(f"Search timed out after {timeout}s")
return [] return []
except Exception as e: except Exception as e:
traceback.print_exc()
logger.error(f"Search failed: {e}") logger.error(f"Search failed: {e}")
return [] return []
@ -109,8 +111,14 @@ def group_chunks(
def make_result(source: SourceData, chunks: list[AnnotatedChunk]) -> SearchResult: def make_result(source: SourceData, chunks: list[AnnotatedChunk]) -> SearchResult:
contents = source.contents or {} contents = source.contents or {}
tags = contents.pop("tags", []) tags = []
content = contents.pop("content", None) if isinstance(contents, dict):
tags = contents.pop("tags", [])
content = contents.pop("content", None)
print(content)
else:
content = contents
contents = {}
return SearchResult( return SearchResult(
id=source.id, id=source.id,

View File

@ -93,35 +93,76 @@ class MailMessage(SourceItem):
} }
@property @property
def parsed_content(self): def parsed_content(self) -> dict[str, Any]:
from memory.parsers.email import parse_email_message from memory.parsers.email import parse_email_message
return parse_email_message(cast(str, self.content), cast(str, self.message_id)) return cast(
dict[str, Any],
parse_email_message(cast(str, self.content), cast(str, self.message_id)),
)
@property @property
def body(self) -> str: def body(self) -> str:
return self.parsed_content["body"] return self.parsed_content["body"]
@property def format_content(self, content: dict[str, Any]) -> str:
def display_contents(self) -> str | None: sender = (
content = self.parsed_content cast(str, self.sender) or content.get("from") or content.get("sender", "")
return textwrap.dedent( )
""" recipients = (
cast(list[str], self.recipients)
or content.get("to")
or content.get("recipients", [])
)
date = (
cast(datetime, self.sent_at) and self.sent_at.isoformat()
) or content.get("date", "")
return (
textwrap.dedent(
"""
Subject: {subject} Subject: {subject}
From: {sender} From: {sender}
To: {recipients} To: {recipients}
Date: {date} Date: {date}
Body: Body:
{body} {body}
""" """
).format( )
subject=content.get("subject", ""), .format(
sender=content.get("from", ""), subject=cast(str, self.subject) or content.get("subject", ""),
recipients=content.get("to", ""), sender=sender,
date=content.get("date", ""), recipients=", ".join(recipients),
body=content.get("body", ""), date=date,
body=content.get("body", ""),
)
.strip()
) )
@property
def display_contents(self) -> dict | None:
return {
**cast(dict, super().display_contents),
"content": self.body,
"subject": self.subject,
"sender": self.sender,
"recipients": self.recipients,
"date": cast(datetime | None, self.sent_at) and self.sent_at.isoformat(),
}
def _chunk_contents(self) -> Sequence[extract.DataChunk]:
content = self.parsed_content
chunks = extract.extract_text(cast(str, self.body))
def add_header(item: extract.MulitmodalChunk) -> extract.MulitmodalChunk:
if isinstance(item, str):
return self.format_content(content | {"body": item}).strip()
return item
for chunk in chunks:
chunk.data = [add_header(item) for item in chunk.data]
return chunks
# Add indexes # Add indexes
__table_args__ = ( __table_args__ = (
Index("mail_sent_idx", "sent_at"), Index("mail_sent_idx", "sent_at"),
@ -161,13 +202,22 @@ class EmailAttachment(SourceItem):
def data_chunks(self, metadata: dict[str, Any] = {}) -> Sequence[Chunk]: def data_chunks(self, metadata: dict[str, Any] = {}) -> Sequence[Chunk]:
if cast(str | None, self.filename): if cast(str | None, self.filename):
contents = pathlib.Path(cast(str, self.filename)).read_bytes() contents = (
settings.FILE_STORAGE_DIR / cast(str, self.filename)
).read_bytes()
else: else:
contents = cast(str, self.content) contents = cast(str, self.content)
chunks = extract.extract_data_chunks(cast(str, self.mime_type), contents) chunks = extract.extract_data_chunks(cast(str, self.mime_type), contents)
return [self._make_chunk(c, metadata) for c in chunks] return [self._make_chunk(c, metadata) for c in chunks]
@property
def display_contents(self) -> dict:
return {
**cast(dict, super().display_contents),
**self.mail_message.display_contents,
}
# Add indexes # Add indexes
__table_args__ = (Index("email_attachment_message_idx", "mail_message_id"),) __table_args__ = (Index("email_attachment_message_idx", "mail_message_id"),)

View File

@ -8,6 +8,7 @@ from memory.common import settings, chunker
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
MAX_TOKENS = 200000
TAGS_PROMPT = """ TAGS_PROMPT = """
The following text is already concise. Please identify 3-5 relevant tags that capture the main topics or themes. The following text is already concise. Please identify 3-5 relevant tags that capture the main topics or themes.
@ -148,6 +149,12 @@ def summarize(content: str, target_tokens: int | None = None) -> tuple[str, list
content=content, content=content,
) )
if chunker.approx_token_count(prompt) > MAX_TOKENS:
logger.warning(
f"Prompt too long ({chunker.approx_token_count(prompt)} tokens), truncating"
)
prompt = truncate(prompt, MAX_TOKENS - 20)
try: try:
if settings.SUMMARIZER_MODEL.startswith("anthropic"): if settings.SUMMARIZER_MODEL.startswith("anthropic"):
result = _call_anthropic(prompt) result = _call_anthropic(prompt)

View File

@ -1,4 +1,5 @@
import email import email
import email.message
import hashlib import hashlib
import logging import logging
import pathlib import pathlib
@ -6,6 +7,8 @@ from datetime import datetime
from email.utils import parsedate_to_datetime from email.utils import parsedate_to_datetime
from typing import TypedDict from typing import TypedDict
from markdownify import markdownify
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@ -71,33 +74,60 @@ def extract_date(msg: email.message.Message) -> datetime | None: # type: ignore
def extract_body(msg: email.message.Message) -> str: # type: ignore def extract_body(msg: email.message.Message) -> str: # type: ignore
""" """
Extract plain text body from email message. Extract body from email message, preferring HTML converted to markdown.
Args: Args:
msg: Email message object msg: Email message object
Returns: Returns:
Plain text body content Body content as markdown (if HTML found) or plain text
""" """
body = "" html_body = ""
plain_body = ""
if not msg.is_multipart(): if not msg.is_multipart():
try: try:
return msg.get_payload(decode=True).decode(errors="replace") payload = msg.get_payload(decode=True)
if isinstance(payload, bytes):
content = payload.decode(errors="replace")
else:
content = str(payload)
content_type = msg.get_content_type()
if content_type == "text/html":
return markdownify(content).strip()
else:
return content
except Exception as e: except Exception as e:
logger.error(f"Error decoding message body: {str(e)}") logger.error(f"Error decoding message body: {str(e)}")
return "" return ""
# Extract both HTML and plain text parts
for part in msg.walk(): for part in msg.walk():
content_type = part.get_content_type() content_type = part.get_content_type()
content_disposition = str(part.get("Content-Disposition", "")) content_disposition = str(part.get("Content-Disposition", ""))
if content_type == "text/plain" and "attachment" not in content_disposition: if "attachment" in content_disposition:
try: continue
body += part.get_payload(decode=True).decode(errors="replace") + "\n"
except Exception as e: try:
logger.error(f"Error decoding message part: {str(e)}") payload = part.get_payload(decode=True)
return body if isinstance(payload, bytes):
content = payload.decode(errors="replace")
else:
content = str(payload)
if content_type == "text/html":
html_body += content + "\n"
elif content_type == "text/plain":
plain_body += content + "\n"
except Exception as e:
logger.error(f"Error decoding message part: {str(e)}")
# Prefer HTML (converted to markdown) over plain text
if html_body.strip():
return markdownify(html_body).strip()
else:
return plain_body.strip()
def extract_attachments(msg: email.message.Message) -> list[Attachment]: # type: ignore def extract_attachments(msg: email.message.Message) -> list[Attachment]: # type: ignore

View File

@ -61,7 +61,7 @@ def process_attachment(
mime_type=attachment["content_type"], mime_type=attachment["content_type"],
mail_message=message, mail_message=message,
content=content, content=content,
filename=file_path and str(file_path), filename=file_path and str(file_path.relative_to(settings.FILE_STORAGE_DIR)),
) )
@ -149,7 +149,7 @@ def extract_email_uid(
def fetch_email(conn: imaplib.IMAP4_SSL, uid: str) -> RawEmailResponse | None: def fetch_email(conn: imaplib.IMAP4_SSL, uid: str) -> RawEmailResponse | None:
try: try:
status, msg_data = conn.fetch(uid, "(UID RFC822)") status, msg_data = conn.fetch(uid, "(UID BODY.PEEK[])")
if status != "OK" or not msg_data or not msg_data[0]: if status != "OK" or not msg_data or not msg_data[0]:
logger.error(f"Error fetching message {uid}") logger.error(f"Error fetching message {uid}")
return None return None

View File

@ -237,6 +237,40 @@ SAMPLE_TEXT = BeautifulSoup(SAMPLE_HTML, "html.parser").get_text()
SECOND_PAGE_MARKDOWN = markdownify(SECOND_PAGE) SECOND_PAGE_MARKDOWN = markdownify(SECOND_PAGE)
SECOND_PAGE_TEXT = BeautifulSoup(SECOND_PAGE, "html.parser").get_text() SECOND_PAGE_TEXT = BeautifulSoup(SECOND_PAGE, "html.parser").get_text()
SAMPLE_EMAIL = f"""From: john.doe@techcorp.com
To: research-team@techcorp.com, jane.smith@university.edu
CC: newsletter@programming-weekly.com
Subject: The Evolution of Programming Languages - Research Article
Date: Wed, 15 Jan 2025 14:30:00 +0000
Message-ID: <20250115143000.12345@techcorp.com>
MIME-Version: 1.0
Content-Type: multipart/mixed; boundary="----=_NextPart_000_0001_01DA1234.56789ABC"
This is a multi-part message in MIME format.
------=_NextPart_000_0001_01DA1234.56789ABC
Content-Type: text/html; charset=utf-8
Content-Transfer-Encoding: quoted-printable
{SAMPLE_HTML}
------=_NextPart_000_0001_01DA1234.56789ABC
Content-Type: image/png
Content-Disposition: attachment; filename="lang_timeline.png"
Content-Transfer-Encoding: base64
iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mNkYPhfDwAChwGA60e6kgAAAABJRU5ErkJggg==
------=_NextPart_000_0001_01DA1234.56789ABC
Content-Type: image/jpeg
Content-Disposition: attachment; filename="code_complexity.jpg"
Content-Transfer-Encoding: base64
/9j/4AAQSkZJRgABAQEAYABgAAD/2wBDAAEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEB
------=_NextPart_000_0001_01DA1234.56789ABC--
"""
def image_hash(image: Image.Image) -> str: def image_hash(image: Image.Image) -> str:
return hashlib.sha256(image.tobytes()).hexdigest() return hashlib.sha256(image.tobytes()).hexdigest()

View File

@ -1,4 +1,5 @@
import hashlib import hashlib
import textwrap
from datetime import datetime from datetime import datetime
from typing import Sequence, cast from typing import Sequence, cast
from unittest.mock import ANY, Mock, call from unittest.mock import ANY, Mock, call
@ -20,6 +21,7 @@ from memory.common.db.models.source_items import (
from memory.common.db.models.sources import Book from memory.common.db.models.sources import Book
from memory.common.embedding import embed_source_item from memory.common.embedding import embed_source_item
from memory.common.extract import page_to_image from memory.common.extract import page_to_image
from memory.parsers.email import parse_email_message
from tests.data.contents import ( from tests.data.contents import (
CHUNKS, CHUNKS,
DATA_DIR, DATA_DIR,
@ -27,6 +29,7 @@ from tests.data.contents import (
LANG_TIMELINE_HASH, LANG_TIMELINE_HASH,
CODE_COMPLEXITY, CODE_COMPLEXITY,
CODE_COMPLEXITY_HASH, CODE_COMPLEXITY_HASH,
SAMPLE_EMAIL,
SAMPLE_MARKDOWN, SAMPLE_MARKDOWN,
SAMPLE_TEXT, SAMPLE_TEXT,
SECOND_PAGE, SECOND_PAGE,
@ -127,31 +130,41 @@ def test_base_source_item_mixed_embeddings(mock_voyage_client):
] == [LANG_TIMELINE_HASH] ] == [LANG_TIMELINE_HASH]
def test_mail_message_embeddings(mock_voyage_client): def test_mail_message_with_attachments_embeddings(mock_voyage_client):
email = parse_email_message(SAMPLE_EMAIL, "123")
item = MailMessage( item = MailMessage(
id=1, id=1,
content=SAMPLE_MARKDOWN, content=SAMPLE_EMAIL,
mime_type="text/html", mime_type="text/html",
modality="text", modality="text",
sha256=hashlib.sha256(SAMPLE_MARKDOWN.encode("utf-8")).hexdigest(), sha256=hashlib.sha256(email["body"].encode("utf-8")).hexdigest(),
size=len(SAMPLE_MARKDOWN), size=len(email["body"]),
tags=["bla"], tags=["bla"],
message_id="123", message_id="123",
subject="Test Subject", subject=email["subject"],
sender="test@example.com", sender=email["sender"],
recipients=["test@example.com"], recipients=email["recipients"],
folder="INBOX", folder="INBOX",
sent_at=datetime(2025, 1, 1, 12, 0, 0), sent_at=datetime(2025, 1, 1, 12, 0, 0),
) )
email_header = textwrap.dedent(
f"""
Subject: {email["subject"]}
From: {email["sender"]}
To: {", ".join(email["recipients"])}
Date: 2025-01-01T12:00:00
Body:
"""
).lstrip()
metadata = item.as_payload() metadata = item.as_payload()
metadata["tags"] = {"bla", "test@example.com"} metadata["tags"] = {"bla", "john.doe@techcorp.com"} | set(email["recipients"])
expected = [ expected = [
(CHUNKS[0].strip(), [], metadata), (email_header + CHUNKS[0].strip(), [], metadata),
(CHUNKS[1].strip(), [], metadata), (email_header + CHUNKS[1].strip().replace("", "\\\\u2014"), [], metadata),
( (
"test summary", email_header + "test summary",
[], [],
metadata | {"tags": {"tag1", "tag2", "bla", "test@example.com"}}, metadata | {"tags": {"tag1", "tag2"} | metadata["tags"]},
), ),
] ]
@ -166,7 +179,11 @@ def test_mail_message_embeddings(mock_voyage_client):
assert not mock_voyage_client.multimodal_embed.call_count assert not mock_voyage_client.multimodal_embed.call_count
assert mock_voyage_client.embed.call_args == call( assert mock_voyage_client.embed.call_args == call(
[CHUNKS[0].strip(), CHUNKS[1].strip(), "test summary"], [
email_header + CHUNKS[0].strip(),
email_header + CHUNKS[1].strip().replace("", "\\\\u2014"),
email_header + "test summary",
],
model=settings.TEXT_EMBEDDING_MODEL, model=settings.TEXT_EMBEDDING_MODEL,
input_type="document", input_type="document",
) )

View File

@ -183,13 +183,27 @@ Subject: Test Subject
Test Body Content""" Test Body Content"""
mail_message = MailMessage( mail_message = MailMessage(
sha256=b"test", content=email_content, message_id="<test@example.com>" sha256=b"test",
content=email_content,
message_id="<test@example.com>",
sender="sender@example.com",
recipients=["recipient@example.com"],
subject="Test Subject",
size=1024,
sent_at=datetime(2023, 1, 1, 12, 0, 0),
) )
expected = ( assert mail_message.display_contents == {
"\nSubject: Test Subject\nFrom: \nTo: \nDate: \nBody: \nTest Body Content\n" "content": "Test Body Content",
) "date": "2023-01-01T12:00:00",
assert mail_message.display_contents == expected "filename": None,
"mime_type": None,
"size": 1024,
"subject": "Test Subject",
"sender": "sender@example.com",
"recipients": ["recipient@example.com"],
"tags": None,
}
@pytest.mark.parametrize( @pytest.mark.parametrize(

View File

@ -246,12 +246,11 @@ def test_parse_simple_email():
"subject": "Test Subject", "subject": "Test Subject",
"sender": "sender@example.com", "sender": "sender@example.com",
"recipients": ["recipient@example.com"], "recipients": ["recipient@example.com"],
"body": "Test body content\n", "body": "Test body content",
"attachments": [], "attachments": [],
"sent_at": ANY, "sent_at": ANY,
"raw_email": msg.as_string(), "raw_email": msg.as_string(),
"hash": b"\xed\xa0\x9b\xd4\t4\x06\xb9l\xa4\xb3*\xe4NpZ\x19\xc2\x9b\x87" "hash": b"\xa8\x8c\xa9\x16\xae\xe7\x99\xca\xc9\xd1q\x8e\xcb\xfc5+ \x03aZLz\xea\xd2\x05\xb9B\xf1i\xde\xa6\xe2",
+ b"\xa6\x12\r\x7fS\xb6\xf1\xbe\x95\x9c\x99\xf1",
} }
assert abs(result["sent_at"].timestamp() - test_date.timestamp()) < 86400 # type: ignore assert abs(result["sent_at"].timestamp() - test_date.timestamp()) < 86400 # type: ignore

View File

@ -1,5 +1,6 @@
import base64 import base64
import pathlib import pathlib
import textwrap
from datetime import datetime from datetime import datetime
from typing import cast from typing import cast
from unittest.mock import MagicMock, patch from unittest.mock import MagicMock, patch
@ -100,12 +101,9 @@ def test_process_attachment_disk(attachment_size, max_inline_size, message_id):
assert result is not None assert result is not None
assert not cast(str, result.content) assert not cast(str, result.content)
assert cast(str, result.filename) == str( assert (
settings.FILE_STORAGE_DIR cast(str, result.filename)
/ "emails" == "emails/sender_example_com/INBOX/test_with_special_chars.txt"
/ "sender_example_com"
/ "INBOX"
/ "test_with_special_chars.txt"
) )
@ -183,13 +181,7 @@ def test_process_attachments_mixed():
assert cast(str, results[2].content) == "c" * 30 assert cast(str, results[2].content) == "c" * 30
# Verify large attachment has a path # Verify large attachment has a path
assert cast(str, results[1].filename) == str( assert cast(str, results[1].filename) == "emails/sender_example_com/INBOX/large.txt"
settings.FILE_STORAGE_DIR
/ "emails"
/ "sender_example_com"
/ "INBOX"
/ "large.txt"
)
def test_extract_email_uid_valid(): def test_extract_email_uid_valid():
@ -256,8 +248,19 @@ def test_create_mail_message(db_session):
assert cast(list[str], mail_message.recipients) == ["recipient@example.com"] assert cast(list[str], mail_message.recipients) == ["recipient@example.com"]
assert mail_message.sent_at.isoformat()[:-6] == "2023-01-01T12:00:00" assert mail_message.sent_at.isoformat()[:-6] == "2023-01-01T12:00:00"
assert cast(str, mail_message.content) == raw_email assert cast(str, mail_message.content) == raw_email
assert mail_message.body == "Test body content\n" assert mail_message.body == "Test body content"
assert mail_message.attachments == attachments assert mail_message.attachments == attachments
assert mail_message.display_contents == {
"content": "Test body content",
"subject": "Test Subject",
"sender": "sender@example.com",
"recipients": ["recipient@example.com"],
"date": "2023-01-01T12:00:00+00:00",
"mime_type": "message/rfc822",
"size": 412,
"tags": ["test"],
"filename": None,
}
def test_fetch_email(email_provider): def test_fetch_email(email_provider):