mirror of
https://github.com/mruwnik/memory.git
synced 2025-06-28 15:14:45 +02:00
better emails embedding + format search results
This commit is contained in:
parent
d73c5bc928
commit
780e27ba04
@ -433,6 +433,14 @@ body {
|
|||||||
background: #e2e8f0;
|
background: #e2e8f0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
.metadata {
|
||||||
|
margin-top: 1rem;
|
||||||
|
padding: 1rem 2rem;
|
||||||
|
background: #f9fafb;
|
||||||
|
border-radius: 8px;
|
||||||
|
border: 1px solid #e5e7eb;
|
||||||
|
}
|
||||||
|
|
||||||
/* Responsive design */
|
/* Responsive design */
|
||||||
@media (max-width: 768px) {
|
@media (max-width: 768px) {
|
||||||
.app-header {
|
.app-header {
|
||||||
|
@ -2,7 +2,6 @@ import React, { useState, useEffect } from 'react'
|
|||||||
import { useNavigate } from 'react-router-dom'
|
import { useNavigate } from 'react-router-dom'
|
||||||
import ReactMarkdown from 'react-markdown'
|
import ReactMarkdown from 'react-markdown'
|
||||||
import { useMCP } from '../hooks/useMCP'
|
import { useMCP } from '../hooks/useMCP'
|
||||||
import { useAuth } from '../hooks/useAuth'
|
|
||||||
import Loading from './Loading'
|
import Loading from './Loading'
|
||||||
|
|
||||||
type SearchItem = {
|
type SearchItem = {
|
||||||
@ -24,7 +23,7 @@ const Tag = ({ tags }: { tags: string[] }) => {
|
|||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
const formatText = ({ filename, content, chunks, tags }: SearchItem) => {
|
const TextResult = ({ filename, content, chunks, tags }: SearchItem) => {
|
||||||
return (
|
return (
|
||||||
<div className="search-result-card">
|
<div className="search-result-card">
|
||||||
<h4>{filename || 'Untitled'}</h4>
|
<h4>{filename || 'Untitled'}</h4>
|
||||||
@ -45,11 +44,12 @@ const formatText = ({ filename, content, chunks, tags }: SearchItem) => {
|
|||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
const formatMarkdown = ({ filename, content, chunks, tags, metadata }: SearchItem) => {
|
const MarkdownResult = ({ filename, content, chunks, tags, metadata }: SearchItem) => {
|
||||||
return (
|
return (
|
||||||
<div className="search-result-card">
|
<div className="search-result-card">
|
||||||
<h4>{filename || 'Untitled'}</h4>
|
<h4>{filename || 'Untitled'}</h4>
|
||||||
<Tag tags={tags} />
|
<Tag tags={tags} />
|
||||||
|
<Metadata metadata={metadata} />
|
||||||
<div className="markdown-content">
|
<div className="markdown-content">
|
||||||
<ReactMarkdown>{content || 'No content available'}</ReactMarkdown>
|
<ReactMarkdown>{content || 'No content available'}</ReactMarkdown>
|
||||||
</div>
|
</div>
|
||||||
@ -70,7 +70,7 @@ const formatMarkdown = ({ filename, content, chunks, tags, metadata }: SearchIte
|
|||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
const formatImage = ({ filename, chunks, tags, metadata }: SearchItem) => {
|
const ImageResult = ({ filename, chunks, tags, metadata }: SearchItem) => {
|
||||||
const title = metadata?.title || filename || 'Untitled'
|
const title = metadata?.title || filename || 'Untitled'
|
||||||
const { fetchFile } = useMCP()
|
const { fetchFile } = useMCP()
|
||||||
const [mime_type, setMimeType] = useState<string>()
|
const [mime_type, setMimeType] = useState<string>()
|
||||||
@ -95,17 +95,66 @@ const formatImage = ({ filename, chunks, tags, metadata }: SearchItem) => {
|
|||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const Metadata = ({ metadata }: { metadata: any }) => {
|
||||||
|
if (!metadata) return null
|
||||||
|
return (
|
||||||
|
<div className="metadata">
|
||||||
|
<ul>
|
||||||
|
{Object.entries(metadata).map(([key, value]) => (
|
||||||
|
<li key={key}>{key}: {typeof value === 'string' ? value : JSON.stringify(value)}</li>
|
||||||
|
))}
|
||||||
|
</ul>
|
||||||
|
</div>
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
const PDFResult = ({ filename, content, tags, metadata }: SearchItem) => {
|
||||||
|
return (
|
||||||
|
<div className="search-result-card">
|
||||||
|
<h4>{filename || 'Untitled'}</h4>
|
||||||
|
<Tag tags={tags} />
|
||||||
|
<a href={`http://localhost:8000/files/${filename}`}>View PDF</a>
|
||||||
|
<Metadata metadata={metadata} />
|
||||||
|
{content && <div className="markdown-content">
|
||||||
|
<details>
|
||||||
|
<summary>View Source</summary>
|
||||||
|
<ReactMarkdown>{content}</ReactMarkdown>
|
||||||
|
</details>
|
||||||
|
</div>}
|
||||||
|
</div>
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
const EmailResult = ({ content, tags, metadata }: SearchItem) => {
|
||||||
|
return (
|
||||||
|
<div className="search-result-card">
|
||||||
|
<h4>{metadata?.title || metadata?.subject || 'Untitled'}</h4>
|
||||||
|
<Tag tags={tags} />
|
||||||
|
<Metadata metadata={metadata} />
|
||||||
|
{content && <div className="markdown-content">
|
||||||
|
<ReactMarkdown>{content}</ReactMarkdown>
|
||||||
|
</div>}
|
||||||
|
</div>
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
const SearchResult = ({ result }: { result: SearchItem }) => {
|
const SearchResult = ({ result }: { result: SearchItem }) => {
|
||||||
if (result.mime_type.startsWith('image/')) {
|
if (result.mime_type.startsWith('image/')) {
|
||||||
return formatImage(result)
|
return <ImageResult {...result} />
|
||||||
}
|
}
|
||||||
if (result.mime_type.startsWith('text/markdown')) {
|
if (result.mime_type.startsWith('text/markdown')) {
|
||||||
console.log(result)
|
return <MarkdownResult {...result} />
|
||||||
return formatMarkdown(result)
|
|
||||||
}
|
}
|
||||||
if (result.mime_type.startsWith('text/')) {
|
if (result.mime_type.startsWith('text/')) {
|
||||||
return formatText(result)
|
return <TextResult {...result} />
|
||||||
}
|
}
|
||||||
|
if (result.mime_type.startsWith('application/pdf')) {
|
||||||
|
return <PDFResult {...result} />
|
||||||
|
}
|
||||||
|
if (result.mime_type.startsWith('message/rfc822')) {
|
||||||
|
return <EmailResult {...result} />
|
||||||
|
}
|
||||||
|
console.log(result)
|
||||||
return null
|
return null
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -115,6 +115,9 @@ export const useMCP = () => {
|
|||||||
}
|
}
|
||||||
|
|
||||||
const resp = await parseJsonRpcResponse(response)
|
const resp = await parseJsonRpcResponse(response)
|
||||||
|
if (resp?.result?.isError) {
|
||||||
|
throw new Error(resp?.result?.content[0].text)
|
||||||
|
}
|
||||||
return resp?.result?.content.map((item: any) => JSON.parse(item.text))
|
return resp?.result?.content.map((item: any) => JSON.parse(item.text))
|
||||||
}, [apiCall])
|
}, [apiCall])
|
||||||
|
|
||||||
|
@ -6,7 +6,7 @@ import contextlib
|
|||||||
import os
|
import os
|
||||||
import logging
|
import logging
|
||||||
|
|
||||||
from fastapi import FastAPI, UploadFile, Request
|
from fastapi import FastAPI, UploadFile, Request, HTTPException
|
||||||
from fastapi.responses import FileResponse
|
from fastapi.responses import FileResponse
|
||||||
from fastapi.middleware.cors import CORSMiddleware
|
from fastapi.middleware.cors import CORSMiddleware
|
||||||
from sqladmin import Admin
|
from sqladmin import Admin
|
||||||
@ -50,6 +50,24 @@ async def serve_react_app(full_path: str):
|
|||||||
return FileResponse(settings.STATIC_DIR / "index.html")
|
return FileResponse(settings.STATIC_DIR / "index.html")
|
||||||
|
|
||||||
|
|
||||||
|
@app.get("/files/{path:path}")
|
||||||
|
async def serve_file(path: str):
|
||||||
|
file_path = settings.FILE_STORAGE_DIR / path
|
||||||
|
if not file_path.is_file():
|
||||||
|
raise HTTPException(status_code=404, detail="File not found")
|
||||||
|
return FileResponse(file_path)
|
||||||
|
|
||||||
|
|
||||||
|
async def input_type(item: str | UploadFile) -> list[extract.DataChunk]:
|
||||||
|
if not item:
|
||||||
|
return []
|
||||||
|
|
||||||
|
if isinstance(item, str):
|
||||||
|
return extract.extract_text(item)
|
||||||
|
content_type = item.content_type or "application/octet-stream"
|
||||||
|
return extract.extract_data_chunks(content_type, await item.read())
|
||||||
|
|
||||||
|
|
||||||
# SQLAdmin setup with OAuth protection
|
# SQLAdmin setup with OAuth protection
|
||||||
engine = get_engine()
|
engine = get_engine()
|
||||||
admin = Admin(app, engine)
|
admin = Admin(app, engine)
|
||||||
@ -72,16 +90,6 @@ async def health_check(request: Request):
|
|||||||
app.mount("/", mcp.streamable_http_app())
|
app.mount("/", mcp.streamable_http_app())
|
||||||
|
|
||||||
|
|
||||||
async def input_type(item: str | UploadFile) -> list[extract.DataChunk]:
|
|
||||||
if not item:
|
|
||||||
return []
|
|
||||||
|
|
||||||
if isinstance(item, str):
|
|
||||||
return extract.extract_text(item)
|
|
||||||
content_type = item.content_type or "application/octet-stream"
|
|
||||||
return extract.extract_data_chunks(content_type, await item.read())
|
|
||||||
|
|
||||||
|
|
||||||
def main(reload: bool = False):
|
def main(reload: bool = False):
|
||||||
"""Run the FastAPI server in debug mode with auto-reloading."""
|
"""Run the FastAPI server in debug mode with auto-reloading."""
|
||||||
import uvicorn
|
import uvicorn
|
||||||
|
@ -23,7 +23,8 @@ async def search_bm25(
|
|||||||
) -> list[tuple[SourceData, AnnotatedChunk]]:
|
) -> list[tuple[SourceData, AnnotatedChunk]]:
|
||||||
with make_session() as db:
|
with make_session() as db:
|
||||||
items_query = db.query(Chunk.id, Chunk.content).filter(
|
items_query = db.query(Chunk.id, Chunk.content).filter(
|
||||||
Chunk.collection_name.in_(modalities)
|
Chunk.collection_name.in_(modalities),
|
||||||
|
Chunk.content.isnot(None),
|
||||||
)
|
)
|
||||||
|
|
||||||
if source_ids := filters.get("source_ids"):
|
if source_ids := filters.get("source_ids"):
|
||||||
@ -46,6 +47,7 @@ async def search_bm25(
|
|||||||
item_ids = {
|
item_ids = {
|
||||||
sha256(item.content.lower().strip().encode("utf-8")).hexdigest(): item.id
|
sha256(item.content.lower().strip().encode("utf-8")).hexdigest(): item.id
|
||||||
for item in items
|
for item in items
|
||||||
|
if item.content
|
||||||
}
|
}
|
||||||
corpus = [item.content.lower().strip() for item in items]
|
corpus = [item.content.lower().strip() for item in items]
|
||||||
|
|
||||||
|
@ -1,4 +1,5 @@
|
|||||||
import asyncio
|
import asyncio
|
||||||
|
import traceback
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
import logging
|
import logging
|
||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
@ -28,7 +29,7 @@ class SourceData(BaseModel):
|
|||||||
mime_type: str | None
|
mime_type: str | None
|
||||||
filename: str | None
|
filename: str | None
|
||||||
content_length: int
|
content_length: int
|
||||||
contents: dict | None
|
contents: dict | str | None
|
||||||
created_at: datetime | None
|
created_at: datetime | None
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
@ -87,6 +88,7 @@ async def with_timeout(
|
|||||||
logger.warning(f"Search timed out after {timeout}s")
|
logger.warning(f"Search timed out after {timeout}s")
|
||||||
return []
|
return []
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
traceback.print_exc()
|
||||||
logger.error(f"Search failed: {e}")
|
logger.error(f"Search failed: {e}")
|
||||||
return []
|
return []
|
||||||
|
|
||||||
@ -109,8 +111,14 @@ def group_chunks(
|
|||||||
|
|
||||||
def make_result(source: SourceData, chunks: list[AnnotatedChunk]) -> SearchResult:
|
def make_result(source: SourceData, chunks: list[AnnotatedChunk]) -> SearchResult:
|
||||||
contents = source.contents or {}
|
contents = source.contents or {}
|
||||||
tags = contents.pop("tags", [])
|
tags = []
|
||||||
content = contents.pop("content", None)
|
if isinstance(contents, dict):
|
||||||
|
tags = contents.pop("tags", [])
|
||||||
|
content = contents.pop("content", None)
|
||||||
|
print(content)
|
||||||
|
else:
|
||||||
|
content = contents
|
||||||
|
contents = {}
|
||||||
|
|
||||||
return SearchResult(
|
return SearchResult(
|
||||||
id=source.id,
|
id=source.id,
|
||||||
|
@ -93,35 +93,76 @@ class MailMessage(SourceItem):
|
|||||||
}
|
}
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def parsed_content(self):
|
def parsed_content(self) -> dict[str, Any]:
|
||||||
from memory.parsers.email import parse_email_message
|
from memory.parsers.email import parse_email_message
|
||||||
|
|
||||||
return parse_email_message(cast(str, self.content), cast(str, self.message_id))
|
return cast(
|
||||||
|
dict[str, Any],
|
||||||
|
parse_email_message(cast(str, self.content), cast(str, self.message_id)),
|
||||||
|
)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def body(self) -> str:
|
def body(self) -> str:
|
||||||
return self.parsed_content["body"]
|
return self.parsed_content["body"]
|
||||||
|
|
||||||
@property
|
def format_content(self, content: dict[str, Any]) -> str:
|
||||||
def display_contents(self) -> str | None:
|
sender = (
|
||||||
content = self.parsed_content
|
cast(str, self.sender) or content.get("from") or content.get("sender", "")
|
||||||
return textwrap.dedent(
|
)
|
||||||
"""
|
recipients = (
|
||||||
|
cast(list[str], self.recipients)
|
||||||
|
or content.get("to")
|
||||||
|
or content.get("recipients", [])
|
||||||
|
)
|
||||||
|
date = (
|
||||||
|
cast(datetime, self.sent_at) and self.sent_at.isoformat()
|
||||||
|
) or content.get("date", "")
|
||||||
|
|
||||||
|
return (
|
||||||
|
textwrap.dedent(
|
||||||
|
"""
|
||||||
Subject: {subject}
|
Subject: {subject}
|
||||||
From: {sender}
|
From: {sender}
|
||||||
To: {recipients}
|
To: {recipients}
|
||||||
Date: {date}
|
Date: {date}
|
||||||
Body:
|
Body:
|
||||||
{body}
|
{body}
|
||||||
"""
|
"""
|
||||||
).format(
|
)
|
||||||
subject=content.get("subject", ""),
|
.format(
|
||||||
sender=content.get("from", ""),
|
subject=cast(str, self.subject) or content.get("subject", ""),
|
||||||
recipients=content.get("to", ""),
|
sender=sender,
|
||||||
date=content.get("date", ""),
|
recipients=", ".join(recipients),
|
||||||
body=content.get("body", ""),
|
date=date,
|
||||||
|
body=content.get("body", ""),
|
||||||
|
)
|
||||||
|
.strip()
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def display_contents(self) -> dict | None:
|
||||||
|
return {
|
||||||
|
**cast(dict, super().display_contents),
|
||||||
|
"content": self.body,
|
||||||
|
"subject": self.subject,
|
||||||
|
"sender": self.sender,
|
||||||
|
"recipients": self.recipients,
|
||||||
|
"date": cast(datetime | None, self.sent_at) and self.sent_at.isoformat(),
|
||||||
|
}
|
||||||
|
|
||||||
|
def _chunk_contents(self) -> Sequence[extract.DataChunk]:
|
||||||
|
content = self.parsed_content
|
||||||
|
chunks = extract.extract_text(cast(str, self.body))
|
||||||
|
|
||||||
|
def add_header(item: extract.MulitmodalChunk) -> extract.MulitmodalChunk:
|
||||||
|
if isinstance(item, str):
|
||||||
|
return self.format_content(content | {"body": item}).strip()
|
||||||
|
return item
|
||||||
|
|
||||||
|
for chunk in chunks:
|
||||||
|
chunk.data = [add_header(item) for item in chunk.data]
|
||||||
|
return chunks
|
||||||
|
|
||||||
# Add indexes
|
# Add indexes
|
||||||
__table_args__ = (
|
__table_args__ = (
|
||||||
Index("mail_sent_idx", "sent_at"),
|
Index("mail_sent_idx", "sent_at"),
|
||||||
@ -161,13 +202,22 @@ class EmailAttachment(SourceItem):
|
|||||||
|
|
||||||
def data_chunks(self, metadata: dict[str, Any] = {}) -> Sequence[Chunk]:
|
def data_chunks(self, metadata: dict[str, Any] = {}) -> Sequence[Chunk]:
|
||||||
if cast(str | None, self.filename):
|
if cast(str | None, self.filename):
|
||||||
contents = pathlib.Path(cast(str, self.filename)).read_bytes()
|
contents = (
|
||||||
|
settings.FILE_STORAGE_DIR / cast(str, self.filename)
|
||||||
|
).read_bytes()
|
||||||
else:
|
else:
|
||||||
contents = cast(str, self.content)
|
contents = cast(str, self.content)
|
||||||
|
|
||||||
chunks = extract.extract_data_chunks(cast(str, self.mime_type), contents)
|
chunks = extract.extract_data_chunks(cast(str, self.mime_type), contents)
|
||||||
return [self._make_chunk(c, metadata) for c in chunks]
|
return [self._make_chunk(c, metadata) for c in chunks]
|
||||||
|
|
||||||
|
@property
|
||||||
|
def display_contents(self) -> dict:
|
||||||
|
return {
|
||||||
|
**cast(dict, super().display_contents),
|
||||||
|
**self.mail_message.display_contents,
|
||||||
|
}
|
||||||
|
|
||||||
# Add indexes
|
# Add indexes
|
||||||
__table_args__ = (Index("email_attachment_message_idx", "mail_message_id"),)
|
__table_args__ = (Index("email_attachment_message_idx", "mail_message_id"),)
|
||||||
|
|
||||||
|
@ -8,6 +8,7 @@ from memory.common import settings, chunker
|
|||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
MAX_TOKENS = 200000
|
||||||
TAGS_PROMPT = """
|
TAGS_PROMPT = """
|
||||||
The following text is already concise. Please identify 3-5 relevant tags that capture the main topics or themes.
|
The following text is already concise. Please identify 3-5 relevant tags that capture the main topics or themes.
|
||||||
|
|
||||||
@ -148,6 +149,12 @@ def summarize(content: str, target_tokens: int | None = None) -> tuple[str, list
|
|||||||
content=content,
|
content=content,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
if chunker.approx_token_count(prompt) > MAX_TOKENS:
|
||||||
|
logger.warning(
|
||||||
|
f"Prompt too long ({chunker.approx_token_count(prompt)} tokens), truncating"
|
||||||
|
)
|
||||||
|
prompt = truncate(prompt, MAX_TOKENS - 20)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
if settings.SUMMARIZER_MODEL.startswith("anthropic"):
|
if settings.SUMMARIZER_MODEL.startswith("anthropic"):
|
||||||
result = _call_anthropic(prompt)
|
result = _call_anthropic(prompt)
|
||||||
|
@ -1,4 +1,5 @@
|
|||||||
import email
|
import email
|
||||||
|
import email.message
|
||||||
import hashlib
|
import hashlib
|
||||||
import logging
|
import logging
|
||||||
import pathlib
|
import pathlib
|
||||||
@ -6,6 +7,8 @@ from datetime import datetime
|
|||||||
from email.utils import parsedate_to_datetime
|
from email.utils import parsedate_to_datetime
|
||||||
from typing import TypedDict
|
from typing import TypedDict
|
||||||
|
|
||||||
|
from markdownify import markdownify
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
@ -71,33 +74,60 @@ def extract_date(msg: email.message.Message) -> datetime | None: # type: ignore
|
|||||||
|
|
||||||
def extract_body(msg: email.message.Message) -> str: # type: ignore
|
def extract_body(msg: email.message.Message) -> str: # type: ignore
|
||||||
"""
|
"""
|
||||||
Extract plain text body from email message.
|
Extract body from email message, preferring HTML converted to markdown.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
msg: Email message object
|
msg: Email message object
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
Plain text body content
|
Body content as markdown (if HTML found) or plain text
|
||||||
"""
|
"""
|
||||||
body = ""
|
html_body = ""
|
||||||
|
plain_body = ""
|
||||||
|
|
||||||
if not msg.is_multipart():
|
if not msg.is_multipart():
|
||||||
try:
|
try:
|
||||||
return msg.get_payload(decode=True).decode(errors="replace")
|
payload = msg.get_payload(decode=True)
|
||||||
|
if isinstance(payload, bytes):
|
||||||
|
content = payload.decode(errors="replace")
|
||||||
|
else:
|
||||||
|
content = str(payload)
|
||||||
|
content_type = msg.get_content_type()
|
||||||
|
if content_type == "text/html":
|
||||||
|
return markdownify(content).strip()
|
||||||
|
else:
|
||||||
|
return content
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Error decoding message body: {str(e)}")
|
logger.error(f"Error decoding message body: {str(e)}")
|
||||||
return ""
|
return ""
|
||||||
|
|
||||||
|
# Extract both HTML and plain text parts
|
||||||
for part in msg.walk():
|
for part in msg.walk():
|
||||||
content_type = part.get_content_type()
|
content_type = part.get_content_type()
|
||||||
content_disposition = str(part.get("Content-Disposition", ""))
|
content_disposition = str(part.get("Content-Disposition", ""))
|
||||||
|
|
||||||
if content_type == "text/plain" and "attachment" not in content_disposition:
|
if "attachment" in content_disposition:
|
||||||
try:
|
continue
|
||||||
body += part.get_payload(decode=True).decode(errors="replace") + "\n"
|
|
||||||
except Exception as e:
|
try:
|
||||||
logger.error(f"Error decoding message part: {str(e)}")
|
payload = part.get_payload(decode=True)
|
||||||
return body
|
if isinstance(payload, bytes):
|
||||||
|
content = payload.decode(errors="replace")
|
||||||
|
else:
|
||||||
|
content = str(payload)
|
||||||
|
|
||||||
|
if content_type == "text/html":
|
||||||
|
html_body += content + "\n"
|
||||||
|
elif content_type == "text/plain":
|
||||||
|
plain_body += content + "\n"
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error decoding message part: {str(e)}")
|
||||||
|
|
||||||
|
# Prefer HTML (converted to markdown) over plain text
|
||||||
|
if html_body.strip():
|
||||||
|
return markdownify(html_body).strip()
|
||||||
|
else:
|
||||||
|
return plain_body.strip()
|
||||||
|
|
||||||
|
|
||||||
def extract_attachments(msg: email.message.Message) -> list[Attachment]: # type: ignore
|
def extract_attachments(msg: email.message.Message) -> list[Attachment]: # type: ignore
|
||||||
|
@ -61,7 +61,7 @@ def process_attachment(
|
|||||||
mime_type=attachment["content_type"],
|
mime_type=attachment["content_type"],
|
||||||
mail_message=message,
|
mail_message=message,
|
||||||
content=content,
|
content=content,
|
||||||
filename=file_path and str(file_path),
|
filename=file_path and str(file_path.relative_to(settings.FILE_STORAGE_DIR)),
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@ -149,7 +149,7 @@ def extract_email_uid(
|
|||||||
|
|
||||||
def fetch_email(conn: imaplib.IMAP4_SSL, uid: str) -> RawEmailResponse | None:
|
def fetch_email(conn: imaplib.IMAP4_SSL, uid: str) -> RawEmailResponse | None:
|
||||||
try:
|
try:
|
||||||
status, msg_data = conn.fetch(uid, "(UID RFC822)")
|
status, msg_data = conn.fetch(uid, "(UID BODY.PEEK[])")
|
||||||
if status != "OK" or not msg_data or not msg_data[0]:
|
if status != "OK" or not msg_data or not msg_data[0]:
|
||||||
logger.error(f"Error fetching message {uid}")
|
logger.error(f"Error fetching message {uid}")
|
||||||
return None
|
return None
|
||||||
|
@ -237,6 +237,40 @@ SAMPLE_TEXT = BeautifulSoup(SAMPLE_HTML, "html.parser").get_text()
|
|||||||
SECOND_PAGE_MARKDOWN = markdownify(SECOND_PAGE)
|
SECOND_PAGE_MARKDOWN = markdownify(SECOND_PAGE)
|
||||||
SECOND_PAGE_TEXT = BeautifulSoup(SECOND_PAGE, "html.parser").get_text()
|
SECOND_PAGE_TEXT = BeautifulSoup(SECOND_PAGE, "html.parser").get_text()
|
||||||
|
|
||||||
|
SAMPLE_EMAIL = f"""From: john.doe@techcorp.com
|
||||||
|
To: research-team@techcorp.com, jane.smith@university.edu
|
||||||
|
CC: newsletter@programming-weekly.com
|
||||||
|
Subject: The Evolution of Programming Languages - Research Article
|
||||||
|
Date: Wed, 15 Jan 2025 14:30:00 +0000
|
||||||
|
Message-ID: <20250115143000.12345@techcorp.com>
|
||||||
|
MIME-Version: 1.0
|
||||||
|
Content-Type: multipart/mixed; boundary="----=_NextPart_000_0001_01DA1234.56789ABC"
|
||||||
|
|
||||||
|
This is a multi-part message in MIME format.
|
||||||
|
|
||||||
|
------=_NextPart_000_0001_01DA1234.56789ABC
|
||||||
|
Content-Type: text/html; charset=utf-8
|
||||||
|
Content-Transfer-Encoding: quoted-printable
|
||||||
|
|
||||||
|
{SAMPLE_HTML}
|
||||||
|
|
||||||
|
------=_NextPart_000_0001_01DA1234.56789ABC
|
||||||
|
Content-Type: image/png
|
||||||
|
Content-Disposition: attachment; filename="lang_timeline.png"
|
||||||
|
Content-Transfer-Encoding: base64
|
||||||
|
|
||||||
|
iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mNkYPhfDwAChwGA60e6kgAAAABJRU5ErkJggg==
|
||||||
|
|
||||||
|
------=_NextPart_000_0001_01DA1234.56789ABC
|
||||||
|
Content-Type: image/jpeg
|
||||||
|
Content-Disposition: attachment; filename="code_complexity.jpg"
|
||||||
|
Content-Transfer-Encoding: base64
|
||||||
|
|
||||||
|
/9j/4AAQSkZJRgABAQEAYABgAAD/2wBDAAEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEB
|
||||||
|
|
||||||
|
------=_NextPart_000_0001_01DA1234.56789ABC--
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
def image_hash(image: Image.Image) -> str:
|
def image_hash(image: Image.Image) -> str:
|
||||||
return hashlib.sha256(image.tobytes()).hexdigest()
|
return hashlib.sha256(image.tobytes()).hexdigest()
|
||||||
|
@ -1,4 +1,5 @@
|
|||||||
import hashlib
|
import hashlib
|
||||||
|
import textwrap
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from typing import Sequence, cast
|
from typing import Sequence, cast
|
||||||
from unittest.mock import ANY, Mock, call
|
from unittest.mock import ANY, Mock, call
|
||||||
@ -20,6 +21,7 @@ from memory.common.db.models.source_items import (
|
|||||||
from memory.common.db.models.sources import Book
|
from memory.common.db.models.sources import Book
|
||||||
from memory.common.embedding import embed_source_item
|
from memory.common.embedding import embed_source_item
|
||||||
from memory.common.extract import page_to_image
|
from memory.common.extract import page_to_image
|
||||||
|
from memory.parsers.email import parse_email_message
|
||||||
from tests.data.contents import (
|
from tests.data.contents import (
|
||||||
CHUNKS,
|
CHUNKS,
|
||||||
DATA_DIR,
|
DATA_DIR,
|
||||||
@ -27,6 +29,7 @@ from tests.data.contents import (
|
|||||||
LANG_TIMELINE_HASH,
|
LANG_TIMELINE_HASH,
|
||||||
CODE_COMPLEXITY,
|
CODE_COMPLEXITY,
|
||||||
CODE_COMPLEXITY_HASH,
|
CODE_COMPLEXITY_HASH,
|
||||||
|
SAMPLE_EMAIL,
|
||||||
SAMPLE_MARKDOWN,
|
SAMPLE_MARKDOWN,
|
||||||
SAMPLE_TEXT,
|
SAMPLE_TEXT,
|
||||||
SECOND_PAGE,
|
SECOND_PAGE,
|
||||||
@ -127,31 +130,41 @@ def test_base_source_item_mixed_embeddings(mock_voyage_client):
|
|||||||
] == [LANG_TIMELINE_HASH]
|
] == [LANG_TIMELINE_HASH]
|
||||||
|
|
||||||
|
|
||||||
def test_mail_message_embeddings(mock_voyage_client):
|
def test_mail_message_with_attachments_embeddings(mock_voyage_client):
|
||||||
|
email = parse_email_message(SAMPLE_EMAIL, "123")
|
||||||
item = MailMessage(
|
item = MailMessage(
|
||||||
id=1,
|
id=1,
|
||||||
content=SAMPLE_MARKDOWN,
|
content=SAMPLE_EMAIL,
|
||||||
mime_type="text/html",
|
mime_type="text/html",
|
||||||
modality="text",
|
modality="text",
|
||||||
sha256=hashlib.sha256(SAMPLE_MARKDOWN.encode("utf-8")).hexdigest(),
|
sha256=hashlib.sha256(email["body"].encode("utf-8")).hexdigest(),
|
||||||
size=len(SAMPLE_MARKDOWN),
|
size=len(email["body"]),
|
||||||
tags=["bla"],
|
tags=["bla"],
|
||||||
message_id="123",
|
message_id="123",
|
||||||
subject="Test Subject",
|
subject=email["subject"],
|
||||||
sender="test@example.com",
|
sender=email["sender"],
|
||||||
recipients=["test@example.com"],
|
recipients=email["recipients"],
|
||||||
folder="INBOX",
|
folder="INBOX",
|
||||||
sent_at=datetime(2025, 1, 1, 12, 0, 0),
|
sent_at=datetime(2025, 1, 1, 12, 0, 0),
|
||||||
)
|
)
|
||||||
|
email_header = textwrap.dedent(
|
||||||
|
f"""
|
||||||
|
Subject: {email["subject"]}
|
||||||
|
From: {email["sender"]}
|
||||||
|
To: {", ".join(email["recipients"])}
|
||||||
|
Date: 2025-01-01T12:00:00
|
||||||
|
Body:
|
||||||
|
"""
|
||||||
|
).lstrip()
|
||||||
metadata = item.as_payload()
|
metadata = item.as_payload()
|
||||||
metadata["tags"] = {"bla", "test@example.com"}
|
metadata["tags"] = {"bla", "john.doe@techcorp.com"} | set(email["recipients"])
|
||||||
expected = [
|
expected = [
|
||||||
(CHUNKS[0].strip(), [], metadata),
|
(email_header + CHUNKS[0].strip(), [], metadata),
|
||||||
(CHUNKS[1].strip(), [], metadata),
|
(email_header + CHUNKS[1].strip().replace("—", "\\\\u2014"), [], metadata),
|
||||||
(
|
(
|
||||||
"test summary",
|
email_header + "test summary",
|
||||||
[],
|
[],
|
||||||
metadata | {"tags": {"tag1", "tag2", "bla", "test@example.com"}},
|
metadata | {"tags": {"tag1", "tag2"} | metadata["tags"]},
|
||||||
),
|
),
|
||||||
]
|
]
|
||||||
|
|
||||||
@ -166,7 +179,11 @@ def test_mail_message_embeddings(mock_voyage_client):
|
|||||||
assert not mock_voyage_client.multimodal_embed.call_count
|
assert not mock_voyage_client.multimodal_embed.call_count
|
||||||
|
|
||||||
assert mock_voyage_client.embed.call_args == call(
|
assert mock_voyage_client.embed.call_args == call(
|
||||||
[CHUNKS[0].strip(), CHUNKS[1].strip(), "test summary"],
|
[
|
||||||
|
email_header + CHUNKS[0].strip(),
|
||||||
|
email_header + CHUNKS[1].strip().replace("—", "\\\\u2014"),
|
||||||
|
email_header + "test summary",
|
||||||
|
],
|
||||||
model=settings.TEXT_EMBEDDING_MODEL,
|
model=settings.TEXT_EMBEDDING_MODEL,
|
||||||
input_type="document",
|
input_type="document",
|
||||||
)
|
)
|
||||||
|
@ -183,13 +183,27 @@ Subject: Test Subject
|
|||||||
Test Body Content"""
|
Test Body Content"""
|
||||||
|
|
||||||
mail_message = MailMessage(
|
mail_message = MailMessage(
|
||||||
sha256=b"test", content=email_content, message_id="<test@example.com>"
|
sha256=b"test",
|
||||||
|
content=email_content,
|
||||||
|
message_id="<test@example.com>",
|
||||||
|
sender="sender@example.com",
|
||||||
|
recipients=["recipient@example.com"],
|
||||||
|
subject="Test Subject",
|
||||||
|
size=1024,
|
||||||
|
sent_at=datetime(2023, 1, 1, 12, 0, 0),
|
||||||
)
|
)
|
||||||
|
|
||||||
expected = (
|
assert mail_message.display_contents == {
|
||||||
"\nSubject: Test Subject\nFrom: \nTo: \nDate: \nBody: \nTest Body Content\n"
|
"content": "Test Body Content",
|
||||||
)
|
"date": "2023-01-01T12:00:00",
|
||||||
assert mail_message.display_contents == expected
|
"filename": None,
|
||||||
|
"mime_type": None,
|
||||||
|
"size": 1024,
|
||||||
|
"subject": "Test Subject",
|
||||||
|
"sender": "sender@example.com",
|
||||||
|
"recipients": ["recipient@example.com"],
|
||||||
|
"tags": None,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
|
@ -246,12 +246,11 @@ def test_parse_simple_email():
|
|||||||
"subject": "Test Subject",
|
"subject": "Test Subject",
|
||||||
"sender": "sender@example.com",
|
"sender": "sender@example.com",
|
||||||
"recipients": ["recipient@example.com"],
|
"recipients": ["recipient@example.com"],
|
||||||
"body": "Test body content\n",
|
"body": "Test body content",
|
||||||
"attachments": [],
|
"attachments": [],
|
||||||
"sent_at": ANY,
|
"sent_at": ANY,
|
||||||
"raw_email": msg.as_string(),
|
"raw_email": msg.as_string(),
|
||||||
"hash": b"\xed\xa0\x9b\xd4\t4\x06\xb9l\xa4\xb3*\xe4NpZ\x19\xc2\x9b\x87"
|
"hash": b"\xa8\x8c\xa9\x16\xae\xe7\x99\xca\xc9\xd1q\x8e\xcb\xfc5+ \x03aZLz\xea\xd2\x05\xb9B\xf1i\xde\xa6\xe2",
|
||||||
+ b"\xa6\x12\r\x7fS\xb6\xf1\xbe\x95\x9c\x99\xf1",
|
|
||||||
}
|
}
|
||||||
assert abs(result["sent_at"].timestamp() - test_date.timestamp()) < 86400 # type: ignore
|
assert abs(result["sent_at"].timestamp() - test_date.timestamp()) < 86400 # type: ignore
|
||||||
|
|
||||||
|
@ -1,5 +1,6 @@
|
|||||||
import base64
|
import base64
|
||||||
import pathlib
|
import pathlib
|
||||||
|
import textwrap
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from typing import cast
|
from typing import cast
|
||||||
from unittest.mock import MagicMock, patch
|
from unittest.mock import MagicMock, patch
|
||||||
@ -100,12 +101,9 @@ def test_process_attachment_disk(attachment_size, max_inline_size, message_id):
|
|||||||
|
|
||||||
assert result is not None
|
assert result is not None
|
||||||
assert not cast(str, result.content)
|
assert not cast(str, result.content)
|
||||||
assert cast(str, result.filename) == str(
|
assert (
|
||||||
settings.FILE_STORAGE_DIR
|
cast(str, result.filename)
|
||||||
/ "emails"
|
== "emails/sender_example_com/INBOX/test_with_special_chars.txt"
|
||||||
/ "sender_example_com"
|
|
||||||
/ "INBOX"
|
|
||||||
/ "test_with_special_chars.txt"
|
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@ -183,13 +181,7 @@ def test_process_attachments_mixed():
|
|||||||
assert cast(str, results[2].content) == "c" * 30
|
assert cast(str, results[2].content) == "c" * 30
|
||||||
|
|
||||||
# Verify large attachment has a path
|
# Verify large attachment has a path
|
||||||
assert cast(str, results[1].filename) == str(
|
assert cast(str, results[1].filename) == "emails/sender_example_com/INBOX/large.txt"
|
||||||
settings.FILE_STORAGE_DIR
|
|
||||||
/ "emails"
|
|
||||||
/ "sender_example_com"
|
|
||||||
/ "INBOX"
|
|
||||||
/ "large.txt"
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def test_extract_email_uid_valid():
|
def test_extract_email_uid_valid():
|
||||||
@ -256,8 +248,19 @@ def test_create_mail_message(db_session):
|
|||||||
assert cast(list[str], mail_message.recipients) == ["recipient@example.com"]
|
assert cast(list[str], mail_message.recipients) == ["recipient@example.com"]
|
||||||
assert mail_message.sent_at.isoformat()[:-6] == "2023-01-01T12:00:00"
|
assert mail_message.sent_at.isoformat()[:-6] == "2023-01-01T12:00:00"
|
||||||
assert cast(str, mail_message.content) == raw_email
|
assert cast(str, mail_message.content) == raw_email
|
||||||
assert mail_message.body == "Test body content\n"
|
assert mail_message.body == "Test body content"
|
||||||
assert mail_message.attachments == attachments
|
assert mail_message.attachments == attachments
|
||||||
|
assert mail_message.display_contents == {
|
||||||
|
"content": "Test body content",
|
||||||
|
"subject": "Test Subject",
|
||||||
|
"sender": "sender@example.com",
|
||||||
|
"recipients": ["recipient@example.com"],
|
||||||
|
"date": "2023-01-01T12:00:00+00:00",
|
||||||
|
"mime_type": "message/rfc822",
|
||||||
|
"size": 412,
|
||||||
|
"tags": ["test"],
|
||||||
|
"filename": None,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
def test_fetch_email(email_provider):
|
def test_fetch_email(email_provider):
|
||||||
|
Loading…
x
Reference in New Issue
Block a user