mruwnik f2161e09f3 Fix 11 high-priority bugs from third deep dive
- Add IMAP connection cleanup on logout failure (email.py)
- Handle IntegrityError for concurrent email processing (tasks/email.py)
- Recover stale scheduled calls stuck in "executing" state (scheduled_calls.py)
- Move git operations outside DB transaction in notes sync (notes.py)
- Add null checks for recipient_user/from_user in Discord (discord.py)
- Add OAuth state and session cleanup tasks (maintenance.py)
- Add distributed lock for backup tasks (backup.py)
- Add /tmp storage warning in settings (settings.py)
- Fix health check error exposure (app.py)
- Remove sensitive data from logs (auth.py)

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-19 22:15:25 +00:00

315 lines
9.3 KiB
Python

import hashlib
import imaplib
import logging
import re
from collections import defaultdict
from contextlib import contextmanager
from datetime import datetime
from typing import Callable, Generator, Sequence, cast
from sqlalchemy.orm import Session, scoped_session
from memory.common import embedding, qdrant, settings, collections
from memory.common.db.models import (
EmailAccount,
EmailAttachment,
MailMessage,
)
from memory.parsers.email import (
Attachment,
EmailMessage,
RawEmailResponse,
)
logger = logging.getLogger(__name__)
def process_attachment(
attachment: Attachment, message: MailMessage
) -> EmailAttachment | None:
"""Process an attachment, storing large files on disk and returning metadata.
Args:
attachment: Attachment dictionary with metadata and content
message: MailMessage instance to use for file path generation
Returns:
Processed attachment dictionary with appropriate metadata
"""
content, file_path = None, None
if not (real_content := attachment.get("content")):
pass # No content, so just save the metadata
elif attachment["size"] <= settings.MAX_INLINE_ATTACHMENT_SIZE and attachment[
"content_type"
].startswith("text/"):
content = real_content.decode("utf-8", errors="replace")
else:
file_path = message.safe_filename(attachment["filename"])
try:
file_path.write_bytes(real_content)
except Exception as e:
logger.error(f"Failed to save attachment {file_path} to disk: {str(e)}")
return None
return EmailAttachment(
modality=collections.get_modality(attachment["content_type"]),
sha256=hashlib.sha256(
real_content if real_content else str(attachment).encode()
).digest(),
tags=message.tags,
size=attachment["size"],
mime_type=attachment["content_type"],
mail_message=message,
content=content,
filename=file_path and str(file_path.relative_to(settings.FILE_STORAGE_DIR)),
)
def process_attachments(
attachments: list[Attachment], message: MailMessage
) -> list[EmailAttachment]:
"""
Process email attachments, storing large files on disk and returning metadata.
Args:
attachments: List of attachment dictionaries with metadata and content
message_id: Email message ID to use in file path generation
Returns:
List of processed attachment dictionaries with appropriate metadata
"""
if not attachments:
return []
return [
attachment
for a in attachments
if (attachment := process_attachment(a, message))
]
def create_mail_message(
db_session: Session | scoped_session,
tags: list[str],
folder: str,
parsed_email: EmailMessage,
) -> MailMessage:
"""
Create a new mail message record and associated attachments.
Args:
db_session: Database session
source_id: ID of the SourceItem
parsed_email: Parsed email data
folder: IMAP folder name
Returns:
Newly created MailMessage
"""
raw_email = parsed_email["raw_email"]
mail_message = MailMessage(
modality="mail",
sha256=parsed_email["hash"],
tags=tags,
size=len(raw_email),
mime_type="message/rfc822",
embed_status="RAW",
message_id=parsed_email["message_id"],
subject=parsed_email["subject"],
sender=parsed_email["sender"],
recipients=parsed_email["recipients"],
sent_at=parsed_email["sent_at"],
content=raw_email,
folder=folder,
)
db_session.add(mail_message)
if parsed_email["attachments"]:
attachments = process_attachments(parsed_email["attachments"], mail_message)
db_session.add_all(attachments)
mail_message.attachments = attachments
return mail_message
def extract_email_uid(
msg_data: Sequence[tuple[bytes, bytes]],
) -> tuple[str | None, bytes]:
"""
Extract the UID and raw email data from the message data.
"""
uid_pattern = re.compile(r"UID (\d+)")
uid_match = uid_pattern.search(msg_data[0][0].decode("utf-8", errors="replace"))
uid = uid_match.group(1) if uid_match else None
raw_email = msg_data[0][1]
return uid, raw_email
def fetch_email(conn: imaplib.IMAP4_SSL, uid: str) -> RawEmailResponse | None:
try:
status, msg_data = conn.fetch(uid, "(UID BODY.PEEK[])")
if status != "OK" or not msg_data or not msg_data[0]:
logger.error(f"Error fetching message {uid}")
return None
return extract_email_uid(msg_data) # type: ignore
except Exception as e:
logger.error(f"Error processing message {uid}: {str(e)}")
return None
def fetch_email_since(
conn: imaplib.IMAP4_SSL,
folder: str,
since_date: datetime = datetime(1970, 1, 1),
) -> list[RawEmailResponse]:
"""
Fetch emails from a folder since a given date and time.
Args:
conn: IMAP connection
folder: Folder name to select
since_date: Fetch emails since this date and time
Returns:
List of tuples with (uid, raw_email)
"""
try:
status, counts = conn.select(folder)
if status != "OK":
logger.error(f"Error selecting folder {folder}: {counts}")
return []
date_str = since_date.strftime("%d-%b-%Y")
status, data = conn.search(None, f'(SINCE "{date_str}")')
if status != "OK":
logger.error(f"Error searching folder {folder}: {data}")
return []
except Exception as e:
logger.error(f"Error in fetch_email_since for folder {folder}: {str(e)}")
return []
if not data or not data[0]:
return []
return [email for uid in data[0].split() if (email := fetch_email(conn, uid))]
def process_folder(
conn: imaplib.IMAP4_SSL,
folder: str,
account: EmailAccount,
since_date: datetime,
processor: Callable[[int, str, str, str], int | None],
) -> dict:
"""
Process a single folder from an email account.
Args:
conn: Active IMAP connection
folder: Folder name to process
account: Email account configuration
since_date: Only fetch messages newer than this date
processor: Function to process each message
Returns:
Stats dictionary for the folder
"""
new_messages, errors = 0, 0
emails = []
try:
emails = fetch_email_since(conn, folder, since_date)
for uid, raw_email in emails:
try:
task = processor(
account_id=account.id, # type: ignore
message_id=uid,
folder=folder,
raw_email=raw_email.decode("utf-8", errors="replace"),
)
if task:
new_messages += 1
except Exception as e:
logger.error(f"Error queuing message {uid}: {str(e)}")
errors += 1
except Exception as e:
logger.error(f"Error processing folder {folder}: {str(e)}")
errors += 1
return {
"messages_found": len(emails),
"new_messages": new_messages,
"errors": errors,
}
@contextmanager
def imap_connection(account: EmailAccount) -> Generator[imaplib.IMAP4_SSL, None, None]:
conn = imaplib.IMAP4_SSL(
host=cast(str, account.imap_server), port=cast(int, account.imap_port)
)
try:
conn.login(cast(str, account.username), cast(str, account.password))
yield conn
finally:
# Always try to logout and close the connection
try:
conn.logout()
except Exception as e:
logger.error(f"Error logging out from {account.imap_server}: {str(e)}")
# If logout fails, explicitly close the socket to prevent resource leak
try:
conn.shutdown()
except Exception:
pass # Socket may already be closed
def vectorize_email(email: MailMessage):
qdrant_client = qdrant.get_qdrant_client()
chunks = embedding.embed_source_item(email)
email.chunks = chunks
if chunks:
vector_ids = [cast(str, c.id) for c in chunks]
vectors = [c.vector for c in chunks]
metadata = [c.item_metadata for c in chunks]
qdrant.upsert_vectors(
client=qdrant_client,
collection_name=cast(str, email.modality),
ids=vector_ids,
vectors=vectors, # type: ignore
payloads=metadata, # type: ignore
)
embeds = defaultdict(list)
for attachment in email.attachments:
chunks = embedding.embed_source_item(attachment)
if not chunks:
continue
attachment.chunks = chunks
embeds[attachment.modality].extend(chunks)
for collection, chunks in embeds.items():
ids = [c.id for c in chunks]
vectors = [c.vector for c in chunks]
metadata = [c.item_metadata for c in chunks]
qdrant.upsert_vectors(
client=qdrant_client,
collection_name=collection,
ids=ids,
vectors=vectors,
payloads=metadata,
)
email.embed_status = "STORED" # type: ignore
for attachment in email.attachments:
attachment.embed_status = "STORED"
logger.info(f"Stored embedding for message {email.message_id}")