partial blog

This commit is contained in:
Daniel O'Connell 2025-05-25 15:36:46 +02:00
parent e8070a3557
commit ce6f4bf5c5
6 changed files with 244 additions and 29 deletions

View File

@ -4,9 +4,10 @@ Database models for the knowledge base system.
import pathlib import pathlib
import re import re
from pathlib import Path
import textwrap import textwrap
from datetime import datetime
from typing import Any, ClassVar, cast from typing import Any, ClassVar, cast
from PIL import Image from PIL import Image
from sqlalchemy import ( from sqlalchemy import (
ARRAY, ARRAY,
@ -27,10 +28,10 @@ from sqlalchemy import (
) )
from sqlalchemy.dialects.postgresql import BYTEA, JSONB, TSVECTOR from sqlalchemy.dialects.postgresql import BYTEA, JSONB, TSVECTOR
from sqlalchemy.ext.declarative import declarative_base from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import relationship, Session from sqlalchemy.orm import Session, relationship
from memory.common import settings from memory.common import settings
from memory.common.parsers.email import parse_email_message, EmailMessage from memory.common.parsers.email import EmailMessage, parse_email_message
Base = declarative_base() Base = declarative_base()
@ -215,13 +216,13 @@ class MailMessage(SourceItem):
} }
@property @property
def attachments_path(self) -> Path: def attachments_path(self) -> pathlib.Path:
clean_sender = clean_filename(cast(str, self.sender)) clean_sender = clean_filename(cast(str, self.sender))
clean_folder = clean_filename(cast(str | None, self.folder) or "INBOX") clean_folder = clean_filename(cast(str | None, self.folder) or "INBOX")
return Path(settings.FILE_STORAGE_DIR) / clean_sender / clean_folder return pathlib.Path(settings.FILE_STORAGE_DIR) / clean_sender / clean_folder
def safe_filename(self, filename: str) -> Path: def safe_filename(self, filename: str) -> pathlib.Path:
suffix = Path(filename).suffix suffix = pathlib.Path(filename).suffix
name = clean_filename(filename.removesuffix(suffix)) + suffix name = clean_filename(filename.removesuffix(suffix)) + suffix
path = self.attachments_path / name path = self.attachments_path / name
path.parent.mkdir(parents=True, exist_ok=True) path.parent.mkdir(parents=True, exist_ok=True)
@ -511,12 +512,46 @@ class BlogPost(SourceItem):
) )
url = Column(Text, unique=True) url = Column(Text, unique=True)
title = Column(Text) title = Column(Text)
published = Column(DateTime(timezone=True)) author = Column(Text, nullable=True)
published = Column(DateTime(timezone=True), nullable=True)
# Additional metadata from webpage parsing
description = Column(Text, nullable=True) # Meta description or excerpt
domain = Column(Text, nullable=True) # Domain of the source website
word_count = Column(Integer, nullable=True) # Approximate word count
# Store original metadata from parser
webpage_metadata = Column(JSONB, nullable=True)
__mapper_args__ = { __mapper_args__ = {
"polymorphic_identity": "blog_post", "polymorphic_identity": "blog_post",
} }
__table_args__ = (
Index("blog_post_author_idx", "author"),
Index("blog_post_domain_idx", "domain"),
Index("blog_post_published_idx", "published"),
Index("blog_post_word_count_idx", "word_count"),
)
def as_payload(self) -> dict:
published_date = cast(datetime | None, self.published)
metadata = cast(dict | None, self.webpage_metadata) or {}
payload = {
"source_id": self.id,
"url": self.url,
"title": self.title,
"author": self.author,
"published": published_date and published_date.isoformat(),
"description": self.description,
"domain": self.domain,
"word_count": self.word_count,
"tags": self.tags,
**metadata,
}
return {k: v for k, v in payload.items() if v}
class MiscDoc(SourceItem): class MiscDoc(SourceItem):
__tablename__ = "misc_doc" __tablename__ = "misc_doc"

View File

@ -1,5 +1,6 @@
import logging import logging
import re import re
from datetime import datetime
from urllib.parse import urlparse from urllib.parse import urlparse
import requests import requests
@ -277,7 +278,7 @@ class ExUrbeParser(BaseHTMLParser):
".tags", ".tags",
] ]
def _extract_date(self, soup: BeautifulSoup) -> str | None: def _extract_date(self, soup: BeautifulSoup) -> datetime | None:
"""Extract date, handling ordinal formats like 'Mar 5th, 2025'.""" """Extract date, handling ordinal formats like 'Mar 5th, 2025'."""
date = soup.select_one(".published") date = soup.select_one(".published")
if date: if date:
@ -335,14 +336,14 @@ class RiftersParser(BaseHTMLParser):
".rss-links", ".rss-links",
] ]
def _extract_date(self, soup: BeautifulSoup) -> str | None: def _extract_date(self, soup: BeautifulSoup) -> datetime | None:
"""Extract date, handling ordinal formats like 'Mar 5th, 2025'.""" """Extract date, handling ordinal formats like 'Mar 5th, 2025'."""
date = soup.select_one(".entry-date") date = soup.select_one(".entry-date")
if not date: if not date:
return None return None
date_str = date.text.replace("\n", " ").strip() date_str = date.text.replace("\n", " ").strip()
if date := parse_date(date_str, "%d %b %Y"): if date := parse_date(date_str, "%d %b %Y"):
return date.isoformat() return date
return None return None
@ -379,7 +380,7 @@ class PaulGrahamParser(BaseHTMLParser):
# Fallback to standard title extraction # Fallback to standard title extraction
return extract_title(soup, self.title_selector) return extract_title(soup, self.title_selector)
def _extract_date(self, soup: BeautifulSoup) -> str | None: def _extract_date(self, soup: BeautifulSoup) -> datetime | None:
"""Extract date from essay content.""" """Extract date from essay content."""
# Look for date patterns in the text content (often at the beginning) # Look for date patterns in the text content (often at the beginning)
text_content = soup.get_text() text_content = soup.get_text()
@ -389,7 +390,7 @@ class PaulGrahamParser(BaseHTMLParser):
if date_match: if date_match:
date_str = date_match.group(1) date_str = date_match.group(1)
if date := parse_date(date_str, self.date_format): if date := parse_date(date_str, self.date_format):
return date.isoformat() return date
return extract_date(soup, self.date_selector, self.date_format) return extract_date(soup, self.date_selector, self.date_format)
@ -450,7 +451,7 @@ class TheRedHandFilesParser(BaseHTMLParser):
".privacy-policy", ".privacy-policy",
] ]
def _extract_date(self, soup: BeautifulSoup) -> str | None: def _extract_date(self, soup: BeautifulSoup) -> datetime | None:
"""Extract date from issue header.""" """Extract date from issue header."""
# Look for issue date pattern like "Issue #325 / May 2025" # Look for issue date pattern like "Issue #325 / May 2025"
text_content = soup.get_text() text_content = soup.get_text()
@ -460,7 +461,7 @@ class TheRedHandFilesParser(BaseHTMLParser):
if date_match: if date_match:
date_str = date_match.group(1) date_str = date_match.group(1)
if date := parse_date(date_str, self.date_format): if date := parse_date(date_str, self.date_format):
return date.isoformat() return date
# Fallback to parent method # Fallback to parent method
return extract_date(soup, self.date_selector, self.date_format) return extract_date(soup, self.date_selector, self.date_format)
@ -485,7 +486,7 @@ class RachelByTheBayParser(BaseHTMLParser):
".comments", ".comments",
] ]
def _extract_date(self, soup: BeautifulSoup) -> str | None: def _extract_date(self, soup: BeautifulSoup) -> datetime | None:
"""Extract date from URL structure if available.""" """Extract date from URL structure if available."""
# Try to get current URL from canonical link or other sources # Try to get current URL from canonical link or other sources
canonical = soup.find("link", rel="canonical") canonical = soup.find("link", rel="canonical")
@ -498,7 +499,7 @@ class RachelByTheBayParser(BaseHTMLParser):
year, month, day = date_match.groups() year, month, day = date_match.groups()
date_str = f"{year}/{month}/{day}" date_str = f"{year}/{month}/{day}"
if date := parse_date(date_str, self.date_format): if date := parse_date(date_str, self.date_format):
return date.isoformat() return date
# Fallback to parent method # Fallback to parent method
return extract_date(soup, self.date_selector, self.date_format) return extract_date(soup, self.date_selector, self.date_format)

View File

@ -24,9 +24,9 @@ class Article:
title: str title: str
content: str # Markdown content content: str # Markdown content
author: str | None = None author: str | None = None
published_date: str | None = None published_date: datetime | None = None
url: str = "" url: str = ""
images: list[PILImage.Image] = field(default_factory=list) images: dict[str, PILImage.Image] = field(default_factory=dict)
metadata: dict[str, Any] = field(default_factory=dict) metadata: dict[str, Any] = field(default_factory=dict)
@ -87,7 +87,7 @@ def parse_date(text: str, date_format: str = "%Y-%m-%d") -> datetime | None:
def extract_date( def extract_date(
soup: BeautifulSoup, date_selector: str, date_format: str = "%Y-%m-%d" soup: BeautifulSoup, date_selector: str, date_format: str = "%Y-%m-%d"
) -> str | None: ) -> datetime | None:
"""Extract publication date.""" """Extract publication date."""
for selector in date_selector.split(","): for selector in date_selector.split(","):
element = soup.select_one(selector.strip()) element = soup.select_one(selector.strip())
@ -98,12 +98,11 @@ def extract_date(
if datetime_attr: if datetime_attr:
date_str = str(datetime_attr) date_str = str(datetime_attr)
if date := parse_date(date_str, date_format): if date := parse_date(date_str, date_format):
return date.isoformat() return date
return date_str
for text in element.find_all(string=True): for text in element.find_all(string=True):
if text and (date := parse_date(str(text).strip(), date_format)): if text and (date := parse_date(str(text).strip(), date_format)):
return date.isoformat() return date
return None return None
@ -149,7 +148,7 @@ def process_image(url: str, image_dir: pathlib.Path) -> PILImage.Image | None:
def process_images( def process_images(
content: Tag | None, base_url: str, image_dir: pathlib.Path content: Tag | None, base_url: str, image_dir: pathlib.Path
) -> tuple[Tag | None, list[PILImage.Image]]: ) -> tuple[Tag | None, dict[str, PILImage.Image]]:
""" """
Process all images in content: download them, update URLs, and return PIL Images. Process all images in content: download them, update URLs, and return PIL Images.
@ -159,7 +158,7 @@ def process_images(
if not content: if not content:
return content, [] return content, []
images = [] images = {}
for img_tag in content.find_all("img"): for img_tag in content.find_all("img"):
if not isinstance(img_tag, Tag): if not isinstance(img_tag, Tag):
@ -180,7 +179,7 @@ def process_images(
path = pathlib.Path(image.filename) # type: ignore path = pathlib.Path(image.filename) # type: ignore
img_tag["src"] = str(path.relative_to(FILE_STORAGE_DIR.resolve())) img_tag["src"] = str(path.relative_to(FILE_STORAGE_DIR.resolve()))
images.append(image) images[img_tag["src"]] = image
except Exception as e: except Exception as e:
logger.warning(f"Failed to process image {src}: {e}") logger.warning(f"Failed to process image {src}: {e}")
continue continue
@ -337,7 +336,7 @@ class BaseHTMLParser:
"""Extract article author.""" """Extract article author."""
return extract_author(soup, self.author_selector) return extract_author(soup, self.author_selector)
def _extract_date(self, soup: BeautifulSoup) -> str | None: def _extract_date(self, soup: BeautifulSoup) -> datetime | None:
"""Extract publication date.""" """Extract publication date."""
return extract_date(soup, self.date_selector, self.date_format) return extract_date(soup, self.date_selector, self.date_format)
@ -349,7 +348,7 @@ class BaseHTMLParser:
def _process_images( def _process_images(
self, content: Tag | None, base_url: str self, content: Tag | None, base_url: str
) -> tuple[Tag | None, list[PILImage.Image]]: ) -> tuple[Tag | None, dict[str, PILImage.Image]]:
"""Process all images: download, update URLs, return PIL Images.""" """Process all images: download, update URLs, return PIL Images."""
return process_images(content, base_url, self.image_dir) return process_images(content, base_url, self.image_dir)

View File

@ -25,6 +25,7 @@ app.conf.update(
"memory.workers.tasks.email.*": {"queue": "email"}, "memory.workers.tasks.email.*": {"queue": "email"},
"memory.workers.tasks.photo.*": {"queue": "photo_embed"}, "memory.workers.tasks.photo.*": {"queue": "photo_embed"},
"memory.workers.tasks.comic.*": {"queue": "comic"}, "memory.workers.tasks.comic.*": {"queue": "comic"},
"memory.workers.tasks.blogs.*": {"queue": "blogs"},
"memory.workers.tasks.docs.*": {"queue": "docs"}, "memory.workers.tasks.docs.*": {"queue": "docs"},
"memory.workers.tasks.maintenance.*": {"queue": "maintenance"}, "memory.workers.tasks.maintenance.*": {"queue": "maintenance"},
}, },

View File

@ -2,7 +2,8 @@
Import sub-modules so Celery can register their @app.task decorators. Import sub-modules so Celery can register their @app.task decorators.
""" """
from memory.workers.tasks import docs, email, comic # noqa from memory.workers.tasks import docs, email, comic, blogs # noqa
from memory.workers.tasks.blogs import SYNC_WEBPAGE
from memory.workers.tasks.email import SYNC_ACCOUNT, SYNC_ALL_ACCOUNTS, PROCESS_EMAIL from memory.workers.tasks.email import SYNC_ACCOUNT, SYNC_ALL_ACCOUNTS, PROCESS_EMAIL
from memory.workers.tasks.maintenance import ( from memory.workers.tasks.maintenance import (
CLEAN_ALL_COLLECTIONS, CLEAN_ALL_COLLECTIONS,
@ -15,6 +16,8 @@ __all__ = [
"docs", "docs",
"email", "email",
"comic", "comic",
"blogs",
"SYNC_WEBPAGE",
"SYNC_ACCOUNT", "SYNC_ACCOUNT",
"SYNC_ALL_ACCOUNTS", "SYNC_ALL_ACCOUNTS",
"PROCESS_EMAIL", "PROCESS_EMAIL",

View File

@ -0,0 +1,176 @@
import hashlib
import logging
from typing import Iterable, cast
from memory.common import chunker, embedding, qdrant
from memory.common.db.connection import make_session
from memory.common.db.models import BlogPost
from memory.common.parsers.blogs import parse_webpage
from memory.workers.celery_app import app
logger = logging.getLogger(__name__)
SYNC_WEBPAGE = "memory.workers.tasks.blogs.sync_webpage"
def create_blog_post_from_article(article, tags: Iterable[str] = []) -> BlogPost:
"""Create a BlogPost model from parsed article data."""
return BlogPost(
url=article.url,
title=article.title,
published=article.published_date,
content=article.content,
sha256=hashlib.sha256(article.content.encode()).digest(),
modality="blog",
tags=tags,
mime_type="text/markdown",
size=len(article.content.encode("utf-8")),
)
def embed_blog_post(blog_post: BlogPost) -> int:
"""Embed blog post content and return count of successfully embedded chunks."""
try:
# Always embed the full content
_, chunks = embedding.embed(
"text/markdown",
cast(str, blog_post.content),
metadata=blog_post.as_payload(),
chunk_size=chunker.EMBEDDING_MAX_TOKENS,
)
# But also embed the content in chunks (unless it's really short)
if (
chunker.approx_token_count(cast(str, blog_post.content))
> chunker.DEFAULT_CHUNK_TOKENS * 2
):
_, small_chunks = embedding.embed(
"text/markdown",
cast(str, blog_post.content),
metadata=blog_post.as_payload(),
)
chunks += small_chunks
if chunks:
blog_post.chunks = chunks
blog_post.embed_status = "QUEUED" # type: ignore
return len(chunks)
else:
blog_post.embed_status = "FAILED" # type: ignore
logger.warning(f"No chunks generated for blog post: {blog_post.title}")
return 0
except Exception as e:
blog_post.embed_status = "FAILED" # type: ignore
logger.error(f"Failed to embed blog post {blog_post.title}: {e}")
return 0
def push_to_qdrant(blog_post: BlogPost):
"""Push embeddings to Qdrant for successfully embedded blog post."""
if cast(str, blog_post.embed_status) != "QUEUED" or not blog_post.chunks:
return
try:
vector_ids = [str(chunk.id) for chunk in blog_post.chunks]
vectors = [chunk.vector for chunk in blog_post.chunks]
payloads = [chunk.item_metadata for chunk in blog_post.chunks]
qdrant.upsert_vectors(
client=qdrant.get_qdrant_client(),
collection_name="blog",
ids=vector_ids,
vectors=vectors,
payloads=payloads,
)
blog_post.embed_status = "STORED" # type: ignore
logger.info(f"Successfully stored embeddings for: {blog_post.title}")
except Exception as e:
blog_post.embed_status = "FAILED" # type: ignore
logger.error(f"Failed to push embeddings to Qdrant: {e}")
raise
@app.task(name=SYNC_WEBPAGE)
def sync_webpage(url: str, tags: Iterable[str] = []) -> dict:
"""
Synchronize a webpage from a URL.
Args:
url: URL of the webpage to parse and store
tags: Additional tags to apply to the content
Returns:
dict: Summary of what was processed
"""
article = parse_webpage(url)
if not article.content:
logger.warning(f"Article content too short or empty: {url}")
return {
"url": url,
"title": article.title,
"status": "skipped_short_content",
"content_length": 0,
}
blog_post = create_blog_post_from_article(article, tags)
with make_session() as session:
existing_post = session.query(BlogPost).filter(BlogPost.url == url).first()
if existing_post:
logger.info(f"Blog post already exists: {existing_post.title}")
return {
"blog_post_id": existing_post.id,
"url": url,
"title": existing_post.title,
"status": "already_exists",
"chunks_count": len(existing_post.chunks),
}
existing_post = (
session.query(BlogPost).filter(BlogPost.sha256 == blog_post.sha256).first()
)
if existing_post:
logger.info(
f"Blog post with the same content already exists: {existing_post.title}"
)
return {
"blog_post_id": existing_post.id,
"url": url,
"title": existing_post.title,
"status": "already_exists",
"chunks_count": len(existing_post.chunks),
}
session.add(blog_post)
session.flush()
chunks_count = embed_blog_post(blog_post)
session.flush()
try:
push_to_qdrant(blog_post)
logger.info(
f"Successfully processed webpage: {blog_post.title} "
f"({chunks_count} chunks embedded)"
)
except Exception as e:
logger.error(f"Failed to push embeddings to Qdrant: {e}")
blog_post.embed_status = "FAILED" # type: ignore
session.commit()
return {
"blog_post_id": blog_post.id,
"url": url,
"title": blog_post.title,
"author": article.author,
"published_date": article.published_date,
"status": "processed",
"chunks_count": chunks_count,
"content_length": len(article.content),
"embed_status": blog_post.embed_status,
}