Article content
+Title
+Main content
+diff --git a/requirements-common.txt b/requirements-common.txt index f7597b1..a275cd0 100644 --- a/requirements-common.txt +++ b/requirements-common.txt @@ -6,4 +6,7 @@ dotenv==0.9.9 voyageai==0.3.2 qdrant-client==1.9.0 PyMuPDF==1.25.5 -ebooklib==0.18.0 \ No newline at end of file +ebooklib==0.18.0 +beautifulsoup4==4.13.4 +markdownify==0.13.1 +pillow==10.4.0 \ No newline at end of file diff --git a/src/memory/common/parsers/blogs.py b/src/memory/common/parsers/blogs.py new file mode 100644 index 0000000..e2c4bf5 --- /dev/null +++ b/src/memory/common/parsers/blogs.py @@ -0,0 +1,664 @@ +import logging +import re +from urllib.parse import urlparse + +import requests +from bs4 import BeautifulSoup, Tag +from memory.common.parsers.html import ( + BaseHTMLParser, + Article, + parse_date, + extract_title, + extract_date, +) + + +logger = logging.getLogger(__name__) + + +class SubstackParser(BaseHTMLParser): + """Parser specifically for Substack articles.""" + + article_selector = "article.post" + title_selector = "h1.post-title, h1" + author_selector = ".post-header .author-name, .byline-names" + date_selector = ".post-header" + date_format = "%b %d, %Y" + content_selector = ".available-content, .post-content" + + remove_selectors = BaseHTMLParser.remove_selectors + [ + ".subscribe-widget", + ".subscription-widget-wrap", + ".post-footer", + ".share-dialog", + ".comments-section", + ] + + +class WordPressParser(BaseHTMLParser): + """Parser for WordPress blogs with common themes.""" + + article_selector = "article, .post, .hentry" + title_selector = ".entry-title, h1.post-title, h1" + author_selector = ".entry-meta .author, .by-author, .author-name, .by" + date_selector = ".entry-meta .entry-date, .post-date, time[datetime]" + date_format = "%b %d, %Y" + content_selector = ".entry-content, .post-content, .content" + + remove_selectors = BaseHTMLParser.remove_selectors + [ + ".sharedaddy", + ".jp-relatedposts", + ".post-navigation", + ".author-bio", + ] + + +class MediumParser(BaseHTMLParser): + """Parser for Medium articles.""" + + article_selector = "article" + title_selector = "h1" + author_selector = "[data-testid='authorName']" + date_selector = "[data-testid='storyPublishDate']" + content_selector = "section" + + remove_selectors = BaseHTMLParser.remove_selectors + [ + "[data-testid='audioPlayButton']", + "[data-testid='headerClapButton']", + "[data-testid='responsesSection']", + ] + + +class AcoupBlogParser(BaseHTMLParser): + """Parser for acoup.blog (A Collection of Unmitigated Pedantry).""" + + article_selector = "article, .post, .entry" + title_selector = "h1.entry-title, h1" + author_selector = ".entry-meta .author, .byline" + date_selector = ".entry-meta .posted-on, .entry-date" + date_format = "%B %d, %Y" # "May 23, 2025" format + content_selector = ".entry-content, .post-content" + + remove_selectors = BaseHTMLParser.remove_selectors + [ + ".entry-meta", + ".post-navigation", + ".related-posts", + ".social-sharing", + ".comments-area", + ] + + +class GuzeyParser(BaseHTMLParser): + """Parser for guzey.com personal blog.""" + + article_selector = "main, .content, body" + title_selector = "h1.article-title" + author_selector = ".author, .byline" # Fallback, likely will use metadata + date_selector = ".post-date time" + date_format = "%Y-%m-%d" # Based on "2018-08-07" format seen + content_selector = "main, .post-content, .content" + + remove_selectors = BaseHTMLParser.remove_selectors + [ + ".header", + ".navigation", + ".sidebar", + ".footer", + ".date-info", # Remove the "created:/modified:" lines + "hr", # Remove horizontal rules that separate sections + ] + + +class AkarlinParser(BaseHTMLParser): + """Parser for akarlin.com (Anatoly Karlin's blog).""" + + article_selector = "article, .entry-content, main" + title_selector = "h1.entry-title, h1" + author_selector = ".entry-meta .author, .author-name" + date_selector = ".posted-on .published, .post-date" + date_format = "%B %d, %Y" # "December 31, 2023" format + content_selector = ".entry-content, .post-content, article" + author = "Anatoly Karlin" + + remove_selectors = BaseHTMLParser.remove_selectors + [ + ".entry-meta", + ".post-navigation", + ".author-bio", + ".related-posts", + ".comments", + ".wp-block-group", # WordPress blocks + "header", + "footer", + ".site-header", + ".site-footer", + ] + + +class AphyrParser(BaseHTMLParser): + """Parser for aphyr.com (Kyle Kingsbury's blog).""" + + article_selector = "article, .post, main" + title_selector = "h1" + author_selector = ".author, .byline" + date_selector = ".date, time" + date_format = "%Y-%m-%d" # "2025-05-21" format + content_selector = ".content, .post-content, article" + author = "Kyle Kingsbury" + + remove_selectors = BaseHTMLParser.remove_selectors + [ + ".comments", + ".comment-form", + "form", + ".post-navigation", + ".tags", + ".categories", + "header nav", + "footer", + ".copyright", + ] + + +class AppliedDivinityStudiesParser(BaseHTMLParser): + """Parser for applieddivinitystudies.com.""" + + article_selector = "article, .post, main, .content" + title_selector = "h1" + author_selector = ".author, .byline" + date_selector = ".date, time" + date_format = "%Y-%m-%d" # "2025-05-10" format + content_selector = ".content, .post-content, article, main" + author = "Applied Divinity Studies" + + remove_selectors = BaseHTMLParser.remove_selectors + [ + ".header", + ".site-header", + ".navigation", + ".footer", + ".site-footer", + ".subscribe", + ".about", + ".archives", + ".previous-post", + ".next-post", + ] + + +class BitsAboutMoneyParser(BaseHTMLParser): + """Parser for bitsaboutmoney.com (Patrick McKenzie's blog).""" + + article_selector = "article, .post, main" + title_selector = "h1" + author_selector = ".author, .byline" + date_selector = ".date, time" + date_format = "%b %d, %Y" + content_selector = ".content, .post-content, article" + author = "Patrick McKenzie (patio11)" + + remove_selectors = BaseHTMLParser.remove_selectors + [ + ".header", + ".site-header", + ".navigation", + ".footer", + ".site-footer", + ".newsletter-signup", + ".subscribe", + ".memberships", + ".author-bio", + ".next-post", + ".prev-post", + ] + + +class DanLuuParser(BaseHTMLParser): + """Parser for danluu.com (Dan Luu's technical blog).""" + + article_selector = "main, article, .content" + title_selector = "h1" + author_selector = ".author, .byline" + date_selector = ".date, time" + date_format = "%Y-%m-%d" + content_selector = "main, article, .content" + author = "Dan Luu" + + remove_selectors = BaseHTMLParser.remove_selectors + [ + ".header", + ".footer", + ".navigation", + ".site-nav", + ".archive-links", + ".patreon-links", + ".social-links", + ] + + +class McFunleyParser(BaseHTMLParser): + """Parser for mcfunley.com (Dan McKinley's blog).""" + + article_selector = "main, article, .content" + title_selector = "h4, h1" # Uses h4 for titles based on the content + author_selector = ".author, .byline" + date_selector = ".post-heading small, .date, time" + date_format = "%B %d, %Y" # "February 9th, 2017" format - will be handled by ordinal stripping + content_selector = "main, article, .content" + author = "Dan McKinley" + + remove_selectors = BaseHTMLParser.remove_selectors + [ + ".header", + ".footer", + ".navigation", + ".social-links", + ".copyright", + ] + + +class ExUrbeParser(BaseHTMLParser): + """Parser for exurbe.com (Ada Palmer's history blog).""" + + article_selector = "article, .post, main" + title_selector = "h1, h2.entry-title" + author_selector = ".author, .byline" + date_selector = ".post_date_time .published" + date_format = "%B %d, %Y" # "June 4, 2020" format + content_selector = ".entry-content, .post-content, article" + author = "Ada Palmer" + + remove_selectors = BaseHTMLParser.remove_selectors + [ + ".widget", + ".sidebar", + ".navigation", + ".site-header", + ".site-footer", + ".entry-meta", + ".post-navigation", + ".related-posts", + ".comments-area", + ".search-form", + ".recommended-posts", + ".categories", + ".tags", + ] + + def _extract_date(self, soup: BeautifulSoup) -> str | None: + """Extract date, handling ordinal formats like 'Mar 5th, 2025'.""" + date = soup.select_one(".published") + if date: + return date.attrs.get("content") # type: ignore + return super()._extract_date(soup) + + +class FlyingMachineStudiosParser(BaseHTMLParser): + """Parser for flyingmachinestudios.com (Daniel Higginbotham's blog).""" + + article_selector = "article, .post, main" + title_selector = "h1" + author_selector = ".author, .byline" + date_selector = ".date, time" + date_format = "%d %B %Y" # "13 August 2019" format + content_selector = ".content, .post-content, article" + author = "Daniel Higginbotham" + + remove_selectors = BaseHTMLParser.remove_selectors + [ + ".header", + ".footer", + ".navigation", + ".sidebar", + ".popular-posts", + ".recent-posts", + ".projects", + ".comments", + ".social-sharing", + ] + + +class RiftersParser(BaseHTMLParser): + """Parser for rifters.com (Peter Watts' blog).""" + + article_selector = "article, .post, .entry" + title_selector = "h2.entry-title, h1" + author_selector = ".author, .byline" + date_selector = ".entry-date, .post-date" + date_format = "%d %B %Y" # "12 May 2025" format + content_selector = ".entry-content, .post-content" + author = "Peter Watts" + + remove_selectors = BaseHTMLParser.remove_selectors + [ + ".sidebar", + ".widget", + ".navigation", + ".site-header", + ".site-footer", + ".entry-meta", + ".post-navigation", + ".comments", + ".related-posts", + ".categories", + ".tags", + ".rss-links", + ] + + def _extract_date(self, soup: BeautifulSoup) -> str | None: + """Extract date, handling ordinal formats like 'Mar 5th, 2025'.""" + date = soup.select_one(".entry-date") + if not date: + return None + date_str = date.text.replace("\n", " ").strip() + if date := parse_date(date_str, "%d %b %Y"): + return date.isoformat() + return None + + +class PaulGrahamParser(BaseHTMLParser): + """Parser for paulgraham.com (Paul Graham's essays).""" + + article_selector = "table, td, body" + title_selector = ( + "img[alt], h1, title" # PG essays often have titles in image alt text + ) + author_selector = ".author, .byline" + date_selector = ".date, time" + date_format = "%B %Y" # "March 2024" format + content_selector = "table td, body" + author = "Paul Graham" + + remove_selectors = BaseHTMLParser.remove_selectors + [ + "img[src*='trans_1x1.gif']", # Remove spacer images + "img[src*='essays-']", # Remove header graphics + ".navigation", + ".header", + ".footer", + ] + + def _extract_title(self, soup: BeautifulSoup) -> str: + """Extract title from image alt text or other sources.""" + # Check for title in image alt attribute (common in PG essays) + img_with_alt = soup.find("img", alt=True) + if img_with_alt and isinstance(img_with_alt, Tag): + alt_text = img_with_alt.get("alt") + if alt_text: + return str(alt_text) + + # Fallback to standard title extraction + return extract_title(soup, self.title_selector) + + def _extract_date(self, soup: BeautifulSoup) -> str | None: + """Extract date from essay content.""" + # Look for date patterns in the text content (often at the beginning) + text_content = soup.get_text() + + # Look for patterns like "March 2024" at the start + date_match = re.search(r"\b([A-Z][a-z]+ \d{4})\b", text_content[:500]) + if date_match: + date_str = date_match.group(1) + if date := parse_date(date_str, self.date_format): + return date.isoformat() + + return extract_date(soup, self.date_selector, self.date_format) + + +class PutanumonitParser(BaseHTMLParser): + """Parser for putanumonit.com (Jacob Falkovich's rationality blog).""" + + article_selector = "article, .post, .entry" + title_selector = "h1.entry-title, h1" + author_selector = ".entry-meta .author, .author-name" + date_selector = ".entry-meta .entry-date, .posted-on" + date_format = "%B %d, %Y" # "August 19, 2023" format + content_selector = ".entry-content, .post-content" + author = "Jacob Falkovich" + + remove_selectors = BaseHTMLParser.remove_selectors + [ + ".widget", + ".sidebar", + ".navigation", + ".site-header", + ".site-footer", + ".entry-meta", + ".post-navigation", + ".related-posts", + ".comments-area", + ".wp-block-group", + ".categories", + ".tags", + ".monthly-archives", + ".recent-posts", + ".recent-comments", + ".subscription-widget-wrap", + ".reblog-subscribe", + ] + + +class TheRedHandFilesParser(BaseHTMLParser): + """Parser for theredhandfiles.com (Nick Cave's Q&A website).""" + + article_selector = "article, .post, main" + title_selector = "h1" + author_selector = "" + date_selector = ".issue-date, .date" + date_format = "%B %Y" # "May 2025" format + content_selector = ".content, .post-content, main" + author = "Nick Cave" + + remove_selectors = BaseHTMLParser.remove_selectors + [ + ".header", + ".site-header", + ".navigation", + ".footer", + ".site-footer", + ".sidebar", + ".recent-posts", + ".subscription", + ".ask-question", + ".privacy-policy", + ] + + def _extract_date(self, soup: BeautifulSoup) -> str | None: + """Extract date from issue header.""" + # Look for issue date pattern like "Issue #325 / May 2025" + text_content = soup.get_text() + + # Look for patterns like "Issue #XXX / Month Year" + date_match = re.search(r"Issue #\d+ / ([A-Z][a-z]+ \d{4})", text_content) + if date_match: + date_str = date_match.group(1) + if date := parse_date(date_str, self.date_format): + return date.isoformat() + + # Fallback to parent method + return extract_date(soup, self.date_selector, self.date_format) + + +class RachelByTheBayParser(BaseHTMLParser): + """Parser for rachelbythebay.com technical blog.""" + + article_selector = "body, main, .content" + title_selector = "title, h1" + author_selector = ".author, .byline" + date_selector = ".date, time" + date_format = "%A, %B %d, %Y" + content_selector = "body, main, .content" + author = "Rachel Kroll" + + remove_selectors = BaseHTMLParser.remove_selectors + [ + ".header", + ".footer", + ".navigation", + ".sidebar", + ".comments", + ] + + def _extract_date(self, soup: BeautifulSoup) -> str | None: + """Extract date from URL structure if available.""" + # Try to get current URL from canonical link or other sources + canonical = soup.find("link", rel="canonical") + if canonical and isinstance(canonical, Tag): + href = canonical.get("href") + if href: + # Look for date pattern in URL like /2025/05/22/ + date_match = re.search(r"/(\d{4})/(\d{2})/(\d{2})/", str(href)) + if date_match: + year, month, day = date_match.groups() + date_str = f"{year}/{month}/{day}" + if date := parse_date(date_str, self.date_format): + return date.isoformat() + + # Fallback to parent method + return extract_date(soup, self.date_selector, self.date_format) + + +class NadiaXyzParser(BaseHTMLParser): + """Parser for nadia.xyz (Nadia Asparouhova's blog).""" + + article_selector = "main, article, body" + title_selector = "h1" + author_selector = ".author, .byline" + date_selector = ".post__date" + date_format = "%B %d, %Y" # "May 3, 2018" format + content_selector = "main, article, body" + author = "Nadia Asparouhova" + + remove_selectors = BaseHTMLParser.remove_selectors + [ + ".header", + ".navigation", + ".footer", + ".sidebar", + ".menu", + ".nav", + "nav", + ] + + +class BloombergParser(BaseHTMLParser): + """Parser for bloomberg.com.""" + + article_selector = "main, article, body, #content" + title_selector = "h1, title" + author_selector = ".author, .byline, .post-author" + date_selector = ".date, .published, time" + content_selector = "main, article, body, #content" + + remove_selectors = BaseHTMLParser.remove_selectors + [ + ".archive-banner", + ".archive-header", + ".wayback-banner", + ".archive-notice", + "#wm-ipp", # Wayback machine banner + ".archive-toolbar", + ".archive-metadata", + ] + + def _extract_author(self, soup: BeautifulSoup) -> str | None: + if author := soup.find("a", attrs={"rel": "author"}): + return author.text.strip() + return super()._extract_author(soup) + + +PARSER_REGISTRY = { + r"\.substack\.com": SubstackParser, + r"substack\.com": SubstackParser, + r"medium\.com": MediumParser, + r"wordpress\.com": WordPressParser, + r"acoup\.blog": AcoupBlogParser, + r"guzey\.com": GuzeyParser, + r"akarlin\.com": AkarlinParser, + r"aphyr\.com": AphyrParser, + r"applieddivinitystudies\.com": AppliedDivinityStudiesParser, + r"bitsaboutmoney\.com": BitsAboutMoneyParser, + r"danluu\.com": DanLuuParser, + r"mcfunley\.com": McFunleyParser, + r"exurbe\.com": ExUrbeParser, + r"flyingmachinestudios\.com": FlyingMachineStudiosParser, + r"rifters\.com": RiftersParser, + r"paulgraham\.com": PaulGrahamParser, + r"putanumonit\.com": PutanumonitParser, + r"theredhandfiles\.com": TheRedHandFilesParser, + r"rachelbythebay\.com": RachelByTheBayParser, + r"nadia\.xyz": NadiaXyzParser, +} + + +def get_parser_for_url(url: str, html: str) -> BaseHTMLParser: + """Get the appropriate parser for a given URL.""" + domain = urlparse(url).netloc + + for pattern, parser_class in PARSER_REGISTRY.items(): + if re.search(pattern, domain): + return parser_class(url) + + soup = BeautifulSoup(html, "html.parser") + body_select = "body" + # Check if this is an archived page + if contents := soup.select_one("#CONTENT .html"): + body_select = ".body" + soup = contents + + if soup.select_one(f"{body_select} .wp-singular"): + return WordPressParser(url) + + if any( + "https://substackcdn.com" == a.attrs.get("href") # type: ignore + for a in soup.find_all("link", {"rel": "preconnect"}) + if hasattr(a, "attrs") # type: ignore + ): + return SubstackParser(url) + + urls = [a.attrs.get("href") for a in soup.select(f"{body_select} a")] # type: ignore + if any(u.endswith("https://www.bloomberg.com/company/") for u in urls[:5] if u): # type: ignore + return BloombergParser(url) + + return BaseHTMLParser(url) + + +def parse_webpage(url: str) -> Article: + """ + Parse a webpage and extract article content. + + Args: + url: URL of the webpage to parse + + Returns: + Article object with extracted content and metadata + """ + response = requests.get( + url, + timeout=30, + headers={ + "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:137.0) Gecko/20100101 Firefox/137.0" + }, + ) + response.raise_for_status() + + parser = get_parser_for_url(url, response.text) + + return parser.parse(response.text, url) + + +blogs = [ + "https://acoup.blog/", + "https://guzey.com/", + "https://akarlin.com/", + "https://aphyr.com/", + "https://www.applieddivinitystudies.com/", + "https://www.bitsaboutmoney.com/", + "https://danluu.com/", + "https://mcfunley.com/", + "https://www.exurbe.com/", + "https://www.flyingmachinestudios.com/", + "https://www.imightbewrong.org/", + "https://www.kvetch.au/", + "https://www.overcomingbias.com/", + "https://www.rifters.com/crawl/", + "https://samkriss.substack.com/", + "https://www.paulgraham.com/articles.html", + "https://putanumonit.com/", + "https://www.richardhanania.com/", + "https://skunkledger.substack.com/", + "https://taipology.substack.com/", + "https://www.theintrinsicperspective.com/", + "https://www.strangeloopcanon.com/", + "https://slimemoldtimemold.com/", + "https://www.theredhandfiles.com/", + "https://rachelbythebay.com/w/", + "https://zeroinputagriculture.substack.com/", + "https://nadia.xyz/posts/", + "https://nayafia.substack.com", + "https://archive.ph/o/IQUoT/https://www.bloomberg.com/opinion/authors/ARbTQlRLRjE/matthew-s-levine", +] diff --git a/src/memory/common/parsers/html.py b/src/memory/common/parsers/html.py new file mode 100644 index 0000000..174ea80 --- /dev/null +++ b/src/memory/common/parsers/html.py @@ -0,0 +1,362 @@ +from datetime import datetime +import logging +import re +from dataclasses import dataclass, field +import pathlib +from typing import Any +from urllib.parse import urljoin, urlparse +import hashlib + +import requests +from bs4 import BeautifulSoup, Tag +from markdownify import markdownify as md +from PIL import Image as PILImage + +from memory.common.settings import FILE_STORAGE_DIR, WEBPAGE_STORAGE_DIR + +logger = logging.getLogger(__name__) + + +@dataclass +class Article: + """Structured representation of a web article.""" + + title: str + content: str # Markdown content + author: str | None = None + published_date: str | None = None + url: str = "" + images: list[PILImage.Image] = field(default_factory=list) + metadata: dict[str, Any] = field(default_factory=dict) + + +def get_base_url(url: str) -> str: + """Extract base URL from full URL.""" + parsed = urlparse(url) + return f"{parsed.scheme}://{parsed.netloc}" + + +def to_absolute_url(url: str, base_url: str) -> str: + """Convert relative URL to absolute URL.""" + parsed = urlparse(url) + if parsed.scheme: + return url + return urljoin(base_url, url) + + +def remove_unwanted_elements(soup: BeautifulSoup, remove_selectors: list[str]) -> None: + """Remove unwanted elements from the soup.""" + for selector in remove_selectors: + for element in soup.select(selector): + element.decompose() + + +def extract_title(soup: BeautifulSoup, title_selector: str) -> str: + """Extract article title.""" + for selector in title_selector.split(","): + element = soup.select_one(selector.strip()) + if element and element.get_text(strip=True): + return element.get_text(strip=True) + + # Fallback to page title + title_tag = soup.find("title") + return title_tag.get_text(strip=True) if title_tag else "Untitled" + + +def extract_author(soup: BeautifulSoup, author_selector: str) -> str | None: + """Extract article author.""" + for selector in author_selector.split(","): + element = soup.select_one(selector.strip()) + if element: + text = element.get_text(strip=True) + # Clean up common author prefixes + text = re.sub(r"^(by|written by|author:)\s*", "", text, flags=re.IGNORECASE) + if text: + return text + return None + + +def parse_date(text: str, date_format: str = "%Y-%m-%d") -> datetime | None: + """Parse date from text.""" + try: + text = re.sub(r"(\d+)(st|nd|rd|th)", r"\1", text) + return datetime.strptime(text, date_format) + except ValueError: + return None + + +def extract_date( + soup: BeautifulSoup, date_selector: str, date_format: str = "%Y-%m-%d" +) -> str | None: + """Extract publication date.""" + for selector in date_selector.split(","): + element = soup.select_one(selector.strip()) + if not element: + continue + + datetime_attr = element.get("datetime") + if datetime_attr: + date_str = str(datetime_attr) + if date := parse_date(date_str, date_format): + return date.isoformat() + return date_str + + for text in element.find_all(string=True): + if text and (date := parse_date(str(text).strip(), date_format)): + return date.isoformat() + + return None + + +def extract_content_element( + soup: BeautifulSoup, content_selector: str, article_selector: str +) -> Tag | None: + """Extract main content element.""" + # Try content selectors first + for selector in content_selector.split(","): + element = soup.select_one(selector.strip()) + if element: + return element + + # Fallback to article selector + for selector in article_selector.split(","): + element = soup.select_one(selector.strip()) + if element: + return element + + # Last resort - use body + return soup.body + + +def process_image(url: str, image_dir: pathlib.Path) -> PILImage.Image | None: + url_hash = hashlib.md5(url.encode()).hexdigest() + ext = pathlib.Path(urlparse(url).path).suffix or ".jpg" + filename = f"{url_hash}{ext}" + local_path = image_dir / filename + + # Download if not already cached + if not local_path.exists(): + response = requests.get(url, timeout=30) + response.raise_for_status() + local_path.write_bytes(response.content) + + try: + return PILImage.open(local_path) + except IOError as e: + logger.warning(f"Failed to open image as PIL Image {local_path}: {e}") + return None + + +def process_images( + content: Tag | None, base_url: str, image_dir: pathlib.Path +) -> tuple[Tag | None, list[PILImage.Image]]: + """ + Process all images in content: download them, update URLs, and return PIL Images. + + Returns: + Tuple of (updated_content, list_of_pil_images) + """ + if not content: + return content, [] + + images = [] + + for img_tag in content.find_all("img"): + if not isinstance(img_tag, Tag): + continue + + src = img_tag.get("src", "") + if not src: + continue + + try: + url = to_absolute_url(str(src), base_url) + image = process_image(url, image_dir) + if not image: + continue + + if not image.filename: # type: ignore + continue + + path = pathlib.Path(image.filename) # type: ignore + img_tag["src"] = str(path.relative_to(FILE_STORAGE_DIR.resolve())) + images.append(image) + except Exception as e: + logger.warning(f"Failed to process image {src}: {e}") + continue + + return content, images + + +def convert_to_markdown(content: Tag | None, base_url: str) -> str: + """Convert HTML content to Markdown.""" + if not content: + return "" + + # Update relative URLs to absolute (except for images which were already processed) + for tag in content.find_all("a"): + # Ensure we have a Tag object + if not isinstance(tag, Tag): + continue + + href = tag.get("href") + if href: + tag["href"] = to_absolute_url(str(href), base_url) + + # Convert to markdown + markdown = md(str(content), heading_style="ATX", bullets="-") + + # Clean up excessive newlines + markdown = re.sub(r"\n{3,}", "\n\n", markdown) + + return markdown.strip() + + +def extract_meta_by_pattern( + soup: BeautifulSoup, selector: dict[str, Any], prefix: str = "" +) -> dict[str, str]: + """Extract metadata using CSS selector pattern.""" + metadata = {} + + for tag in soup.find_all("meta", **selector): + if not isinstance(tag, Tag): + continue + + # Determine the key attribute (property for OG, name for others) + key_attr = "property" if "property" in selector else "name" + key = tag.get(key_attr, "") + content = tag.get("content") + + if key and content: + # Remove prefix from key and add custom prefix + clean_key = str(key).replace(prefix.replace(":", ""), "").lstrip(":") + final_key = ( + f"{prefix.replace(':', '_')}{clean_key}" if prefix else clean_key + ) + metadata[final_key] = str(content) + + return metadata + + +def extract_metadata(soup: BeautifulSoup) -> dict[str, Any]: + """Extract additional metadata from the page.""" + metadata = {} + + # Open Graph metadata + og_meta = extract_meta_by_pattern( + soup, {"attrs": {"property": re.compile("^og:")}}, "og:" + ) + metadata.update(og_meta) + + # Twitter metadata + twitter_meta = extract_meta_by_pattern( + soup, {"attrs": {"name": re.compile("^twitter:")}}, "twitter:" + ) + metadata.update(twitter_meta) + + # Standard meta tags + standard_tags = ["description", "author", "keywords", "robots"] + for tag_name in standard_tags: + tag = soup.find("meta", attrs={"name": tag_name}) + if tag and isinstance(tag, Tag): + content = tag.get("content") + if content: + metadata[tag_name] = str(content) + + return metadata + + +class BaseHTMLParser: + """Base class for parsing HTML content from websites.""" + + # CSS selectors - override in subclasses + article_selector: str = "article, main, [role='main']" + title_selector: str = "h1, .title, .post-title" + author_selector: str = ".author, .by-line, .byline" + date_selector: str = "time, .date, .published" + date_format: str = "%Y-%m-%d" + content_selector: str = ".content, .post-content, .entry-content" + author: str | None = None + + # Tags to remove from content + remove_selectors: list[str] = [ + "script", + "style", + "nav", + "aside", + ".comments", + ".social-share", + ".related-posts", + ".advertisement", + ] + + def __init__(self, base_url: str | None = None): + self.base_url = base_url + self.image_dir = WEBPAGE_STORAGE_DIR / str(urlparse(base_url).netloc) + self.image_dir.mkdir(parents=True, exist_ok=True) + + def parse(self, html: str, url: str) -> Article: + """Parse HTML content and return structured article data.""" + soup = BeautifulSoup(html, "html.parser") + self.base_url = self.base_url or get_base_url(url) + + metadata = self._extract_metadata(soup) + title = self._extract_title(soup) + author = self.author or self._extract_author(soup) or metadata.get("author") + date = self._extract_date(soup) + + self._remove_unwanted_elements(soup) + content_element = self._extract_content_element(soup) + + updated_content, images = self._process_images(content_element, url) + content = self._convert_to_markdown(updated_content, url) + + return Article( + title=title, + content=content, + author=author, + published_date=date, + url=url, + images=images, + metadata=metadata, + ) + + def _get_base_url(self, url: str) -> str: + """Extract base URL from full URL.""" + return get_base_url(url) + + def _remove_unwanted_elements(self, soup: BeautifulSoup) -> None: + """Remove unwanted elements from the soup.""" + return remove_unwanted_elements(soup, self.remove_selectors) + + def _extract_title(self, soup: BeautifulSoup) -> str: + """Extract article title.""" + return extract_title(soup, self.title_selector) + + def _extract_author(self, soup: BeautifulSoup) -> str | None: + """Extract article author.""" + return extract_author(soup, self.author_selector) + + def _extract_date(self, soup: BeautifulSoup) -> str | None: + """Extract publication date.""" + return extract_date(soup, self.date_selector, self.date_format) + + def _extract_content_element(self, soup: BeautifulSoup) -> Tag | None: + """Extract main content element.""" + return extract_content_element( + soup, self.content_selector, self.article_selector + ) + + def _process_images( + self, content: Tag | None, base_url: str + ) -> tuple[Tag | None, list[PILImage.Image]]: + """Process all images: download, update URLs, return PIL Images.""" + return process_images(content, base_url, self.image_dir) + + def _convert_to_markdown(self, content: Tag | None, base_url: str) -> str: + """Convert HTML content to Markdown.""" + return convert_to_markdown(content, base_url) + + def _extract_metadata(self, soup: BeautifulSoup) -> dict[str, Any]: + """Extract additional metadata from the page.""" + return extract_metadata(soup) diff --git a/src/memory/common/settings.py b/src/memory/common/settings.py index 80481b5..48f7790 100644 --- a/src/memory/common/settings.py +++ b/src/memory/common/settings.py @@ -39,16 +39,24 @@ CELERY_RESULT_BACKEND = os.getenv("CELERY_RESULT_BACKEND", f"db+{DB_URL}") # File storage settings FILE_STORAGE_DIR = pathlib.Path(os.getenv("FILE_STORAGE_DIR", "/tmp/memory_files")) -FILE_STORAGE_DIR.mkdir(parents=True, exist_ok=True) CHUNK_STORAGE_DIR = pathlib.Path( os.getenv("CHUNK_STORAGE_DIR", FILE_STORAGE_DIR / "chunks") ) -CHUNK_STORAGE_DIR.mkdir(parents=True, exist_ok=True) - COMIC_STORAGE_DIR = pathlib.Path( os.getenv("COMIC_STORAGE_DIR", FILE_STORAGE_DIR / "comics") ) -COMIC_STORAGE_DIR.mkdir(parents=True, exist_ok=True) +WEBPAGE_STORAGE_DIR = pathlib.Path( + os.getenv("WEBPAGE_STORAGE_DIR", FILE_STORAGE_DIR / "webpages") +) + +storage_dirs = [ + FILE_STORAGE_DIR, + CHUNK_STORAGE_DIR, + COMIC_STORAGE_DIR, + WEBPAGE_STORAGE_DIR, +] +for dir in storage_dirs: + dir.mkdir(parents=True, exist_ok=True) # Maximum attachment size to store directly in the database (10MB) MAX_INLINE_ATTACHMENT_SIZE = int( diff --git a/tests/memory/common/parsers/test_html.py b/tests/memory/common/parsers/test_html.py new file mode 100644 index 0000000..c0f5d76 --- /dev/null +++ b/tests/memory/common/parsers/test_html.py @@ -0,0 +1,606 @@ +import pathlib +import tempfile +from datetime import datetime +from typing import cast +from unittest.mock import MagicMock, patch +from urllib.parse import urlparse +import re +import hashlib + +import pytest +import requests +from bs4 import BeautifulSoup, Tag +from PIL import Image as PILImage + +from memory.common.parsers.html import ( + Article, + BaseHTMLParser, + convert_to_markdown, + extract_author, + extract_content_element, + extract_date, + extract_meta_by_pattern, + extract_metadata, + extract_title, + get_base_url, + parse_date, + process_image, + process_images, + remove_unwanted_elements, + to_absolute_url, +) + + +@pytest.mark.parametrize( + "url, expected", + [ + ("https://example.com/path", "https://example.com"), + ("http://test.org/page?param=1", "http://test.org"), + ("https://sub.domain.com:8080/", "https://sub.domain.com:8080"), + ("ftp://files.example.com/dir", "ftp://files.example.com"), + ], +) +def test_get_base_url(url, expected): + assert get_base_url(url) == expected + + +@pytest.mark.parametrize( + "url, base_url, expected", + [ + # Already absolute URLs should remain unchanged + ("https://example.com/page", "https://test.com", "https://example.com/page"), + ("http://other.com", "https://test.com", "http://other.com"), + # Relative URLs should be made absolute + ("/path", "https://example.com", "https://example.com/path"), + ("page.html", "https://example.com/dir/", "https://example.com/dir/page.html"), + ("../up", "https://example.com/dir/", "https://example.com/up"), + ("?query=1", "https://example.com/page", "https://example.com/page?query=1"), + ], +) +def test_to_absolute_url(url, base_url, expected): + assert to_absolute_url(url, base_url) == expected + + +def test_remove_unwanted_elements(): + html = """ +
Keep this
+ + +Keep this too
+No h1
", "h1", "Page Title"), + # Multiple h1s - should pick first + ("No title
", "h1", "Untitled"), + ], +) +def test_extract_title(html, selector, expected): + soup = BeautifulSoup(html, "html.parser") + assert extract_title(soup, selector) == expected + + +@pytest.mark.parametrize( + "html, selector, expected", + [ + # Basic author extraction + (" ", ".author", "John Doe"), + # Author with prefix + (" ", ".byline", "Jane Smith"), + # Multiple selectors + ("Bob
", ".author, .writer", "Bob"), + # Case insensitive prefix removal + (" ", ".author", "Alice"), + # No author found + ("No author here
", ".author", None), + # Empty author + ("", ".author", None), + # Author with whitespace + (" ", ".author", "Author Name"), + ], +) +def test_extract_author(html, selector, expected): + soup = BeautifulSoup(html, "html.parser") + assert extract_author(soup, selector) == expected + + +@pytest.mark.parametrize( + "text, date_format, expected", + [ + # Standard date + ("2023-01-15", "%Y-%m-%d", datetime(2023, 1, 15)), + # Different format + ("15/01/2023", "%d/%m/%Y", datetime(2023, 1, 15)), + # With ordinal suffixes + ("15th January 2023", "%d %B %Y", datetime(2023, 1, 15)), + ("1st March 2023", "%d %B %Y", datetime(2023, 3, 1)), + ("22nd December 2023", "%d %B %Y", datetime(2023, 12, 22)), + ("3rd April 2023", "%d %B %Y", datetime(2023, 4, 3)), + # Invalid date + ("invalid date", "%Y-%m-%d", None), + # Wrong format + ("2023-01-15", "%d/%m/%Y", None), + ], +) +def test_parse_date(text, date_format, expected): + assert parse_date(text, date_format) == expected + + +def test_extract_date(): + html = """ +Main content
+Article content
+This is a paragraph with bold text.
+Text content
+More text
+This is the main content of the article.
+It has multiple paragraphs.
+Custom content here.
+Content with image:
+