Add archives fetcher

This commit is contained in:
Daniel O'Connell 2025-05-27 01:24:57 +02:00
parent 27fbfcc548
commit 876fa87725
7 changed files with 1306 additions and 228 deletions

View File

@ -0,0 +1,301 @@
from dataclasses import dataclass, field
import logging
import re
import time
from urllib.parse import parse_qs, urlencode, urlparse, urlunparse
from typing import Generator, cast
from bs4 import BeautifulSoup
from memory.common.parsers.blogs import is_substack
from memory.common.parsers.feeds import (
DanluuParser,
HTMLListParser,
RiftersParser,
FeedItem,
FeedParser,
SubstackAPIParser,
)
from memory.common.parsers.html import (
fetch_html,
extract_url,
get_base_url,
)
logger = logging.getLogger(__name__)
@dataclass
class ArchiveFetcher:
"""Fetches complete backlogs from sites with pagination."""
parser_class: type[FeedParser]
start_url: str
max_pages: int = 100
delay_between_requests: float = 1.0
parser_kwargs: dict = field(default_factory=dict)
def make_parser(self, url: str) -> FeedParser:
parser = self.parser_class(url=url)
for key, value in self.parser_kwargs.items():
setattr(parser, key, value)
return parser
def fetch_all_items(self) -> Generator[FeedItem, None, None]:
"""Fetch all items from all pages."""
visited_urls = set()
current_url = self.start_url
page_count = 0
total_items = 0
while current_url and page_count < self.max_pages:
if current_url in visited_urls:
logger.warning(f"Already visited {current_url}, stopping")
break
logger.info(f"Fetching page {page_count + 1}: {current_url}")
visited_urls.add(current_url)
try:
parser = self.make_parser(current_url)
items = parser.parse_feed()
if not items:
break
prev_items = total_items
for item in items:
total_items += 1
yield item
if prev_items == total_items:
logger.warning(f"No new items found on page {page_count + 1}")
break
current_url = self._find_next_page(parser, page_count)
if not current_url:
logger.info("No more pages found")
break
page_count += 1
if self.delay_between_requests > 0:
time.sleep(self.delay_between_requests)
except Exception as e:
logger.error(f"Error processing {current_url}: {e}")
break
def _find_next_page(self, parser: FeedParser, current_page: int = 0) -> str | None:
return None
@dataclass
class LinkFetcher(ArchiveFetcher):
per_page: int = 10
def _find_next_page(self, parser: FeedParser, current_page: int = 0):
next_page = current_page + 1
parsed = urlparse(self.start_url)
params = parse_qs(parsed.query)
params["offset"] = [str(next_page * self.per_page)]
params["limit"] = [str(self.per_page)]
new_query = urlencode(params, doseq=True)
return urlunparse(parsed._replace(query=new_query))
@dataclass
class HTMLArchiveFetcher(ArchiveFetcher):
next_page_selectors: list[str] = field(
default_factory=lambda: [
'a[rel="next"]',
".next a",
"a.next",
".pagination .next",
".pager .next",
"nav.page a:last-of-type",
".navigation a:last-of-type",
]
)
def _find_next_page(self, parser: FeedParser, current_page: int = 0) -> str | None:
if not parser.content:
return None
soup = BeautifulSoup(parser.content, "html.parser")
selectors = ",".join(self.next_page_selectors)
return extract_url(soup, selectors, parser.url)
def html_parser(**kwargs) -> type[HTMLListParser]:
class ConfiguredHTMLListParser(HTMLListParser):
def __init__(self, url: str):
super().__init__(url)
for key, value in kwargs.items():
setattr(self, key, value)
return ConfiguredHTMLListParser
@dataclass
class SubstackArchiveFetcher(LinkFetcher):
def __post_init__(self):
if "api/v1/archive" not in self.start_url:
base_url = get_base_url(self.start_url)
self.start_url = f"{base_url}/api/v1/archive"
@dataclass
class ACOUPArchiveFetcher(HTMLArchiveFetcher):
def _find_next_page(self, parser: FeedParser, current_page: int = 0) -> str | None:
if not parser.content:
return None
soup = BeautifulSoup(parser.content, "html.parser")
urls = reversed([i.attrs.get("href") for i in soup.select(".widget_archive a")])
urls = (cast(str, u) for u in urls if u)
for url in urls:
if url.rstrip("/") == parser.url.rstrip("/"):
return next(urls, None)
@dataclass
class HTMLNextUrlArchiveFetcher(HTMLArchiveFetcher):
next_url: str = ""
def __post_init__(self):
if not self.next_url:
self.next_url = self.start_url
if not self.next_url.startswith("http") and not self.next_url.startswith("/"):
self.next_url = f"{self.start_url}/{self.next_url}"
def _find_next_page(self, parser: FeedParser, current_page: int = 0) -> str | None:
return f"{self.next_url}/{current_page + 1}"
FETCHER_REGISTRY = {
r"https://putanumonit.com": (
"https://putanumonit.com/full-archive",
html_parser(
item_selector="article p", title_selector="a strong", url_selector="a"
),
),
r"https://danluu.com": DanluuParser,
r"https://www.rifters.com": RiftersParser,
r"https://rachelbythebay.com": html_parser(
item_selector="div.post",
url_selector="a",
),
r"https://guzey.com": (
"https://guzey.com/archive/",
html_parser(item_selector="article li"),
),
r"https://aphyr.com": html_parser(
item_selector="article.post",
title_selector="h1",
url_selector="h1 a",
description_selector=".body",
date_selector=".meta time",
),
r"https://www.applieddivinitystudies.com": html_parser(
item_selector="article.article",
title_selector="header.article-header h1",
url_selector="header.article-header h1 a",
description_selector=".article-entry",
date_selector=".article-meta time",
),
r"https://www.flyingmachinestudios.com": html_parser(
item_selector="#main #articles li",
title_selector="header .title",
description_selector="p",
date_selector="header .date",
date_format="%d %B %Y",
),
r"https://slimemoldtimemold.com": html_parser(
item_selector="article .wp-block-list li", title_selector="a"
),
r"https://www.paulgraham.com": (
"https://www.paulgraham.com/articles.html",
html_parser(item_selector="img + font"),
),
r"https://slatestarcodex.com": (
"https://slatestarcodex.com/archives/",
html_parser(item_selector="#sya_container li"),
),
r"https://mcfunley.com": (
"https://mcfunley.com/writing",
html_parser(item_selector="article", title_selector="h6"),
),
r"https://www.bitsaboutmoney.com": HTMLArchiveFetcher(
html_parser(
item_selector="article",
title_selector="h1",
description_selector="p",
date_selector="time",
),
"https://www.bitsaboutmoney.com/archive/",
next_page_selectors=["nav.pagination a.older-posts"],
),
r"https://acoup.blog": ACOUPArchiveFetcher(
html_parser(
item_selector="article",
title_selector="a",
description_selector=".entry-content",
date_selector=".published-on time",
),
"https://acoup.blog/2019/05/",
),
r"https://www.theredhandfiles.com": html_parser(
item_selector="article", title_selector="h3", description_selector="h2"
),
}
def get_archive_fetcher(url: str) -> ArchiveFetcher | None:
for pattern, fetcher in FETCHER_REGISTRY.items():
if re.search(pattern, url.rstrip("/")):
if isinstance(fetcher, ArchiveFetcher):
return fetcher
elif isinstance(fetcher, tuple):
base_url, html_fetcher = fetcher
return HTMLArchiveFetcher(html_fetcher, base_url)
else:
return HTMLArchiveFetcher(fetcher, url)
html = fetch_html(url)
soup = BeautifulSoup(html, "html.parser")
if is_substack(soup):
return SubstackArchiveFetcher(SubstackAPIParser, url)
feeds = [
"https://archive.ph/o/IQUoT/https://www.bloomberg.com/opinion/authors/ARbTQlRLRjE/matthew-s-levine",
"https://www.rifters.com/crawl/",
"https://rachelbythebay.com/w/",
"https://danluu.com/",
"https://guzey.com",
"https://aphyr.com/",
"https://www.applieddivinitystudies.com/",
"https://www.imightbewrong.org/",
"https://www.kvetch.au/",
"https://www.overcomingbias.com/",
"https://samkriss.substack.com/",
"https://www.richardhanania.com/",
"https://skunkledger.substack.com/",
"https://taipology.substack.com/",
"https://putanumonit.com/",
"https://www.flyingmachinestudios.com/",
"https://www.theintrinsicperspective.com/",
"https://www.strangeloopcanon.com/",
"https://slimemoldtimemold.com/",
"https://zeroinputagriculture.substack.com/",
"https://nayafia.substack.com",
"https://www.paulgraham.com/articles.html",
"https://mcfunley.com/writing",
"https://www.bitsaboutmoney.com/",
"https://akarlin.com",
"https://www.exurbe.com/",
"https://acoup.blog/",
"https://www.theredhandfiles.com/",
"https://karlin.blog/",
"https://slatestarcodex.com/",
]

View File

@ -13,6 +13,9 @@ from memory.common.parsers.html import (
extract_title, extract_title,
extract_date, extract_date,
fetch_html, fetch_html,
is_wordpress,
is_substack,
is_bloomberg,
) )
@ -587,24 +590,13 @@ def get_parser_for_url(url: str, html: str) -> BaseHTMLParser:
return parser_class(url) return parser_class(url)
soup = BeautifulSoup(html, "html.parser") soup = BeautifulSoup(html, "html.parser")
body_select = "body" if is_wordpress(soup):
# Check if this is an archived page
if contents := soup.select_one("#CONTENT .html"):
body_select = ".body"
soup = contents
if soup.select_one(f"{body_select} .wp-singular"):
return WordPressParser(url) return WordPressParser(url)
if any( if is_substack(soup):
"https://substackcdn.com" == a.attrs.get("href") # type: ignore
for a in soup.find_all("link", {"rel": "preconnect"})
if hasattr(a, "attrs") # type: ignore
):
return SubstackParser(url) return SubstackParser(url)
urls = [a.attrs.get("href") for a in soup.select(f"{body_select} a")] # type: ignore if is_bloomberg(soup):
if any(u.endswith("https://www.bloomberg.com/company/") for u in urls[:5] if u): # type: ignore
return BloombergParser(url) return BloombergParser(url)
return BaseHTMLParser(url) return BaseHTMLParser(url)
@ -628,11 +620,11 @@ def parse_webpage(url: str) -> Article:
feeds = [ feeds = [
"https://archive.ph/o/IQUoT/https://www.bloomberg.com/opinion/authors/ARbTQlRLRjE/matthew-s-levine", "https://archive.ph/o/IQUoT/https://www.bloomberg.com/opinion/authors/ARbTQlRLRjE/matthew-s-levine",
"https://www.rifters.com/crawl/", "https://www.rifters.com/crawl/",
"https://rachelbythebay.com/w/atom.xml", "https://rachelbythebay.com/w/",
"https://danluu.com/", "https://danluu.com/",
"https://guzey.com/archive", "https://guzey.come",
"https://aphyr.com/posts.atom", "https://aphyr.com/",
"https://www.applieddivinitystudies.com/atom.xml", "https://www.applieddivinitystudies.com/",
"https://www.imightbewrong.org/", "https://www.imightbewrong.org/",
"https://www.kvetch.au/", "https://www.kvetch.au/",
"https://www.overcomingbias.com/", "https://www.overcomingbias.com/",
@ -649,9 +641,10 @@ feeds = [
"https://nayafia.substack.com", "https://nayafia.substack.com",
"https://www.paulgraham.com/articles.html", "https://www.paulgraham.com/articles.html",
"https://mcfunley.com/writing", "https://mcfunley.com/writing",
"https://www.bitsaboutmoney.com/archive/", "https://www.bitsaboutmoney.com/",
"https://akarlin.com/archive/", "https://akarlin.com",
"https://www.exurbe.com/", "https://www.exurbe.com/",
"https://acoup.blog/", "https://acoup.blog/",
"https://www.theredhandfiles.com/", "https://www.theredhandfiles.com/",
"https://karlin.blog/",
] ]

View File

@ -1,5 +1,6 @@
from datetime import datetime from datetime import datetime
import logging import logging
import json
import re import re
from dataclasses import dataclass, field from dataclasses import dataclass, field
from typing import Any, Generator, Sequence, cast from typing import Any, Generator, Sequence, cast
@ -20,6 +21,20 @@ from memory.common.parsers.html import (
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
ObjectPath = list[str | int]
def select_in(data: Any, path: ObjectPath) -> Any:
if not path:
return data
key, *rest = path
try:
return select_in(data[key], rest)
except (KeyError, TypeError, IndexError):
return None
@dataclass @dataclass
class FeedItem: class FeedItem:
"""Represents a single item from a feed.""" """Represents a single item from a feed."""
@ -62,7 +77,7 @@ class FeedParser:
) )
def valid_item(self, item: FeedItem) -> bool: def valid_item(self, item: FeedItem) -> bool:
return True return bool(item.url)
def parse_feed(self) -> Generator[FeedItem, None, None]: def parse_feed(self) -> Generator[FeedItem, None, None]:
"""Parse feed content and return list of feed items.""" """Parse feed content and return list of feed items."""
@ -100,6 +115,46 @@ class FeedParser:
return {} return {}
class JSONParser(FeedParser):
title_path: ObjectPath = ["title"]
url_path: ObjectPath = ["url"]
description_path: ObjectPath = ["description"]
date_path: ObjectPath = ["date"]
author_path: ObjectPath = ["author"]
guid_path: ObjectPath = ["guid"]
metadata_path: ObjectPath = ["metadata"]
def fetch_items(self) -> Sequence[Any]:
if not self.content:
self.content = cast(str, fetch_html(self.url))
try:
return json.loads(self.content)
except json.JSONDecodeError as e:
logger.error(f"Error parsing JSON: {e}")
return []
def extract_title(self, entry: Any) -> str:
return select_in(entry, self.title_path)
def extract_url(self, entry: Any) -> str:
return select_in(entry, self.url_path)
def extract_description(self, entry: Any) -> str:
return select_in(entry, self.description_path)
def extract_date(self, entry: Any) -> datetime:
return select_in(entry, self.date_path)
def extract_author(self, entry: Any) -> str:
return select_in(entry, self.author_path)
def extract_guid(self, entry: Any) -> str:
return select_in(entry, self.guid_path)
def extract_metadata(self, entry: Any) -> dict[str, Any]:
return select_in(entry, self.metadata_path)
class RSSAtomParser(FeedParser): class RSSAtomParser(FeedParser):
"""Parser for RSS and Atom feeds using feedparser.""" """Parser for RSS and Atom feeds using feedparser."""
@ -237,8 +292,14 @@ class HTMLListParser(FeedParser):
return extract_date(entry, self.date_selector, self.date_format) return extract_date(entry, self.date_selector, self.date_format)
class SubstackAPIParser(JSONParser):
url_path = ["canonical_url"]
author_path = ["publishedBylines", 0, "name"]
date_path = ["post_date"]
class DanluuParser(HTMLListParser): class DanluuParser(HTMLListParser):
skip_patterns = [r"^https://danluu.com/#"] skip_patterns = DEFAULT_SKIP_PATTERNS + [r"^https://danluu\.com/?#"]
def valid_item(self, item: FeedItem) -> bool: def valid_item(self, item: FeedItem) -> bool:
return item.url.startswith(self.base_url) return item.url.startswith(self.base_url)
@ -351,6 +412,13 @@ class RedHandFilesParser(HTMLListParser):
return "" return ""
class RiftersParser(HTMLListParser):
item_selector = "#content .post"
title_selector = "h2 a"
url_selector = "h2 a"
description_selector = ".entry-content"
class BloombergAuthorParser(HTMLListParser): class BloombergAuthorParser(HTMLListParser):
item_selector = "section#author_page article" item_selector = "section#author_page article"
url_selector = "a[href]" url_selector = "a[href]"
@ -393,7 +461,7 @@ def is_rss_feed(content: str) -> bool:
) )
def extract_url(element: Tag, base_url: str) -> str | None: def clean_url(element: Tag, base_url: str) -> str | None:
if not (href := element.get("href")): if not (href := element.get("href")):
return None return None
@ -409,14 +477,14 @@ def find_feed_link(url: str, soup: BeautifulSoup) -> str | None:
for link in links: for link in links:
if not isinstance(link, Tag): if not isinstance(link, Tag):
continue continue
if not (link_url := extract_url(link, url)): if not (link_url := clean_url(link, url)):
continue continue
if link_url.rstrip("/") != url.rstrip("/"): if link_url.rstrip("/") != url.rstrip("/"):
return link_url return link_url
return None return None
PARSER_REGISTRY = { FEED_REGISTRY = {
r"https://danluu.com": DanluuParser, r"https://danluu.com": DanluuParser,
r"https://guzey.com/archive": GuzeyParser, r"https://guzey.com/archive": GuzeyParser,
r"https://www.paulgraham.com/articles": PaulGrahamParser, r"https://www.paulgraham.com/articles": PaulGrahamParser,
@ -427,7 +495,7 @@ PARSER_REGISTRY = {
def get_feed_parser(url: str, check_from: datetime | None = None) -> FeedParser | None: def get_feed_parser(url: str, check_from: datetime | None = None) -> FeedParser | None:
for pattern, parser_class in PARSER_REGISTRY.items(): for pattern, parser_class in FEED_REGISTRY.items():
if re.search(pattern, url.rstrip("/")): if re.search(pattern, url.rstrip("/")):
return parser_class(url=url, since=check_from) return parser_class(url=url, since=check_from)

View File

@ -110,7 +110,14 @@ def extract_date(
datetime_attr = element.get("datetime") datetime_attr = element.get("datetime")
if datetime_attr: if datetime_attr:
for format in ["%Y-%m-%dT%H:%M:%S", "%Y-%m-%d", date_format]: for format in [
"%Y-%m-%dT%H:%M:%S.%fZ",
"%Y-%m-%dT%H:%M:%S%z",
"%Y-%m-%dT%H:%M:%S.%f",
"%Y-%m-%dT%H:%M:%S",
"%Y-%m-%d",
date_format,
]:
if date := parse_date(str(datetime_attr), format): if date := parse_date(str(datetime_attr), format):
return date return date
@ -277,6 +284,47 @@ def extract_metadata(soup: BeautifulSoup) -> dict[str, Any]:
return metadata return metadata
def extract_url(soup: BeautifulSoup, selectors: str, base_url: str = "") -> str | None:
for selector in selectors.split(","):
next_link = soup.select_one(selector)
if not (next_link and isinstance(next_link, Tag)):
continue
if not (href := next_link.get("href")):
continue
return to_absolute_url(str(href), base_url)
return None
def is_substack(soup: BeautifulSoup | Tag) -> bool:
return any(
"https://substackcdn.com" == a.attrs.get("href") # type: ignore
for a in soup.find_all("link", {"rel": "preconnect"})
if hasattr(a, "attrs") # type: ignore
)
def is_wordpress(soup: BeautifulSoup | Tag) -> bool:
body_select = "body"
# Check if this is an archived page
if contents := soup.select_one("#CONTENT .html"):
body_select = "#CONTENT .html"
soup = contents
return bool(soup.select_one(f"{body_select} .wp-singular"))
def is_bloomberg(soup: BeautifulSoup | Tag) -> bool:
body_select = "body"
# Check if this is an archived page
if contents := soup.select_one("#CONTENT .html"):
body_select = "#CONTENT .html"
soup = contents
urls = [a.attrs.get("href") for a in soup.select(f"{body_select} a")] # type: ignore
return any(u.endswith("https://www.bloomberg.com/company/") for u in urls[:5] if u) # type: ignore
class BaseHTMLParser: class BaseHTMLParser:
"""Base class for parsing HTML content from websites.""" """Base class for parsing HTML content from websites."""

View File

@ -1,176 +1,512 @@
from unittest.mock import patch
from urllib.parse import urlparse, parse_qs
import pytest import pytest
from unittest.mock import Mock, patch
from bs4 import BeautifulSoup
from memory.common.parsers.archives import ( from memory.common.parsers.archives import (
ArchiveParser, ArchiveFetcher,
WordPressArchiveParser, LinkFetcher,
SubstackArchiveParser, HTMLArchiveFetcher,
get_archive_parser, SubstackArchiveFetcher,
ACOUPArchiveFetcher,
HTMLNextUrlArchiveFetcher,
html_parser,
get_archive_fetcher,
FETCHER_REGISTRY,
)
from memory.common.parsers.feeds import (
FeedItem,
FeedParser,
HTMLListParser,
DanluuParser,
SubstackAPIParser,
) )
class TestArchiveParser: class MockParser(FeedParser):
def test_init(self): def __init__(
parser = ArchiveParser(url="https://example.com") self, url: str, items: list[FeedItem] | None = None, content: str = ""
assert parser.url == "https://example.com" ):
assert parser._visited_urls == set() super().__init__(url)
assert parser._all_items == [] self.items = items or []
assert parser.max_pages == 100 self.content = content
assert parser.delay_between_requests == 1.0
def test_extract_items_from_page(self): def parse_feed(self):
html = """ return self.items
<div>
<li><a href="/post1">Post 1</a></li>
<li><a href="/post2">Post 2</a></li>
<li><a href="/post1">Post 1</a></li> <!-- Duplicate -->
</div>
"""
soup = BeautifulSoup(html, "html.parser")
parser = ArchiveParser(url="https://example.com")
items = parser._extract_items_from_page(soup)
assert len(items) == 2 # Duplicates should be filtered out
def test_find_next_page_url_with_selector(self):
html = '<div><a class="next" href="/page/2">Next</a></div>'
soup = BeautifulSoup(html, "html.parser")
parser = ArchiveParser(url="https://example.com")
parser.next_page_selector = ".next"
next_url = parser._find_next_page_url(soup, "https://example.com/page/1")
assert next_url == "https://example.com/page/2"
def test_find_next_page_url_heuristic(self):
html = '<div><a rel="next" href="/page/2">Next</a></div>'
soup = BeautifulSoup(html, "html.parser")
parser = ArchiveParser(url="https://example.com")
next_url = parser._find_next_page_url(soup, "https://example.com/page/1")
assert next_url == "https://example.com/page/2"
def test_find_next_page_url_contains_text(self):
html = '<div><a href="/page/2">Next →</a></div>'
soup = BeautifulSoup(html, "html.parser")
parser = ArchiveParser(url="https://example.com")
next_url = parser._find_next_page_heuristic(soup)
assert next_url == "https://example.com/page/2"
def test_find_next_numeric_page(self):
parser = ArchiveParser(url="https://example.com")
parser.page_url_pattern = "/page/{page}"
# Test with existing page number
next_url = parser._find_next_numeric_page("https://example.com/page/3")
assert next_url == "https://example.com/page/4"
# Test without page number (assume page 1)
next_url = parser._find_next_numeric_page("https://example.com/archive")
assert next_url == "https://example.com/archive/page/2"
@patch("memory.common.parsers.archives.fetch_html")
@patch("time.sleep")
def test_fetch_items_pagination(self, mock_sleep, mock_fetch):
# Mock HTML for two pages
page1_html = """
<div>
<li><a href="/post1">Post 1</a></li>
<li><a href="/post2">Post 2</a></li>
<a rel="next" href="/page/2">Next</a>
</div>
"""
page2_html = """
<div>
<li><a href="/post3">Post 3</a></li>
<li><a href="/post4">Post 4</a></li>
</div>
"""
mock_fetch.side_effect = [page1_html, page2_html]
parser = ArchiveParser(url="https://example.com/page/1")
parser.delay_between_requests = 0.1 # Speed up test
items = parser.fetch_items()
assert len(items) == 4
assert mock_fetch.call_count == 2
assert mock_sleep.call_count == 1 # One delay between requests
@patch("memory.common.parsers.archives.fetch_html")
def test_fetch_items_stops_at_max_pages(self, mock_fetch):
# Mock HTML that always has a next page
html_with_next = """
<div>
<li><a href="/post">Post</a></li>
<a rel="next" href="/page/999">Next</a>
</div>
"""
mock_fetch.return_value = html_with_next
parser = ArchiveParser(url="https://example.com/page/1")
parser.max_pages = 3
parser.delay_between_requests = 0 # No delay for test
items = parser.fetch_items()
assert mock_fetch.call_count == 3 # Should stop at max_pages
@patch("memory.common.parsers.archives.fetch_html")
def test_fetch_items_handles_duplicate_urls(self, mock_fetch):
# Mock HTML that creates a cycle
page1_html = """
<div>
<li><a href="/post1">Post 1</a></li>
<a rel="next" href="/page/2">Next</a>
</div>
"""
page2_html = """
<div>
<li><a href="/post2">Post 2</a></li>
<a rel="next" href="/page/1">Back to page 1</a>
</div>
"""
mock_fetch.side_effect = [page1_html, page2_html]
parser = ArchiveParser(url="https://example.com/page/1")
parser.delay_between_requests = 0
items = parser.fetch_items()
assert len(items) == 2
assert mock_fetch.call_count == 2 # Should stop when it hits visited URL
class TestWordPressArchiveParser: def test_archive_fetcher_make_parser():
def test_selectors(self): fetcher = ArchiveFetcher(
parser = WordPressArchiveParser(url="https://example.wordpress.com") parser_class=MockParser,
assert parser.item_selector == "article, .post" start_url="https://example.com",
assert parser.next_page_selector == '.nav-previous a, .next a, a[rel="next"]' parser_kwargs={"custom_attr": "value"},
assert parser.title_selector == ".entry-title a, h1 a, h2 a"
class TestSubstackArchiveParser:
def test_selectors(self):
parser = SubstackArchiveParser(url="https://example.substack.com")
assert parser.item_selector == ".post-preview, .post"
assert parser.next_page_selector == ".pagination .next"
class TestGetArchiveParser:
@pytest.mark.parametrize(
"url,expected_class",
[
("https://example.wordpress.com/archive", WordPressArchiveParser),
("https://example.substack.com/archive", SubstackArchiveParser),
("https://example.com/archive", ArchiveParser), # Default
],
) )
def test_get_archive_parser(self, url, expected_class):
parser = get_archive_parser(url) parser = fetcher.make_parser("https://example.com/page1")
assert isinstance(parser, expected_class)
assert parser.url == url assert isinstance(parser, MockParser)
assert parser.url == "https://example.com/page1"
assert getattr(parser, "custom_attr") == "value"
def test_archive_fetcher_find_next_page_base():
fetcher = ArchiveFetcher(MockParser, "https://example.com")
parser = MockParser("https://example.com")
assert fetcher._find_next_page(parser, 0) is None
@patch("memory.common.parsers.archives.time.sleep")
def test_archive_fetcher_fetch_all_items_single_page(mock_sleep):
items = [
FeedItem(title="Item 1", url="https://example.com/1"),
FeedItem(title="Item 2", url="https://example.com/2"),
]
fetcher = ArchiveFetcher(
parser_class=MockParser,
start_url="https://example.com",
delay_between_requests=0.5,
)
with patch.object(fetcher, "make_parser") as mock_make_parser:
mock_parser = MockParser("https://example.com", items)
mock_make_parser.return_value = mock_parser
result = list(fetcher.fetch_all_items())
assert result == items
mock_make_parser.assert_called_once_with("https://example.com")
mock_sleep.assert_not_called() # No delay for single page
@patch("memory.common.parsers.archives.time.sleep")
def test_archive_fetcher_fetch_all_items_multiple_pages(mock_sleep):
page1_items = [FeedItem(title="Item 1", url="https://example.com/1")]
page2_items = [FeedItem(title="Item 2", url="https://example.com/2")]
class TestFetcher(ArchiveFetcher):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.call_count = 0
def _find_next_page(self, parser, current_page=0):
self.call_count += 1
if self.call_count == 1:
return "https://example.com/page2"
return None
fetcher = TestFetcher(
parser_class=MockParser,
start_url="https://example.com",
delay_between_requests=0.1,
)
with patch.object(fetcher, "make_parser") as mock_make_parser:
mock_make_parser.side_effect = [
MockParser("https://example.com", page1_items),
MockParser("https://example.com/page2", page2_items),
]
result = list(fetcher.fetch_all_items())
assert result == page1_items + page2_items
assert mock_make_parser.call_count == 2
mock_sleep.assert_called_once_with(0.1)
def test_archive_fetcher_fetch_all_items_max_pages():
class TestFetcher(ArchiveFetcher):
def _find_next_page(self, parser, current_page=0):
return f"https://example.com/page{current_page + 2}"
fetcher = TestFetcher(
parser_class=MockParser,
start_url="https://example.com",
max_pages=2,
delay_between_requests=0,
)
items = [FeedItem(title="Item", url="https://example.com/item")]
with patch.object(fetcher, "make_parser") as mock_make_parser:
mock_make_parser.return_value = MockParser("https://example.com", items)
result = list(fetcher.fetch_all_items())
assert len(result) == 2 # 2 pages * 1 item per page
assert mock_make_parser.call_count == 2
def test_archive_fetcher_fetch_all_items_visited_url():
class TestFetcher(ArchiveFetcher):
def _find_next_page(self, parser, current_page=0):
return "https://example.com" # Return same URL to trigger visited check
fetcher = TestFetcher(MockParser, "https://example.com", delay_between_requests=0)
items = [FeedItem(title="Item", url="https://example.com/item")]
with patch.object(fetcher, "make_parser") as mock_make_parser:
mock_make_parser.return_value = MockParser("https://example.com", items)
result = list(fetcher.fetch_all_items())
assert len(result) == 1 # Only first page processed
mock_make_parser.assert_called_once()
def test_archive_fetcher_fetch_all_items_no_items():
fetcher = ArchiveFetcher(
MockParser, "https://example.com", delay_between_requests=0
)
with patch.object(fetcher, "make_parser") as mock_make_parser:
mock_make_parser.return_value = MockParser("https://example.com", [])
result = list(fetcher.fetch_all_items())
assert result == []
mock_make_parser.assert_called_once()
def test_archive_fetcher_fetch_all_items_exception():
fetcher = ArchiveFetcher(
MockParser, "https://example.com", delay_between_requests=0
)
with patch.object(fetcher, "make_parser") as mock_make_parser:
mock_make_parser.side_effect = Exception("Network error")
result = list(fetcher.fetch_all_items())
assert result == []
@pytest.mark.parametrize(
"start_url, per_page, current_page, expected_params",
[
("https://example.com", 10, 0, {"offset": ["10"], "limit": ["10"]}),
(
"https://example.com?existing=value",
20,
1,
{"existing": ["value"], "offset": ["40"], "limit": ["20"]},
),
(
"https://example.com?offset=0&limit=5",
15,
2,
{"offset": ["45"], "limit": ["15"]},
),
],
)
def test_link_fetcher_find_next_page(
start_url, per_page, current_page, expected_params
):
fetcher = LinkFetcher(MockParser, start_url, per_page=per_page)
parser = MockParser(start_url)
next_url = fetcher._find_next_page(parser, current_page)
assert next_url is not None
parsed = urlparse(next_url)
params = parse_qs(parsed.query)
for key, value in expected_params.items():
assert params[key] == value
@pytest.mark.parametrize(
"html, selectors, expected_url",
[
(
'<a rel="next" href="/page2">Next</a>',
['a[rel="next"]'],
"https://example.com/page2",
),
(
'<div class="next"><a href="/page2">Next</a></div>',
[".next a"],
"https://example.com/page2",
),
(
'<a class="next" href="/page2">Next</a>',
["a.next"],
"https://example.com/page2",
),
(
'<div class="pagination"><span class="next"><a href="/page2">Next</a></span></div>',
[".pagination .next"],
None, # This won't match because it's looking for .pagination .next directly
),
(
'<div class="pagination next"><a href="/page2">Next</a></div>',
[".pagination.next"],
None, # This selector isn't in default list
),
(
'<nav class="page"><a href="/page1">1</a><a href="/page2">2</a></nav>',
["nav.page a:last-of-type"],
"https://example.com/page2",
),
("<div>No next link</div>", ['a[rel="next"]'], None),
],
)
def test_html_archive_fetcher_find_next_page(html, selectors, expected_url):
fetcher = HTMLArchiveFetcher(
MockParser, "https://example.com", next_page_selectors=selectors
)
parser = MockParser("https://example.com", content=html)
with patch("memory.common.parsers.archives.extract_url") as mock_extract:
mock_extract.return_value = expected_url
result = fetcher._find_next_page(parser)
if expected_url:
mock_extract.assert_called_once()
assert result == expected_url
else:
# extract_url might still be called but return None
assert result is None
def test_html_archive_fetcher_find_next_page_no_content():
fetcher = HTMLArchiveFetcher(MockParser, "https://example.com")
parser = MockParser("https://example.com", content="")
result = fetcher._find_next_page(parser)
assert result is None
def test_html_parser_factory():
CustomParser = html_parser(
item_selector="article", title_selector="h1", custom_attr="value"
)
parser = CustomParser("https://example.com")
assert isinstance(parser, HTMLListParser)
assert parser.item_selector == "article"
assert parser.title_selector == "h1"
assert getattr(parser, "custom_attr") == "value"
@pytest.mark.parametrize(
"start_url, expected_api_url",
[
("https://example.substack.com", "https://example.substack.com/api/v1/archive"),
(
"https://example.substack.com/posts",
"https://example.substack.com/api/v1/archive",
),
(
"https://example.substack.com/api/v1/archive",
"https://example.substack.com/api/v1/archive",
),
],
)
def test_substack_archive_fetcher_post_init(start_url, expected_api_url):
with patch("memory.common.parsers.archives.get_base_url") as mock_get_base:
mock_get_base.return_value = "https://example.substack.com"
fetcher = SubstackArchiveFetcher(SubstackAPIParser, start_url)
assert fetcher.start_url == expected_api_url
def test_acoup_archive_fetcher_find_next_page():
html = """
<div class="widget_archive">
<a href="https://acoup.blog/2019/04/">April 2019</a>
<a href="https://acoup.blog/2019/05/">May 2019</a>
<a href="https://acoup.blog/2019/06/">June 2019</a>
</div>
"""
fetcher = ACOUPArchiveFetcher(MockParser, "https://acoup.blog/2019/05/")
parser = MockParser("https://acoup.blog/2019/05/", content=html)
result = fetcher._find_next_page(parser)
assert result == "https://acoup.blog/2019/04/"
def test_acoup_archive_fetcher_find_next_page_no_match():
html = """
<div class="widget_archive">
<a href="https://acoup.blog/2019/04/">April 2019</a>
<a href="https://acoup.blog/2019/06/">June 2019</a>
</div>
"""
fetcher = ACOUPArchiveFetcher(MockParser, "https://acoup.blog/2019/05/")
parser = MockParser("https://acoup.blog/2019/05/", content=html)
result = fetcher._find_next_page(parser)
assert result is None
def test_acoup_archive_fetcher_find_next_page_no_content():
fetcher = ACOUPArchiveFetcher(MockParser, "https://acoup.blog/2019/05/")
parser = MockParser("https://acoup.blog/2019/05/", content="")
result = fetcher._find_next_page(parser)
assert result is None
@pytest.mark.parametrize(
"start_url, next_url, expected_next_url",
[
(
"https://example.com",
"",
"https://example.com",
), # Empty next_url defaults to start_url
(
"https://example.com",
"https://other.com/archive",
"https://other.com/archive", # Full URL is preserved
),
(
"https://example.com",
"/archive",
"/archive",
), # Absolute path is preserved
(
"https://example.com",
"archive",
"https://example.com/archive",
), # Relative path gets prepended
],
)
def test_html_next_url_archive_fetcher_post_init(
start_url, next_url, expected_next_url
):
fetcher = HTMLNextUrlArchiveFetcher(MockParser, start_url, next_url=next_url)
assert fetcher.next_url == expected_next_url
def test_html_next_url_archive_fetcher_find_next_page():
fetcher = HTMLNextUrlArchiveFetcher(
MockParser, "https://example.com", next_url="https://example.com/archive"
)
parser = MockParser("https://example.com")
result = fetcher._find_next_page(parser, 2)
assert result == "https://example.com/archive/3"
@pytest.mark.parametrize(
"url, expected_fetcher_type",
[
("https://danluu.com", HTMLArchiveFetcher),
("https://www.rifters.com", HTMLArchiveFetcher),
("https://putanumonit.com", HTMLArchiveFetcher),
("https://acoup.blog", ACOUPArchiveFetcher),
("https://unknown.com", None),
],
)
def test_get_archive_fetcher_registry_matches(url, expected_fetcher_type):
with patch("memory.common.parsers.archives.fetch_html") as mock_fetch:
mock_fetch.return_value = "<html><body>Not substack</body></html>"
with patch("memory.common.parsers.archives.is_substack") as mock_is_substack:
mock_is_substack.return_value = False
fetcher = get_archive_fetcher(url)
if expected_fetcher_type:
assert isinstance(fetcher, expected_fetcher_type)
else:
assert fetcher is None
def test_get_archive_fetcher_tuple_registry():
url = "https://putanumonit.com"
with patch("memory.common.parsers.archives.fetch_html") as mock_fetch:
mock_fetch.return_value = "<html><body>Not substack</body></html>"
fetcher = get_archive_fetcher(url)
assert isinstance(fetcher, HTMLArchiveFetcher)
assert fetcher.start_url == "https://putanumonit.com/full-archive"
def test_get_archive_fetcher_direct_parser_registry():
url = "https://danluu.com"
with patch("memory.common.parsers.archives.fetch_html") as mock_fetch:
mock_fetch.return_value = "<html><body>Not substack</body></html>"
fetcher = get_archive_fetcher(url)
assert isinstance(fetcher, HTMLArchiveFetcher)
assert fetcher.parser_class == DanluuParser
assert fetcher.start_url == url
def test_get_archive_fetcher_substack():
url = "https://example.substack.com"
with patch("memory.common.parsers.archives.fetch_html") as mock_fetch:
mock_fetch.return_value = "<html><body>Substack content</body></html>"
with patch("memory.common.parsers.archives.is_substack") as mock_is_substack:
mock_is_substack.return_value = True
fetcher = get_archive_fetcher(url)
assert isinstance(fetcher, SubstackArchiveFetcher)
assert fetcher.parser_class == SubstackAPIParser
def test_get_archive_fetcher_no_match():
url = "https://unknown.com"
with patch("memory.common.parsers.archives.fetch_html") as mock_fetch:
mock_fetch.return_value = "<html><body>Regular website</body></html>"
with patch("memory.common.parsers.archives.is_substack") as mock_is_substack:
mock_is_substack.return_value = False
fetcher = get_archive_fetcher(url)
assert fetcher is None
def test_fetcher_registry_structure():
"""Test that FETCHER_REGISTRY has expected structure."""
assert isinstance(FETCHER_REGISTRY, dict)
for pattern, fetcher in FETCHER_REGISTRY.items():
assert isinstance(pattern, str)
assert (
isinstance(fetcher, type)
and issubclass(fetcher, FeedParser)
or isinstance(fetcher, tuple)
or isinstance(fetcher, ArchiveFetcher)
)
@pytest.mark.parametrize(
"pattern, test_url, should_match",
[
(r"https://danluu.com", "https://danluu.com", True),
(r"https://danluu.com", "https://danluu.com/", True),
(r"https://danluu.com", "https://other.com", False),
(r"https://www.rifters.com", "https://www.rifters.com/crawl", True),
(r"https://putanumonit.com", "https://putanumonit.com/archive", True),
],
)
def test_registry_pattern_matching(pattern, test_url, should_match):
import re
match = re.search(pattern, test_url.rstrip("/"))
assert bool(match) == should_match

View File

@ -1,10 +1,10 @@
from datetime import datetime from datetime import datetime
from unittest.mock import MagicMock, patch from unittest.mock import MagicMock, patch
from typing import Any, cast from typing import cast
import json
import pytest import pytest
from bs4 import BeautifulSoup, Tag from bs4 import BeautifulSoup, Tag
import requests
from memory.common.parsers.feeds import ( from memory.common.parsers.feeds import (
FeedItem, FeedItem,
@ -17,15 +17,160 @@ from memory.common.parsers.feeds import (
NadiaXyzParser, NadiaXyzParser,
RedHandFilesParser, RedHandFilesParser,
BloombergAuthorParser, BloombergAuthorParser,
JSONParser,
SubstackAPIParser,
select_in,
clean_url,
is_rss_feed, is_rss_feed,
extract_url,
find_feed_link, find_feed_link,
get_feed_parser, get_feed_parser,
DEFAULT_SKIP_PATTERNS,
PARSER_REGISTRY,
) )
@pytest.mark.parametrize(
"data, path, expected",
[
# Basic dictionary access
({"key": "value"}, ["key"], "value"),
({"nested": {"key": "value"}}, ["nested", "key"], "value"),
# List access
(["a", "b", "c"], [1], "b"),
([{"key": "value"}], [0, "key"], "value"),
# Mixed access
(
{"items": [{"name": "first"}, {"name": "second"}]},
["items", 1, "name"],
"second",
),
# Empty path returns original data
({"key": "value"}, [], {"key": "value"}),
# Missing keys return None
({"key": "value"}, ["missing"], None),
({"nested": {}}, ["nested", "missing"], None),
# Index out of bounds returns None
(["a", "b"], [5], None),
# Type errors return None
("string", ["key"], None),
(123, [0], None),
(None, ["key"], None),
# Deep nesting
({"a": {"b": {"c": {"d": "deep"}}}}, ["a", "b", "c", "d"], "deep"),
],
)
def test_select_in(data, path, expected):
assert select_in(data, path) == expected
@patch("memory.common.parsers.feeds.fetch_html")
def test_json_parser_fetch_items_with_content(mock_fetch_html):
content = json.dumps(
[
{"title": "Article 1", "url": "https://example.com/1"},
{"title": "Article 2", "url": "https://example.com/2"},
]
)
parser = JSONParser(url="https://example.com/feed.json", content=content)
items = parser.fetch_items()
assert items == [
{"title": "Article 1", "url": "https://example.com/1"},
{"title": "Article 2", "url": "https://example.com/2"},
]
mock_fetch_html.assert_not_called()
@patch("memory.common.parsers.feeds.fetch_html")
def test_json_parser_fetch_items_without_content(mock_fetch_html):
content = json.dumps([{"title": "Article", "url": "https://example.com/1"}])
mock_fetch_html.return_value = content
parser = JSONParser(url="https://example.com/feed.json")
items = parser.fetch_items()
assert items == [{"title": "Article", "url": "https://example.com/1"}]
mock_fetch_html.assert_called_once_with("https://example.com/feed.json")
@patch("memory.common.parsers.feeds.fetch_html")
def test_json_parser_fetch_items_invalid_json(mock_fetch_html):
mock_fetch_html.return_value = "invalid json content"
parser = JSONParser(url="https://example.com/feed.json")
items = parser.fetch_items()
assert items == []
def test_json_parser_extract_methods():
parser = JSONParser(url="https://example.com")
entry = {
"title": "Test Title",
"url": "https://example.com/article",
"description": "Test description",
"date": "2023-01-15",
"author": "John Doe",
"guid": "unique-123",
"metadata": {"tags": ["tech", "news"]},
}
assert parser.extract_title(entry) == "Test Title"
assert parser.extract_url(entry) == "https://example.com/article"
assert parser.extract_description(entry) == "Test description"
assert parser.extract_date(entry) == "2023-01-15"
assert parser.extract_author(entry) == "John Doe"
assert parser.extract_guid(entry) == "unique-123"
assert parser.extract_metadata(entry) == {"tags": ["tech", "news"]}
def test_json_parser_custom_paths():
parser = JSONParser(url="https://example.com")
parser.title_path = ["content", "headline"]
parser.url_path = ["links", "canonical"]
parser.author_path = ["byline", "name"]
entry = {
"content": {"headline": "Custom Title"},
"links": {"canonical": "https://example.com/custom"},
"byline": {"name": "Jane Smith"},
}
assert parser.extract_title(entry) == "Custom Title"
assert parser.extract_url(entry) == "https://example.com/custom"
assert parser.extract_author(entry) == "Jane Smith"
def test_json_parser_missing_fields():
parser = JSONParser(url="https://example.com")
entry = {} # Empty entry
assert parser.extract_title(entry) is None
assert parser.extract_url(entry) is None
assert parser.extract_description(entry) is None
assert parser.extract_date(entry) is None
assert parser.extract_author(entry) is None
assert parser.extract_guid(entry) is None
assert parser.extract_metadata(entry) is None
def test_json_parser_nested_paths():
parser = JSONParser(url="https://example.com")
parser.title_path = ["article", "header", "title"]
parser.author_path = ["article", "byline", 0, "name"]
entry = {
"article": {
"header": {"title": "Nested Title"},
"byline": [{"name": "First Author"}, {"name": "Second Author"}],
}
}
assert parser.extract_title(entry) == "Nested Title"
assert parser.extract_author(entry) == "First Author"
def test_feed_parser_base_url(): def test_feed_parser_base_url():
parser = FeedParser(url="https://example.com/path/to/feed") parser = FeedParser(url="https://example.com/path/to/feed")
assert parser.base_url == "https://example.com" assert parser.base_url == "https://example.com"
@ -582,7 +727,7 @@ def test_extract_url_function(html, expected):
element = soup.find("a") element = soup.find("a")
assert element is not None assert element is not None
url = extract_url(cast(Tag, element), "https://example.com") url = clean_url(cast(Tag, element), "https://example.com")
assert url == expected assert url == expected
@ -706,33 +851,30 @@ def test_get_feed_parser_with_check_from():
assert parser.since == check_from assert parser.since == check_from
def test_parser_registry_completeness(): def test_substack_api_parser():
"""Ensure PARSER_REGISTRY contains expected parsers.""" parser = SubstackAPIParser(url="https://example.substack.com/api/v1/posts")
expected_patterns = [
r"https://danluu.com",
r"https://guzey.com/archive",
r"https://www.paulgraham.com/articles",
r"https://nadia.xyz/posts",
r"https://www.theredhandfiles.com",
r"https://archive.ph/.*?/https://www.bloomberg.com/opinion/authors/",
]
assert len(PARSER_REGISTRY) == len(expected_patterns) entry = {
for pattern in expected_patterns: "title": "Substack Post",
assert pattern in PARSER_REGISTRY "canonical_url": "https://example.substack.com/p/post-slug",
"publishedBylines": [{"name": "Author Name"}],
"post_date": "2023-01-15T10:30:00Z",
}
assert parser.extract_title(entry) == "Substack Post"
assert parser.extract_url(entry) == "https://example.substack.com/p/post-slug"
assert parser.extract_author(entry) == "Author Name"
assert parser.extract_date(entry) == "2023-01-15T10:30:00Z"
def test_default_skip_patterns(): def test_substack_api_parser_missing_bylines():
"""Ensure DEFAULT_SKIP_PATTERNS contains expected patterns.""" parser = SubstackAPIParser(url="https://example.substack.com/api/v1/posts")
expected_patterns = [
r"^#",
r"mailto:",
r"tel:",
r"javascript:",
r"\.pdf$",
r"\.jpg$",
r"\.png$",
r"\.gif$",
]
assert DEFAULT_SKIP_PATTERNS == expected_patterns entry = {
"title": "Post Without Author",
"canonical_url": "https://example.substack.com/p/post",
"publishedBylines": [],
"post_date": "2023-01-15T10:30:00Z",
}
assert parser.extract_author(entry) is None

View File

@ -23,7 +23,11 @@ from memory.common.parsers.html import (
extract_meta_by_pattern, extract_meta_by_pattern,
extract_metadata, extract_metadata,
extract_title, extract_title,
extract_url,
get_base_url, get_base_url,
is_bloomberg,
is_substack,
is_wordpress,
parse_date, parse_date,
process_image, process_image,
process_images, process_images,
@ -454,7 +458,7 @@ def test_process_images_empty():
None, "https://example.com", pathlib.Path("/tmp") None, "https://example.com", pathlib.Path("/tmp")
) )
assert result_content is None assert result_content is None
assert result_images == [] assert result_images == {}
@patch("memory.common.parsers.html.process_image") @patch("memory.common.parsers.html.process_image")
@ -503,6 +507,191 @@ def test_process_images_no_filename(mock_process_image):
assert not images assert not images
@pytest.mark.parametrize(
"html, selectors, base_url, expected",
[
# Basic URL extraction
(
'<a href="/next-page">Next</a>',
"a",
"https://example.com",
"https://example.com/next-page",
),
# Multiple selectors - should pick first matching
(
'<div><a href="/first">First</a><a href="/second">Second</a></div>',
"a",
"https://example.com",
"https://example.com/first",
),
# Multiple selectors with comma separation - span doesn't have href, so falls back to a
(
'<div><span class="next">Span</span><a href="/link">Link</a></div>',
".next, a",
"https://example.com",
"https://example.com/link",
),
# Absolute URL should remain unchanged
(
'<a href="https://other.com/page">External</a>',
"a",
"https://example.com",
"https://other.com/page",
),
# No href attribute
("<a>No href</a>", "a", "https://example.com", None),
# No matching element
("<p>No links</p>", "a", "https://example.com", None),
# Empty href
('<a href="">Empty</a>', "a", "https://example.com", None),
],
)
def test_extract_url(html, selectors, base_url, expected):
soup = BeautifulSoup(html, "html.parser")
assert extract_url(soup, selectors, base_url) == expected
@pytest.mark.parametrize(
"html, expected",
[
# Substack with preconnect link
(
"""
<head>
<link rel="preconnect" href="https://substackcdn.com">
</head>
""",
True,
),
# Multiple preconnect links, one is Substack
(
"""
<head>
<link rel="preconnect" href="https://fonts.googleapis.com">
<link rel="preconnect" href="https://substackcdn.com">
</head>
""",
True,
),
# No Substack preconnect
(
"""
<head>
<link rel="preconnect" href="https://fonts.googleapis.com">
</head>
""",
False,
),
# No preconnect links at all
("<head></head>", False),
# Preconnect without href
('<head><link rel="preconnect"></head>', False),
# Different rel attribute
('<head><link rel="stylesheet" href="https://substackcdn.com"></head>', False),
],
)
def test_is_substack(html, expected):
soup = BeautifulSoup(html, "html.parser")
assert is_substack(soup) == expected
@pytest.mark.parametrize(
"html, expected",
[
# WordPress with wp-singular class on body should be False (looks for content inside body)
('<body class="wp-singular">Content</body>', False),
# WordPress with nested wp-singular
('<body><div class="wp-singular">Content</div></body>', True),
# Archived page with WordPress content
(
"""
<div id="CONTENT">
<div class="html">
<body class="wp-singular">Content</body>
</div>
</div>
""",
True,
),
# No WordPress indicators
('<body><div class="content">Regular content</div></body>', False),
# Empty body
("<body></body>", False),
# No body tag
("<div>No body</div>", False),
],
)
def test_is_wordpress(html, expected):
soup = BeautifulSoup(html, "html.parser")
assert is_wordpress(soup) == expected
@pytest.mark.parametrize(
"html, expected",
[
# Bloomberg with company link
(
"""
<body>
<a href="https://www.bloomberg.com/company/">Bloomberg</a>
</body>
""",
True,
),
# Bloomberg link among other links
(
"""
<body>
<a href="https://example.com">Example</a>
<a href="https://www.bloomberg.com/company/">Bloomberg</a>
<a href="https://other.com">Other</a>
</body>
""",
True,
),
# Archived page with Bloomberg content
(
"""
<div id="CONTENT">
<div class="html">
<body>
<a href="https://www.bloomberg.com/company/">Bloomberg</a>
</body>
</div>
</div>
""",
True,
),
# No Bloomberg links
(
"""
<body>
<a href="https://example.com">Example</a>
<a href="https://other.com">Other</a>
</body>
""",
False,
),
# Bloomberg link but not company page
(
"""
<body>
<a href="https://www.bloomberg.com/news/">Bloomberg News</a>
</body>
""",
False,
),
# No links at all
("<body><p>No links</p></body>", False),
# Links without href
("<body><a>No href</a></body>", False),
],
)
def test_is_bloomberg(html, expected):
soup = BeautifulSoup(html, "html.parser")
assert is_bloomberg(soup) == expected
class TestBaseHTMLParser: class TestBaseHTMLParser:
def test_init_with_base_url(self): def test_init_with_base_url(self):
parser = BaseHTMLParser("https://example.com/path") parser = BaseHTMLParser("https://example.com/path")
@ -584,7 +773,7 @@ class TestBaseHTMLParser:
def test_parse_with_images(self, mock_process_images): def test_parse_with_images(self, mock_process_images):
# Mock the image processing to return test data # Mock the image processing to return test data
mock_image = MagicMock(spec=PILImage.Image) mock_image = MagicMock(spec=PILImage.Image)
mock_process_images.return_value = (MagicMock(), [mock_image]) mock_process_images.return_value = (MagicMock(), {"test_image.jpg": mock_image})
html = """ html = """
<article> <article>
@ -600,5 +789,6 @@ class TestBaseHTMLParser:
article = parser.parse(html, "https://example.com/article") article = parser.parse(html, "https://example.com/article")
assert len(article.images) == 1 assert len(article.images) == 1
assert article.images[0] == mock_image assert "test_image.jpg" in article.images
assert article.images["test_image.jpg"] == mock_image
mock_process_images.assert_called_once() mock_process_images.assert_called_once()