Add archives fetcher

2025-10-02 13:12:36 +02:00 · 2025-05-27 01:24:57 +02:00 · 2025-05-27 01:24:57 +02:00 · 876fa87725
commit 876fa87725
parent 27fbfcc548
7 changed files with 1306 additions and 228 deletions
--- a/src/memory/common/parsers/archives.py
+++ b/src/memory/common/parsers/archives.py
@ -0,0 +1,301 @@
 from dataclasses import dataclass, field
 import logging
 import re
 import time
 from urllib.parse import parse_qs, urlencode, urlparse, urlunparse
 from typing import Generator, cast
 from bs4 import BeautifulSoup
 from memory.common.parsers.blogs import is_substack
 from memory.common.parsers.feeds import (
    DanluuParser,
    HTMLListParser,
    RiftersParser,
    FeedItem,
    FeedParser,
    SubstackAPIParser,
 )
 from memory.common.parsers.html import (
    fetch_html,
    extract_url,
    get_base_url,
 )
 logger = logging.getLogger(__name__)
@dataclass
 class ArchiveFetcher:
    """Fetches complete backlogs from sites with pagination."""
    parser_class: type[FeedParser]
    start_url: str
    max_pages: int = 100
    delay_between_requests: float = 1.0
    parser_kwargs: dict = field(default_factory=dict)
    def make_parser(self, url: str) -> FeedParser:
        parser = self.parser_class(url=url)
        for key, value in self.parser_kwargs.items():
            setattr(parser, key, value)
        return parser
    def fetch_all_items(self) -> Generator[FeedItem, None, None]:
        """Fetch all items from all pages."""
        visited_urls = set()
        current_url = self.start_url
        page_count = 0
        total_items = 0
        while current_url and page_count < self.max_pages:
            if current_url in visited_urls:
                logger.warning(f"Already visited {current_url}, stopping")
                break
            logger.info(f"Fetching page {page_count + 1}: {current_url}")
            visited_urls.add(current_url)
            try:
                parser = self.make_parser(current_url)
                items = parser.parse_feed()
                if not items:
                    break
                prev_items = total_items
                for item in items:
                    total_items += 1
                    yield item
                if prev_items == total_items:
                    logger.warning(f"No new items found on page {page_count + 1}")
                    break
                current_url = self._find_next_page(parser, page_count)
                if not current_url:
                    logger.info("No more pages found")
                    break
                page_count += 1
                if self.delay_between_requests > 0:
                    time.sleep(self.delay_between_requests)
            except Exception as e:
                logger.error(f"Error processing {current_url}: {e}")
                break
    def _find_next_page(self, parser: FeedParser, current_page: int = 0) -> str | None:
        return None
@dataclass
 class LinkFetcher(ArchiveFetcher):
    per_page: int = 10
    def _find_next_page(self, parser: FeedParser, current_page: int = 0):
        next_page = current_page + 1
        parsed = urlparse(self.start_url)
        params = parse_qs(parsed.query)
        params["offset"] = [str(next_page * self.per_page)]
        params["limit"] = [str(self.per_page)]
        new_query = urlencode(params, doseq=True)
        return urlunparse(parsed._replace(query=new_query))
@dataclass
 class HTMLArchiveFetcher(ArchiveFetcher):
    next_page_selectors: list[str] = field(
        default_factory=lambda: [
            'a[rel="next"]',
            ".next a",
            "a.next",
            ".pagination .next",
            ".pager .next",
            "nav.page a:last-of-type",
            ".navigation a:last-of-type",
        ]
    )
    def _find_next_page(self, parser: FeedParser, current_page: int = 0) -> str | None:
        if not parser.content:
            return None
        soup = BeautifulSoup(parser.content, "html.parser")
        selectors = ",".join(self.next_page_selectors)
        return extract_url(soup, selectors, parser.url)
 def html_parser(**kwargs) -> type[HTMLListParser]:
    class ConfiguredHTMLListParser(HTMLListParser):
        def __init__(self, url: str):
            super().__init__(url)
            for key, value in kwargs.items():
                setattr(self, key, value)
    return ConfiguredHTMLListParser
@dataclass
 class SubstackArchiveFetcher(LinkFetcher):
    def __post_init__(self):
        if "api/v1/archive" not in self.start_url:
            base_url = get_base_url(self.start_url)
            self.start_url = f"{base_url}/api/v1/archive"
@dataclass
 class ACOUPArchiveFetcher(HTMLArchiveFetcher):
    def _find_next_page(self, parser: FeedParser, current_page: int = 0) -> str | None:
        if not parser.content:
            return None
        soup = BeautifulSoup(parser.content, "html.parser")
        urls = reversed([i.attrs.get("href") for i in soup.select(".widget_archive a")])
        urls = (cast(str, u) for u in urls if u)
        for url in urls:
            if url.rstrip("/") == parser.url.rstrip("/"):
                return next(urls, None)
@dataclass
 class HTMLNextUrlArchiveFetcher(HTMLArchiveFetcher):
    next_url: str = ""
    def __post_init__(self):
        if not self.next_url:
            self.next_url = self.start_url
        if not self.next_url.startswith("http") and not self.next_url.startswith("/"):
            self.next_url = f"{self.start_url}/{self.next_url}"
    def _find_next_page(self, parser: FeedParser, current_page: int = 0) -> str | None:
        return f"{self.next_url}/{current_page + 1}"
 FETCHER_REGISTRY = {
    r"https://putanumonit.com": (
        "https://putanumonit.com/full-archive",
        html_parser(
            item_selector="article p", title_selector="a strong", url_selector="a"
        ),
    ),
    r"https://danluu.com": DanluuParser,
    r"https://www.rifters.com": RiftersParser,
    r"https://rachelbythebay.com": html_parser(
        item_selector="div.post",
        url_selector="a",
    ),
    r"https://guzey.com": (
        "https://guzey.com/archive/",
        html_parser(item_selector="article li"),
    ),
    r"https://aphyr.com": html_parser(
        item_selector="article.post",
        title_selector="h1",
        url_selector="h1 a",
        description_selector=".body",
        date_selector=".meta time",
    ),
    r"https://www.applieddivinitystudies.com": html_parser(
        item_selector="article.article",
        title_selector="header.article-header h1",
        url_selector="header.article-header h1 a",
        description_selector=".article-entry",
        date_selector=".article-meta time",
    ),
    r"https://www.flyingmachinestudios.com": html_parser(
        item_selector="#main #articles li",
        title_selector="header .title",
        description_selector="p",
        date_selector="header .date",
        date_format="%d %B %Y",
    ),
    r"https://slimemoldtimemold.com": html_parser(
        item_selector="article .wp-block-list li", title_selector="a"
    ),
    r"https://www.paulgraham.com": (
        "https://www.paulgraham.com/articles.html",
        html_parser(item_selector="img + font"),
    ),
    r"https://slatestarcodex.com": (
        "https://slatestarcodex.com/archives/",
        html_parser(item_selector="#sya_container li"),
    ),
    r"https://mcfunley.com": (
        "https://mcfunley.com/writing",
        html_parser(item_selector="article", title_selector="h6"),
    ),
    r"https://www.bitsaboutmoney.com": HTMLArchiveFetcher(
        html_parser(
            item_selector="article",
            title_selector="h1",
            description_selector="p",
            date_selector="time",
        ),
        "https://www.bitsaboutmoney.com/archive/",
        next_page_selectors=["nav.pagination a.older-posts"],
    ),
    r"https://acoup.blog": ACOUPArchiveFetcher(
        html_parser(
            item_selector="article",
            title_selector="a",
            description_selector=".entry-content",
            date_selector=".published-on time",
        ),
        "https://acoup.blog/2019/05/",
    ),
    r"https://www.theredhandfiles.com": html_parser(
        item_selector="article", title_selector="h3", description_selector="h2"
    ),
 }
 def get_archive_fetcher(url: str) -> ArchiveFetcher | None:
    for pattern, fetcher in FETCHER_REGISTRY.items():
        if re.search(pattern, url.rstrip("/")):
            if isinstance(fetcher, ArchiveFetcher):
                return fetcher
            elif isinstance(fetcher, tuple):
                base_url, html_fetcher = fetcher
                return HTMLArchiveFetcher(html_fetcher, base_url)
            else:
                return HTMLArchiveFetcher(fetcher, url)
    html = fetch_html(url)
    soup = BeautifulSoup(html, "html.parser")
    if is_substack(soup):
        return SubstackArchiveFetcher(SubstackAPIParser, url)
 feeds = [
    "https://archive.ph/o/IQUoT/https://www.bloomberg.com/opinion/authors/ARbTQlRLRjE/matthew-s-levine",
    "https://www.rifters.com/crawl/",
    "https://rachelbythebay.com/w/",
    "https://danluu.com/",
    "https://guzey.com",
    "https://aphyr.com/",
    "https://www.applieddivinitystudies.com/",
    "https://www.imightbewrong.org/",
    "https://www.kvetch.au/",
    "https://www.overcomingbias.com/",
    "https://samkriss.substack.com/",
    "https://www.richardhanania.com/",
    "https://skunkledger.substack.com/",
    "https://taipology.substack.com/",
    "https://putanumonit.com/",
    "https://www.flyingmachinestudios.com/",
    "https://www.theintrinsicperspective.com/",
    "https://www.strangeloopcanon.com/",
    "https://slimemoldtimemold.com/",
    "https://zeroinputagriculture.substack.com/",
    "https://nayafia.substack.com",
    "https://www.paulgraham.com/articles.html",
    "https://mcfunley.com/writing",
    "https://www.bitsaboutmoney.com/",
    "https://akarlin.com",
    "https://www.exurbe.com/",
    "https://acoup.blog/",
    "https://www.theredhandfiles.com/",
    "https://karlin.blog/",
    "https://slatestarcodex.com/",
 ]
--- a/src/memory/common/parsers/blogs.py
+++ b/src/memory/common/parsers/blogs.py
@ -13,6 +13,9 @@ from memory.common.parsers.html import (
    extract_title,
    extract_date,
    fetch_html,
    is_wordpress,
    is_substack,
    is_bloomberg,
 )
@ -587,24 +590,13 @@ def get_parser_for_url(url: str, html: str) -> BaseHTMLParser:
            return parser_class(url)
    soup = BeautifulSoup(html, "html.parser")
-    body_select = "body"
+    if is_wordpress(soup):
    # Check if this is an archived page
    if contents := soup.select_one("#CONTENT .html"):
        body_select = ".body"
        soup = contents
    if soup.select_one(f"{body_select} .wp-singular"):
        return WordPressParser(url)
-    if any(
+    if is_substack(soup):
        "https://substackcdn.com" == a.attrs.get("href")  # type: ignore
        for a in soup.find_all("link", {"rel": "preconnect"})
        if hasattr(a, "attrs")  # type: ignore
    ):
        return SubstackParser(url)
-    urls = [a.attrs.get("href") for a in soup.select(f"{body_select} a")]  # type: ignore
+    if is_bloomberg(soup):
    if any(u.endswith("https://www.bloomberg.com/company/") for u in urls[:5] if u):  # type: ignore
        return BloombergParser(url)
    return BaseHTMLParser(url)
@ -628,11 +620,11 @@ def parse_webpage(url: str) -> Article:
 feeds = [
    "https://archive.ph/o/IQUoT/https://www.bloomberg.com/opinion/authors/ARbTQlRLRjE/matthew-s-levine",
    "https://www.rifters.com/crawl/",
-    "https://rachelbythebay.com/w/atom.xml",
+    "https://rachelbythebay.com/w/",
    "https://danluu.com/",
-    "https://guzey.com/archive",
+    "https://guzey.come",
-    "https://aphyr.com/posts.atom",
+    "https://aphyr.com/",
-    "https://www.applieddivinitystudies.com/atom.xml",
+    "https://www.applieddivinitystudies.com/",
    "https://www.imightbewrong.org/",
    "https://www.kvetch.au/",
    "https://www.overcomingbias.com/",
@ -649,9 +641,10 @@ feeds = [
    "https://nayafia.substack.com",
    "https://www.paulgraham.com/articles.html",
    "https://mcfunley.com/writing",
-    "https://www.bitsaboutmoney.com/archive/",
+    "https://www.bitsaboutmoney.com/",
-    "https://akarlin.com/archive/",
+    "https://akarlin.com",
    "https://www.exurbe.com/",
    "https://acoup.blog/",
    "https://www.theredhandfiles.com/",
    "https://karlin.blog/",
 ]
--- a/src/memory/common/parsers/feeds.py
+++ b/src/memory/common/parsers/feeds.py
@ -1,5 +1,6 @@
 from datetime import datetime
 import logging
 import json
 import re
 from dataclasses import dataclass, field
 from typing import Any, Generator, Sequence, cast
@ -20,6 +21,20 @@ from memory.common.parsers.html import (
 logger = logging.getLogger(__name__)
 ObjectPath = list[str | int]
 def select_in(data: Any, path: ObjectPath) -> Any:
    if not path:
        return data
    key, *rest = path
    try:
        return select_in(data[key], rest)
    except (KeyError, TypeError, IndexError):
        return None
@dataclass
 class FeedItem:
    """Represents a single item from a feed."""
@ -62,7 +77,7 @@ class FeedParser:
        )
    def valid_item(self, item: FeedItem) -> bool:
-        return True
+        return bool(item.url)
    def parse_feed(self) -> Generator[FeedItem, None, None]:
        """Parse feed content and return list of feed items."""
@ -100,6 +115,46 @@ class FeedParser:
        return {}
 class JSONParser(FeedParser):
    title_path: ObjectPath = ["title"]
    url_path: ObjectPath = ["url"]
    description_path: ObjectPath = ["description"]
    date_path: ObjectPath = ["date"]
    author_path: ObjectPath = ["author"]
    guid_path: ObjectPath = ["guid"]
    metadata_path: ObjectPath = ["metadata"]
    def fetch_items(self) -> Sequence[Any]:
        if not self.content:
            self.content = cast(str, fetch_html(self.url))
        try:
            return json.loads(self.content)
        except json.JSONDecodeError as e:
            logger.error(f"Error parsing JSON: {e}")
            return []
    def extract_title(self, entry: Any) -> str:
        return select_in(entry, self.title_path)
    def extract_url(self, entry: Any) -> str:
        return select_in(entry, self.url_path)
    def extract_description(self, entry: Any) -> str:
        return select_in(entry, self.description_path)
    def extract_date(self, entry: Any) -> datetime:
        return select_in(entry, self.date_path)
    def extract_author(self, entry: Any) -> str:
        return select_in(entry, self.author_path)
    def extract_guid(self, entry: Any) -> str:
        return select_in(entry, self.guid_path)
    def extract_metadata(self, entry: Any) -> dict[str, Any]:
        return select_in(entry, self.metadata_path)
 class RSSAtomParser(FeedParser):
    """Parser for RSS and Atom feeds using feedparser."""
@ -237,8 +292,14 @@ class HTMLListParser(FeedParser):
            return extract_date(entry, self.date_selector, self.date_format)
 class SubstackAPIParser(JSONParser):
    url_path = ["canonical_url"]
    author_path = ["publishedBylines", 0, "name"]
    date_path = ["post_date"]
 class DanluuParser(HTMLListParser):
-    skip_patterns = [r"^https://danluu.com/#"]
+    skip_patterns = DEFAULT_SKIP_PATTERNS + [r"^https://danluu\.com/?#"]
    def valid_item(self, item: FeedItem) -> bool:
        return item.url.startswith(self.base_url)
@ -351,6 +412,13 @@ class RedHandFilesParser(HTMLListParser):
        return ""
 class RiftersParser(HTMLListParser):
    item_selector = "#content .post"
    title_selector = "h2 a"
    url_selector = "h2 a"
    description_selector = ".entry-content"
 class BloombergAuthorParser(HTMLListParser):
    item_selector = "section#author_page article"
    url_selector = "a[href]"
@ -393,7 +461,7 @@ def is_rss_feed(content: str) -> bool:
    )
-def extract_url(element: Tag, base_url: str) -> str | None:
+def clean_url(element: Tag, base_url: str) -> str | None:
    if not (href := element.get("href")):
        return None
@ -409,14 +477,14 @@ def find_feed_link(url: str, soup: BeautifulSoup) -> str | None:
        for link in links:
            if not isinstance(link, Tag):
                continue
-            if not (link_url := extract_url(link, url)):
+            if not (link_url := clean_url(link, url)):
                continue
            if link_url.rstrip("/") != url.rstrip("/"):
                return link_url
    return None
-PARSER_REGISTRY = {
+FEED_REGISTRY = {
    r"https://danluu.com": DanluuParser,
    r"https://guzey.com/archive": GuzeyParser,
    r"https://www.paulgraham.com/articles": PaulGrahamParser,
@ -427,7 +495,7 @@ PARSER_REGISTRY = {
 def get_feed_parser(url: str, check_from: datetime | None = None) -> FeedParser | None:
-    for pattern, parser_class in PARSER_REGISTRY.items():
+    for pattern, parser_class in FEED_REGISTRY.items():
        if re.search(pattern, url.rstrip("/")):
            return parser_class(url=url, since=check_from)
--- a/src/memory/common/parsers/html.py
+++ b/src/memory/common/parsers/html.py
@ -110,7 +110,14 @@ def extract_date(
        datetime_attr = element.get("datetime")
        if datetime_attr:
-            for format in ["%Y-%m-%dT%H:%M:%S", "%Y-%m-%d", date_format]:
+            for format in [
                "%Y-%m-%dT%H:%M:%S.%fZ",
                "%Y-%m-%dT%H:%M:%S%z",
                "%Y-%m-%dT%H:%M:%S.%f",
                "%Y-%m-%dT%H:%M:%S",
                "%Y-%m-%d",
                date_format,
            ]:
                if date := parse_date(str(datetime_attr), format):
                    return date
@ -277,6 +284,47 @@ def extract_metadata(soup: BeautifulSoup) -> dict[str, Any]:
    return metadata
 def extract_url(soup: BeautifulSoup, selectors: str, base_url: str = "") -> str | None:
    for selector in selectors.split(","):
        next_link = soup.select_one(selector)
        if not (next_link and isinstance(next_link, Tag)):
            continue
        if not (href := next_link.get("href")):
            continue
        return to_absolute_url(str(href), base_url)
    return None
 def is_substack(soup: BeautifulSoup | Tag) -> bool:
    return any(
        "https://substackcdn.com" == a.attrs.get("href")  # type: ignore
        for a in soup.find_all("link", {"rel": "preconnect"})
        if hasattr(a, "attrs")  # type: ignore
    )
 def is_wordpress(soup: BeautifulSoup | Tag) -> bool:
    body_select = "body"
    # Check if this is an archived page
    if contents := soup.select_one("#CONTENT .html"):
        body_select = "#CONTENT .html"
        soup = contents
    return bool(soup.select_one(f"{body_select} .wp-singular"))
 def is_bloomberg(soup: BeautifulSoup | Tag) -> bool:
    body_select = "body"
    # Check if this is an archived page
    if contents := soup.select_one("#CONTENT .html"):
        body_select = "#CONTENT .html"
        soup = contents
    urls = [a.attrs.get("href") for a in soup.select(f"{body_select} a")]  # type: ignore
    return any(u.endswith("https://www.bloomberg.com/company/") for u in urls[:5] if u)  # type: ignore
 class BaseHTMLParser:
    """Base class for parsing HTML content from websites."""
--- a/tests/memory/common/parsers/test_archives.py
+++ b/tests/memory/common/parsers/test_archives.py
@ -1,176 +1,512 @@
 from unittest.mock import patch
 from urllib.parse import urlparse, parse_qs
 import pytest
 from unittest.mock import Mock, patch
 from bs4 import BeautifulSoup
 from memory.common.parsers.archives import (
-    ArchiveParser,
+    ArchiveFetcher,
-    WordPressArchiveParser,
+    LinkFetcher,
-    SubstackArchiveParser,
+    HTMLArchiveFetcher,
-    get_archive_parser,
+    SubstackArchiveFetcher,
    ACOUPArchiveFetcher,
    HTMLNextUrlArchiveFetcher,
    html_parser,
    get_archive_fetcher,
    FETCHER_REGISTRY,
 )
 from memory.common.parsers.feeds import (
    FeedItem,
    FeedParser,
    HTMLListParser,
    DanluuParser,
    SubstackAPIParser,
 )
-class TestArchiveParser:
+class MockParser(FeedParser):
-    def test_init(self):
+    def __init__(
-        parser = ArchiveParser(url="https://example.com")
+        self, url: str, items: list[FeedItem] | None = None, content: str = ""
-        assert parser.url == "https://example.com"
+    ):
-        assert parser._visited_urls == set()
+        super().__init__(url)
-        assert parser._all_items == []
+        self.items = items or []
-        assert parser.max_pages == 100
+        self.content = content
        assert parser.delay_between_requests == 1.0
-    def test_extract_items_from_page(self):
+    def parse_feed(self):
-        html = """
+        return self.items
        <div>
            <li><a href="/post1">Post 1</a></li>
            <li><a href="/post2">Post 2</a></li>
            <li><a href="/post1">Post 1</a></li>  <!-- Duplicate -->
        </div>
        """
        soup = BeautifulSoup(html, "html.parser")
        parser = ArchiveParser(url="https://example.com")
        items = parser._extract_items_from_page(soup)
        assert len(items) == 2  # Duplicates should be filtered out
    def test_find_next_page_url_with_selector(self):
        html = '<div><a class="next" href="/page/2">Next</a></div>'
        soup = BeautifulSoup(html, "html.parser")
        parser = ArchiveParser(url="https://example.com")
        parser.next_page_selector = ".next"
        next_url = parser._find_next_page_url(soup, "https://example.com/page/1")
        assert next_url == "https://example.com/page/2"
    def test_find_next_page_url_heuristic(self):
        html = '<div><a rel="next" href="/page/2">Next</a></div>'
        soup = BeautifulSoup(html, "html.parser")
        parser = ArchiveParser(url="https://example.com")
        next_url = parser._find_next_page_url(soup, "https://example.com/page/1")
        assert next_url == "https://example.com/page/2"
    def test_find_next_page_url_contains_text(self):
        html = '<div><a href="/page/2">Next →</a></div>'
        soup = BeautifulSoup(html, "html.parser")
        parser = ArchiveParser(url="https://example.com")
        next_url = parser._find_next_page_heuristic(soup)
        assert next_url == "https://example.com/page/2"
    def test_find_next_numeric_page(self):
        parser = ArchiveParser(url="https://example.com")
        parser.page_url_pattern = "/page/{page}"
        # Test with existing page number
        next_url = parser._find_next_numeric_page("https://example.com/page/3")
        assert next_url == "https://example.com/page/4"
        # Test without page number (assume page 1)
        next_url = parser._find_next_numeric_page("https://example.com/archive")
        assert next_url == "https://example.com/archive/page/2"
    @patch("memory.common.parsers.archives.fetch_html")
    @patch("time.sleep")
    def test_fetch_items_pagination(self, mock_sleep, mock_fetch):
        # Mock HTML for two pages
        page1_html = """
        <div>
            <li><a href="/post1">Post 1</a></li>
            <li><a href="/post2">Post 2</a></li>
            <a rel="next" href="/page/2">Next</a>
        </div>
        """
        page2_html = """
        <div>
            <li><a href="/post3">Post 3</a></li>
            <li><a href="/post4">Post 4</a></li>
        </div>
        """
        mock_fetch.side_effect = [page1_html, page2_html]
        parser = ArchiveParser(url="https://example.com/page/1")
        parser.delay_between_requests = 0.1  # Speed up test
        items = parser.fetch_items()
        assert len(items) == 4
        assert mock_fetch.call_count == 2
        assert mock_sleep.call_count == 1  # One delay between requests
    @patch("memory.common.parsers.archives.fetch_html")
    def test_fetch_items_stops_at_max_pages(self, mock_fetch):
        # Mock HTML that always has a next page
        html_with_next = """
        <div>
            <li><a href="/post">Post</a></li>
            <a rel="next" href="/page/999">Next</a>
        </div>
        """
        mock_fetch.return_value = html_with_next
        parser = ArchiveParser(url="https://example.com/page/1")
        parser.max_pages = 3
        parser.delay_between_requests = 0  # No delay for test
        items = parser.fetch_items()
        assert mock_fetch.call_count == 3  # Should stop at max_pages
    @patch("memory.common.parsers.archives.fetch_html")
    def test_fetch_items_handles_duplicate_urls(self, mock_fetch):
        # Mock HTML that creates a cycle
        page1_html = """
        <div>
            <li><a href="/post1">Post 1</a></li>
            <a rel="next" href="/page/2">Next</a>
        </div>
        """
        page2_html = """
        <div>
            <li><a href="/post2">Post 2</a></li>
            <a rel="next" href="/page/1">Back to page 1</a>
        </div>
        """
        mock_fetch.side_effect = [page1_html, page2_html]
        parser = ArchiveParser(url="https://example.com/page/1")
        parser.delay_between_requests = 0
        items = parser.fetch_items()
        assert len(items) == 2
        assert mock_fetch.call_count == 2  # Should stop when it hits visited URL
-class TestWordPressArchiveParser:
+def test_archive_fetcher_make_parser():
-    def test_selectors(self):
+    fetcher = ArchiveFetcher(
-        parser = WordPressArchiveParser(url="https://example.wordpress.com")
+        parser_class=MockParser,
-        assert parser.item_selector == "article, .post"
+        start_url="https://example.com",
-        assert parser.next_page_selector == '.nav-previous a, .next a, a[rel="next"]'
+        parser_kwargs={"custom_attr": "value"},
        assert parser.title_selector == ".entry-title a, h1 a, h2 a"
 class TestSubstackArchiveParser:
    def test_selectors(self):
        parser = SubstackArchiveParser(url="https://example.substack.com")
        assert parser.item_selector == ".post-preview, .post"
        assert parser.next_page_selector == ".pagination .next"
 class TestGetArchiveParser:
    @pytest.mark.parametrize(
        "url,expected_class",
        [
            ("https://example.wordpress.com/archive", WordPressArchiveParser),
            ("https://example.substack.com/archive", SubstackArchiveParser),
            ("https://example.com/archive", ArchiveParser),  # Default
        ],
    )
-    def test_get_archive_parser(self, url, expected_class):
+
-        parser = get_archive_parser(url)
+    parser = fetcher.make_parser("https://example.com/page1")
-        assert isinstance(parser, expected_class)
+
-        assert parser.url == url
+    assert isinstance(parser, MockParser)
    assert parser.url == "https://example.com/page1"
    assert getattr(parser, "custom_attr") == "value"
 def test_archive_fetcher_find_next_page_base():
    fetcher = ArchiveFetcher(MockParser, "https://example.com")
    parser = MockParser("https://example.com")
    assert fetcher._find_next_page(parser, 0) is None
@patch("memory.common.parsers.archives.time.sleep")
 def test_archive_fetcher_fetch_all_items_single_page(mock_sleep):
    items = [
        FeedItem(title="Item 1", url="https://example.com/1"),
        FeedItem(title="Item 2", url="https://example.com/2"),
    ]
    fetcher = ArchiveFetcher(
        parser_class=MockParser,
        start_url="https://example.com",
        delay_between_requests=0.5,
    )
    with patch.object(fetcher, "make_parser") as mock_make_parser:
        mock_parser = MockParser("https://example.com", items)
        mock_make_parser.return_value = mock_parser
        result = list(fetcher.fetch_all_items())
        assert result == items
        mock_make_parser.assert_called_once_with("https://example.com")
        mock_sleep.assert_not_called()  # No delay for single page
@patch("memory.common.parsers.archives.time.sleep")
 def test_archive_fetcher_fetch_all_items_multiple_pages(mock_sleep):
    page1_items = [FeedItem(title="Item 1", url="https://example.com/1")]
    page2_items = [FeedItem(title="Item 2", url="https://example.com/2")]
    class TestFetcher(ArchiveFetcher):
        def __init__(self, *args, **kwargs):
            super().__init__(*args, **kwargs)
            self.call_count = 0
        def _find_next_page(self, parser, current_page=0):
            self.call_count += 1
            if self.call_count == 1:
                return "https://example.com/page2"
            return None
    fetcher = TestFetcher(
        parser_class=MockParser,
        start_url="https://example.com",
        delay_between_requests=0.1,
    )
    with patch.object(fetcher, "make_parser") as mock_make_parser:
        mock_make_parser.side_effect = [
            MockParser("https://example.com", page1_items),
            MockParser("https://example.com/page2", page2_items),
        ]
        result = list(fetcher.fetch_all_items())
        assert result == page1_items + page2_items
        assert mock_make_parser.call_count == 2
        mock_sleep.assert_called_once_with(0.1)
 def test_archive_fetcher_fetch_all_items_max_pages():
    class TestFetcher(ArchiveFetcher):
        def _find_next_page(self, parser, current_page=0):
            return f"https://example.com/page{current_page + 2}"
    fetcher = TestFetcher(
        parser_class=MockParser,
        start_url="https://example.com",
        max_pages=2,
        delay_between_requests=0,
    )
    items = [FeedItem(title="Item", url="https://example.com/item")]
    with patch.object(fetcher, "make_parser") as mock_make_parser:
        mock_make_parser.return_value = MockParser("https://example.com", items)
        result = list(fetcher.fetch_all_items())
        assert len(result) == 2  # 2 pages * 1 item per page
        assert mock_make_parser.call_count == 2
 def test_archive_fetcher_fetch_all_items_visited_url():
    class TestFetcher(ArchiveFetcher):
        def _find_next_page(self, parser, current_page=0):
            return "https://example.com"  # Return same URL to trigger visited check
    fetcher = TestFetcher(MockParser, "https://example.com", delay_between_requests=0)
    items = [FeedItem(title="Item", url="https://example.com/item")]
    with patch.object(fetcher, "make_parser") as mock_make_parser:
        mock_make_parser.return_value = MockParser("https://example.com", items)
        result = list(fetcher.fetch_all_items())
        assert len(result) == 1  # Only first page processed
        mock_make_parser.assert_called_once()
 def test_archive_fetcher_fetch_all_items_no_items():
    fetcher = ArchiveFetcher(
        MockParser, "https://example.com", delay_between_requests=0
    )
    with patch.object(fetcher, "make_parser") as mock_make_parser:
        mock_make_parser.return_value = MockParser("https://example.com", [])
        result = list(fetcher.fetch_all_items())
        assert result == []
        mock_make_parser.assert_called_once()
 def test_archive_fetcher_fetch_all_items_exception():
    fetcher = ArchiveFetcher(
        MockParser, "https://example.com", delay_between_requests=0
    )
    with patch.object(fetcher, "make_parser") as mock_make_parser:
        mock_make_parser.side_effect = Exception("Network error")
        result = list(fetcher.fetch_all_items())
        assert result == []
@pytest.mark.parametrize(
    "start_url, per_page, current_page, expected_params",
    [
        ("https://example.com", 10, 0, {"offset": ["10"], "limit": ["10"]}),
        (
            "https://example.com?existing=value",
            20,
            1,
            {"existing": ["value"], "offset": ["40"], "limit": ["20"]},
        ),
        (
            "https://example.com?offset=0&limit=5",
            15,
            2,
            {"offset": ["45"], "limit": ["15"]},
        ),
    ],
 )
 def test_link_fetcher_find_next_page(
    start_url, per_page, current_page, expected_params
 ):
    fetcher = LinkFetcher(MockParser, start_url, per_page=per_page)
    parser = MockParser(start_url)
    next_url = fetcher._find_next_page(parser, current_page)
    assert next_url is not None
    parsed = urlparse(next_url)
    params = parse_qs(parsed.query)
    for key, value in expected_params.items():
        assert params[key] == value
@pytest.mark.parametrize(
    "html, selectors, expected_url",
    [
        (
            '<a rel="next" href="/page2">Next</a>',
            ['a[rel="next"]'],
            "https://example.com/page2",
        ),
        (
            '<div class="next"><a href="/page2">Next</a></div>',
            [".next a"],
            "https://example.com/page2",
        ),
        (
            '<a class="next" href="/page2">Next</a>',
            ["a.next"],
            "https://example.com/page2",
        ),
        (
            '<div class="pagination"><span class="next"><a href="/page2">Next</a></span></div>',
            [".pagination .next"],
            None,  # This won't match because it's looking for .pagination .next directly
        ),
        (
            '<div class="pagination next"><a href="/page2">Next</a></div>',
            [".pagination.next"],
            None,  # This selector isn't in default list
        ),
        (
            '<nav class="page"><a href="/page1">1</a><a href="/page2">2</a></nav>',
            ["nav.page a:last-of-type"],
            "https://example.com/page2",
        ),
        ("<div>No next link</div>", ['a[rel="next"]'], None),
    ],
 )
 def test_html_archive_fetcher_find_next_page(html, selectors, expected_url):
    fetcher = HTMLArchiveFetcher(
        MockParser, "https://example.com", next_page_selectors=selectors
    )
    parser = MockParser("https://example.com", content=html)
    with patch("memory.common.parsers.archives.extract_url") as mock_extract:
        mock_extract.return_value = expected_url
        result = fetcher._find_next_page(parser)
        if expected_url:
            mock_extract.assert_called_once()
            assert result == expected_url
        else:
            # extract_url might still be called but return None
            assert result is None
 def test_html_archive_fetcher_find_next_page_no_content():
    fetcher = HTMLArchiveFetcher(MockParser, "https://example.com")
    parser = MockParser("https://example.com", content="")
    result = fetcher._find_next_page(parser)
    assert result is None
 def test_html_parser_factory():
    CustomParser = html_parser(
        item_selector="article", title_selector="h1", custom_attr="value"
    )
    parser = CustomParser("https://example.com")
    assert isinstance(parser, HTMLListParser)
    assert parser.item_selector == "article"
    assert parser.title_selector == "h1"
    assert getattr(parser, "custom_attr") == "value"
@pytest.mark.parametrize(
    "start_url, expected_api_url",
    [
        ("https://example.substack.com", "https://example.substack.com/api/v1/archive"),
        (
            "https://example.substack.com/posts",
            "https://example.substack.com/api/v1/archive",
        ),
        (
            "https://example.substack.com/api/v1/archive",
            "https://example.substack.com/api/v1/archive",
        ),
    ],
 )
 def test_substack_archive_fetcher_post_init(start_url, expected_api_url):
    with patch("memory.common.parsers.archives.get_base_url") as mock_get_base:
        mock_get_base.return_value = "https://example.substack.com"
        fetcher = SubstackArchiveFetcher(SubstackAPIParser, start_url)
        assert fetcher.start_url == expected_api_url
 def test_acoup_archive_fetcher_find_next_page():
    html = """
    <div class="widget_archive">
        <a href="https://acoup.blog/2019/04/">April 2019</a>
        <a href="https://acoup.blog/2019/05/">May 2019</a>
        <a href="https://acoup.blog/2019/06/">June 2019</a>
    </div>
    """
    fetcher = ACOUPArchiveFetcher(MockParser, "https://acoup.blog/2019/05/")
    parser = MockParser("https://acoup.blog/2019/05/", content=html)
    result = fetcher._find_next_page(parser)
    assert result == "https://acoup.blog/2019/04/"
 def test_acoup_archive_fetcher_find_next_page_no_match():
    html = """
    <div class="widget_archive">
        <a href="https://acoup.blog/2019/04/">April 2019</a>
        <a href="https://acoup.blog/2019/06/">June 2019</a>
    </div>
    """
    fetcher = ACOUPArchiveFetcher(MockParser, "https://acoup.blog/2019/05/")
    parser = MockParser("https://acoup.blog/2019/05/", content=html)
    result = fetcher._find_next_page(parser)
    assert result is None
 def test_acoup_archive_fetcher_find_next_page_no_content():
    fetcher = ACOUPArchiveFetcher(MockParser, "https://acoup.blog/2019/05/")
    parser = MockParser("https://acoup.blog/2019/05/", content="")
    result = fetcher._find_next_page(parser)
    assert result is None
@pytest.mark.parametrize(
    "start_url, next_url, expected_next_url",
    [
        (
            "https://example.com",
            "",
            "https://example.com",
        ),  # Empty next_url defaults to start_url
        (
            "https://example.com",
            "https://other.com/archive",
            "https://other.com/archive",  # Full URL is preserved
        ),
        (
            "https://example.com",
            "/archive",
            "/archive",
        ),  # Absolute path is preserved
        (
            "https://example.com",
            "archive",
            "https://example.com/archive",
        ),  # Relative path gets prepended
    ],
 )
 def test_html_next_url_archive_fetcher_post_init(
    start_url, next_url, expected_next_url
 ):
    fetcher = HTMLNextUrlArchiveFetcher(MockParser, start_url, next_url=next_url)
    assert fetcher.next_url == expected_next_url
 def test_html_next_url_archive_fetcher_find_next_page():
    fetcher = HTMLNextUrlArchiveFetcher(
        MockParser, "https://example.com", next_url="https://example.com/archive"
    )
    parser = MockParser("https://example.com")
    result = fetcher._find_next_page(parser, 2)
    assert result == "https://example.com/archive/3"
@pytest.mark.parametrize(
    "url, expected_fetcher_type",
    [
        ("https://danluu.com", HTMLArchiveFetcher),
        ("https://www.rifters.com", HTMLArchiveFetcher),
        ("https://putanumonit.com", HTMLArchiveFetcher),
        ("https://acoup.blog", ACOUPArchiveFetcher),
        ("https://unknown.com", None),
    ],
 )
 def test_get_archive_fetcher_registry_matches(url, expected_fetcher_type):
    with patch("memory.common.parsers.archives.fetch_html") as mock_fetch:
        mock_fetch.return_value = "<html><body>Not substack</body></html>"
        with patch("memory.common.parsers.archives.is_substack") as mock_is_substack:
            mock_is_substack.return_value = False
            fetcher = get_archive_fetcher(url)
            if expected_fetcher_type:
                assert isinstance(fetcher, expected_fetcher_type)
            else:
                assert fetcher is None
 def test_get_archive_fetcher_tuple_registry():
    url = "https://putanumonit.com"
    with patch("memory.common.parsers.archives.fetch_html") as mock_fetch:
        mock_fetch.return_value = "<html><body>Not substack</body></html>"
        fetcher = get_archive_fetcher(url)
        assert isinstance(fetcher, HTMLArchiveFetcher)
        assert fetcher.start_url == "https://putanumonit.com/full-archive"
 def test_get_archive_fetcher_direct_parser_registry():
    url = "https://danluu.com"
    with patch("memory.common.parsers.archives.fetch_html") as mock_fetch:
        mock_fetch.return_value = "<html><body>Not substack</body></html>"
        fetcher = get_archive_fetcher(url)
        assert isinstance(fetcher, HTMLArchiveFetcher)
        assert fetcher.parser_class == DanluuParser
        assert fetcher.start_url == url
 def test_get_archive_fetcher_substack():
    url = "https://example.substack.com"
    with patch("memory.common.parsers.archives.fetch_html") as mock_fetch:
        mock_fetch.return_value = "<html><body>Substack content</body></html>"
        with patch("memory.common.parsers.archives.is_substack") as mock_is_substack:
            mock_is_substack.return_value = True
            fetcher = get_archive_fetcher(url)
            assert isinstance(fetcher, SubstackArchiveFetcher)
            assert fetcher.parser_class == SubstackAPIParser
 def test_get_archive_fetcher_no_match():
    url = "https://unknown.com"
    with patch("memory.common.parsers.archives.fetch_html") as mock_fetch:
        mock_fetch.return_value = "<html><body>Regular website</body></html>"
        with patch("memory.common.parsers.archives.is_substack") as mock_is_substack:
            mock_is_substack.return_value = False
            fetcher = get_archive_fetcher(url)
            assert fetcher is None
 def test_fetcher_registry_structure():
    """Test that FETCHER_REGISTRY has expected structure."""
    assert isinstance(FETCHER_REGISTRY, dict)
    for pattern, fetcher in FETCHER_REGISTRY.items():
        assert isinstance(pattern, str)
        assert (
            isinstance(fetcher, type)
            and issubclass(fetcher, FeedParser)
            or isinstance(fetcher, tuple)
            or isinstance(fetcher, ArchiveFetcher)
        )
@pytest.mark.parametrize(
    "pattern, test_url, should_match",
    [
        (r"https://danluu.com", "https://danluu.com", True),
        (r"https://danluu.com", "https://danluu.com/", True),
        (r"https://danluu.com", "https://other.com", False),
        (r"https://www.rifters.com", "https://www.rifters.com/crawl", True),
        (r"https://putanumonit.com", "https://putanumonit.com/archive", True),
    ],
 )
 def test_registry_pattern_matching(pattern, test_url, should_match):
    import re
    match = re.search(pattern, test_url.rstrip("/"))
    assert bool(match) == should_match
--- a/tests/memory/common/parsers/test_feeds.py
+++ b/tests/memory/common/parsers/test_feeds.py
@ -1,10 +1,10 @@
 from datetime import datetime
 from unittest.mock import MagicMock, patch
-from typing import Any, cast
+from typing import cast
 import json
 import pytest
 from bs4 import BeautifulSoup, Tag
 import requests
 from memory.common.parsers.feeds import (
    FeedItem,
@ -17,15 +17,160 @@ from memory.common.parsers.feeds import (
    NadiaXyzParser,
    RedHandFilesParser,
    BloombergAuthorParser,
    JSONParser,
    SubstackAPIParser,
    select_in,
    clean_url,
    is_rss_feed,
    extract_url,
    find_feed_link,
    get_feed_parser,
    DEFAULT_SKIP_PATTERNS,
    PARSER_REGISTRY,
 )
@pytest.mark.parametrize(
    "data, path, expected",
    [
        # Basic dictionary access
        ({"key": "value"}, ["key"], "value"),
        ({"nested": {"key": "value"}}, ["nested", "key"], "value"),
        # List access
        (["a", "b", "c"], [1], "b"),
        ([{"key": "value"}], [0, "key"], "value"),
        # Mixed access
        (
            {"items": [{"name": "first"}, {"name": "second"}]},
            ["items", 1, "name"],
            "second",
        ),
        # Empty path returns original data
        ({"key": "value"}, [], {"key": "value"}),
        # Missing keys return None
        ({"key": "value"}, ["missing"], None),
        ({"nested": {}}, ["nested", "missing"], None),
        # Index out of bounds returns None
        (["a", "b"], [5], None),
        # Type errors return None
        ("string", ["key"], None),
        (123, [0], None),
        (None, ["key"], None),
        # Deep nesting
        ({"a": {"b": {"c": {"d": "deep"}}}}, ["a", "b", "c", "d"], "deep"),
    ],
 )
 def test_select_in(data, path, expected):
    assert select_in(data, path) == expected
@patch("memory.common.parsers.feeds.fetch_html")
 def test_json_parser_fetch_items_with_content(mock_fetch_html):
    content = json.dumps(
        [
            {"title": "Article 1", "url": "https://example.com/1"},
            {"title": "Article 2", "url": "https://example.com/2"},
        ]
    )
    parser = JSONParser(url="https://example.com/feed.json", content=content)
    items = parser.fetch_items()
    assert items == [
        {"title": "Article 1", "url": "https://example.com/1"},
        {"title": "Article 2", "url": "https://example.com/2"},
    ]
    mock_fetch_html.assert_not_called()
@patch("memory.common.parsers.feeds.fetch_html")
 def test_json_parser_fetch_items_without_content(mock_fetch_html):
    content = json.dumps([{"title": "Article", "url": "https://example.com/1"}])
    mock_fetch_html.return_value = content
    parser = JSONParser(url="https://example.com/feed.json")
    items = parser.fetch_items()
    assert items == [{"title": "Article", "url": "https://example.com/1"}]
    mock_fetch_html.assert_called_once_with("https://example.com/feed.json")
@patch("memory.common.parsers.feeds.fetch_html")
 def test_json_parser_fetch_items_invalid_json(mock_fetch_html):
    mock_fetch_html.return_value = "invalid json content"
    parser = JSONParser(url="https://example.com/feed.json")
    items = parser.fetch_items()
    assert items == []
 def test_json_parser_extract_methods():
    parser = JSONParser(url="https://example.com")
    entry = {
        "title": "Test Title",
        "url": "https://example.com/article",
        "description": "Test description",
        "date": "2023-01-15",
        "author": "John Doe",
        "guid": "unique-123",
        "metadata": {"tags": ["tech", "news"]},
    }
    assert parser.extract_title(entry) == "Test Title"
    assert parser.extract_url(entry) == "https://example.com/article"
    assert parser.extract_description(entry) == "Test description"
    assert parser.extract_date(entry) == "2023-01-15"
    assert parser.extract_author(entry) == "John Doe"
    assert parser.extract_guid(entry) == "unique-123"
    assert parser.extract_metadata(entry) == {"tags": ["tech", "news"]}
 def test_json_parser_custom_paths():
    parser = JSONParser(url="https://example.com")
    parser.title_path = ["content", "headline"]
    parser.url_path = ["links", "canonical"]
    parser.author_path = ["byline", "name"]
    entry = {
        "content": {"headline": "Custom Title"},
        "links": {"canonical": "https://example.com/custom"},
        "byline": {"name": "Jane Smith"},
    }
    assert parser.extract_title(entry) == "Custom Title"
    assert parser.extract_url(entry) == "https://example.com/custom"
    assert parser.extract_author(entry) == "Jane Smith"
 def test_json_parser_missing_fields():
    parser = JSONParser(url="https://example.com")
    entry = {}  # Empty entry
    assert parser.extract_title(entry) is None
    assert parser.extract_url(entry) is None
    assert parser.extract_description(entry) is None
    assert parser.extract_date(entry) is None
    assert parser.extract_author(entry) is None
    assert parser.extract_guid(entry) is None
    assert parser.extract_metadata(entry) is None
 def test_json_parser_nested_paths():
    parser = JSONParser(url="https://example.com")
    parser.title_path = ["article", "header", "title"]
    parser.author_path = ["article", "byline", 0, "name"]
    entry = {
        "article": {
            "header": {"title": "Nested Title"},
            "byline": [{"name": "First Author"}, {"name": "Second Author"}],
        }
    }
    assert parser.extract_title(entry) == "Nested Title"
    assert parser.extract_author(entry) == "First Author"
 def test_feed_parser_base_url():
    parser = FeedParser(url="https://example.com/path/to/feed")
    assert parser.base_url == "https://example.com"
@ -582,7 +727,7 @@ def test_extract_url_function(html, expected):
    element = soup.find("a")
    assert element is not None
-    url = extract_url(cast(Tag, element), "https://example.com")
+    url = clean_url(cast(Tag, element), "https://example.com")
    assert url == expected
@ -706,33 +851,30 @@ def test_get_feed_parser_with_check_from():
    assert parser.since == check_from
-def test_parser_registry_completeness():
+def test_substack_api_parser():
-    """Ensure PARSER_REGISTRY contains expected parsers."""
+    parser = SubstackAPIParser(url="https://example.substack.com/api/v1/posts")
    expected_patterns = [
        r"https://danluu.com",
        r"https://guzey.com/archive",
        r"https://www.paulgraham.com/articles",
        r"https://nadia.xyz/posts",
        r"https://www.theredhandfiles.com",
        r"https://archive.ph/.*?/https://www.bloomberg.com/opinion/authors/",
    ]
-    assert len(PARSER_REGISTRY) == len(expected_patterns)
+    entry = {
-    for pattern in expected_patterns:
+        "title": "Substack Post",
-        assert pattern in PARSER_REGISTRY
+        "canonical_url": "https://example.substack.com/p/post-slug",
        "publishedBylines": [{"name": "Author Name"}],
        "post_date": "2023-01-15T10:30:00Z",
    }
    assert parser.extract_title(entry) == "Substack Post"
    assert parser.extract_url(entry) == "https://example.substack.com/p/post-slug"
    assert parser.extract_author(entry) == "Author Name"
    assert parser.extract_date(entry) == "2023-01-15T10:30:00Z"
-def test_default_skip_patterns():
+def test_substack_api_parser_missing_bylines():
-    """Ensure DEFAULT_SKIP_PATTERNS contains expected patterns."""
+    parser = SubstackAPIParser(url="https://example.substack.com/api/v1/posts")
    expected_patterns = [
        r"^#",
        r"mailto:",
        r"tel:",
        r"javascript:",
        r"\.pdf$",
        r"\.jpg$",
        r"\.png$",
        r"\.gif$",
    ]
-    assert DEFAULT_SKIP_PATTERNS == expected_patterns
+    entry = {
        "title": "Post Without Author",
        "canonical_url": "https://example.substack.com/p/post",
        "publishedBylines": [],
        "post_date": "2023-01-15T10:30:00Z",
    }
    assert parser.extract_author(entry) is None
--- a/tests/memory/common/parsers/test_html.py
+++ b/tests/memory/common/parsers/test_html.py
@ -23,7 +23,11 @@ from memory.common.parsers.html import (
    extract_meta_by_pattern,
    extract_metadata,
    extract_title,
    extract_url,
    get_base_url,
    is_bloomberg,
    is_substack,
    is_wordpress,
    parse_date,
    process_image,
    process_images,
@ -454,7 +458,7 @@ def test_process_images_empty():
        None, "https://example.com", pathlib.Path("/tmp")
    )
    assert result_content is None
-    assert result_images == []
+    assert result_images == {}
@patch("memory.common.parsers.html.process_image")
@ -503,6 +507,191 @@ def test_process_images_no_filename(mock_process_image):
    assert not images
@pytest.mark.parametrize(
    "html, selectors, base_url, expected",
    [
        # Basic URL extraction
        (
            '<a href="/next-page">Next</a>',
            "a",
            "https://example.com",
            "https://example.com/next-page",
        ),
        # Multiple selectors - should pick first matching
        (
            '<div><a href="/first">First</a><a href="/second">Second</a></div>',
            "a",
            "https://example.com",
            "https://example.com/first",
        ),
        # Multiple selectors with comma separation - span doesn't have href, so falls back to a
        (
            '<div><span class="next">Span</span><a href="/link">Link</a></div>',
            ".next, a",
            "https://example.com",
            "https://example.com/link",
        ),
        # Absolute URL should remain unchanged
        (
            '<a href="https://other.com/page">External</a>',
            "a",
            "https://example.com",
            "https://other.com/page",
        ),
        # No href attribute
        ("<a>No href</a>", "a", "https://example.com", None),
        # No matching element
        ("<p>No links</p>", "a", "https://example.com", None),
        # Empty href
        ('<a href="">Empty</a>', "a", "https://example.com", None),
    ],
 )
 def test_extract_url(html, selectors, base_url, expected):
    soup = BeautifulSoup(html, "html.parser")
    assert extract_url(soup, selectors, base_url) == expected
@pytest.mark.parametrize(
    "html, expected",
    [
        # Substack with preconnect link
        (
            """
            <head>
                <link rel="preconnect" href="https://substackcdn.com">
            </head>
            """,
            True,
        ),
        # Multiple preconnect links, one is Substack
        (
            """
            <head>
                <link rel="preconnect" href="https://fonts.googleapis.com">
                <link rel="preconnect" href="https://substackcdn.com">
            </head>
            """,
            True,
        ),
        # No Substack preconnect
        (
            """
            <head>
                <link rel="preconnect" href="https://fonts.googleapis.com">
            </head>
            """,
            False,
        ),
        # No preconnect links at all
        ("<head></head>", False),
        # Preconnect without href
        ('<head><link rel="preconnect"></head>', False),
        # Different rel attribute
        ('<head><link rel="stylesheet" href="https://substackcdn.com"></head>', False),
    ],
 )
 def test_is_substack(html, expected):
    soup = BeautifulSoup(html, "html.parser")
    assert is_substack(soup) == expected
@pytest.mark.parametrize(
    "html, expected",
    [
        # WordPress with wp-singular class on body should be False (looks for content inside body)
        ('<body class="wp-singular">Content</body>', False),
        # WordPress with nested wp-singular
        ('<body><div class="wp-singular">Content</div></body>', True),
        # Archived page with WordPress content
        (
            """
            <div id="CONTENT">
                <div class="html">
                    <body class="wp-singular">Content</body>
                </div>
            </div>
            """,
            True,
        ),
        # No WordPress indicators
        ('<body><div class="content">Regular content</div></body>', False),
        # Empty body
        ("<body></body>", False),
        # No body tag
        ("<div>No body</div>", False),
    ],
 )
 def test_is_wordpress(html, expected):
    soup = BeautifulSoup(html, "html.parser")
    assert is_wordpress(soup) == expected
@pytest.mark.parametrize(
    "html, expected",
    [
        # Bloomberg with company link
        (
            """
            <body>
                <a href="https://www.bloomberg.com/company/">Bloomberg</a>
            </body>
            """,
            True,
        ),
        # Bloomberg link among other links
        (
            """
            <body>
                <a href="https://example.com">Example</a>
                <a href="https://www.bloomberg.com/company/">Bloomberg</a>
                <a href="https://other.com">Other</a>
            </body>
            """,
            True,
        ),
        # Archived page with Bloomberg content
        (
            """
            <div id="CONTENT">
                <div class="html">
                    <body>
                        <a href="https://www.bloomberg.com/company/">Bloomberg</a>
                    </body>
                </div>
            </div>
            """,
            True,
        ),
        # No Bloomberg links
        (
            """
            <body>
                <a href="https://example.com">Example</a>
                <a href="https://other.com">Other</a>
            </body>
            """,
            False,
        ),
        # Bloomberg link but not company page
        (
            """
            <body>
                <a href="https://www.bloomberg.com/news/">Bloomberg News</a>
            </body>
            """,
            False,
        ),
        # No links at all
        ("<body><p>No links</p></body>", False),
        # Links without href
        ("<body><a>No href</a></body>", False),
    ],
 )
 def test_is_bloomberg(html, expected):
    soup = BeautifulSoup(html, "html.parser")
    assert is_bloomberg(soup) == expected
 class TestBaseHTMLParser:
    def test_init_with_base_url(self):
        parser = BaseHTMLParser("https://example.com/path")
@ -584,7 +773,7 @@ class TestBaseHTMLParser:
    def test_parse_with_images(self, mock_process_images):
        # Mock the image processing to return test data
        mock_image = MagicMock(spec=PILImage.Image)
-        mock_process_images.return_value = (MagicMock(), [mock_image])
+        mock_process_images.return_value = (MagicMock(), {"test_image.jpg": mock_image})
        html = """
        <article>
@ -600,5 +789,6 @@ class TestBaseHTMLParser:
        article = parser.parse(html, "https://example.com/article")
        assert len(article.images) == 1
-        assert article.images[0] == mock_image
+        assert "test_image.jpg" in article.images
        assert article.images["test_image.jpg"] == mock_image
        mock_process_images.assert_called_once()