mirror of
https://github.com/mruwnik/memory.git
synced 2025-06-08 21:34:42 +02:00
Add archives fetcher
This commit is contained in:
parent
27fbfcc548
commit
876fa87725
301
src/memory/common/parsers/archives.py
Normal file
301
src/memory/common/parsers/archives.py
Normal file
@ -0,0 +1,301 @@
|
|||||||
|
from dataclasses import dataclass, field
|
||||||
|
import logging
|
||||||
|
import re
|
||||||
|
import time
|
||||||
|
from urllib.parse import parse_qs, urlencode, urlparse, urlunparse
|
||||||
|
from typing import Generator, cast
|
||||||
|
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
from memory.common.parsers.blogs import is_substack
|
||||||
|
|
||||||
|
from memory.common.parsers.feeds import (
|
||||||
|
DanluuParser,
|
||||||
|
HTMLListParser,
|
||||||
|
RiftersParser,
|
||||||
|
FeedItem,
|
||||||
|
FeedParser,
|
||||||
|
SubstackAPIParser,
|
||||||
|
)
|
||||||
|
from memory.common.parsers.html import (
|
||||||
|
fetch_html,
|
||||||
|
extract_url,
|
||||||
|
get_base_url,
|
||||||
|
)
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class ArchiveFetcher:
|
||||||
|
"""Fetches complete backlogs from sites with pagination."""
|
||||||
|
|
||||||
|
parser_class: type[FeedParser]
|
||||||
|
start_url: str
|
||||||
|
max_pages: int = 100
|
||||||
|
delay_between_requests: float = 1.0
|
||||||
|
parser_kwargs: dict = field(default_factory=dict)
|
||||||
|
|
||||||
|
def make_parser(self, url: str) -> FeedParser:
|
||||||
|
parser = self.parser_class(url=url)
|
||||||
|
for key, value in self.parser_kwargs.items():
|
||||||
|
setattr(parser, key, value)
|
||||||
|
return parser
|
||||||
|
|
||||||
|
def fetch_all_items(self) -> Generator[FeedItem, None, None]:
|
||||||
|
"""Fetch all items from all pages."""
|
||||||
|
visited_urls = set()
|
||||||
|
current_url = self.start_url
|
||||||
|
page_count = 0
|
||||||
|
total_items = 0
|
||||||
|
|
||||||
|
while current_url and page_count < self.max_pages:
|
||||||
|
if current_url in visited_urls:
|
||||||
|
logger.warning(f"Already visited {current_url}, stopping")
|
||||||
|
break
|
||||||
|
|
||||||
|
logger.info(f"Fetching page {page_count + 1}: {current_url}")
|
||||||
|
visited_urls.add(current_url)
|
||||||
|
|
||||||
|
try:
|
||||||
|
parser = self.make_parser(current_url)
|
||||||
|
|
||||||
|
items = parser.parse_feed()
|
||||||
|
if not items:
|
||||||
|
break
|
||||||
|
|
||||||
|
prev_items = total_items
|
||||||
|
for item in items:
|
||||||
|
total_items += 1
|
||||||
|
yield item
|
||||||
|
|
||||||
|
if prev_items == total_items:
|
||||||
|
logger.warning(f"No new items found on page {page_count + 1}")
|
||||||
|
break
|
||||||
|
|
||||||
|
current_url = self._find_next_page(parser, page_count)
|
||||||
|
if not current_url:
|
||||||
|
logger.info("No more pages found")
|
||||||
|
break
|
||||||
|
|
||||||
|
page_count += 1
|
||||||
|
|
||||||
|
if self.delay_between_requests > 0:
|
||||||
|
time.sleep(self.delay_between_requests)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error processing {current_url}: {e}")
|
||||||
|
break
|
||||||
|
|
||||||
|
def _find_next_page(self, parser: FeedParser, current_page: int = 0) -> str | None:
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class LinkFetcher(ArchiveFetcher):
|
||||||
|
per_page: int = 10
|
||||||
|
|
||||||
|
def _find_next_page(self, parser: FeedParser, current_page: int = 0):
|
||||||
|
next_page = current_page + 1
|
||||||
|
parsed = urlparse(self.start_url)
|
||||||
|
params = parse_qs(parsed.query)
|
||||||
|
params["offset"] = [str(next_page * self.per_page)]
|
||||||
|
params["limit"] = [str(self.per_page)]
|
||||||
|
|
||||||
|
new_query = urlencode(params, doseq=True)
|
||||||
|
return urlunparse(parsed._replace(query=new_query))
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class HTMLArchiveFetcher(ArchiveFetcher):
|
||||||
|
next_page_selectors: list[str] = field(
|
||||||
|
default_factory=lambda: [
|
||||||
|
'a[rel="next"]',
|
||||||
|
".next a",
|
||||||
|
"a.next",
|
||||||
|
".pagination .next",
|
||||||
|
".pager .next",
|
||||||
|
"nav.page a:last-of-type",
|
||||||
|
".navigation a:last-of-type",
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
def _find_next_page(self, parser: FeedParser, current_page: int = 0) -> str | None:
|
||||||
|
if not parser.content:
|
||||||
|
return None
|
||||||
|
soup = BeautifulSoup(parser.content, "html.parser")
|
||||||
|
selectors = ",".join(self.next_page_selectors)
|
||||||
|
return extract_url(soup, selectors, parser.url)
|
||||||
|
|
||||||
|
|
||||||
|
def html_parser(**kwargs) -> type[HTMLListParser]:
|
||||||
|
class ConfiguredHTMLListParser(HTMLListParser):
|
||||||
|
def __init__(self, url: str):
|
||||||
|
super().__init__(url)
|
||||||
|
for key, value in kwargs.items():
|
||||||
|
setattr(self, key, value)
|
||||||
|
|
||||||
|
return ConfiguredHTMLListParser
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class SubstackArchiveFetcher(LinkFetcher):
|
||||||
|
def __post_init__(self):
|
||||||
|
if "api/v1/archive" not in self.start_url:
|
||||||
|
base_url = get_base_url(self.start_url)
|
||||||
|
self.start_url = f"{base_url}/api/v1/archive"
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class ACOUPArchiveFetcher(HTMLArchiveFetcher):
|
||||||
|
def _find_next_page(self, parser: FeedParser, current_page: int = 0) -> str | None:
|
||||||
|
if not parser.content:
|
||||||
|
return None
|
||||||
|
soup = BeautifulSoup(parser.content, "html.parser")
|
||||||
|
urls = reversed([i.attrs.get("href") for i in soup.select(".widget_archive a")])
|
||||||
|
urls = (cast(str, u) for u in urls if u)
|
||||||
|
for url in urls:
|
||||||
|
if url.rstrip("/") == parser.url.rstrip("/"):
|
||||||
|
return next(urls, None)
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class HTMLNextUrlArchiveFetcher(HTMLArchiveFetcher):
|
||||||
|
next_url: str = ""
|
||||||
|
|
||||||
|
def __post_init__(self):
|
||||||
|
if not self.next_url:
|
||||||
|
self.next_url = self.start_url
|
||||||
|
if not self.next_url.startswith("http") and not self.next_url.startswith("/"):
|
||||||
|
self.next_url = f"{self.start_url}/{self.next_url}"
|
||||||
|
|
||||||
|
def _find_next_page(self, parser: FeedParser, current_page: int = 0) -> str | None:
|
||||||
|
return f"{self.next_url}/{current_page + 1}"
|
||||||
|
|
||||||
|
|
||||||
|
FETCHER_REGISTRY = {
|
||||||
|
r"https://putanumonit.com": (
|
||||||
|
"https://putanumonit.com/full-archive",
|
||||||
|
html_parser(
|
||||||
|
item_selector="article p", title_selector="a strong", url_selector="a"
|
||||||
|
),
|
||||||
|
),
|
||||||
|
r"https://danluu.com": DanluuParser,
|
||||||
|
r"https://www.rifters.com": RiftersParser,
|
||||||
|
r"https://rachelbythebay.com": html_parser(
|
||||||
|
item_selector="div.post",
|
||||||
|
url_selector="a",
|
||||||
|
),
|
||||||
|
r"https://guzey.com": (
|
||||||
|
"https://guzey.com/archive/",
|
||||||
|
html_parser(item_selector="article li"),
|
||||||
|
),
|
||||||
|
r"https://aphyr.com": html_parser(
|
||||||
|
item_selector="article.post",
|
||||||
|
title_selector="h1",
|
||||||
|
url_selector="h1 a",
|
||||||
|
description_selector=".body",
|
||||||
|
date_selector=".meta time",
|
||||||
|
),
|
||||||
|
r"https://www.applieddivinitystudies.com": html_parser(
|
||||||
|
item_selector="article.article",
|
||||||
|
title_selector="header.article-header h1",
|
||||||
|
url_selector="header.article-header h1 a",
|
||||||
|
description_selector=".article-entry",
|
||||||
|
date_selector=".article-meta time",
|
||||||
|
),
|
||||||
|
r"https://www.flyingmachinestudios.com": html_parser(
|
||||||
|
item_selector="#main #articles li",
|
||||||
|
title_selector="header .title",
|
||||||
|
description_selector="p",
|
||||||
|
date_selector="header .date",
|
||||||
|
date_format="%d %B %Y",
|
||||||
|
),
|
||||||
|
r"https://slimemoldtimemold.com": html_parser(
|
||||||
|
item_selector="article .wp-block-list li", title_selector="a"
|
||||||
|
),
|
||||||
|
r"https://www.paulgraham.com": (
|
||||||
|
"https://www.paulgraham.com/articles.html",
|
||||||
|
html_parser(item_selector="img + font"),
|
||||||
|
),
|
||||||
|
r"https://slatestarcodex.com": (
|
||||||
|
"https://slatestarcodex.com/archives/",
|
||||||
|
html_parser(item_selector="#sya_container li"),
|
||||||
|
),
|
||||||
|
r"https://mcfunley.com": (
|
||||||
|
"https://mcfunley.com/writing",
|
||||||
|
html_parser(item_selector="article", title_selector="h6"),
|
||||||
|
),
|
||||||
|
r"https://www.bitsaboutmoney.com": HTMLArchiveFetcher(
|
||||||
|
html_parser(
|
||||||
|
item_selector="article",
|
||||||
|
title_selector="h1",
|
||||||
|
description_selector="p",
|
||||||
|
date_selector="time",
|
||||||
|
),
|
||||||
|
"https://www.bitsaboutmoney.com/archive/",
|
||||||
|
next_page_selectors=["nav.pagination a.older-posts"],
|
||||||
|
),
|
||||||
|
r"https://acoup.blog": ACOUPArchiveFetcher(
|
||||||
|
html_parser(
|
||||||
|
item_selector="article",
|
||||||
|
title_selector="a",
|
||||||
|
description_selector=".entry-content",
|
||||||
|
date_selector=".published-on time",
|
||||||
|
),
|
||||||
|
"https://acoup.blog/2019/05/",
|
||||||
|
),
|
||||||
|
r"https://www.theredhandfiles.com": html_parser(
|
||||||
|
item_selector="article", title_selector="h3", description_selector="h2"
|
||||||
|
),
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def get_archive_fetcher(url: str) -> ArchiveFetcher | None:
|
||||||
|
for pattern, fetcher in FETCHER_REGISTRY.items():
|
||||||
|
if re.search(pattern, url.rstrip("/")):
|
||||||
|
if isinstance(fetcher, ArchiveFetcher):
|
||||||
|
return fetcher
|
||||||
|
elif isinstance(fetcher, tuple):
|
||||||
|
base_url, html_fetcher = fetcher
|
||||||
|
return HTMLArchiveFetcher(html_fetcher, base_url)
|
||||||
|
else:
|
||||||
|
return HTMLArchiveFetcher(fetcher, url)
|
||||||
|
|
||||||
|
html = fetch_html(url)
|
||||||
|
soup = BeautifulSoup(html, "html.parser")
|
||||||
|
if is_substack(soup):
|
||||||
|
return SubstackArchiveFetcher(SubstackAPIParser, url)
|
||||||
|
|
||||||
|
|
||||||
|
feeds = [
|
||||||
|
"https://archive.ph/o/IQUoT/https://www.bloomberg.com/opinion/authors/ARbTQlRLRjE/matthew-s-levine",
|
||||||
|
"https://www.rifters.com/crawl/",
|
||||||
|
"https://rachelbythebay.com/w/",
|
||||||
|
"https://danluu.com/",
|
||||||
|
"https://guzey.com",
|
||||||
|
"https://aphyr.com/",
|
||||||
|
"https://www.applieddivinitystudies.com/",
|
||||||
|
"https://www.imightbewrong.org/",
|
||||||
|
"https://www.kvetch.au/",
|
||||||
|
"https://www.overcomingbias.com/",
|
||||||
|
"https://samkriss.substack.com/",
|
||||||
|
"https://www.richardhanania.com/",
|
||||||
|
"https://skunkledger.substack.com/",
|
||||||
|
"https://taipology.substack.com/",
|
||||||
|
"https://putanumonit.com/",
|
||||||
|
"https://www.flyingmachinestudios.com/",
|
||||||
|
"https://www.theintrinsicperspective.com/",
|
||||||
|
"https://www.strangeloopcanon.com/",
|
||||||
|
"https://slimemoldtimemold.com/",
|
||||||
|
"https://zeroinputagriculture.substack.com/",
|
||||||
|
"https://nayafia.substack.com",
|
||||||
|
"https://www.paulgraham.com/articles.html",
|
||||||
|
"https://mcfunley.com/writing",
|
||||||
|
"https://www.bitsaboutmoney.com/",
|
||||||
|
"https://akarlin.com",
|
||||||
|
"https://www.exurbe.com/",
|
||||||
|
"https://acoup.blog/",
|
||||||
|
"https://www.theredhandfiles.com/",
|
||||||
|
"https://karlin.blog/",
|
||||||
|
"https://slatestarcodex.com/",
|
||||||
|
]
|
@ -13,6 +13,9 @@ from memory.common.parsers.html import (
|
|||||||
extract_title,
|
extract_title,
|
||||||
extract_date,
|
extract_date,
|
||||||
fetch_html,
|
fetch_html,
|
||||||
|
is_wordpress,
|
||||||
|
is_substack,
|
||||||
|
is_bloomberg,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@ -587,24 +590,13 @@ def get_parser_for_url(url: str, html: str) -> BaseHTMLParser:
|
|||||||
return parser_class(url)
|
return parser_class(url)
|
||||||
|
|
||||||
soup = BeautifulSoup(html, "html.parser")
|
soup = BeautifulSoup(html, "html.parser")
|
||||||
body_select = "body"
|
if is_wordpress(soup):
|
||||||
# Check if this is an archived page
|
|
||||||
if contents := soup.select_one("#CONTENT .html"):
|
|
||||||
body_select = ".body"
|
|
||||||
soup = contents
|
|
||||||
|
|
||||||
if soup.select_one(f"{body_select} .wp-singular"):
|
|
||||||
return WordPressParser(url)
|
return WordPressParser(url)
|
||||||
|
|
||||||
if any(
|
if is_substack(soup):
|
||||||
"https://substackcdn.com" == a.attrs.get("href") # type: ignore
|
|
||||||
for a in soup.find_all("link", {"rel": "preconnect"})
|
|
||||||
if hasattr(a, "attrs") # type: ignore
|
|
||||||
):
|
|
||||||
return SubstackParser(url)
|
return SubstackParser(url)
|
||||||
|
|
||||||
urls = [a.attrs.get("href") for a in soup.select(f"{body_select} a")] # type: ignore
|
if is_bloomberg(soup):
|
||||||
if any(u.endswith("https://www.bloomberg.com/company/") for u in urls[:5] if u): # type: ignore
|
|
||||||
return BloombergParser(url)
|
return BloombergParser(url)
|
||||||
|
|
||||||
return BaseHTMLParser(url)
|
return BaseHTMLParser(url)
|
||||||
@ -628,11 +620,11 @@ def parse_webpage(url: str) -> Article:
|
|||||||
feeds = [
|
feeds = [
|
||||||
"https://archive.ph/o/IQUoT/https://www.bloomberg.com/opinion/authors/ARbTQlRLRjE/matthew-s-levine",
|
"https://archive.ph/o/IQUoT/https://www.bloomberg.com/opinion/authors/ARbTQlRLRjE/matthew-s-levine",
|
||||||
"https://www.rifters.com/crawl/",
|
"https://www.rifters.com/crawl/",
|
||||||
"https://rachelbythebay.com/w/atom.xml",
|
"https://rachelbythebay.com/w/",
|
||||||
"https://danluu.com/",
|
"https://danluu.com/",
|
||||||
"https://guzey.com/archive",
|
"https://guzey.come",
|
||||||
"https://aphyr.com/posts.atom",
|
"https://aphyr.com/",
|
||||||
"https://www.applieddivinitystudies.com/atom.xml",
|
"https://www.applieddivinitystudies.com/",
|
||||||
"https://www.imightbewrong.org/",
|
"https://www.imightbewrong.org/",
|
||||||
"https://www.kvetch.au/",
|
"https://www.kvetch.au/",
|
||||||
"https://www.overcomingbias.com/",
|
"https://www.overcomingbias.com/",
|
||||||
@ -649,9 +641,10 @@ feeds = [
|
|||||||
"https://nayafia.substack.com",
|
"https://nayafia.substack.com",
|
||||||
"https://www.paulgraham.com/articles.html",
|
"https://www.paulgraham.com/articles.html",
|
||||||
"https://mcfunley.com/writing",
|
"https://mcfunley.com/writing",
|
||||||
"https://www.bitsaboutmoney.com/archive/",
|
"https://www.bitsaboutmoney.com/",
|
||||||
"https://akarlin.com/archive/",
|
"https://akarlin.com",
|
||||||
"https://www.exurbe.com/",
|
"https://www.exurbe.com/",
|
||||||
"https://acoup.blog/",
|
"https://acoup.blog/",
|
||||||
"https://www.theredhandfiles.com/",
|
"https://www.theredhandfiles.com/",
|
||||||
|
"https://karlin.blog/",
|
||||||
]
|
]
|
||||||
|
@ -1,5 +1,6 @@
|
|||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
import logging
|
import logging
|
||||||
|
import json
|
||||||
import re
|
import re
|
||||||
from dataclasses import dataclass, field
|
from dataclasses import dataclass, field
|
||||||
from typing import Any, Generator, Sequence, cast
|
from typing import Any, Generator, Sequence, cast
|
||||||
@ -20,6 +21,20 @@ from memory.common.parsers.html import (
|
|||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
ObjectPath = list[str | int]
|
||||||
|
|
||||||
|
|
||||||
|
def select_in(data: Any, path: ObjectPath) -> Any:
|
||||||
|
if not path:
|
||||||
|
return data
|
||||||
|
|
||||||
|
key, *rest = path
|
||||||
|
try:
|
||||||
|
return select_in(data[key], rest)
|
||||||
|
except (KeyError, TypeError, IndexError):
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class FeedItem:
|
class FeedItem:
|
||||||
"""Represents a single item from a feed."""
|
"""Represents a single item from a feed."""
|
||||||
@ -62,7 +77,7 @@ class FeedParser:
|
|||||||
)
|
)
|
||||||
|
|
||||||
def valid_item(self, item: FeedItem) -> bool:
|
def valid_item(self, item: FeedItem) -> bool:
|
||||||
return True
|
return bool(item.url)
|
||||||
|
|
||||||
def parse_feed(self) -> Generator[FeedItem, None, None]:
|
def parse_feed(self) -> Generator[FeedItem, None, None]:
|
||||||
"""Parse feed content and return list of feed items."""
|
"""Parse feed content and return list of feed items."""
|
||||||
@ -100,6 +115,46 @@ class FeedParser:
|
|||||||
return {}
|
return {}
|
||||||
|
|
||||||
|
|
||||||
|
class JSONParser(FeedParser):
|
||||||
|
title_path: ObjectPath = ["title"]
|
||||||
|
url_path: ObjectPath = ["url"]
|
||||||
|
description_path: ObjectPath = ["description"]
|
||||||
|
date_path: ObjectPath = ["date"]
|
||||||
|
author_path: ObjectPath = ["author"]
|
||||||
|
guid_path: ObjectPath = ["guid"]
|
||||||
|
metadata_path: ObjectPath = ["metadata"]
|
||||||
|
|
||||||
|
def fetch_items(self) -> Sequence[Any]:
|
||||||
|
if not self.content:
|
||||||
|
self.content = cast(str, fetch_html(self.url))
|
||||||
|
try:
|
||||||
|
return json.loads(self.content)
|
||||||
|
except json.JSONDecodeError as e:
|
||||||
|
logger.error(f"Error parsing JSON: {e}")
|
||||||
|
return []
|
||||||
|
|
||||||
|
def extract_title(self, entry: Any) -> str:
|
||||||
|
return select_in(entry, self.title_path)
|
||||||
|
|
||||||
|
def extract_url(self, entry: Any) -> str:
|
||||||
|
return select_in(entry, self.url_path)
|
||||||
|
|
||||||
|
def extract_description(self, entry: Any) -> str:
|
||||||
|
return select_in(entry, self.description_path)
|
||||||
|
|
||||||
|
def extract_date(self, entry: Any) -> datetime:
|
||||||
|
return select_in(entry, self.date_path)
|
||||||
|
|
||||||
|
def extract_author(self, entry: Any) -> str:
|
||||||
|
return select_in(entry, self.author_path)
|
||||||
|
|
||||||
|
def extract_guid(self, entry: Any) -> str:
|
||||||
|
return select_in(entry, self.guid_path)
|
||||||
|
|
||||||
|
def extract_metadata(self, entry: Any) -> dict[str, Any]:
|
||||||
|
return select_in(entry, self.metadata_path)
|
||||||
|
|
||||||
|
|
||||||
class RSSAtomParser(FeedParser):
|
class RSSAtomParser(FeedParser):
|
||||||
"""Parser for RSS and Atom feeds using feedparser."""
|
"""Parser for RSS and Atom feeds using feedparser."""
|
||||||
|
|
||||||
@ -237,8 +292,14 @@ class HTMLListParser(FeedParser):
|
|||||||
return extract_date(entry, self.date_selector, self.date_format)
|
return extract_date(entry, self.date_selector, self.date_format)
|
||||||
|
|
||||||
|
|
||||||
|
class SubstackAPIParser(JSONParser):
|
||||||
|
url_path = ["canonical_url"]
|
||||||
|
author_path = ["publishedBylines", 0, "name"]
|
||||||
|
date_path = ["post_date"]
|
||||||
|
|
||||||
|
|
||||||
class DanluuParser(HTMLListParser):
|
class DanluuParser(HTMLListParser):
|
||||||
skip_patterns = [r"^https://danluu.com/#"]
|
skip_patterns = DEFAULT_SKIP_PATTERNS + [r"^https://danluu\.com/?#"]
|
||||||
|
|
||||||
def valid_item(self, item: FeedItem) -> bool:
|
def valid_item(self, item: FeedItem) -> bool:
|
||||||
return item.url.startswith(self.base_url)
|
return item.url.startswith(self.base_url)
|
||||||
@ -351,6 +412,13 @@ class RedHandFilesParser(HTMLListParser):
|
|||||||
return ""
|
return ""
|
||||||
|
|
||||||
|
|
||||||
|
class RiftersParser(HTMLListParser):
|
||||||
|
item_selector = "#content .post"
|
||||||
|
title_selector = "h2 a"
|
||||||
|
url_selector = "h2 a"
|
||||||
|
description_selector = ".entry-content"
|
||||||
|
|
||||||
|
|
||||||
class BloombergAuthorParser(HTMLListParser):
|
class BloombergAuthorParser(HTMLListParser):
|
||||||
item_selector = "section#author_page article"
|
item_selector = "section#author_page article"
|
||||||
url_selector = "a[href]"
|
url_selector = "a[href]"
|
||||||
@ -393,7 +461,7 @@ def is_rss_feed(content: str) -> bool:
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def extract_url(element: Tag, base_url: str) -> str | None:
|
def clean_url(element: Tag, base_url: str) -> str | None:
|
||||||
if not (href := element.get("href")):
|
if not (href := element.get("href")):
|
||||||
return None
|
return None
|
||||||
|
|
||||||
@ -409,14 +477,14 @@ def find_feed_link(url: str, soup: BeautifulSoup) -> str | None:
|
|||||||
for link in links:
|
for link in links:
|
||||||
if not isinstance(link, Tag):
|
if not isinstance(link, Tag):
|
||||||
continue
|
continue
|
||||||
if not (link_url := extract_url(link, url)):
|
if not (link_url := clean_url(link, url)):
|
||||||
continue
|
continue
|
||||||
if link_url.rstrip("/") != url.rstrip("/"):
|
if link_url.rstrip("/") != url.rstrip("/"):
|
||||||
return link_url
|
return link_url
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
PARSER_REGISTRY = {
|
FEED_REGISTRY = {
|
||||||
r"https://danluu.com": DanluuParser,
|
r"https://danluu.com": DanluuParser,
|
||||||
r"https://guzey.com/archive": GuzeyParser,
|
r"https://guzey.com/archive": GuzeyParser,
|
||||||
r"https://www.paulgraham.com/articles": PaulGrahamParser,
|
r"https://www.paulgraham.com/articles": PaulGrahamParser,
|
||||||
@ -427,7 +495,7 @@ PARSER_REGISTRY = {
|
|||||||
|
|
||||||
|
|
||||||
def get_feed_parser(url: str, check_from: datetime | None = None) -> FeedParser | None:
|
def get_feed_parser(url: str, check_from: datetime | None = None) -> FeedParser | None:
|
||||||
for pattern, parser_class in PARSER_REGISTRY.items():
|
for pattern, parser_class in FEED_REGISTRY.items():
|
||||||
if re.search(pattern, url.rstrip("/")):
|
if re.search(pattern, url.rstrip("/")):
|
||||||
return parser_class(url=url, since=check_from)
|
return parser_class(url=url, since=check_from)
|
||||||
|
|
||||||
|
@ -110,7 +110,14 @@ def extract_date(
|
|||||||
|
|
||||||
datetime_attr = element.get("datetime")
|
datetime_attr = element.get("datetime")
|
||||||
if datetime_attr:
|
if datetime_attr:
|
||||||
for format in ["%Y-%m-%dT%H:%M:%S", "%Y-%m-%d", date_format]:
|
for format in [
|
||||||
|
"%Y-%m-%dT%H:%M:%S.%fZ",
|
||||||
|
"%Y-%m-%dT%H:%M:%S%z",
|
||||||
|
"%Y-%m-%dT%H:%M:%S.%f",
|
||||||
|
"%Y-%m-%dT%H:%M:%S",
|
||||||
|
"%Y-%m-%d",
|
||||||
|
date_format,
|
||||||
|
]:
|
||||||
if date := parse_date(str(datetime_attr), format):
|
if date := parse_date(str(datetime_attr), format):
|
||||||
return date
|
return date
|
||||||
|
|
||||||
@ -277,6 +284,47 @@ def extract_metadata(soup: BeautifulSoup) -> dict[str, Any]:
|
|||||||
return metadata
|
return metadata
|
||||||
|
|
||||||
|
|
||||||
|
def extract_url(soup: BeautifulSoup, selectors: str, base_url: str = "") -> str | None:
|
||||||
|
for selector in selectors.split(","):
|
||||||
|
next_link = soup.select_one(selector)
|
||||||
|
if not (next_link and isinstance(next_link, Tag)):
|
||||||
|
continue
|
||||||
|
|
||||||
|
if not (href := next_link.get("href")):
|
||||||
|
continue
|
||||||
|
|
||||||
|
return to_absolute_url(str(href), base_url)
|
||||||
|
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def is_substack(soup: BeautifulSoup | Tag) -> bool:
|
||||||
|
return any(
|
||||||
|
"https://substackcdn.com" == a.attrs.get("href") # type: ignore
|
||||||
|
for a in soup.find_all("link", {"rel": "preconnect"})
|
||||||
|
if hasattr(a, "attrs") # type: ignore
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def is_wordpress(soup: BeautifulSoup | Tag) -> bool:
|
||||||
|
body_select = "body"
|
||||||
|
# Check if this is an archived page
|
||||||
|
if contents := soup.select_one("#CONTENT .html"):
|
||||||
|
body_select = "#CONTENT .html"
|
||||||
|
soup = contents
|
||||||
|
return bool(soup.select_one(f"{body_select} .wp-singular"))
|
||||||
|
|
||||||
|
|
||||||
|
def is_bloomberg(soup: BeautifulSoup | Tag) -> bool:
|
||||||
|
body_select = "body"
|
||||||
|
# Check if this is an archived page
|
||||||
|
if contents := soup.select_one("#CONTENT .html"):
|
||||||
|
body_select = "#CONTENT .html"
|
||||||
|
soup = contents
|
||||||
|
urls = [a.attrs.get("href") for a in soup.select(f"{body_select} a")] # type: ignore
|
||||||
|
return any(u.endswith("https://www.bloomberg.com/company/") for u in urls[:5] if u) # type: ignore
|
||||||
|
|
||||||
|
|
||||||
class BaseHTMLParser:
|
class BaseHTMLParser:
|
||||||
"""Base class for parsing HTML content from websites."""
|
"""Base class for parsing HTML content from websites."""
|
||||||
|
|
||||||
|
@ -1,176 +1,512 @@
|
|||||||
|
from unittest.mock import patch
|
||||||
|
from urllib.parse import urlparse, parse_qs
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
from unittest.mock import Mock, patch
|
|
||||||
from bs4 import BeautifulSoup
|
|
||||||
|
|
||||||
from memory.common.parsers.archives import (
|
from memory.common.parsers.archives import (
|
||||||
ArchiveParser,
|
ArchiveFetcher,
|
||||||
WordPressArchiveParser,
|
LinkFetcher,
|
||||||
SubstackArchiveParser,
|
HTMLArchiveFetcher,
|
||||||
get_archive_parser,
|
SubstackArchiveFetcher,
|
||||||
|
ACOUPArchiveFetcher,
|
||||||
|
HTMLNextUrlArchiveFetcher,
|
||||||
|
html_parser,
|
||||||
|
get_archive_fetcher,
|
||||||
|
FETCHER_REGISTRY,
|
||||||
|
)
|
||||||
|
from memory.common.parsers.feeds import (
|
||||||
|
FeedItem,
|
||||||
|
FeedParser,
|
||||||
|
HTMLListParser,
|
||||||
|
DanluuParser,
|
||||||
|
SubstackAPIParser,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
class TestArchiveParser:
|
class MockParser(FeedParser):
|
||||||
def test_init(self):
|
def __init__(
|
||||||
parser = ArchiveParser(url="https://example.com")
|
self, url: str, items: list[FeedItem] | None = None, content: str = ""
|
||||||
assert parser.url == "https://example.com"
|
):
|
||||||
assert parser._visited_urls == set()
|
super().__init__(url)
|
||||||
assert parser._all_items == []
|
self.items = items or []
|
||||||
assert parser.max_pages == 100
|
self.content = content
|
||||||
assert parser.delay_between_requests == 1.0
|
|
||||||
|
|
||||||
def test_extract_items_from_page(self):
|
def parse_feed(self):
|
||||||
html = """
|
return self.items
|
||||||
<div>
|
|
||||||
<li><a href="/post1">Post 1</a></li>
|
|
||||||
<li><a href="/post2">Post 2</a></li>
|
|
||||||
<li><a href="/post1">Post 1</a></li> <!-- Duplicate -->
|
|
||||||
</div>
|
|
||||||
"""
|
|
||||||
soup = BeautifulSoup(html, "html.parser")
|
|
||||||
parser = ArchiveParser(url="https://example.com")
|
|
||||||
|
|
||||||
items = parser._extract_items_from_page(soup)
|
|
||||||
assert len(items) == 2 # Duplicates should be filtered out
|
|
||||||
|
|
||||||
def test_find_next_page_url_with_selector(self):
|
|
||||||
html = '<div><a class="next" href="/page/2">Next</a></div>'
|
|
||||||
soup = BeautifulSoup(html, "html.parser")
|
|
||||||
parser = ArchiveParser(url="https://example.com")
|
|
||||||
parser.next_page_selector = ".next"
|
|
||||||
|
|
||||||
next_url = parser._find_next_page_url(soup, "https://example.com/page/1")
|
|
||||||
assert next_url == "https://example.com/page/2"
|
|
||||||
|
|
||||||
def test_find_next_page_url_heuristic(self):
|
|
||||||
html = '<div><a rel="next" href="/page/2">Next</a></div>'
|
|
||||||
soup = BeautifulSoup(html, "html.parser")
|
|
||||||
parser = ArchiveParser(url="https://example.com")
|
|
||||||
|
|
||||||
next_url = parser._find_next_page_url(soup, "https://example.com/page/1")
|
|
||||||
assert next_url == "https://example.com/page/2"
|
|
||||||
|
|
||||||
def test_find_next_page_url_contains_text(self):
|
|
||||||
html = '<div><a href="/page/2">Next →</a></div>'
|
|
||||||
soup = BeautifulSoup(html, "html.parser")
|
|
||||||
parser = ArchiveParser(url="https://example.com")
|
|
||||||
|
|
||||||
next_url = parser._find_next_page_heuristic(soup)
|
|
||||||
assert next_url == "https://example.com/page/2"
|
|
||||||
|
|
||||||
def test_find_next_numeric_page(self):
|
|
||||||
parser = ArchiveParser(url="https://example.com")
|
|
||||||
parser.page_url_pattern = "/page/{page}"
|
|
||||||
|
|
||||||
# Test with existing page number
|
|
||||||
next_url = parser._find_next_numeric_page("https://example.com/page/3")
|
|
||||||
assert next_url == "https://example.com/page/4"
|
|
||||||
|
|
||||||
# Test without page number (assume page 1)
|
|
||||||
next_url = parser._find_next_numeric_page("https://example.com/archive")
|
|
||||||
assert next_url == "https://example.com/archive/page/2"
|
|
||||||
|
|
||||||
@patch("memory.common.parsers.archives.fetch_html")
|
|
||||||
@patch("time.sleep")
|
|
||||||
def test_fetch_items_pagination(self, mock_sleep, mock_fetch):
|
|
||||||
# Mock HTML for two pages
|
|
||||||
page1_html = """
|
|
||||||
<div>
|
|
||||||
<li><a href="/post1">Post 1</a></li>
|
|
||||||
<li><a href="/post2">Post 2</a></li>
|
|
||||||
<a rel="next" href="/page/2">Next</a>
|
|
||||||
</div>
|
|
||||||
"""
|
|
||||||
page2_html = """
|
|
||||||
<div>
|
|
||||||
<li><a href="/post3">Post 3</a></li>
|
|
||||||
<li><a href="/post4">Post 4</a></li>
|
|
||||||
</div>
|
|
||||||
"""
|
|
||||||
|
|
||||||
mock_fetch.side_effect = [page1_html, page2_html]
|
|
||||||
|
|
||||||
parser = ArchiveParser(url="https://example.com/page/1")
|
|
||||||
parser.delay_between_requests = 0.1 # Speed up test
|
|
||||||
|
|
||||||
items = parser.fetch_items()
|
|
||||||
|
|
||||||
assert len(items) == 4
|
|
||||||
assert mock_fetch.call_count == 2
|
|
||||||
assert mock_sleep.call_count == 1 # One delay between requests
|
|
||||||
|
|
||||||
@patch("memory.common.parsers.archives.fetch_html")
|
|
||||||
def test_fetch_items_stops_at_max_pages(self, mock_fetch):
|
|
||||||
# Mock HTML that always has a next page
|
|
||||||
html_with_next = """
|
|
||||||
<div>
|
|
||||||
<li><a href="/post">Post</a></li>
|
|
||||||
<a rel="next" href="/page/999">Next</a>
|
|
||||||
</div>
|
|
||||||
"""
|
|
||||||
|
|
||||||
mock_fetch.return_value = html_with_next
|
|
||||||
|
|
||||||
parser = ArchiveParser(url="https://example.com/page/1")
|
|
||||||
parser.max_pages = 3
|
|
||||||
parser.delay_between_requests = 0 # No delay for test
|
|
||||||
|
|
||||||
items = parser.fetch_items()
|
|
||||||
|
|
||||||
assert mock_fetch.call_count == 3 # Should stop at max_pages
|
|
||||||
|
|
||||||
@patch("memory.common.parsers.archives.fetch_html")
|
|
||||||
def test_fetch_items_handles_duplicate_urls(self, mock_fetch):
|
|
||||||
# Mock HTML that creates a cycle
|
|
||||||
page1_html = """
|
|
||||||
<div>
|
|
||||||
<li><a href="/post1">Post 1</a></li>
|
|
||||||
<a rel="next" href="/page/2">Next</a>
|
|
||||||
</div>
|
|
||||||
"""
|
|
||||||
page2_html = """
|
|
||||||
<div>
|
|
||||||
<li><a href="/post2">Post 2</a></li>
|
|
||||||
<a rel="next" href="/page/1">Back to page 1</a>
|
|
||||||
</div>
|
|
||||||
"""
|
|
||||||
|
|
||||||
mock_fetch.side_effect = [page1_html, page2_html]
|
|
||||||
|
|
||||||
parser = ArchiveParser(url="https://example.com/page/1")
|
|
||||||
parser.delay_between_requests = 0
|
|
||||||
|
|
||||||
items = parser.fetch_items()
|
|
||||||
|
|
||||||
assert len(items) == 2
|
|
||||||
assert mock_fetch.call_count == 2 # Should stop when it hits visited URL
|
|
||||||
|
|
||||||
|
|
||||||
class TestWordPressArchiveParser:
|
def test_archive_fetcher_make_parser():
|
||||||
def test_selectors(self):
|
fetcher = ArchiveFetcher(
|
||||||
parser = WordPressArchiveParser(url="https://example.wordpress.com")
|
parser_class=MockParser,
|
||||||
assert parser.item_selector == "article, .post"
|
start_url="https://example.com",
|
||||||
assert parser.next_page_selector == '.nav-previous a, .next a, a[rel="next"]'
|
parser_kwargs={"custom_attr": "value"},
|
||||||
assert parser.title_selector == ".entry-title a, h1 a, h2 a"
|
)
|
||||||
|
|
||||||
|
parser = fetcher.make_parser("https://example.com/page1")
|
||||||
|
|
||||||
|
assert isinstance(parser, MockParser)
|
||||||
|
assert parser.url == "https://example.com/page1"
|
||||||
|
assert getattr(parser, "custom_attr") == "value"
|
||||||
|
|
||||||
|
|
||||||
class TestSubstackArchiveParser:
|
def test_archive_fetcher_find_next_page_base():
|
||||||
def test_selectors(self):
|
fetcher = ArchiveFetcher(MockParser, "https://example.com")
|
||||||
parser = SubstackArchiveParser(url="https://example.substack.com")
|
parser = MockParser("https://example.com")
|
||||||
assert parser.item_selector == ".post-preview, .post"
|
|
||||||
assert parser.next_page_selector == ".pagination .next"
|
assert fetcher._find_next_page(parser, 0) is None
|
||||||
|
|
||||||
|
|
||||||
|
@patch("memory.common.parsers.archives.time.sleep")
|
||||||
|
def test_archive_fetcher_fetch_all_items_single_page(mock_sleep):
|
||||||
|
items = [
|
||||||
|
FeedItem(title="Item 1", url="https://example.com/1"),
|
||||||
|
FeedItem(title="Item 2", url="https://example.com/2"),
|
||||||
|
]
|
||||||
|
|
||||||
|
fetcher = ArchiveFetcher(
|
||||||
|
parser_class=MockParser,
|
||||||
|
start_url="https://example.com",
|
||||||
|
delay_between_requests=0.5,
|
||||||
|
)
|
||||||
|
|
||||||
|
with patch.object(fetcher, "make_parser") as mock_make_parser:
|
||||||
|
mock_parser = MockParser("https://example.com", items)
|
||||||
|
mock_make_parser.return_value = mock_parser
|
||||||
|
|
||||||
|
result = list(fetcher.fetch_all_items())
|
||||||
|
|
||||||
|
assert result == items
|
||||||
|
mock_make_parser.assert_called_once_with("https://example.com")
|
||||||
|
mock_sleep.assert_not_called() # No delay for single page
|
||||||
|
|
||||||
|
|
||||||
|
@patch("memory.common.parsers.archives.time.sleep")
|
||||||
|
def test_archive_fetcher_fetch_all_items_multiple_pages(mock_sleep):
|
||||||
|
page1_items = [FeedItem(title="Item 1", url="https://example.com/1")]
|
||||||
|
page2_items = [FeedItem(title="Item 2", url="https://example.com/2")]
|
||||||
|
|
||||||
|
class TestFetcher(ArchiveFetcher):
|
||||||
|
def __init__(self, *args, **kwargs):
|
||||||
|
super().__init__(*args, **kwargs)
|
||||||
|
self.call_count = 0
|
||||||
|
|
||||||
|
def _find_next_page(self, parser, current_page=0):
|
||||||
|
self.call_count += 1
|
||||||
|
if self.call_count == 1:
|
||||||
|
return "https://example.com/page2"
|
||||||
|
return None
|
||||||
|
|
||||||
|
fetcher = TestFetcher(
|
||||||
|
parser_class=MockParser,
|
||||||
|
start_url="https://example.com",
|
||||||
|
delay_between_requests=0.1,
|
||||||
|
)
|
||||||
|
|
||||||
|
with patch.object(fetcher, "make_parser") as mock_make_parser:
|
||||||
|
mock_make_parser.side_effect = [
|
||||||
|
MockParser("https://example.com", page1_items),
|
||||||
|
MockParser("https://example.com/page2", page2_items),
|
||||||
|
]
|
||||||
|
|
||||||
|
result = list(fetcher.fetch_all_items())
|
||||||
|
|
||||||
|
assert result == page1_items + page2_items
|
||||||
|
assert mock_make_parser.call_count == 2
|
||||||
|
mock_sleep.assert_called_once_with(0.1)
|
||||||
|
|
||||||
|
|
||||||
|
def test_archive_fetcher_fetch_all_items_max_pages():
|
||||||
|
class TestFetcher(ArchiveFetcher):
|
||||||
|
def _find_next_page(self, parser, current_page=0):
|
||||||
|
return f"https://example.com/page{current_page + 2}"
|
||||||
|
|
||||||
|
fetcher = TestFetcher(
|
||||||
|
parser_class=MockParser,
|
||||||
|
start_url="https://example.com",
|
||||||
|
max_pages=2,
|
||||||
|
delay_between_requests=0,
|
||||||
|
)
|
||||||
|
|
||||||
|
items = [FeedItem(title="Item", url="https://example.com/item")]
|
||||||
|
|
||||||
|
with patch.object(fetcher, "make_parser") as mock_make_parser:
|
||||||
|
mock_make_parser.return_value = MockParser("https://example.com", items)
|
||||||
|
|
||||||
|
result = list(fetcher.fetch_all_items())
|
||||||
|
|
||||||
|
assert len(result) == 2 # 2 pages * 1 item per page
|
||||||
|
assert mock_make_parser.call_count == 2
|
||||||
|
|
||||||
|
|
||||||
|
def test_archive_fetcher_fetch_all_items_visited_url():
|
||||||
|
class TestFetcher(ArchiveFetcher):
|
||||||
|
def _find_next_page(self, parser, current_page=0):
|
||||||
|
return "https://example.com" # Return same URL to trigger visited check
|
||||||
|
|
||||||
|
fetcher = TestFetcher(MockParser, "https://example.com", delay_between_requests=0)
|
||||||
|
items = [FeedItem(title="Item", url="https://example.com/item")]
|
||||||
|
|
||||||
|
with patch.object(fetcher, "make_parser") as mock_make_parser:
|
||||||
|
mock_make_parser.return_value = MockParser("https://example.com", items)
|
||||||
|
|
||||||
|
result = list(fetcher.fetch_all_items())
|
||||||
|
|
||||||
|
assert len(result) == 1 # Only first page processed
|
||||||
|
mock_make_parser.assert_called_once()
|
||||||
|
|
||||||
|
|
||||||
|
def test_archive_fetcher_fetch_all_items_no_items():
|
||||||
|
fetcher = ArchiveFetcher(
|
||||||
|
MockParser, "https://example.com", delay_between_requests=0
|
||||||
|
)
|
||||||
|
|
||||||
|
with patch.object(fetcher, "make_parser") as mock_make_parser:
|
||||||
|
mock_make_parser.return_value = MockParser("https://example.com", [])
|
||||||
|
|
||||||
|
result = list(fetcher.fetch_all_items())
|
||||||
|
|
||||||
|
assert result == []
|
||||||
|
mock_make_parser.assert_called_once()
|
||||||
|
|
||||||
|
|
||||||
|
def test_archive_fetcher_fetch_all_items_exception():
|
||||||
|
fetcher = ArchiveFetcher(
|
||||||
|
MockParser, "https://example.com", delay_between_requests=0
|
||||||
|
)
|
||||||
|
|
||||||
|
with patch.object(fetcher, "make_parser") as mock_make_parser:
|
||||||
|
mock_make_parser.side_effect = Exception("Network error")
|
||||||
|
|
||||||
|
result = list(fetcher.fetch_all_items())
|
||||||
|
|
||||||
|
assert result == []
|
||||||
|
|
||||||
|
|
||||||
class TestGetArchiveParser:
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
"url,expected_class",
|
"start_url, per_page, current_page, expected_params",
|
||||||
[
|
[
|
||||||
("https://example.wordpress.com/archive", WordPressArchiveParser),
|
("https://example.com", 10, 0, {"offset": ["10"], "limit": ["10"]}),
|
||||||
("https://example.substack.com/archive", SubstackArchiveParser),
|
(
|
||||||
("https://example.com/archive", ArchiveParser), # Default
|
"https://example.com?existing=value",
|
||||||
|
20,
|
||||||
|
1,
|
||||||
|
{"existing": ["value"], "offset": ["40"], "limit": ["20"]},
|
||||||
|
),
|
||||||
|
(
|
||||||
|
"https://example.com?offset=0&limit=5",
|
||||||
|
15,
|
||||||
|
2,
|
||||||
|
{"offset": ["45"], "limit": ["15"]},
|
||||||
|
),
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
def test_get_archive_parser(self, url, expected_class):
|
def test_link_fetcher_find_next_page(
|
||||||
parser = get_archive_parser(url)
|
start_url, per_page, current_page, expected_params
|
||||||
assert isinstance(parser, expected_class)
|
):
|
||||||
assert parser.url == url
|
fetcher = LinkFetcher(MockParser, start_url, per_page=per_page)
|
||||||
|
parser = MockParser(start_url)
|
||||||
|
|
||||||
|
next_url = fetcher._find_next_page(parser, current_page)
|
||||||
|
|
||||||
|
assert next_url is not None
|
||||||
|
parsed = urlparse(next_url)
|
||||||
|
params = parse_qs(parsed.query)
|
||||||
|
|
||||||
|
for key, value in expected_params.items():
|
||||||
|
assert params[key] == value
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"html, selectors, expected_url",
|
||||||
|
[
|
||||||
|
(
|
||||||
|
'<a rel="next" href="/page2">Next</a>',
|
||||||
|
['a[rel="next"]'],
|
||||||
|
"https://example.com/page2",
|
||||||
|
),
|
||||||
|
(
|
||||||
|
'<div class="next"><a href="/page2">Next</a></div>',
|
||||||
|
[".next a"],
|
||||||
|
"https://example.com/page2",
|
||||||
|
),
|
||||||
|
(
|
||||||
|
'<a class="next" href="/page2">Next</a>',
|
||||||
|
["a.next"],
|
||||||
|
"https://example.com/page2",
|
||||||
|
),
|
||||||
|
(
|
||||||
|
'<div class="pagination"><span class="next"><a href="/page2">Next</a></span></div>',
|
||||||
|
[".pagination .next"],
|
||||||
|
None, # This won't match because it's looking for .pagination .next directly
|
||||||
|
),
|
||||||
|
(
|
||||||
|
'<div class="pagination next"><a href="/page2">Next</a></div>',
|
||||||
|
[".pagination.next"],
|
||||||
|
None, # This selector isn't in default list
|
||||||
|
),
|
||||||
|
(
|
||||||
|
'<nav class="page"><a href="/page1">1</a><a href="/page2">2</a></nav>',
|
||||||
|
["nav.page a:last-of-type"],
|
||||||
|
"https://example.com/page2",
|
||||||
|
),
|
||||||
|
("<div>No next link</div>", ['a[rel="next"]'], None),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
def test_html_archive_fetcher_find_next_page(html, selectors, expected_url):
|
||||||
|
fetcher = HTMLArchiveFetcher(
|
||||||
|
MockParser, "https://example.com", next_page_selectors=selectors
|
||||||
|
)
|
||||||
|
parser = MockParser("https://example.com", content=html)
|
||||||
|
|
||||||
|
with patch("memory.common.parsers.archives.extract_url") as mock_extract:
|
||||||
|
mock_extract.return_value = expected_url
|
||||||
|
|
||||||
|
result = fetcher._find_next_page(parser)
|
||||||
|
|
||||||
|
if expected_url:
|
||||||
|
mock_extract.assert_called_once()
|
||||||
|
assert result == expected_url
|
||||||
|
else:
|
||||||
|
# extract_url might still be called but return None
|
||||||
|
assert result is None
|
||||||
|
|
||||||
|
|
||||||
|
def test_html_archive_fetcher_find_next_page_no_content():
|
||||||
|
fetcher = HTMLArchiveFetcher(MockParser, "https://example.com")
|
||||||
|
parser = MockParser("https://example.com", content="")
|
||||||
|
|
||||||
|
result = fetcher._find_next_page(parser)
|
||||||
|
|
||||||
|
assert result is None
|
||||||
|
|
||||||
|
|
||||||
|
def test_html_parser_factory():
|
||||||
|
CustomParser = html_parser(
|
||||||
|
item_selector="article", title_selector="h1", custom_attr="value"
|
||||||
|
)
|
||||||
|
|
||||||
|
parser = CustomParser("https://example.com")
|
||||||
|
|
||||||
|
assert isinstance(parser, HTMLListParser)
|
||||||
|
assert parser.item_selector == "article"
|
||||||
|
assert parser.title_selector == "h1"
|
||||||
|
assert getattr(parser, "custom_attr") == "value"
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"start_url, expected_api_url",
|
||||||
|
[
|
||||||
|
("https://example.substack.com", "https://example.substack.com/api/v1/archive"),
|
||||||
|
(
|
||||||
|
"https://example.substack.com/posts",
|
||||||
|
"https://example.substack.com/api/v1/archive",
|
||||||
|
),
|
||||||
|
(
|
||||||
|
"https://example.substack.com/api/v1/archive",
|
||||||
|
"https://example.substack.com/api/v1/archive",
|
||||||
|
),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
def test_substack_archive_fetcher_post_init(start_url, expected_api_url):
|
||||||
|
with patch("memory.common.parsers.archives.get_base_url") as mock_get_base:
|
||||||
|
mock_get_base.return_value = "https://example.substack.com"
|
||||||
|
|
||||||
|
fetcher = SubstackArchiveFetcher(SubstackAPIParser, start_url)
|
||||||
|
|
||||||
|
assert fetcher.start_url == expected_api_url
|
||||||
|
|
||||||
|
|
||||||
|
def test_acoup_archive_fetcher_find_next_page():
|
||||||
|
html = """
|
||||||
|
<div class="widget_archive">
|
||||||
|
<a href="https://acoup.blog/2019/04/">April 2019</a>
|
||||||
|
<a href="https://acoup.blog/2019/05/">May 2019</a>
|
||||||
|
<a href="https://acoup.blog/2019/06/">June 2019</a>
|
||||||
|
</div>
|
||||||
|
"""
|
||||||
|
|
||||||
|
fetcher = ACOUPArchiveFetcher(MockParser, "https://acoup.blog/2019/05/")
|
||||||
|
parser = MockParser("https://acoup.blog/2019/05/", content=html)
|
||||||
|
|
||||||
|
result = fetcher._find_next_page(parser)
|
||||||
|
|
||||||
|
assert result == "https://acoup.blog/2019/04/"
|
||||||
|
|
||||||
|
|
||||||
|
def test_acoup_archive_fetcher_find_next_page_no_match():
|
||||||
|
html = """
|
||||||
|
<div class="widget_archive">
|
||||||
|
<a href="https://acoup.blog/2019/04/">April 2019</a>
|
||||||
|
<a href="https://acoup.blog/2019/06/">June 2019</a>
|
||||||
|
</div>
|
||||||
|
"""
|
||||||
|
|
||||||
|
fetcher = ACOUPArchiveFetcher(MockParser, "https://acoup.blog/2019/05/")
|
||||||
|
parser = MockParser("https://acoup.blog/2019/05/", content=html)
|
||||||
|
|
||||||
|
result = fetcher._find_next_page(parser)
|
||||||
|
|
||||||
|
assert result is None
|
||||||
|
|
||||||
|
|
||||||
|
def test_acoup_archive_fetcher_find_next_page_no_content():
|
||||||
|
fetcher = ACOUPArchiveFetcher(MockParser, "https://acoup.blog/2019/05/")
|
||||||
|
parser = MockParser("https://acoup.blog/2019/05/", content="")
|
||||||
|
|
||||||
|
result = fetcher._find_next_page(parser)
|
||||||
|
|
||||||
|
assert result is None
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"start_url, next_url, expected_next_url",
|
||||||
|
[
|
||||||
|
(
|
||||||
|
"https://example.com",
|
||||||
|
"",
|
||||||
|
"https://example.com",
|
||||||
|
), # Empty next_url defaults to start_url
|
||||||
|
(
|
||||||
|
"https://example.com",
|
||||||
|
"https://other.com/archive",
|
||||||
|
"https://other.com/archive", # Full URL is preserved
|
||||||
|
),
|
||||||
|
(
|
||||||
|
"https://example.com",
|
||||||
|
"/archive",
|
||||||
|
"/archive",
|
||||||
|
), # Absolute path is preserved
|
||||||
|
(
|
||||||
|
"https://example.com",
|
||||||
|
"archive",
|
||||||
|
"https://example.com/archive",
|
||||||
|
), # Relative path gets prepended
|
||||||
|
],
|
||||||
|
)
|
||||||
|
def test_html_next_url_archive_fetcher_post_init(
|
||||||
|
start_url, next_url, expected_next_url
|
||||||
|
):
|
||||||
|
fetcher = HTMLNextUrlArchiveFetcher(MockParser, start_url, next_url=next_url)
|
||||||
|
|
||||||
|
assert fetcher.next_url == expected_next_url
|
||||||
|
|
||||||
|
|
||||||
|
def test_html_next_url_archive_fetcher_find_next_page():
|
||||||
|
fetcher = HTMLNextUrlArchiveFetcher(
|
||||||
|
MockParser, "https://example.com", next_url="https://example.com/archive"
|
||||||
|
)
|
||||||
|
parser = MockParser("https://example.com")
|
||||||
|
|
||||||
|
result = fetcher._find_next_page(parser, 2)
|
||||||
|
|
||||||
|
assert result == "https://example.com/archive/3"
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"url, expected_fetcher_type",
|
||||||
|
[
|
||||||
|
("https://danluu.com", HTMLArchiveFetcher),
|
||||||
|
("https://www.rifters.com", HTMLArchiveFetcher),
|
||||||
|
("https://putanumonit.com", HTMLArchiveFetcher),
|
||||||
|
("https://acoup.blog", ACOUPArchiveFetcher),
|
||||||
|
("https://unknown.com", None),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
def test_get_archive_fetcher_registry_matches(url, expected_fetcher_type):
|
||||||
|
with patch("memory.common.parsers.archives.fetch_html") as mock_fetch:
|
||||||
|
mock_fetch.return_value = "<html><body>Not substack</body></html>"
|
||||||
|
|
||||||
|
with patch("memory.common.parsers.archives.is_substack") as mock_is_substack:
|
||||||
|
mock_is_substack.return_value = False
|
||||||
|
|
||||||
|
fetcher = get_archive_fetcher(url)
|
||||||
|
|
||||||
|
if expected_fetcher_type:
|
||||||
|
assert isinstance(fetcher, expected_fetcher_type)
|
||||||
|
else:
|
||||||
|
assert fetcher is None
|
||||||
|
|
||||||
|
|
||||||
|
def test_get_archive_fetcher_tuple_registry():
|
||||||
|
url = "https://putanumonit.com"
|
||||||
|
|
||||||
|
with patch("memory.common.parsers.archives.fetch_html") as mock_fetch:
|
||||||
|
mock_fetch.return_value = "<html><body>Not substack</body></html>"
|
||||||
|
|
||||||
|
fetcher = get_archive_fetcher(url)
|
||||||
|
|
||||||
|
assert isinstance(fetcher, HTMLArchiveFetcher)
|
||||||
|
assert fetcher.start_url == "https://putanumonit.com/full-archive"
|
||||||
|
|
||||||
|
|
||||||
|
def test_get_archive_fetcher_direct_parser_registry():
|
||||||
|
url = "https://danluu.com"
|
||||||
|
|
||||||
|
with patch("memory.common.parsers.archives.fetch_html") as mock_fetch:
|
||||||
|
mock_fetch.return_value = "<html><body>Not substack</body></html>"
|
||||||
|
|
||||||
|
fetcher = get_archive_fetcher(url)
|
||||||
|
|
||||||
|
assert isinstance(fetcher, HTMLArchiveFetcher)
|
||||||
|
assert fetcher.parser_class == DanluuParser
|
||||||
|
assert fetcher.start_url == url
|
||||||
|
|
||||||
|
|
||||||
|
def test_get_archive_fetcher_substack():
|
||||||
|
url = "https://example.substack.com"
|
||||||
|
|
||||||
|
with patch("memory.common.parsers.archives.fetch_html") as mock_fetch:
|
||||||
|
mock_fetch.return_value = "<html><body>Substack content</body></html>"
|
||||||
|
|
||||||
|
with patch("memory.common.parsers.archives.is_substack") as mock_is_substack:
|
||||||
|
mock_is_substack.return_value = True
|
||||||
|
|
||||||
|
fetcher = get_archive_fetcher(url)
|
||||||
|
|
||||||
|
assert isinstance(fetcher, SubstackArchiveFetcher)
|
||||||
|
assert fetcher.parser_class == SubstackAPIParser
|
||||||
|
|
||||||
|
|
||||||
|
def test_get_archive_fetcher_no_match():
|
||||||
|
url = "https://unknown.com"
|
||||||
|
|
||||||
|
with patch("memory.common.parsers.archives.fetch_html") as mock_fetch:
|
||||||
|
mock_fetch.return_value = "<html><body>Regular website</body></html>"
|
||||||
|
|
||||||
|
with patch("memory.common.parsers.archives.is_substack") as mock_is_substack:
|
||||||
|
mock_is_substack.return_value = False
|
||||||
|
|
||||||
|
fetcher = get_archive_fetcher(url)
|
||||||
|
|
||||||
|
assert fetcher is None
|
||||||
|
|
||||||
|
|
||||||
|
def test_fetcher_registry_structure():
|
||||||
|
"""Test that FETCHER_REGISTRY has expected structure."""
|
||||||
|
assert isinstance(FETCHER_REGISTRY, dict)
|
||||||
|
|
||||||
|
for pattern, fetcher in FETCHER_REGISTRY.items():
|
||||||
|
assert isinstance(pattern, str)
|
||||||
|
assert (
|
||||||
|
isinstance(fetcher, type)
|
||||||
|
and issubclass(fetcher, FeedParser)
|
||||||
|
or isinstance(fetcher, tuple)
|
||||||
|
or isinstance(fetcher, ArchiveFetcher)
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"pattern, test_url, should_match",
|
||||||
|
[
|
||||||
|
(r"https://danluu.com", "https://danluu.com", True),
|
||||||
|
(r"https://danluu.com", "https://danluu.com/", True),
|
||||||
|
(r"https://danluu.com", "https://other.com", False),
|
||||||
|
(r"https://www.rifters.com", "https://www.rifters.com/crawl", True),
|
||||||
|
(r"https://putanumonit.com", "https://putanumonit.com/archive", True),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
def test_registry_pattern_matching(pattern, test_url, should_match):
|
||||||
|
import re
|
||||||
|
|
||||||
|
match = re.search(pattern, test_url.rstrip("/"))
|
||||||
|
assert bool(match) == should_match
|
||||||
|
@ -1,10 +1,10 @@
|
|||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from unittest.mock import MagicMock, patch
|
from unittest.mock import MagicMock, patch
|
||||||
from typing import Any, cast
|
from typing import cast
|
||||||
|
import json
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
from bs4 import BeautifulSoup, Tag
|
from bs4 import BeautifulSoup, Tag
|
||||||
import requests
|
|
||||||
|
|
||||||
from memory.common.parsers.feeds import (
|
from memory.common.parsers.feeds import (
|
||||||
FeedItem,
|
FeedItem,
|
||||||
@ -17,15 +17,160 @@ from memory.common.parsers.feeds import (
|
|||||||
NadiaXyzParser,
|
NadiaXyzParser,
|
||||||
RedHandFilesParser,
|
RedHandFilesParser,
|
||||||
BloombergAuthorParser,
|
BloombergAuthorParser,
|
||||||
|
JSONParser,
|
||||||
|
SubstackAPIParser,
|
||||||
|
select_in,
|
||||||
|
clean_url,
|
||||||
is_rss_feed,
|
is_rss_feed,
|
||||||
extract_url,
|
|
||||||
find_feed_link,
|
find_feed_link,
|
||||||
get_feed_parser,
|
get_feed_parser,
|
||||||
DEFAULT_SKIP_PATTERNS,
|
|
||||||
PARSER_REGISTRY,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"data, path, expected",
|
||||||
|
[
|
||||||
|
# Basic dictionary access
|
||||||
|
({"key": "value"}, ["key"], "value"),
|
||||||
|
({"nested": {"key": "value"}}, ["nested", "key"], "value"),
|
||||||
|
# List access
|
||||||
|
(["a", "b", "c"], [1], "b"),
|
||||||
|
([{"key": "value"}], [0, "key"], "value"),
|
||||||
|
# Mixed access
|
||||||
|
(
|
||||||
|
{"items": [{"name": "first"}, {"name": "second"}]},
|
||||||
|
["items", 1, "name"],
|
||||||
|
"second",
|
||||||
|
),
|
||||||
|
# Empty path returns original data
|
||||||
|
({"key": "value"}, [], {"key": "value"}),
|
||||||
|
# Missing keys return None
|
||||||
|
({"key": "value"}, ["missing"], None),
|
||||||
|
({"nested": {}}, ["nested", "missing"], None),
|
||||||
|
# Index out of bounds returns None
|
||||||
|
(["a", "b"], [5], None),
|
||||||
|
# Type errors return None
|
||||||
|
("string", ["key"], None),
|
||||||
|
(123, [0], None),
|
||||||
|
(None, ["key"], None),
|
||||||
|
# Deep nesting
|
||||||
|
({"a": {"b": {"c": {"d": "deep"}}}}, ["a", "b", "c", "d"], "deep"),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
def test_select_in(data, path, expected):
|
||||||
|
assert select_in(data, path) == expected
|
||||||
|
|
||||||
|
|
||||||
|
@patch("memory.common.parsers.feeds.fetch_html")
|
||||||
|
def test_json_parser_fetch_items_with_content(mock_fetch_html):
|
||||||
|
content = json.dumps(
|
||||||
|
[
|
||||||
|
{"title": "Article 1", "url": "https://example.com/1"},
|
||||||
|
{"title": "Article 2", "url": "https://example.com/2"},
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
parser = JSONParser(url="https://example.com/feed.json", content=content)
|
||||||
|
items = parser.fetch_items()
|
||||||
|
|
||||||
|
assert items == [
|
||||||
|
{"title": "Article 1", "url": "https://example.com/1"},
|
||||||
|
{"title": "Article 2", "url": "https://example.com/2"},
|
||||||
|
]
|
||||||
|
mock_fetch_html.assert_not_called()
|
||||||
|
|
||||||
|
|
||||||
|
@patch("memory.common.parsers.feeds.fetch_html")
|
||||||
|
def test_json_parser_fetch_items_without_content(mock_fetch_html):
|
||||||
|
content = json.dumps([{"title": "Article", "url": "https://example.com/1"}])
|
||||||
|
mock_fetch_html.return_value = content
|
||||||
|
|
||||||
|
parser = JSONParser(url="https://example.com/feed.json")
|
||||||
|
items = parser.fetch_items()
|
||||||
|
|
||||||
|
assert items == [{"title": "Article", "url": "https://example.com/1"}]
|
||||||
|
mock_fetch_html.assert_called_once_with("https://example.com/feed.json")
|
||||||
|
|
||||||
|
|
||||||
|
@patch("memory.common.parsers.feeds.fetch_html")
|
||||||
|
def test_json_parser_fetch_items_invalid_json(mock_fetch_html):
|
||||||
|
mock_fetch_html.return_value = "invalid json content"
|
||||||
|
|
||||||
|
parser = JSONParser(url="https://example.com/feed.json")
|
||||||
|
items = parser.fetch_items()
|
||||||
|
|
||||||
|
assert items == []
|
||||||
|
|
||||||
|
|
||||||
|
def test_json_parser_extract_methods():
|
||||||
|
parser = JSONParser(url="https://example.com")
|
||||||
|
|
||||||
|
entry = {
|
||||||
|
"title": "Test Title",
|
||||||
|
"url": "https://example.com/article",
|
||||||
|
"description": "Test description",
|
||||||
|
"date": "2023-01-15",
|
||||||
|
"author": "John Doe",
|
||||||
|
"guid": "unique-123",
|
||||||
|
"metadata": {"tags": ["tech", "news"]},
|
||||||
|
}
|
||||||
|
|
||||||
|
assert parser.extract_title(entry) == "Test Title"
|
||||||
|
assert parser.extract_url(entry) == "https://example.com/article"
|
||||||
|
assert parser.extract_description(entry) == "Test description"
|
||||||
|
assert parser.extract_date(entry) == "2023-01-15"
|
||||||
|
assert parser.extract_author(entry) == "John Doe"
|
||||||
|
assert parser.extract_guid(entry) == "unique-123"
|
||||||
|
assert parser.extract_metadata(entry) == {"tags": ["tech", "news"]}
|
||||||
|
|
||||||
|
|
||||||
|
def test_json_parser_custom_paths():
|
||||||
|
parser = JSONParser(url="https://example.com")
|
||||||
|
parser.title_path = ["content", "headline"]
|
||||||
|
parser.url_path = ["links", "canonical"]
|
||||||
|
parser.author_path = ["byline", "name"]
|
||||||
|
|
||||||
|
entry = {
|
||||||
|
"content": {"headline": "Custom Title"},
|
||||||
|
"links": {"canonical": "https://example.com/custom"},
|
||||||
|
"byline": {"name": "Jane Smith"},
|
||||||
|
}
|
||||||
|
|
||||||
|
assert parser.extract_title(entry) == "Custom Title"
|
||||||
|
assert parser.extract_url(entry) == "https://example.com/custom"
|
||||||
|
assert parser.extract_author(entry) == "Jane Smith"
|
||||||
|
|
||||||
|
|
||||||
|
def test_json_parser_missing_fields():
|
||||||
|
parser = JSONParser(url="https://example.com")
|
||||||
|
|
||||||
|
entry = {} # Empty entry
|
||||||
|
|
||||||
|
assert parser.extract_title(entry) is None
|
||||||
|
assert parser.extract_url(entry) is None
|
||||||
|
assert parser.extract_description(entry) is None
|
||||||
|
assert parser.extract_date(entry) is None
|
||||||
|
assert parser.extract_author(entry) is None
|
||||||
|
assert parser.extract_guid(entry) is None
|
||||||
|
assert parser.extract_metadata(entry) is None
|
||||||
|
|
||||||
|
|
||||||
|
def test_json_parser_nested_paths():
|
||||||
|
parser = JSONParser(url="https://example.com")
|
||||||
|
parser.title_path = ["article", "header", "title"]
|
||||||
|
parser.author_path = ["article", "byline", 0, "name"]
|
||||||
|
|
||||||
|
entry = {
|
||||||
|
"article": {
|
||||||
|
"header": {"title": "Nested Title"},
|
||||||
|
"byline": [{"name": "First Author"}, {"name": "Second Author"}],
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
assert parser.extract_title(entry) == "Nested Title"
|
||||||
|
assert parser.extract_author(entry) == "First Author"
|
||||||
|
|
||||||
|
|
||||||
def test_feed_parser_base_url():
|
def test_feed_parser_base_url():
|
||||||
parser = FeedParser(url="https://example.com/path/to/feed")
|
parser = FeedParser(url="https://example.com/path/to/feed")
|
||||||
assert parser.base_url == "https://example.com"
|
assert parser.base_url == "https://example.com"
|
||||||
@ -582,7 +727,7 @@ def test_extract_url_function(html, expected):
|
|||||||
element = soup.find("a")
|
element = soup.find("a")
|
||||||
assert element is not None
|
assert element is not None
|
||||||
|
|
||||||
url = extract_url(cast(Tag, element), "https://example.com")
|
url = clean_url(cast(Tag, element), "https://example.com")
|
||||||
assert url == expected
|
assert url == expected
|
||||||
|
|
||||||
|
|
||||||
@ -706,33 +851,30 @@ def test_get_feed_parser_with_check_from():
|
|||||||
assert parser.since == check_from
|
assert parser.since == check_from
|
||||||
|
|
||||||
|
|
||||||
def test_parser_registry_completeness():
|
def test_substack_api_parser():
|
||||||
"""Ensure PARSER_REGISTRY contains expected parsers."""
|
parser = SubstackAPIParser(url="https://example.substack.com/api/v1/posts")
|
||||||
expected_patterns = [
|
|
||||||
r"https://danluu.com",
|
|
||||||
r"https://guzey.com/archive",
|
|
||||||
r"https://www.paulgraham.com/articles",
|
|
||||||
r"https://nadia.xyz/posts",
|
|
||||||
r"https://www.theredhandfiles.com",
|
|
||||||
r"https://archive.ph/.*?/https://www.bloomberg.com/opinion/authors/",
|
|
||||||
]
|
|
||||||
|
|
||||||
assert len(PARSER_REGISTRY) == len(expected_patterns)
|
entry = {
|
||||||
for pattern in expected_patterns:
|
"title": "Substack Post",
|
||||||
assert pattern in PARSER_REGISTRY
|
"canonical_url": "https://example.substack.com/p/post-slug",
|
||||||
|
"publishedBylines": [{"name": "Author Name"}],
|
||||||
|
"post_date": "2023-01-15T10:30:00Z",
|
||||||
|
}
|
||||||
|
|
||||||
|
assert parser.extract_title(entry) == "Substack Post"
|
||||||
|
assert parser.extract_url(entry) == "https://example.substack.com/p/post-slug"
|
||||||
|
assert parser.extract_author(entry) == "Author Name"
|
||||||
|
assert parser.extract_date(entry) == "2023-01-15T10:30:00Z"
|
||||||
|
|
||||||
|
|
||||||
def test_default_skip_patterns():
|
def test_substack_api_parser_missing_bylines():
|
||||||
"""Ensure DEFAULT_SKIP_PATTERNS contains expected patterns."""
|
parser = SubstackAPIParser(url="https://example.substack.com/api/v1/posts")
|
||||||
expected_patterns = [
|
|
||||||
r"^#",
|
|
||||||
r"mailto:",
|
|
||||||
r"tel:",
|
|
||||||
r"javascript:",
|
|
||||||
r"\.pdf$",
|
|
||||||
r"\.jpg$",
|
|
||||||
r"\.png$",
|
|
||||||
r"\.gif$",
|
|
||||||
]
|
|
||||||
|
|
||||||
assert DEFAULT_SKIP_PATTERNS == expected_patterns
|
entry = {
|
||||||
|
"title": "Post Without Author",
|
||||||
|
"canonical_url": "https://example.substack.com/p/post",
|
||||||
|
"publishedBylines": [],
|
||||||
|
"post_date": "2023-01-15T10:30:00Z",
|
||||||
|
}
|
||||||
|
|
||||||
|
assert parser.extract_author(entry) is None
|
||||||
|
@ -23,7 +23,11 @@ from memory.common.parsers.html import (
|
|||||||
extract_meta_by_pattern,
|
extract_meta_by_pattern,
|
||||||
extract_metadata,
|
extract_metadata,
|
||||||
extract_title,
|
extract_title,
|
||||||
|
extract_url,
|
||||||
get_base_url,
|
get_base_url,
|
||||||
|
is_bloomberg,
|
||||||
|
is_substack,
|
||||||
|
is_wordpress,
|
||||||
parse_date,
|
parse_date,
|
||||||
process_image,
|
process_image,
|
||||||
process_images,
|
process_images,
|
||||||
@ -454,7 +458,7 @@ def test_process_images_empty():
|
|||||||
None, "https://example.com", pathlib.Path("/tmp")
|
None, "https://example.com", pathlib.Path("/tmp")
|
||||||
)
|
)
|
||||||
assert result_content is None
|
assert result_content is None
|
||||||
assert result_images == []
|
assert result_images == {}
|
||||||
|
|
||||||
|
|
||||||
@patch("memory.common.parsers.html.process_image")
|
@patch("memory.common.parsers.html.process_image")
|
||||||
@ -503,6 +507,191 @@ def test_process_images_no_filename(mock_process_image):
|
|||||||
assert not images
|
assert not images
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"html, selectors, base_url, expected",
|
||||||
|
[
|
||||||
|
# Basic URL extraction
|
||||||
|
(
|
||||||
|
'<a href="/next-page">Next</a>',
|
||||||
|
"a",
|
||||||
|
"https://example.com",
|
||||||
|
"https://example.com/next-page",
|
||||||
|
),
|
||||||
|
# Multiple selectors - should pick first matching
|
||||||
|
(
|
||||||
|
'<div><a href="/first">First</a><a href="/second">Second</a></div>',
|
||||||
|
"a",
|
||||||
|
"https://example.com",
|
||||||
|
"https://example.com/first",
|
||||||
|
),
|
||||||
|
# Multiple selectors with comma separation - span doesn't have href, so falls back to a
|
||||||
|
(
|
||||||
|
'<div><span class="next">Span</span><a href="/link">Link</a></div>',
|
||||||
|
".next, a",
|
||||||
|
"https://example.com",
|
||||||
|
"https://example.com/link",
|
||||||
|
),
|
||||||
|
# Absolute URL should remain unchanged
|
||||||
|
(
|
||||||
|
'<a href="https://other.com/page">External</a>',
|
||||||
|
"a",
|
||||||
|
"https://example.com",
|
||||||
|
"https://other.com/page",
|
||||||
|
),
|
||||||
|
# No href attribute
|
||||||
|
("<a>No href</a>", "a", "https://example.com", None),
|
||||||
|
# No matching element
|
||||||
|
("<p>No links</p>", "a", "https://example.com", None),
|
||||||
|
# Empty href
|
||||||
|
('<a href="">Empty</a>', "a", "https://example.com", None),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
def test_extract_url(html, selectors, base_url, expected):
|
||||||
|
soup = BeautifulSoup(html, "html.parser")
|
||||||
|
assert extract_url(soup, selectors, base_url) == expected
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"html, expected",
|
||||||
|
[
|
||||||
|
# Substack with preconnect link
|
||||||
|
(
|
||||||
|
"""
|
||||||
|
<head>
|
||||||
|
<link rel="preconnect" href="https://substackcdn.com">
|
||||||
|
</head>
|
||||||
|
""",
|
||||||
|
True,
|
||||||
|
),
|
||||||
|
# Multiple preconnect links, one is Substack
|
||||||
|
(
|
||||||
|
"""
|
||||||
|
<head>
|
||||||
|
<link rel="preconnect" href="https://fonts.googleapis.com">
|
||||||
|
<link rel="preconnect" href="https://substackcdn.com">
|
||||||
|
</head>
|
||||||
|
""",
|
||||||
|
True,
|
||||||
|
),
|
||||||
|
# No Substack preconnect
|
||||||
|
(
|
||||||
|
"""
|
||||||
|
<head>
|
||||||
|
<link rel="preconnect" href="https://fonts.googleapis.com">
|
||||||
|
</head>
|
||||||
|
""",
|
||||||
|
False,
|
||||||
|
),
|
||||||
|
# No preconnect links at all
|
||||||
|
("<head></head>", False),
|
||||||
|
# Preconnect without href
|
||||||
|
('<head><link rel="preconnect"></head>', False),
|
||||||
|
# Different rel attribute
|
||||||
|
('<head><link rel="stylesheet" href="https://substackcdn.com"></head>', False),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
def test_is_substack(html, expected):
|
||||||
|
soup = BeautifulSoup(html, "html.parser")
|
||||||
|
assert is_substack(soup) == expected
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"html, expected",
|
||||||
|
[
|
||||||
|
# WordPress with wp-singular class on body should be False (looks for content inside body)
|
||||||
|
('<body class="wp-singular">Content</body>', False),
|
||||||
|
# WordPress with nested wp-singular
|
||||||
|
('<body><div class="wp-singular">Content</div></body>', True),
|
||||||
|
# Archived page with WordPress content
|
||||||
|
(
|
||||||
|
"""
|
||||||
|
<div id="CONTENT">
|
||||||
|
<div class="html">
|
||||||
|
<body class="wp-singular">Content</body>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
""",
|
||||||
|
True,
|
||||||
|
),
|
||||||
|
# No WordPress indicators
|
||||||
|
('<body><div class="content">Regular content</div></body>', False),
|
||||||
|
# Empty body
|
||||||
|
("<body></body>", False),
|
||||||
|
# No body tag
|
||||||
|
("<div>No body</div>", False),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
def test_is_wordpress(html, expected):
|
||||||
|
soup = BeautifulSoup(html, "html.parser")
|
||||||
|
assert is_wordpress(soup) == expected
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"html, expected",
|
||||||
|
[
|
||||||
|
# Bloomberg with company link
|
||||||
|
(
|
||||||
|
"""
|
||||||
|
<body>
|
||||||
|
<a href="https://www.bloomberg.com/company/">Bloomberg</a>
|
||||||
|
</body>
|
||||||
|
""",
|
||||||
|
True,
|
||||||
|
),
|
||||||
|
# Bloomberg link among other links
|
||||||
|
(
|
||||||
|
"""
|
||||||
|
<body>
|
||||||
|
<a href="https://example.com">Example</a>
|
||||||
|
<a href="https://www.bloomberg.com/company/">Bloomberg</a>
|
||||||
|
<a href="https://other.com">Other</a>
|
||||||
|
</body>
|
||||||
|
""",
|
||||||
|
True,
|
||||||
|
),
|
||||||
|
# Archived page with Bloomberg content
|
||||||
|
(
|
||||||
|
"""
|
||||||
|
<div id="CONTENT">
|
||||||
|
<div class="html">
|
||||||
|
<body>
|
||||||
|
<a href="https://www.bloomberg.com/company/">Bloomberg</a>
|
||||||
|
</body>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
""",
|
||||||
|
True,
|
||||||
|
),
|
||||||
|
# No Bloomberg links
|
||||||
|
(
|
||||||
|
"""
|
||||||
|
<body>
|
||||||
|
<a href="https://example.com">Example</a>
|
||||||
|
<a href="https://other.com">Other</a>
|
||||||
|
</body>
|
||||||
|
""",
|
||||||
|
False,
|
||||||
|
),
|
||||||
|
# Bloomberg link but not company page
|
||||||
|
(
|
||||||
|
"""
|
||||||
|
<body>
|
||||||
|
<a href="https://www.bloomberg.com/news/">Bloomberg News</a>
|
||||||
|
</body>
|
||||||
|
""",
|
||||||
|
False,
|
||||||
|
),
|
||||||
|
# No links at all
|
||||||
|
("<body><p>No links</p></body>", False),
|
||||||
|
# Links without href
|
||||||
|
("<body><a>No href</a></body>", False),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
def test_is_bloomberg(html, expected):
|
||||||
|
soup = BeautifulSoup(html, "html.parser")
|
||||||
|
assert is_bloomberg(soup) == expected
|
||||||
|
|
||||||
|
|
||||||
class TestBaseHTMLParser:
|
class TestBaseHTMLParser:
|
||||||
def test_init_with_base_url(self):
|
def test_init_with_base_url(self):
|
||||||
parser = BaseHTMLParser("https://example.com/path")
|
parser = BaseHTMLParser("https://example.com/path")
|
||||||
@ -584,7 +773,7 @@ class TestBaseHTMLParser:
|
|||||||
def test_parse_with_images(self, mock_process_images):
|
def test_parse_with_images(self, mock_process_images):
|
||||||
# Mock the image processing to return test data
|
# Mock the image processing to return test data
|
||||||
mock_image = MagicMock(spec=PILImage.Image)
|
mock_image = MagicMock(spec=PILImage.Image)
|
||||||
mock_process_images.return_value = (MagicMock(), [mock_image])
|
mock_process_images.return_value = (MagicMock(), {"test_image.jpg": mock_image})
|
||||||
|
|
||||||
html = """
|
html = """
|
||||||
<article>
|
<article>
|
||||||
@ -600,5 +789,6 @@ class TestBaseHTMLParser:
|
|||||||
article = parser.parse(html, "https://example.com/article")
|
article = parser.parse(html, "https://example.com/article")
|
||||||
|
|
||||||
assert len(article.images) == 1
|
assert len(article.images) == 1
|
||||||
assert article.images[0] == mock_image
|
assert "test_image.jpg" in article.images
|
||||||
|
assert article.images["test_image.jpg"] == mock_image
|
||||||
mock_process_images.assert_called_once()
|
mock_process_images.assert_called_once()
|
||||||
|
Loading…
x
Reference in New Issue
Block a user