From b68e15d3ab712c3a32ffb919b9db0b4dbaa8f8c1 Mon Sep 17 00:00:00 2001 From: Daniel O'Connell Date: Sat, 9 Aug 2025 02:05:41 +0200 Subject: [PATCH] add blogs --- src/memory/common/settings.py | 1 + src/memory/parsers/archives.py | 61 ++++++++++--------- src/memory/parsers/blogs.py | 3 + src/memory/parsers/lesswrong.py | 4 ++ src/memory/workers/ingest.py | 5 ++ src/memory/workers/tasks/forums.py | 25 +++++++- .../memory/workers/tasks/test_forums_tasks.py | 47 +++++++------- 7 files changed, 93 insertions(+), 53 deletions(-) diff --git a/src/memory/common/settings.py b/src/memory/common/settings.py index 53c343e..80c5a75 100644 --- a/src/memory/common/settings.py +++ b/src/memory/common/settings.py @@ -106,6 +106,7 @@ ARTICLE_FEED_SYNC_INTERVAL = int(os.getenv("ARTICLE_FEED_SYNC_INTERVAL", 30 * 60 CLEAN_COLLECTION_INTERVAL = int(os.getenv("CLEAN_COLLECTION_INTERVAL", 24 * 60 * 60)) CHUNK_REINGEST_INTERVAL = int(os.getenv("CHUNK_REINGEST_INTERVAL", 60 * 60)) NOTES_SYNC_INTERVAL = int(os.getenv("NOTES_SYNC_INTERVAL", 15 * 60)) +LESSWRONG_SYNC_INTERVAL = int(os.getenv("LESSWRONG_SYNC_INTERVAL", 60 * 60 * 24)) CHUNK_REINGEST_SINCE_MINUTES = int(os.getenv("CHUNK_REINGEST_SINCE_MINUTES", 60 * 24)) diff --git a/src/memory/parsers/archives.py b/src/memory/parsers/archives.py index fa54874..7a72cac 100644 --- a/src/memory/parsers/archives.py +++ b/src/memory/parsers/archives.py @@ -264,35 +264,38 @@ def get_archive_fetcher(url: str) -> ArchiveFetcher | None: feeds = [ - "https://archive.ph/o/IQUoT/https://www.bloomberg.com/opinion/authors/ARbTQlRLRjE/matthew-s-levine", - "https://www.rifters.com/crawl/", - "https://rachelbythebay.com/w/", - "https://danluu.com/", - "https://guzey.com", - "https://aphyr.com/", - "https://www.applieddivinitystudies.com/", - "https://www.imightbewrong.org/", - "https://www.kvetch.au/", - "https://www.overcomingbias.com/", - "https://samkriss.substack.com/", - "https://www.richardhanania.com/", - "https://skunkledger.substack.com/", - "https://taipology.substack.com/", - "https://putanumonit.com/", - "https://www.flyingmachinestudios.com/", - "https://www.theintrinsicperspective.com/", - "https://www.strangeloopcanon.com/", - "https://slimemoldtimemold.com/", - "https://zeroinputagriculture.substack.com/", - "https://nayafia.substack.com", - "https://www.paulgraham.com/articles.html", - "https://mcfunley.com/writing", - "https://www.bitsaboutmoney.com/", - "https://akarlin.com", - "https://www.exurbe.com/", - "https://acoup.blog/", - "https://www.theredhandfiles.com/", - "https://karlin.blog/", + # "https://archive.ph/o/IQUoT/https://www.bloomberg.com/opinion/authors/ARbTQlRLRjE/matthew-s-levine", + # "https://www.rifters.com/crawl/", + # "https://rachelbythebay.com/w/", + # "https://danluu.com/", + # "https://guzey.com", + # "https://aphyr.com/", + # "https://www.applieddivinitystudies.com/", + # "https://www.imightbewrong.org/", + # "https://www.kvetch.au/", + # "https://www.overcomingbias.com/", + # "https://samkriss.substack.com/", + # "https://www.richardhanania.com/", + # "https://skunkledger.substack.com/", + # "https://taipology.substack.com/", + # "https://putanumonit.com/", + # "https://www.flyingmachinestudios.com/", + # "https://www.theintrinsicperspective.com/", + # "https://www.strangeloopcanon.com/", + # "https://slimemoldtimemold.com/", + # "https://zeroinputagriculture.substack.com/", + # "https://nayafia.substack.com", + # "https://www.paulgraham.com/articles.html", + # "https://mcfunley.com/writing", + # "https://www.bitsaboutmoney.com/", + # "https://akarlin.com", + # "https://www.exurbe.com/", + # "https://acoup.blog/", + # "https://www.theredhandfiles.com/", + # "https://karlin.blog/", "https://slatestarcodex.com/", "https://www.astralcodexten.com/", + "https://nayafia.substack.com", + "https://homosabiens.substack.com", + "https://usefulfictions.substack.com", ] diff --git a/src/memory/parsers/blogs.py b/src/memory/parsers/blogs.py index 76a4afd..e7112b6 100644 --- a/src/memory/parsers/blogs.py +++ b/src/memory/parsers/blogs.py @@ -609,6 +609,9 @@ PARSER_REGISTRY = { r"rachelbythebay\.com": RachelByTheBayParser, r"nadia\.xyz": NadiaXyzParser, r"slatestarcodex\.com": SlateStarCodexParser, + r"nayafia\.substack\.com": SubstackParser, + r"homosabiens\.substack\.com": SubstackParser, + r"usefulfictions\.substack\.com": SubstackParser, } diff --git a/src/memory/parsers/lesswrong.py b/src/memory/parsers/lesswrong.py index 125c5c0..1a5299e 100644 --- a/src/memory/parsers/lesswrong.py +++ b/src/memory/parsers/lesswrong.py @@ -237,6 +237,7 @@ def fetch_lesswrong( def fetch_lesswrong_posts( since: datetime | None = None, + until: datetime | None = None, min_karma: int = 10, limit: int = 50, cooldown: float = 0.5, @@ -280,6 +281,9 @@ def fetch_lesswrong_posts( break for post in page_posts: + published_at = post.get("published_at") + if published_at and until and published_at > until: + break yield post last_item = page_posts[-1] diff --git a/src/memory/workers/ingest.py b/src/memory/workers/ingest.py index 01485f6..9813754 100644 --- a/src/memory/workers/ingest.py +++ b/src/memory/workers/ingest.py @@ -8,6 +8,7 @@ from memory.common.celery_app import ( SYNC_ALL_COMICS, SYNC_ALL_ARTICLE_FEEDS, TRACK_GIT_CHANGES, + SYNC_LESSWRONG, ) logger = logging.getLogger(__name__) @@ -38,4 +39,8 @@ app.conf.beat_schedule = { "task": TRACK_GIT_CHANGES, "schedule": settings.NOTES_SYNC_INTERVAL, }, + "sync-lesswrong": { + "task": SYNC_LESSWRONG, + "schedule": settings.LESSWRONG_SYNC_INTERVAL, + }, } diff --git a/src/memory/workers/tasks/forums.py b/src/memory/workers/tasks/forums.py index 85a1332..0c95b41 100644 --- a/src/memory/workers/tasks/forums.py +++ b/src/memory/workers/tasks/forums.py @@ -49,7 +49,8 @@ def sync_lesswrong_post( @app.task(name=SYNC_LESSWRONG) @safe_task_execution def sync_lesswrong( - since: str = (datetime.now() - timedelta(days=30)).isoformat(), + since: str | None = None, + until: str | None = None, min_karma: int = 10, limit: int = 50, cooldown: float = 0.5, @@ -57,9 +58,27 @@ def sync_lesswrong( af: bool = False, tags: list[str] = [], ): + if until: + end_date = datetime.fromisoformat(until) + else: + end_date = datetime.now() - timedelta(hours=8) + logger.info(f"Syncing LessWrong posts since {since}") - start_date = datetime.fromisoformat(since) - posts = fetch_lesswrong_posts(start_date, min_karma, limit, cooldown, max_items, af) + + if since: + start_date = datetime.fromisoformat(since) + else: + start_date = end_date - timedelta(days=30) + + posts = fetch_lesswrong_posts( + since=start_date, + until=end_date, + min_karma=min_karma, + limit=limit, + cooldown=cooldown, + max_items=max_items, + af=af, + ) posts_num, new_posts = 0, 0 with make_session() as session: diff --git a/tests/memory/workers/tasks/test_forums_tasks.py b/tests/memory/workers/tasks/test_forums_tasks.py index ee085d7..065be0f 100644 --- a/tests/memory/workers/tasks/test_forums_tasks.py +++ b/tests/memory/workers/tasks/test_forums_tasks.py @@ -1,5 +1,5 @@ import pytest -from datetime import datetime, timedelta, timezone +from datetime import datetime, timezone from unittest.mock import Mock, patch from memory.common.db.models import ForumPost @@ -228,17 +228,19 @@ def test_sync_lesswrong_success(mock_fetch, mock_lesswrong_post, db_session): assert result["since"] == "2024-01-01T00:00:00" assert result["min_karma"] == 10 assert result["max_items"] == 100 - assert result["af"] == False + assert not result["af"] - # Verify fetch_lesswrong_posts was called with correct arguments - mock_fetch.assert_called_once_with( - datetime.fromisoformat("2024-01-01T00:00:00"), - 10, # min_karma - 50, # limit - 0.1, # cooldown - 100, # max_items - False, # af - ) + # Verify fetch_lesswrong_posts was called with correct arguments (kwargs) + mock_fetch.assert_called_once() + kwargs = mock_fetch.call_args.kwargs + assert kwargs["since"] == datetime.fromisoformat("2024-01-01T00:00:00") + assert kwargs["min_karma"] == 10 + assert kwargs["limit"] == 50 + assert kwargs["cooldown"] == 0.1 + assert kwargs["max_items"] == 100 + assert kwargs["af"] is False + assert "until" in kwargs + assert isinstance(kwargs["until"], datetime) # Verify sync_lesswrong_post was called for the new post mock_sync_post.delay.assert_called_once_with(mock_lesswrong_post, ["test"]) @@ -343,11 +345,14 @@ def test_sync_lesswrong_since_parameter(mock_fetch, db_session): forums.sync_lesswrong(since="2024-01-01T00:00:00") expected_since = datetime.fromisoformat("2024-01-01T00:00:00") - # Verify fetch was called with correct since date - call_args = mock_fetch.call_args[0] - actual_since = call_args[0] + # Verify fetch was called with correct since date (kwargs) + kwargs = mock_fetch.call_args.kwargs + actual_since = kwargs["since"] assert actual_since == expected_since + assert "until" in kwargs + assert isinstance(kwargs["until"], datetime) + assert kwargs["until"] >= actual_since @pytest.mark.parametrize( @@ -373,14 +378,14 @@ def test_sync_lesswrong_parameters( max_items=500, ) - # Verify fetch was called with correct parameters - call_args = mock_fetch.call_args[0] + # Verify fetch was called with correct parameters (kwargs) + kwargs = mock_fetch.call_args.kwargs - assert call_args[1] == min_karma # min_karma - assert call_args[2] == limit # limit - assert call_args[3] == cooldown # cooldown - assert call_args[4] == 500 # max_items - assert call_args[5] == af_value # af + assert kwargs["min_karma"] == min_karma + assert kwargs["limit"] == limit + assert kwargs["cooldown"] == cooldown + assert kwargs["max_items"] == 500 + assert kwargs["af"] == af_value assert result["min_karma"] == min_karma assert result["af"] == af_value