add blogs

This commit is contained in:
Daniel O'Connell 2025-08-09 02:05:41 +02:00
parent 862251fedb
commit b68e15d3ab
7 changed files with 93 additions and 53 deletions

View File

@ -106,6 +106,7 @@ ARTICLE_FEED_SYNC_INTERVAL = int(os.getenv("ARTICLE_FEED_SYNC_INTERVAL", 30 * 60
CLEAN_COLLECTION_INTERVAL = int(os.getenv("CLEAN_COLLECTION_INTERVAL", 24 * 60 * 60)) CLEAN_COLLECTION_INTERVAL = int(os.getenv("CLEAN_COLLECTION_INTERVAL", 24 * 60 * 60))
CHUNK_REINGEST_INTERVAL = int(os.getenv("CHUNK_REINGEST_INTERVAL", 60 * 60)) CHUNK_REINGEST_INTERVAL = int(os.getenv("CHUNK_REINGEST_INTERVAL", 60 * 60))
NOTES_SYNC_INTERVAL = int(os.getenv("NOTES_SYNC_INTERVAL", 15 * 60)) NOTES_SYNC_INTERVAL = int(os.getenv("NOTES_SYNC_INTERVAL", 15 * 60))
LESSWRONG_SYNC_INTERVAL = int(os.getenv("LESSWRONG_SYNC_INTERVAL", 60 * 60 * 24))
CHUNK_REINGEST_SINCE_MINUTES = int(os.getenv("CHUNK_REINGEST_SINCE_MINUTES", 60 * 24)) CHUNK_REINGEST_SINCE_MINUTES = int(os.getenv("CHUNK_REINGEST_SINCE_MINUTES", 60 * 24))

View File

@ -264,35 +264,38 @@ def get_archive_fetcher(url: str) -> ArchiveFetcher | None:
feeds = [ feeds = [
"https://archive.ph/o/IQUoT/https://www.bloomberg.com/opinion/authors/ARbTQlRLRjE/matthew-s-levine", # "https://archive.ph/o/IQUoT/https://www.bloomberg.com/opinion/authors/ARbTQlRLRjE/matthew-s-levine",
"https://www.rifters.com/crawl/", # "https://www.rifters.com/crawl/",
"https://rachelbythebay.com/w/", # "https://rachelbythebay.com/w/",
"https://danluu.com/", # "https://danluu.com/",
"https://guzey.com", # "https://guzey.com",
"https://aphyr.com/", # "https://aphyr.com/",
"https://www.applieddivinitystudies.com/", # "https://www.applieddivinitystudies.com/",
"https://www.imightbewrong.org/", # "https://www.imightbewrong.org/",
"https://www.kvetch.au/", # "https://www.kvetch.au/",
"https://www.overcomingbias.com/", # "https://www.overcomingbias.com/",
"https://samkriss.substack.com/", # "https://samkriss.substack.com/",
"https://www.richardhanania.com/", # "https://www.richardhanania.com/",
"https://skunkledger.substack.com/", # "https://skunkledger.substack.com/",
"https://taipology.substack.com/", # "https://taipology.substack.com/",
"https://putanumonit.com/", # "https://putanumonit.com/",
"https://www.flyingmachinestudios.com/", # "https://www.flyingmachinestudios.com/",
"https://www.theintrinsicperspective.com/", # "https://www.theintrinsicperspective.com/",
"https://www.strangeloopcanon.com/", # "https://www.strangeloopcanon.com/",
"https://slimemoldtimemold.com/", # "https://slimemoldtimemold.com/",
"https://zeroinputagriculture.substack.com/", # "https://zeroinputagriculture.substack.com/",
"https://nayafia.substack.com", # "https://nayafia.substack.com",
"https://www.paulgraham.com/articles.html", # "https://www.paulgraham.com/articles.html",
"https://mcfunley.com/writing", # "https://mcfunley.com/writing",
"https://www.bitsaboutmoney.com/", # "https://www.bitsaboutmoney.com/",
"https://akarlin.com", # "https://akarlin.com",
"https://www.exurbe.com/", # "https://www.exurbe.com/",
"https://acoup.blog/", # "https://acoup.blog/",
"https://www.theredhandfiles.com/", # "https://www.theredhandfiles.com/",
"https://karlin.blog/", # "https://karlin.blog/",
"https://slatestarcodex.com/", "https://slatestarcodex.com/",
"https://www.astralcodexten.com/", "https://www.astralcodexten.com/",
"https://nayafia.substack.com",
"https://homosabiens.substack.com",
"https://usefulfictions.substack.com",
] ]

View File

@ -609,6 +609,9 @@ PARSER_REGISTRY = {
r"rachelbythebay\.com": RachelByTheBayParser, r"rachelbythebay\.com": RachelByTheBayParser,
r"nadia\.xyz": NadiaXyzParser, r"nadia\.xyz": NadiaXyzParser,
r"slatestarcodex\.com": SlateStarCodexParser, r"slatestarcodex\.com": SlateStarCodexParser,
r"nayafia\.substack\.com": SubstackParser,
r"homosabiens\.substack\.com": SubstackParser,
r"usefulfictions\.substack\.com": SubstackParser,
} }

View File

@ -237,6 +237,7 @@ def fetch_lesswrong(
def fetch_lesswrong_posts( def fetch_lesswrong_posts(
since: datetime | None = None, since: datetime | None = None,
until: datetime | None = None,
min_karma: int = 10, min_karma: int = 10,
limit: int = 50, limit: int = 50,
cooldown: float = 0.5, cooldown: float = 0.5,
@ -280,6 +281,9 @@ def fetch_lesswrong_posts(
break break
for post in page_posts: for post in page_posts:
published_at = post.get("published_at")
if published_at and until and published_at > until:
break
yield post yield post
last_item = page_posts[-1] last_item = page_posts[-1]

View File

@ -8,6 +8,7 @@ from memory.common.celery_app import (
SYNC_ALL_COMICS, SYNC_ALL_COMICS,
SYNC_ALL_ARTICLE_FEEDS, SYNC_ALL_ARTICLE_FEEDS,
TRACK_GIT_CHANGES, TRACK_GIT_CHANGES,
SYNC_LESSWRONG,
) )
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@ -38,4 +39,8 @@ app.conf.beat_schedule = {
"task": TRACK_GIT_CHANGES, "task": TRACK_GIT_CHANGES,
"schedule": settings.NOTES_SYNC_INTERVAL, "schedule": settings.NOTES_SYNC_INTERVAL,
}, },
"sync-lesswrong": {
"task": SYNC_LESSWRONG,
"schedule": settings.LESSWRONG_SYNC_INTERVAL,
},
} }

View File

@ -49,7 +49,8 @@ def sync_lesswrong_post(
@app.task(name=SYNC_LESSWRONG) @app.task(name=SYNC_LESSWRONG)
@safe_task_execution @safe_task_execution
def sync_lesswrong( def sync_lesswrong(
since: str = (datetime.now() - timedelta(days=30)).isoformat(), since: str | None = None,
until: str | None = None,
min_karma: int = 10, min_karma: int = 10,
limit: int = 50, limit: int = 50,
cooldown: float = 0.5, cooldown: float = 0.5,
@ -57,9 +58,27 @@ def sync_lesswrong(
af: bool = False, af: bool = False,
tags: list[str] = [], tags: list[str] = [],
): ):
if until:
end_date = datetime.fromisoformat(until)
else:
end_date = datetime.now() - timedelta(hours=8)
logger.info(f"Syncing LessWrong posts since {since}") logger.info(f"Syncing LessWrong posts since {since}")
start_date = datetime.fromisoformat(since)
posts = fetch_lesswrong_posts(start_date, min_karma, limit, cooldown, max_items, af) if since:
start_date = datetime.fromisoformat(since)
else:
start_date = end_date - timedelta(days=30)
posts = fetch_lesswrong_posts(
since=start_date,
until=end_date,
min_karma=min_karma,
limit=limit,
cooldown=cooldown,
max_items=max_items,
af=af,
)
posts_num, new_posts = 0, 0 posts_num, new_posts = 0, 0
with make_session() as session: with make_session() as session:

View File

@ -1,5 +1,5 @@
import pytest import pytest
from datetime import datetime, timedelta, timezone from datetime import datetime, timezone
from unittest.mock import Mock, patch from unittest.mock import Mock, patch
from memory.common.db.models import ForumPost from memory.common.db.models import ForumPost
@ -228,17 +228,19 @@ def test_sync_lesswrong_success(mock_fetch, mock_lesswrong_post, db_session):
assert result["since"] == "2024-01-01T00:00:00" assert result["since"] == "2024-01-01T00:00:00"
assert result["min_karma"] == 10 assert result["min_karma"] == 10
assert result["max_items"] == 100 assert result["max_items"] == 100
assert result["af"] == False assert not result["af"]
# Verify fetch_lesswrong_posts was called with correct arguments # Verify fetch_lesswrong_posts was called with correct arguments (kwargs)
mock_fetch.assert_called_once_with( mock_fetch.assert_called_once()
datetime.fromisoformat("2024-01-01T00:00:00"), kwargs = mock_fetch.call_args.kwargs
10, # min_karma assert kwargs["since"] == datetime.fromisoformat("2024-01-01T00:00:00")
50, # limit assert kwargs["min_karma"] == 10
0.1, # cooldown assert kwargs["limit"] == 50
100, # max_items assert kwargs["cooldown"] == 0.1
False, # af assert kwargs["max_items"] == 100
) assert kwargs["af"] is False
assert "until" in kwargs
assert isinstance(kwargs["until"], datetime)
# Verify sync_lesswrong_post was called for the new post # Verify sync_lesswrong_post was called for the new post
mock_sync_post.delay.assert_called_once_with(mock_lesswrong_post, ["test"]) mock_sync_post.delay.assert_called_once_with(mock_lesswrong_post, ["test"])
@ -343,11 +345,14 @@ def test_sync_lesswrong_since_parameter(mock_fetch, db_session):
forums.sync_lesswrong(since="2024-01-01T00:00:00") forums.sync_lesswrong(since="2024-01-01T00:00:00")
expected_since = datetime.fromisoformat("2024-01-01T00:00:00") expected_since = datetime.fromisoformat("2024-01-01T00:00:00")
# Verify fetch was called with correct since date # Verify fetch was called with correct since date (kwargs)
call_args = mock_fetch.call_args[0] kwargs = mock_fetch.call_args.kwargs
actual_since = call_args[0] actual_since = kwargs["since"]
assert actual_since == expected_since assert actual_since == expected_since
assert "until" in kwargs
assert isinstance(kwargs["until"], datetime)
assert kwargs["until"] >= actual_since
@pytest.mark.parametrize( @pytest.mark.parametrize(
@ -373,14 +378,14 @@ def test_sync_lesswrong_parameters(
max_items=500, max_items=500,
) )
# Verify fetch was called with correct parameters # Verify fetch was called with correct parameters (kwargs)
call_args = mock_fetch.call_args[0] kwargs = mock_fetch.call_args.kwargs
assert call_args[1] == min_karma # min_karma assert kwargs["min_karma"] == min_karma
assert call_args[2] == limit # limit assert kwargs["limit"] == limit
assert call_args[3] == cooldown # cooldown assert kwargs["cooldown"] == cooldown
assert call_args[4] == 500 # max_items assert kwargs["max_items"] == 500
assert call_args[5] == af_value # af assert kwargs["af"] == af_value
assert result["min_karma"] == min_karma assert result["min_karma"] == min_karma
assert result["af"] == af_value assert result["af"] == af_value