Compare commits

..

2 Commits

Author SHA1 Message Date
EC2 Default User
a2d107fad7 command to add blog 2025-08-09 00:31:54 +00:00
Daniel O'Connell
b68e15d3ab add blogs 2025-08-09 02:07:49 +02:00
10 changed files with 161 additions and 28 deletions

View File

@ -41,6 +41,7 @@ UPDATE_METADATA_FOR_ITEM = f"{MAINTENANCE_ROOT}.update_metadata_for_item"
SYNC_WEBPAGE = f"{BLOGS_ROOT}.sync_webpage" SYNC_WEBPAGE = f"{BLOGS_ROOT}.sync_webpage"
SYNC_ARTICLE_FEED = f"{BLOGS_ROOT}.sync_article_feed" SYNC_ARTICLE_FEED = f"{BLOGS_ROOT}.sync_article_feed"
SYNC_ALL_ARTICLE_FEEDS = f"{BLOGS_ROOT}.sync_all_article_feeds" SYNC_ALL_ARTICLE_FEEDS = f"{BLOGS_ROOT}.sync_all_article_feeds"
ADD_ARTICLE_FEED = f"{BLOGS_ROOT}.add_article_feed"
SYNC_WEBSITE_ARCHIVE = f"{BLOGS_ROOT}.sync_website_archive" SYNC_WEBSITE_ARCHIVE = f"{BLOGS_ROOT}.sync_website_archive"

View File

@ -106,6 +106,7 @@ ARTICLE_FEED_SYNC_INTERVAL = int(os.getenv("ARTICLE_FEED_SYNC_INTERVAL", 30 * 60
CLEAN_COLLECTION_INTERVAL = int(os.getenv("CLEAN_COLLECTION_INTERVAL", 24 * 60 * 60)) CLEAN_COLLECTION_INTERVAL = int(os.getenv("CLEAN_COLLECTION_INTERVAL", 24 * 60 * 60))
CHUNK_REINGEST_INTERVAL = int(os.getenv("CHUNK_REINGEST_INTERVAL", 60 * 60)) CHUNK_REINGEST_INTERVAL = int(os.getenv("CHUNK_REINGEST_INTERVAL", 60 * 60))
NOTES_SYNC_INTERVAL = int(os.getenv("NOTES_SYNC_INTERVAL", 15 * 60)) NOTES_SYNC_INTERVAL = int(os.getenv("NOTES_SYNC_INTERVAL", 15 * 60))
LESSWRONG_SYNC_INTERVAL = int(os.getenv("LESSWRONG_SYNC_INTERVAL", 60 * 60 * 24))
CHUNK_REINGEST_SINCE_MINUTES = int(os.getenv("CHUNK_REINGEST_SINCE_MINUTES", 60 * 24)) CHUNK_REINGEST_SINCE_MINUTES = int(os.getenv("CHUNK_REINGEST_SINCE_MINUTES", 60 * 24))

View File

@ -295,4 +295,7 @@ feeds = [
"https://karlin.blog/", "https://karlin.blog/",
"https://slatestarcodex.com/", "https://slatestarcodex.com/",
"https://www.astralcodexten.com/", "https://www.astralcodexten.com/",
"https://nayafia.substack.com",
"https://homosabiens.substack.com",
"https://usefulfictions.substack.com",
] ]

View File

@ -609,6 +609,9 @@ PARSER_REGISTRY = {
r"rachelbythebay\.com": RachelByTheBayParser, r"rachelbythebay\.com": RachelByTheBayParser,
r"nadia\.xyz": NadiaXyzParser, r"nadia\.xyz": NadiaXyzParser,
r"slatestarcodex\.com": SlateStarCodexParser, r"slatestarcodex\.com": SlateStarCodexParser,
r"nayafia\.substack\.com": SubstackParser,
r"homosabiens\.substack\.com": SubstackParser,
r"usefulfictions\.substack\.com": SubstackParser,
} }

View File

@ -237,6 +237,7 @@ def fetch_lesswrong(
def fetch_lesswrong_posts( def fetch_lesswrong_posts(
since: datetime | None = None, since: datetime | None = None,
until: datetime | None = None,
min_karma: int = 10, min_karma: int = 10,
limit: int = 50, limit: int = 50,
cooldown: float = 0.5, cooldown: float = 0.5,
@ -280,6 +281,9 @@ def fetch_lesswrong_posts(
break break
for post in page_posts: for post in page_posts:
published_at = post.get("published_at")
if published_at and until and published_at > until:
break
yield post yield post
last_item = page_posts[-1] last_item = page_posts[-1]

View File

@ -8,6 +8,7 @@ from memory.common.celery_app import (
SYNC_ALL_COMICS, SYNC_ALL_COMICS,
SYNC_ALL_ARTICLE_FEEDS, SYNC_ALL_ARTICLE_FEEDS,
TRACK_GIT_CHANGES, TRACK_GIT_CHANGES,
SYNC_LESSWRONG,
) )
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@ -38,4 +39,8 @@ app.conf.beat_schedule = {
"task": TRACK_GIT_CHANGES, "task": TRACK_GIT_CHANGES,
"schedule": settings.NOTES_SYNC_INTERVAL, "schedule": settings.NOTES_SYNC_INTERVAL,
}, },
"sync-lesswrong": {
"task": SYNC_LESSWRONG,
"schedule": settings.LESSWRONG_SYNC_INTERVAL,
},
} }

View File

@ -12,6 +12,7 @@ from memory.common.celery_app import (
SYNC_WEBPAGE, SYNC_WEBPAGE,
SYNC_ARTICLE_FEED, SYNC_ARTICLE_FEED,
SYNC_ALL_ARTICLE_FEEDS, SYNC_ALL_ARTICLE_FEEDS,
ADD_ARTICLE_FEED,
SYNC_WEBSITE_ARCHIVE, SYNC_WEBSITE_ARCHIVE,
) )
from memory.workers.tasks.content_processing import ( from memory.workers.tasks.content_processing import (
@ -169,10 +170,52 @@ def sync_all_article_feeds() -> list[dict]:
return results return results
@app.task(name=ADD_ARTICLE_FEED)
def add_article_feed(
url: str,
title: str | None = None,
description: str | None = None,
tags: Iterable[str] = [],
active: bool = True,
check_interval: int = 60 * 24, # 24 hours
) -> dict:
"""
Add a new ArticleFeed.
Args:
url: URL of the feed
title: Title of the feed (optional)
description: Description of the feed (optional)
tags: Tags to apply to the feed
active: Whether the feed is active
check_interval: Interval in minutes to check the feed
Returns:
dict: Summary of the added feed
"""
with make_session() as session:
feed = session.query(ArticleFeed).filter(ArticleFeed.url == url).first()
if feed:
logger.info(f"Feed already exists: {url}")
return {"status": "error", "error": "Feed already exists"}
feed = ArticleFeed(
url=url,
title=title or url,
description=description,
active=active,
check_interval=check_interval,
tags=tags,
)
session.add(feed)
session.commit()
return {"status": "success", "feed_id": feed.id}
@app.task(name=SYNC_WEBSITE_ARCHIVE) @app.task(name=SYNC_WEBSITE_ARCHIVE)
@safe_task_execution @safe_task_execution
def sync_website_archive( def sync_website_archive(
url: str, tags: Iterable[str] = [], max_pages: int = 100 url: str, tags: Iterable[str] = [], max_pages: int = 100, add_feed: bool = True
) -> dict: ) -> dict:
""" """
Synchronize all articles from a website's archive. Synchronize all articles from a website's archive.
@ -187,6 +230,16 @@ def sync_website_archive(
""" """
logger.info(f"Starting archive sync for: {url}") logger.info(f"Starting archive sync for: {url}")
if add_feed:
with make_session() as session:
feed = session.query(ArticleFeed).filter(ArticleFeed.url == url).first()
if not feed:
feed = ArticleFeed(
url=url,
title=url,
active=True,
)
# Get archive fetcher for the website # Get archive fetcher for the website
fetcher = get_archive_fetcher(url) fetcher = get_archive_fetcher(url)
if not fetcher: if not fetcher:
@ -200,10 +253,10 @@ def sync_website_archive(
new_articles = 0 new_articles = 0
task_ids = [] task_ids = []
with make_session() as session:
for feed_item in fetcher.fetch_all_items(): for feed_item in fetcher.fetch_all_items():
articles_found += 1 articles_found += 1
with make_session() as session:
existing = check_content_exists(session, BlogPost, url=feed_item.url) existing = check_content_exists(session, BlogPost, url=feed_item.url)
if existing: if existing:
continue continue

View File

@ -49,7 +49,8 @@ def sync_lesswrong_post(
@app.task(name=SYNC_LESSWRONG) @app.task(name=SYNC_LESSWRONG)
@safe_task_execution @safe_task_execution
def sync_lesswrong( def sync_lesswrong(
since: str = (datetime.now() - timedelta(days=30)).isoformat(), since: str | None = None,
until: str | None = None,
min_karma: int = 10, min_karma: int = 10,
limit: int = 50, limit: int = 50,
cooldown: float = 0.5, cooldown: float = 0.5,
@ -57,9 +58,27 @@ def sync_lesswrong(
af: bool = False, af: bool = False,
tags: list[str] = [], tags: list[str] = [],
): ):
if until:
end_date = datetime.fromisoformat(until)
else:
end_date = datetime.now() - timedelta(hours=8)
logger.info(f"Syncing LessWrong posts since {since}") logger.info(f"Syncing LessWrong posts since {since}")
if since:
start_date = datetime.fromisoformat(since) start_date = datetime.fromisoformat(since)
posts = fetch_lesswrong_posts(start_date, min_karma, limit, cooldown, max_items, af) else:
start_date = end_date - timedelta(days=30)
posts = fetch_lesswrong_posts(
since=start_date,
until=end_date,
min_karma=min_karma,
limit=limit,
cooldown=cooldown,
max_items=max_items,
af=af,
)
posts_num, new_posts = 0, 0 posts_num, new_posts = 0, 0
with make_session() as session: with make_session() as session:

View File

@ -1,5 +1,5 @@
import pytest import pytest
from datetime import datetime, timedelta, timezone from datetime import datetime, timezone
from unittest.mock import Mock, patch from unittest.mock import Mock, patch
from memory.common.db.models import ForumPost from memory.common.db.models import ForumPost
@ -228,17 +228,19 @@ def test_sync_lesswrong_success(mock_fetch, mock_lesswrong_post, db_session):
assert result["since"] == "2024-01-01T00:00:00" assert result["since"] == "2024-01-01T00:00:00"
assert result["min_karma"] == 10 assert result["min_karma"] == 10
assert result["max_items"] == 100 assert result["max_items"] == 100
assert result["af"] == False assert not result["af"]
# Verify fetch_lesswrong_posts was called with correct arguments # Verify fetch_lesswrong_posts was called with correct arguments (kwargs)
mock_fetch.assert_called_once_with( mock_fetch.assert_called_once()
datetime.fromisoformat("2024-01-01T00:00:00"), kwargs = mock_fetch.call_args.kwargs
10, # min_karma assert kwargs["since"] == datetime.fromisoformat("2024-01-01T00:00:00")
50, # limit assert kwargs["min_karma"] == 10
0.1, # cooldown assert kwargs["limit"] == 50
100, # max_items assert kwargs["cooldown"] == 0.1
False, # af assert kwargs["max_items"] == 100
) assert kwargs["af"] is False
assert "until" in kwargs
assert isinstance(kwargs["until"], datetime)
# Verify sync_lesswrong_post was called for the new post # Verify sync_lesswrong_post was called for the new post
mock_sync_post.delay.assert_called_once_with(mock_lesswrong_post, ["test"]) mock_sync_post.delay.assert_called_once_with(mock_lesswrong_post, ["test"])
@ -343,11 +345,14 @@ def test_sync_lesswrong_since_parameter(mock_fetch, db_session):
forums.sync_lesswrong(since="2024-01-01T00:00:00") forums.sync_lesswrong(since="2024-01-01T00:00:00")
expected_since = datetime.fromisoformat("2024-01-01T00:00:00") expected_since = datetime.fromisoformat("2024-01-01T00:00:00")
# Verify fetch was called with correct since date # Verify fetch was called with correct since date (kwargs)
call_args = mock_fetch.call_args[0] kwargs = mock_fetch.call_args.kwargs
actual_since = call_args[0] actual_since = kwargs["since"]
assert actual_since == expected_since assert actual_since == expected_since
assert "until" in kwargs
assert isinstance(kwargs["until"], datetime)
assert kwargs["until"] >= actual_since
@pytest.mark.parametrize( @pytest.mark.parametrize(
@ -373,14 +378,14 @@ def test_sync_lesswrong_parameters(
max_items=500, max_items=500,
) )
# Verify fetch was called with correct parameters # Verify fetch was called with correct parameters (kwargs)
call_args = mock_fetch.call_args[0] kwargs = mock_fetch.call_args.kwargs
assert call_args[1] == min_karma # min_karma assert kwargs["min_karma"] == min_karma
assert call_args[2] == limit # limit assert kwargs["limit"] == limit
assert call_args[3] == cooldown # cooldown assert kwargs["cooldown"] == cooldown
assert call_args[4] == 500 # max_items assert kwargs["max_items"] == 500
assert call_args[5] == af_value # af assert kwargs["af"] == af_value
assert result["min_karma"] == min_karma assert result["min_karma"] == min_karma
assert result["af"] == af_value assert result["af"] == af_value

View File

@ -27,6 +27,7 @@ from memory.common import settings
from memory.common.celery_app import ( from memory.common.celery_app import (
SYNC_ALL_ARTICLE_FEEDS, SYNC_ALL_ARTICLE_FEEDS,
SYNC_ARTICLE_FEED, SYNC_ARTICLE_FEED,
ADD_ARTICLE_FEED,
SYNC_WEBPAGE, SYNC_WEBPAGE,
SYNC_WEBSITE_ARCHIVE, SYNC_WEBSITE_ARCHIVE,
SYNC_ALL_COMICS, SYNC_ALL_COMICS,
@ -49,6 +50,7 @@ from memory.common.celery_app import (
UPDATE_METADATA_FOR_ITEM, UPDATE_METADATA_FOR_ITEM,
UPDATE_METADATA_FOR_SOURCE_ITEMS, UPDATE_METADATA_FOR_SOURCE_ITEMS,
SETUP_GIT_NOTES, SETUP_GIT_NOTES,
TRACK_GIT_CHANGES,
app, app,
) )
@ -78,6 +80,7 @@ TASK_MAPPINGS = {
"sync_article_feed": SYNC_ARTICLE_FEED, "sync_article_feed": SYNC_ARTICLE_FEED,
"sync_all_article_feeds": SYNC_ALL_ARTICLE_FEEDS, "sync_all_article_feeds": SYNC_ALL_ARTICLE_FEEDS,
"sync_website_archive": SYNC_WEBSITE_ARCHIVE, "sync_website_archive": SYNC_WEBSITE_ARCHIVE,
"add_article_feed": ADD_ARTICLE_FEED,
}, },
"comic": { "comic": {
"sync_all_comics": SYNC_ALL_COMICS, "sync_all_comics": SYNC_ALL_COMICS,
@ -92,6 +95,7 @@ TASK_MAPPINGS = {
}, },
"notes": { "notes": {
"setup_git_notes": SETUP_GIT_NOTES, "setup_git_notes": SETUP_GIT_NOTES,
"track_git_changes": TRACK_GIT_CHANGES,
}, },
} }
QUEUE_MAPPINGS = { QUEUE_MAPPINGS = {
@ -249,6 +253,13 @@ def notes_setup_git_notes(ctx, origin, email, name):
execute_task(ctx, "notes", "setup_git_notes", origin=origin, email=email, name=name) execute_task(ctx, "notes", "setup_git_notes", origin=origin, email=email, name=name)
@notes.command("track-git-changes")
@click.pass_context
def notes_track_git_changes(ctx):
"""Track git changes."""
execute_task(ctx, "notes", "track_git_changes")
@cli.group() @cli.group()
@click.pass_context @click.pass_context
def maintenance(ctx): def maintenance(ctx):
@ -376,6 +387,34 @@ def blogs_sync_website_archive(ctx, url):
execute_task(ctx, "blogs", "sync_website_archive", url=url) execute_task(ctx, "blogs", "sync_website_archive", url=url)
@blogs.command("add-article-feed")
@click.option("--url", required=True, help="URL of the feed")
@click.option("--title", help="Title of the feed")
@click.option("--description", help="Description of the feed")
@click.option("--tags", help="Comma-separated tags to apply to the feed", default="")
@click.option("--active", is_flag=True, help="Whether the feed is active")
@click.option(
"--check-interval",
type=int,
help="Interval in minutes to check the feed",
default=60 * 24, # 24 hours
)
@click.pass_context
def blogs_add_article_feed(ctx, url, title, description, tags, active, check_interval):
"""Add a new article feed."""
execute_task(
ctx,
"blogs",
"add_article_feed",
url=url,
title=title,
description=description,
tags=tags.split(","),
active=active,
check_interval=check_interval,
)
@cli.group() @cli.group()
@click.pass_context @click.pass_context
def comic(ctx): def comic(ctx):