Compare commits

..

No commits in common. "a2d107fad76d854597daa3c1b9e434ff88331ce6" and "862251fedbb3bd3f45389ac42906abd6b2c168f0" have entirely different histories.

10 changed files with 28 additions and 161 deletions

View File

@ -41,7 +41,6 @@ UPDATE_METADATA_FOR_ITEM = f"{MAINTENANCE_ROOT}.update_metadata_for_item"
SYNC_WEBPAGE = f"{BLOGS_ROOT}.sync_webpage"
SYNC_ARTICLE_FEED = f"{BLOGS_ROOT}.sync_article_feed"
SYNC_ALL_ARTICLE_FEEDS = f"{BLOGS_ROOT}.sync_all_article_feeds"
ADD_ARTICLE_FEED = f"{BLOGS_ROOT}.add_article_feed"
SYNC_WEBSITE_ARCHIVE = f"{BLOGS_ROOT}.sync_website_archive"

View File

@ -106,7 +106,6 @@ ARTICLE_FEED_SYNC_INTERVAL = int(os.getenv("ARTICLE_FEED_SYNC_INTERVAL", 30 * 60
CLEAN_COLLECTION_INTERVAL = int(os.getenv("CLEAN_COLLECTION_INTERVAL", 24 * 60 * 60))
CHUNK_REINGEST_INTERVAL = int(os.getenv("CHUNK_REINGEST_INTERVAL", 60 * 60))
NOTES_SYNC_INTERVAL = int(os.getenv("NOTES_SYNC_INTERVAL", 15 * 60))
LESSWRONG_SYNC_INTERVAL = int(os.getenv("LESSWRONG_SYNC_INTERVAL", 60 * 60 * 24))
CHUNK_REINGEST_SINCE_MINUTES = int(os.getenv("CHUNK_REINGEST_SINCE_MINUTES", 60 * 24))

View File

@ -295,7 +295,4 @@ feeds = [
"https://karlin.blog/",
"https://slatestarcodex.com/",
"https://www.astralcodexten.com/",
"https://nayafia.substack.com",
"https://homosabiens.substack.com",
"https://usefulfictions.substack.com",
]

View File

@ -609,9 +609,6 @@ PARSER_REGISTRY = {
r"rachelbythebay\.com": RachelByTheBayParser,
r"nadia\.xyz": NadiaXyzParser,
r"slatestarcodex\.com": SlateStarCodexParser,
r"nayafia\.substack\.com": SubstackParser,
r"homosabiens\.substack\.com": SubstackParser,
r"usefulfictions\.substack\.com": SubstackParser,
}

View File

@ -237,7 +237,6 @@ def fetch_lesswrong(
def fetch_lesswrong_posts(
since: datetime | None = None,
until: datetime | None = None,
min_karma: int = 10,
limit: int = 50,
cooldown: float = 0.5,
@ -281,9 +280,6 @@ def fetch_lesswrong_posts(
break
for post in page_posts:
published_at = post.get("published_at")
if published_at and until and published_at > until:
break
yield post
last_item = page_posts[-1]

View File

@ -8,7 +8,6 @@ from memory.common.celery_app import (
SYNC_ALL_COMICS,
SYNC_ALL_ARTICLE_FEEDS,
TRACK_GIT_CHANGES,
SYNC_LESSWRONG,
)
logger = logging.getLogger(__name__)
@ -39,8 +38,4 @@ app.conf.beat_schedule = {
"task": TRACK_GIT_CHANGES,
"schedule": settings.NOTES_SYNC_INTERVAL,
},
"sync-lesswrong": {
"task": SYNC_LESSWRONG,
"schedule": settings.LESSWRONG_SYNC_INTERVAL,
},
}

View File

@ -12,7 +12,6 @@ from memory.common.celery_app import (
SYNC_WEBPAGE,
SYNC_ARTICLE_FEED,
SYNC_ALL_ARTICLE_FEEDS,
ADD_ARTICLE_FEED,
SYNC_WEBSITE_ARCHIVE,
)
from memory.workers.tasks.content_processing import (
@ -170,52 +169,10 @@ def sync_all_article_feeds() -> list[dict]:
return results
@app.task(name=ADD_ARTICLE_FEED)
def add_article_feed(
url: str,
title: str | None = None,
description: str | None = None,
tags: Iterable[str] = [],
active: bool = True,
check_interval: int = 60 * 24, # 24 hours
) -> dict:
"""
Add a new ArticleFeed.
Args:
url: URL of the feed
title: Title of the feed (optional)
description: Description of the feed (optional)
tags: Tags to apply to the feed
active: Whether the feed is active
check_interval: Interval in minutes to check the feed
Returns:
dict: Summary of the added feed
"""
with make_session() as session:
feed = session.query(ArticleFeed).filter(ArticleFeed.url == url).first()
if feed:
logger.info(f"Feed already exists: {url}")
return {"status": "error", "error": "Feed already exists"}
feed = ArticleFeed(
url=url,
title=title or url,
description=description,
active=active,
check_interval=check_interval,
tags=tags,
)
session.add(feed)
session.commit()
return {"status": "success", "feed_id": feed.id}
@app.task(name=SYNC_WEBSITE_ARCHIVE)
@safe_task_execution
def sync_website_archive(
url: str, tags: Iterable[str] = [], max_pages: int = 100, add_feed: bool = True
url: str, tags: Iterable[str] = [], max_pages: int = 100
) -> dict:
"""
Synchronize all articles from a website's archive.
@ -230,16 +187,6 @@ def sync_website_archive(
"""
logger.info(f"Starting archive sync for: {url}")
if add_feed:
with make_session() as session:
feed = session.query(ArticleFeed).filter(ArticleFeed.url == url).first()
if not feed:
feed = ArticleFeed(
url=url,
title=url,
active=True,
)
# Get archive fetcher for the website
fetcher = get_archive_fetcher(url)
if not fetcher:
@ -253,10 +200,10 @@ def sync_website_archive(
new_articles = 0
task_ids = []
with make_session() as session:
for feed_item in fetcher.fetch_all_items():
articles_found += 1
with make_session() as session:
existing = check_content_exists(session, BlogPost, url=feed_item.url)
if existing:
continue

View File

@ -49,8 +49,7 @@ def sync_lesswrong_post(
@app.task(name=SYNC_LESSWRONG)
@safe_task_execution
def sync_lesswrong(
since: str | None = None,
until: str | None = None,
since: str = (datetime.now() - timedelta(days=30)).isoformat(),
min_karma: int = 10,
limit: int = 50,
cooldown: float = 0.5,
@ -58,27 +57,9 @@ def sync_lesswrong(
af: bool = False,
tags: list[str] = [],
):
if until:
end_date = datetime.fromisoformat(until)
else:
end_date = datetime.now() - timedelta(hours=8)
logger.info(f"Syncing LessWrong posts since {since}")
if since:
start_date = datetime.fromisoformat(since)
else:
start_date = end_date - timedelta(days=30)
posts = fetch_lesswrong_posts(
since=start_date,
until=end_date,
min_karma=min_karma,
limit=limit,
cooldown=cooldown,
max_items=max_items,
af=af,
)
posts = fetch_lesswrong_posts(start_date, min_karma, limit, cooldown, max_items, af)
posts_num, new_posts = 0, 0
with make_session() as session:

View File

@ -1,5 +1,5 @@
import pytest
from datetime import datetime, timezone
from datetime import datetime, timedelta, timezone
from unittest.mock import Mock, patch
from memory.common.db.models import ForumPost
@ -228,19 +228,17 @@ def test_sync_lesswrong_success(mock_fetch, mock_lesswrong_post, db_session):
assert result["since"] == "2024-01-01T00:00:00"
assert result["min_karma"] == 10
assert result["max_items"] == 100
assert not result["af"]
assert result["af"] == False
# Verify fetch_lesswrong_posts was called with correct arguments (kwargs)
mock_fetch.assert_called_once()
kwargs = mock_fetch.call_args.kwargs
assert kwargs["since"] == datetime.fromisoformat("2024-01-01T00:00:00")
assert kwargs["min_karma"] == 10
assert kwargs["limit"] == 50
assert kwargs["cooldown"] == 0.1
assert kwargs["max_items"] == 100
assert kwargs["af"] is False
assert "until" in kwargs
assert isinstance(kwargs["until"], datetime)
# Verify fetch_lesswrong_posts was called with correct arguments
mock_fetch.assert_called_once_with(
datetime.fromisoformat("2024-01-01T00:00:00"),
10, # min_karma
50, # limit
0.1, # cooldown
100, # max_items
False, # af
)
# Verify sync_lesswrong_post was called for the new post
mock_sync_post.delay.assert_called_once_with(mock_lesswrong_post, ["test"])
@ -345,14 +343,11 @@ def test_sync_lesswrong_since_parameter(mock_fetch, db_session):
forums.sync_lesswrong(since="2024-01-01T00:00:00")
expected_since = datetime.fromisoformat("2024-01-01T00:00:00")
# Verify fetch was called with correct since date (kwargs)
kwargs = mock_fetch.call_args.kwargs
actual_since = kwargs["since"]
# Verify fetch was called with correct since date
call_args = mock_fetch.call_args[0]
actual_since = call_args[0]
assert actual_since == expected_since
assert "until" in kwargs
assert isinstance(kwargs["until"], datetime)
assert kwargs["until"] >= actual_since
@pytest.mark.parametrize(
@ -378,14 +373,14 @@ def test_sync_lesswrong_parameters(
max_items=500,
)
# Verify fetch was called with correct parameters (kwargs)
kwargs = mock_fetch.call_args.kwargs
# Verify fetch was called with correct parameters
call_args = mock_fetch.call_args[0]
assert kwargs["min_karma"] == min_karma
assert kwargs["limit"] == limit
assert kwargs["cooldown"] == cooldown
assert kwargs["max_items"] == 500
assert kwargs["af"] == af_value
assert call_args[1] == min_karma # min_karma
assert call_args[2] == limit # limit
assert call_args[3] == cooldown # cooldown
assert call_args[4] == 500 # max_items
assert call_args[5] == af_value # af
assert result["min_karma"] == min_karma
assert result["af"] == af_value

View File

@ -27,7 +27,6 @@ from memory.common import settings
from memory.common.celery_app import (
SYNC_ALL_ARTICLE_FEEDS,
SYNC_ARTICLE_FEED,
ADD_ARTICLE_FEED,
SYNC_WEBPAGE,
SYNC_WEBSITE_ARCHIVE,
SYNC_ALL_COMICS,
@ -50,7 +49,6 @@ from memory.common.celery_app import (
UPDATE_METADATA_FOR_ITEM,
UPDATE_METADATA_FOR_SOURCE_ITEMS,
SETUP_GIT_NOTES,
TRACK_GIT_CHANGES,
app,
)
@ -80,7 +78,6 @@ TASK_MAPPINGS = {
"sync_article_feed": SYNC_ARTICLE_FEED,
"sync_all_article_feeds": SYNC_ALL_ARTICLE_FEEDS,
"sync_website_archive": SYNC_WEBSITE_ARCHIVE,
"add_article_feed": ADD_ARTICLE_FEED,
},
"comic": {
"sync_all_comics": SYNC_ALL_COMICS,
@ -95,7 +92,6 @@ TASK_MAPPINGS = {
},
"notes": {
"setup_git_notes": SETUP_GIT_NOTES,
"track_git_changes": TRACK_GIT_CHANGES,
},
}
QUEUE_MAPPINGS = {
@ -253,13 +249,6 @@ def notes_setup_git_notes(ctx, origin, email, name):
execute_task(ctx, "notes", "setup_git_notes", origin=origin, email=email, name=name)
@notes.command("track-git-changes")
@click.pass_context
def notes_track_git_changes(ctx):
"""Track git changes."""
execute_task(ctx, "notes", "track_git_changes")
@cli.group()
@click.pass_context
def maintenance(ctx):
@ -387,34 +376,6 @@ def blogs_sync_website_archive(ctx, url):
execute_task(ctx, "blogs", "sync_website_archive", url=url)
@blogs.command("add-article-feed")
@click.option("--url", required=True, help="URL of the feed")
@click.option("--title", help="Title of the feed")
@click.option("--description", help="Description of the feed")
@click.option("--tags", help="Comma-separated tags to apply to the feed", default="")
@click.option("--active", is_flag=True, help="Whether the feed is active")
@click.option(
"--check-interval",
type=int,
help="Interval in minutes to check the feed",
default=60 * 24, # 24 hours
)
@click.pass_context
def blogs_add_article_feed(ctx, url, title, description, tags, active, check_interval):
"""Add a new article feed."""
execute_task(
ctx,
"blogs",
"add_article_feed",
url=url,
title=title,
description=description,
tags=tags.split(","),
active=active,
check_interval=check_interval,
)
@cli.group()
@click.pass_context
def comic(ctx):