mirror of
https://github.com/mruwnik/memory.git
synced 2025-10-01 20:52:39 +02:00
Compare commits
2 Commits
862251fedb
...
a2d107fad7
Author | SHA1 | Date | |
---|---|---|---|
![]() |
a2d107fad7 | ||
![]() |
b68e15d3ab |
@ -41,6 +41,7 @@ UPDATE_METADATA_FOR_ITEM = f"{MAINTENANCE_ROOT}.update_metadata_for_item"
|
||||
SYNC_WEBPAGE = f"{BLOGS_ROOT}.sync_webpage"
|
||||
SYNC_ARTICLE_FEED = f"{BLOGS_ROOT}.sync_article_feed"
|
||||
SYNC_ALL_ARTICLE_FEEDS = f"{BLOGS_ROOT}.sync_all_article_feeds"
|
||||
ADD_ARTICLE_FEED = f"{BLOGS_ROOT}.add_article_feed"
|
||||
SYNC_WEBSITE_ARCHIVE = f"{BLOGS_ROOT}.sync_website_archive"
|
||||
|
||||
|
||||
|
@ -106,6 +106,7 @@ ARTICLE_FEED_SYNC_INTERVAL = int(os.getenv("ARTICLE_FEED_SYNC_INTERVAL", 30 * 60
|
||||
CLEAN_COLLECTION_INTERVAL = int(os.getenv("CLEAN_COLLECTION_INTERVAL", 24 * 60 * 60))
|
||||
CHUNK_REINGEST_INTERVAL = int(os.getenv("CHUNK_REINGEST_INTERVAL", 60 * 60))
|
||||
NOTES_SYNC_INTERVAL = int(os.getenv("NOTES_SYNC_INTERVAL", 15 * 60))
|
||||
LESSWRONG_SYNC_INTERVAL = int(os.getenv("LESSWRONG_SYNC_INTERVAL", 60 * 60 * 24))
|
||||
|
||||
CHUNK_REINGEST_SINCE_MINUTES = int(os.getenv("CHUNK_REINGEST_SINCE_MINUTES", 60 * 24))
|
||||
|
||||
|
@ -295,4 +295,7 @@ feeds = [
|
||||
"https://karlin.blog/",
|
||||
"https://slatestarcodex.com/",
|
||||
"https://www.astralcodexten.com/",
|
||||
"https://nayafia.substack.com",
|
||||
"https://homosabiens.substack.com",
|
||||
"https://usefulfictions.substack.com",
|
||||
]
|
||||
|
@ -609,6 +609,9 @@ PARSER_REGISTRY = {
|
||||
r"rachelbythebay\.com": RachelByTheBayParser,
|
||||
r"nadia\.xyz": NadiaXyzParser,
|
||||
r"slatestarcodex\.com": SlateStarCodexParser,
|
||||
r"nayafia\.substack\.com": SubstackParser,
|
||||
r"homosabiens\.substack\.com": SubstackParser,
|
||||
r"usefulfictions\.substack\.com": SubstackParser,
|
||||
}
|
||||
|
||||
|
||||
|
@ -237,6 +237,7 @@ def fetch_lesswrong(
|
||||
|
||||
def fetch_lesswrong_posts(
|
||||
since: datetime | None = None,
|
||||
until: datetime | None = None,
|
||||
min_karma: int = 10,
|
||||
limit: int = 50,
|
||||
cooldown: float = 0.5,
|
||||
@ -280,6 +281,9 @@ def fetch_lesswrong_posts(
|
||||
break
|
||||
|
||||
for post in page_posts:
|
||||
published_at = post.get("published_at")
|
||||
if published_at and until and published_at > until:
|
||||
break
|
||||
yield post
|
||||
|
||||
last_item = page_posts[-1]
|
||||
|
@ -8,6 +8,7 @@ from memory.common.celery_app import (
|
||||
SYNC_ALL_COMICS,
|
||||
SYNC_ALL_ARTICLE_FEEDS,
|
||||
TRACK_GIT_CHANGES,
|
||||
SYNC_LESSWRONG,
|
||||
)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
@ -38,4 +39,8 @@ app.conf.beat_schedule = {
|
||||
"task": TRACK_GIT_CHANGES,
|
||||
"schedule": settings.NOTES_SYNC_INTERVAL,
|
||||
},
|
||||
"sync-lesswrong": {
|
||||
"task": SYNC_LESSWRONG,
|
||||
"schedule": settings.LESSWRONG_SYNC_INTERVAL,
|
||||
},
|
||||
}
|
||||
|
@ -12,6 +12,7 @@ from memory.common.celery_app import (
|
||||
SYNC_WEBPAGE,
|
||||
SYNC_ARTICLE_FEED,
|
||||
SYNC_ALL_ARTICLE_FEEDS,
|
||||
ADD_ARTICLE_FEED,
|
||||
SYNC_WEBSITE_ARCHIVE,
|
||||
)
|
||||
from memory.workers.tasks.content_processing import (
|
||||
@ -169,10 +170,52 @@ def sync_all_article_feeds() -> list[dict]:
|
||||
return results
|
||||
|
||||
|
||||
@app.task(name=ADD_ARTICLE_FEED)
|
||||
def add_article_feed(
|
||||
url: str,
|
||||
title: str | None = None,
|
||||
description: str | None = None,
|
||||
tags: Iterable[str] = [],
|
||||
active: bool = True,
|
||||
check_interval: int = 60 * 24, # 24 hours
|
||||
) -> dict:
|
||||
"""
|
||||
Add a new ArticleFeed.
|
||||
|
||||
Args:
|
||||
url: URL of the feed
|
||||
title: Title of the feed (optional)
|
||||
description: Description of the feed (optional)
|
||||
tags: Tags to apply to the feed
|
||||
active: Whether the feed is active
|
||||
check_interval: Interval in minutes to check the feed
|
||||
|
||||
Returns:
|
||||
dict: Summary of the added feed
|
||||
"""
|
||||
with make_session() as session:
|
||||
feed = session.query(ArticleFeed).filter(ArticleFeed.url == url).first()
|
||||
if feed:
|
||||
logger.info(f"Feed already exists: {url}")
|
||||
return {"status": "error", "error": "Feed already exists"}
|
||||
|
||||
feed = ArticleFeed(
|
||||
url=url,
|
||||
title=title or url,
|
||||
description=description,
|
||||
active=active,
|
||||
check_interval=check_interval,
|
||||
tags=tags,
|
||||
)
|
||||
session.add(feed)
|
||||
session.commit()
|
||||
return {"status": "success", "feed_id": feed.id}
|
||||
|
||||
|
||||
@app.task(name=SYNC_WEBSITE_ARCHIVE)
|
||||
@safe_task_execution
|
||||
def sync_website_archive(
|
||||
url: str, tags: Iterable[str] = [], max_pages: int = 100
|
||||
url: str, tags: Iterable[str] = [], max_pages: int = 100, add_feed: bool = True
|
||||
) -> dict:
|
||||
"""
|
||||
Synchronize all articles from a website's archive.
|
||||
@ -187,6 +230,16 @@ def sync_website_archive(
|
||||
"""
|
||||
logger.info(f"Starting archive sync for: {url}")
|
||||
|
||||
if add_feed:
|
||||
with make_session() as session:
|
||||
feed = session.query(ArticleFeed).filter(ArticleFeed.url == url).first()
|
||||
if not feed:
|
||||
feed = ArticleFeed(
|
||||
url=url,
|
||||
title=url,
|
||||
active=True,
|
||||
)
|
||||
|
||||
# Get archive fetcher for the website
|
||||
fetcher = get_archive_fetcher(url)
|
||||
if not fetcher:
|
||||
@ -200,10 +253,10 @@ def sync_website_archive(
|
||||
new_articles = 0
|
||||
task_ids = []
|
||||
|
||||
for feed_item in fetcher.fetch_all_items():
|
||||
articles_found += 1
|
||||
with make_session() as session:
|
||||
for feed_item in fetcher.fetch_all_items():
|
||||
articles_found += 1
|
||||
|
||||
with make_session() as session:
|
||||
existing = check_content_exists(session, BlogPost, url=feed_item.url)
|
||||
if existing:
|
||||
continue
|
||||
|
@ -49,7 +49,8 @@ def sync_lesswrong_post(
|
||||
@app.task(name=SYNC_LESSWRONG)
|
||||
@safe_task_execution
|
||||
def sync_lesswrong(
|
||||
since: str = (datetime.now() - timedelta(days=30)).isoformat(),
|
||||
since: str | None = None,
|
||||
until: str | None = None,
|
||||
min_karma: int = 10,
|
||||
limit: int = 50,
|
||||
cooldown: float = 0.5,
|
||||
@ -57,9 +58,27 @@ def sync_lesswrong(
|
||||
af: bool = False,
|
||||
tags: list[str] = [],
|
||||
):
|
||||
if until:
|
||||
end_date = datetime.fromisoformat(until)
|
||||
else:
|
||||
end_date = datetime.now() - timedelta(hours=8)
|
||||
|
||||
logger.info(f"Syncing LessWrong posts since {since}")
|
||||
start_date = datetime.fromisoformat(since)
|
||||
posts = fetch_lesswrong_posts(start_date, min_karma, limit, cooldown, max_items, af)
|
||||
|
||||
if since:
|
||||
start_date = datetime.fromisoformat(since)
|
||||
else:
|
||||
start_date = end_date - timedelta(days=30)
|
||||
|
||||
posts = fetch_lesswrong_posts(
|
||||
since=start_date,
|
||||
until=end_date,
|
||||
min_karma=min_karma,
|
||||
limit=limit,
|
||||
cooldown=cooldown,
|
||||
max_items=max_items,
|
||||
af=af,
|
||||
)
|
||||
|
||||
posts_num, new_posts = 0, 0
|
||||
with make_session() as session:
|
||||
|
@ -1,5 +1,5 @@
|
||||
import pytest
|
||||
from datetime import datetime, timedelta, timezone
|
||||
from datetime import datetime, timezone
|
||||
from unittest.mock import Mock, patch
|
||||
|
||||
from memory.common.db.models import ForumPost
|
||||
@ -228,17 +228,19 @@ def test_sync_lesswrong_success(mock_fetch, mock_lesswrong_post, db_session):
|
||||
assert result["since"] == "2024-01-01T00:00:00"
|
||||
assert result["min_karma"] == 10
|
||||
assert result["max_items"] == 100
|
||||
assert result["af"] == False
|
||||
assert not result["af"]
|
||||
|
||||
# Verify fetch_lesswrong_posts was called with correct arguments
|
||||
mock_fetch.assert_called_once_with(
|
||||
datetime.fromisoformat("2024-01-01T00:00:00"),
|
||||
10, # min_karma
|
||||
50, # limit
|
||||
0.1, # cooldown
|
||||
100, # max_items
|
||||
False, # af
|
||||
)
|
||||
# Verify fetch_lesswrong_posts was called with correct arguments (kwargs)
|
||||
mock_fetch.assert_called_once()
|
||||
kwargs = mock_fetch.call_args.kwargs
|
||||
assert kwargs["since"] == datetime.fromisoformat("2024-01-01T00:00:00")
|
||||
assert kwargs["min_karma"] == 10
|
||||
assert kwargs["limit"] == 50
|
||||
assert kwargs["cooldown"] == 0.1
|
||||
assert kwargs["max_items"] == 100
|
||||
assert kwargs["af"] is False
|
||||
assert "until" in kwargs
|
||||
assert isinstance(kwargs["until"], datetime)
|
||||
|
||||
# Verify sync_lesswrong_post was called for the new post
|
||||
mock_sync_post.delay.assert_called_once_with(mock_lesswrong_post, ["test"])
|
||||
@ -343,11 +345,14 @@ def test_sync_lesswrong_since_parameter(mock_fetch, db_session):
|
||||
forums.sync_lesswrong(since="2024-01-01T00:00:00")
|
||||
expected_since = datetime.fromisoformat("2024-01-01T00:00:00")
|
||||
|
||||
# Verify fetch was called with correct since date
|
||||
call_args = mock_fetch.call_args[0]
|
||||
actual_since = call_args[0]
|
||||
# Verify fetch was called with correct since date (kwargs)
|
||||
kwargs = mock_fetch.call_args.kwargs
|
||||
actual_since = kwargs["since"]
|
||||
|
||||
assert actual_since == expected_since
|
||||
assert "until" in kwargs
|
||||
assert isinstance(kwargs["until"], datetime)
|
||||
assert kwargs["until"] >= actual_since
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
@ -373,14 +378,14 @@ def test_sync_lesswrong_parameters(
|
||||
max_items=500,
|
||||
)
|
||||
|
||||
# Verify fetch was called with correct parameters
|
||||
call_args = mock_fetch.call_args[0]
|
||||
# Verify fetch was called with correct parameters (kwargs)
|
||||
kwargs = mock_fetch.call_args.kwargs
|
||||
|
||||
assert call_args[1] == min_karma # min_karma
|
||||
assert call_args[2] == limit # limit
|
||||
assert call_args[3] == cooldown # cooldown
|
||||
assert call_args[4] == 500 # max_items
|
||||
assert call_args[5] == af_value # af
|
||||
assert kwargs["min_karma"] == min_karma
|
||||
assert kwargs["limit"] == limit
|
||||
assert kwargs["cooldown"] == cooldown
|
||||
assert kwargs["max_items"] == 500
|
||||
assert kwargs["af"] == af_value
|
||||
|
||||
assert result["min_karma"] == min_karma
|
||||
assert result["af"] == af_value
|
||||
|
@ -27,6 +27,7 @@ from memory.common import settings
|
||||
from memory.common.celery_app import (
|
||||
SYNC_ALL_ARTICLE_FEEDS,
|
||||
SYNC_ARTICLE_FEED,
|
||||
ADD_ARTICLE_FEED,
|
||||
SYNC_WEBPAGE,
|
||||
SYNC_WEBSITE_ARCHIVE,
|
||||
SYNC_ALL_COMICS,
|
||||
@ -49,6 +50,7 @@ from memory.common.celery_app import (
|
||||
UPDATE_METADATA_FOR_ITEM,
|
||||
UPDATE_METADATA_FOR_SOURCE_ITEMS,
|
||||
SETUP_GIT_NOTES,
|
||||
TRACK_GIT_CHANGES,
|
||||
app,
|
||||
)
|
||||
|
||||
@ -78,6 +80,7 @@ TASK_MAPPINGS = {
|
||||
"sync_article_feed": SYNC_ARTICLE_FEED,
|
||||
"sync_all_article_feeds": SYNC_ALL_ARTICLE_FEEDS,
|
||||
"sync_website_archive": SYNC_WEBSITE_ARCHIVE,
|
||||
"add_article_feed": ADD_ARTICLE_FEED,
|
||||
},
|
||||
"comic": {
|
||||
"sync_all_comics": SYNC_ALL_COMICS,
|
||||
@ -92,6 +95,7 @@ TASK_MAPPINGS = {
|
||||
},
|
||||
"notes": {
|
||||
"setup_git_notes": SETUP_GIT_NOTES,
|
||||
"track_git_changes": TRACK_GIT_CHANGES,
|
||||
},
|
||||
}
|
||||
QUEUE_MAPPINGS = {
|
||||
@ -249,6 +253,13 @@ def notes_setup_git_notes(ctx, origin, email, name):
|
||||
execute_task(ctx, "notes", "setup_git_notes", origin=origin, email=email, name=name)
|
||||
|
||||
|
||||
@notes.command("track-git-changes")
|
||||
@click.pass_context
|
||||
def notes_track_git_changes(ctx):
|
||||
"""Track git changes."""
|
||||
execute_task(ctx, "notes", "track_git_changes")
|
||||
|
||||
|
||||
@cli.group()
|
||||
@click.pass_context
|
||||
def maintenance(ctx):
|
||||
@ -376,6 +387,34 @@ def blogs_sync_website_archive(ctx, url):
|
||||
execute_task(ctx, "blogs", "sync_website_archive", url=url)
|
||||
|
||||
|
||||
@blogs.command("add-article-feed")
|
||||
@click.option("--url", required=True, help="URL of the feed")
|
||||
@click.option("--title", help="Title of the feed")
|
||||
@click.option("--description", help="Description of the feed")
|
||||
@click.option("--tags", help="Comma-separated tags to apply to the feed", default="")
|
||||
@click.option("--active", is_flag=True, help="Whether the feed is active")
|
||||
@click.option(
|
||||
"--check-interval",
|
||||
type=int,
|
||||
help="Interval in minutes to check the feed",
|
||||
default=60 * 24, # 24 hours
|
||||
)
|
||||
@click.pass_context
|
||||
def blogs_add_article_feed(ctx, url, title, description, tags, active, check_interval):
|
||||
"""Add a new article feed."""
|
||||
execute_task(
|
||||
ctx,
|
||||
"blogs",
|
||||
"add_article_feed",
|
||||
url=url,
|
||||
title=title,
|
||||
description=description,
|
||||
tags=tags.split(","),
|
||||
active=active,
|
||||
check_interval=check_interval,
|
||||
)
|
||||
|
||||
|
||||
@cli.group()
|
||||
@click.pass_context
|
||||
def comic(ctx):
|
||||
|
Loading…
x
Reference in New Issue
Block a user