diff --git a/src/memory/common/db/models/source_items.py b/src/memory/common/db/models/source_items.py index 3cdbdc5..f5a5d5a 100644 --- a/src/memory/common/db/models/source_items.py +++ b/src/memory/common/db/models/source_items.py @@ -269,7 +269,7 @@ class Comic(SourceItem): return {k: v for k, v in payload.items() if v is not None} def _chunk_contents(self) -> Sequence[extract.DataChunk]: - image = Image.open(pathlib.Path(cast(str, self.filename))) + image = Image.open(settings.FILE_STORAGE_DIR / cast(str, self.filename)) description = f"{self.title} by {self.author}" return [extract.DataChunk(data=[image, description])] diff --git a/src/memory/common/summarizer.py b/src/memory/common/summarizer.py index 981ced6..85d5454 100644 --- a/src/memory/common/summarizer.py +++ b/src/memory/common/summarizer.py @@ -1,8 +1,9 @@ -import json import logging import traceback from typing import Any +from bs4 import BeautifulSoup + from memory.common import settings, chunker logger = logging.getLogger(__name__) @@ -12,11 +13,13 @@ The following text is already concise. Please identify 3-5 relevant tags that ca Tags should be lowercase and use hyphens instead of spaces, e.g. "machine-learning" instead of "Machine Learning". -Return your response as JSON with this format: -{{ -"summary": "{summary}", -"tags": ["tag1", "tag2", "tag3"] -}} +Return your response as XML with this format: +{summary} + + tag1 + tag2 + tag3 + Text: {content} @@ -28,17 +31,28 @@ Also provide 3-5 relevant tags that capture the main topics or themes. Tags should be lowercase and use hyphens instead of spaces, e.g. "machine-learning" instead of "Machine Learning". -Return your response as JSON with this format: -{{ - "summary": "your summary here", - "tags": ["tag1", "tag2", "tag3"] -}} +Return your response as XML with this format: + +your summary here + + tag1 + tag2 + tag3 + Text to summarize: {content} """ +def parse_response(response: str) -> dict[str, Any]: + """Parse the response from the summarizer.""" + soup = BeautifulSoup(response, "xml") + summary = soup.find("summary").text + tags = [tag.text for tag in soup.find_all("tag")] + return {"summary": summary, "tags": tags} + + def _call_openai(prompt: str) -> dict[str, Any]: """Call OpenAI API for summarization.""" import openai @@ -58,7 +72,7 @@ def _call_openai(prompt: str) -> dict[str, Any]: temperature=0.3, max_tokens=2048, ) - return json.loads(response.choices[0].message.content or "{}") + return parse_response(response.choices[0].message.content or "") except Exception as e: logger.error(f"OpenAI API error: {e}") raise @@ -73,13 +87,14 @@ def _call_anthropic(prompt: str) -> dict[str, Any]: response = client.messages.create( model=settings.SUMMARIZER_MODEL.split("/")[1], messages=[{"role": "user", "content": prompt}], - system="You are a helpful assistant that creates concise summaries and identifies key topics. Always respond with valid JSON.", + system="You are a helpful assistant that creates concise summaries and identifies key topics. Always respond with valid XML.", temperature=0.3, max_tokens=2048, ) - return json.loads(response.content[0].text) + return parse_response(response.content[0].text) except Exception as e: logger.error(f"Anthropic API error: {e}") + logger.error(response.content[0].text) raise diff --git a/src/memory/parsers/archives.py b/src/memory/parsers/archives.py index 762d45b..fa54874 100644 --- a/src/memory/parsers/archives.py +++ b/src/memory/parsers/archives.py @@ -294,4 +294,5 @@ feeds = [ "https://www.theredhandfiles.com/", "https://karlin.blog/", "https://slatestarcodex.com/", + "https://www.astralcodexten.com/", ] diff --git a/src/memory/workers/tasks/comic.py b/src/memory/workers/tasks/comic.py index 9e81706..de75ff3 100644 --- a/src/memory/workers/tasks/comic.py +++ b/src/memory/workers/tasks/comic.py @@ -75,6 +75,7 @@ def sync_comic( published_date: datetime | None = None, ): """Synchronize a comic from a URL.""" + logger.info(f"syncing comic {url}") with make_session() as session: existing_comic = check_content_exists(session, Comic, url=url) if existing_comic: @@ -101,7 +102,7 @@ def sync_comic( url=url, published=published_date, author=author, - filename=filename.resolve().as_posix(), + filename=filename.resolve().relative_to(settings.FILE_STORAGE_DIR).as_posix(), mime_type=mime_type, size=len(response.content), sha256=create_content_hash(f"{image_url}{published_date}"), diff --git a/tools/run_celery_task.py b/tools/run_celery_task.py index 6e87f29..284071f 100644 --- a/tools/run_celery_task.py +++ b/tools/run_celery_task.py @@ -84,6 +84,7 @@ TASK_MAPPINGS = { "sync_smbc": SYNC_SMBC, "sync_xkcd": SYNC_XKCD, "sync_comic": SYNC_COMIC, + "full_sync_comics": "memory.workers.tasks.comic.full_sync_comic", }, "forums": { "sync_lesswrong": SYNC_LESSWRONG, @@ -422,6 +423,13 @@ def comic_sync_comic(ctx, image_url, title, author, published_date): ) +@comic.command("full-sync-comics") +@click.pass_context +def comic_full_sync_comics(ctx): + """Full sync comics.""" + execute_task(ctx, "comic", "full_sync_comics") + + @cli.group() @click.pass_context def forums(ctx): @@ -442,7 +450,7 @@ def forums_sync_lesswrong(ctx, since_date, min_karma, limit, cooldown, max_items ctx, "forums", "sync_lesswrong", - since_date=since_date, + since=since_date, min_karma=min_karma, limit=limit, cooldown=cooldown,