diff --git a/src/memory/common/db/models/source_items.py b/src/memory/common/db/models/source_items.py
index 3cdbdc5..f5a5d5a 100644
--- a/src/memory/common/db/models/source_items.py
+++ b/src/memory/common/db/models/source_items.py
@@ -269,7 +269,7 @@ class Comic(SourceItem):
return {k: v for k, v in payload.items() if v is not None}
def _chunk_contents(self) -> Sequence[extract.DataChunk]:
- image = Image.open(pathlib.Path(cast(str, self.filename)))
+ image = Image.open(settings.FILE_STORAGE_DIR / cast(str, self.filename))
description = f"{self.title} by {self.author}"
return [extract.DataChunk(data=[image, description])]
diff --git a/src/memory/common/summarizer.py b/src/memory/common/summarizer.py
index 981ced6..85d5454 100644
--- a/src/memory/common/summarizer.py
+++ b/src/memory/common/summarizer.py
@@ -1,8 +1,9 @@
-import json
import logging
import traceback
from typing import Any
+from bs4 import BeautifulSoup
+
from memory.common import settings, chunker
logger = logging.getLogger(__name__)
@@ -12,11 +13,13 @@ The following text is already concise. Please identify 3-5 relevant tags that ca
Tags should be lowercase and use hyphens instead of spaces, e.g. "machine-learning" instead of "Machine Learning".
-Return your response as JSON with this format:
-{{
-"summary": "{summary}",
-"tags": ["tag1", "tag2", "tag3"]
-}}
+Return your response as XML with this format:
+{summary}
+
+ tag1
+ tag2
+ tag3
+
Text:
{content}
@@ -28,17 +31,28 @@ Also provide 3-5 relevant tags that capture the main topics or themes.
Tags should be lowercase and use hyphens instead of spaces, e.g. "machine-learning" instead of "Machine Learning".
-Return your response as JSON with this format:
-{{
- "summary": "your summary here",
- "tags": ["tag1", "tag2", "tag3"]
-}}
+Return your response as XML with this format:
+
+your summary here
+
+ tag1
+ tag2
+ tag3
+
Text to summarize:
{content}
"""
+def parse_response(response: str) -> dict[str, Any]:
+ """Parse the response from the summarizer."""
+ soup = BeautifulSoup(response, "xml")
+ summary = soup.find("summary").text
+ tags = [tag.text for tag in soup.find_all("tag")]
+ return {"summary": summary, "tags": tags}
+
+
def _call_openai(prompt: str) -> dict[str, Any]:
"""Call OpenAI API for summarization."""
import openai
@@ -58,7 +72,7 @@ def _call_openai(prompt: str) -> dict[str, Any]:
temperature=0.3,
max_tokens=2048,
)
- return json.loads(response.choices[0].message.content or "{}")
+ return parse_response(response.choices[0].message.content or "")
except Exception as e:
logger.error(f"OpenAI API error: {e}")
raise
@@ -73,13 +87,14 @@ def _call_anthropic(prompt: str) -> dict[str, Any]:
response = client.messages.create(
model=settings.SUMMARIZER_MODEL.split("/")[1],
messages=[{"role": "user", "content": prompt}],
- system="You are a helpful assistant that creates concise summaries and identifies key topics. Always respond with valid JSON.",
+ system="You are a helpful assistant that creates concise summaries and identifies key topics. Always respond with valid XML.",
temperature=0.3,
max_tokens=2048,
)
- return json.loads(response.content[0].text)
+ return parse_response(response.content[0].text)
except Exception as e:
logger.error(f"Anthropic API error: {e}")
+ logger.error(response.content[0].text)
raise
diff --git a/src/memory/parsers/archives.py b/src/memory/parsers/archives.py
index 762d45b..fa54874 100644
--- a/src/memory/parsers/archives.py
+++ b/src/memory/parsers/archives.py
@@ -294,4 +294,5 @@ feeds = [
"https://www.theredhandfiles.com/",
"https://karlin.blog/",
"https://slatestarcodex.com/",
+ "https://www.astralcodexten.com/",
]
diff --git a/src/memory/workers/tasks/comic.py b/src/memory/workers/tasks/comic.py
index 9e81706..de75ff3 100644
--- a/src/memory/workers/tasks/comic.py
+++ b/src/memory/workers/tasks/comic.py
@@ -75,6 +75,7 @@ def sync_comic(
published_date: datetime | None = None,
):
"""Synchronize a comic from a URL."""
+ logger.info(f"syncing comic {url}")
with make_session() as session:
existing_comic = check_content_exists(session, Comic, url=url)
if existing_comic:
@@ -101,7 +102,7 @@ def sync_comic(
url=url,
published=published_date,
author=author,
- filename=filename.resolve().as_posix(),
+ filename=filename.resolve().relative_to(settings.FILE_STORAGE_DIR).as_posix(),
mime_type=mime_type,
size=len(response.content),
sha256=create_content_hash(f"{image_url}{published_date}"),
diff --git a/tools/run_celery_task.py b/tools/run_celery_task.py
index 6e87f29..284071f 100644
--- a/tools/run_celery_task.py
+++ b/tools/run_celery_task.py
@@ -84,6 +84,7 @@ TASK_MAPPINGS = {
"sync_smbc": SYNC_SMBC,
"sync_xkcd": SYNC_XKCD,
"sync_comic": SYNC_COMIC,
+ "full_sync_comics": "memory.workers.tasks.comic.full_sync_comic",
},
"forums": {
"sync_lesswrong": SYNC_LESSWRONG,
@@ -422,6 +423,13 @@ def comic_sync_comic(ctx, image_url, title, author, published_date):
)
+@comic.command("full-sync-comics")
+@click.pass_context
+def comic_full_sync_comics(ctx):
+ """Full sync comics."""
+ execute_task(ctx, "comic", "full_sync_comics")
+
+
@cli.group()
@click.pass_context
def forums(ctx):
@@ -442,7 +450,7 @@ def forums_sync_lesswrong(ctx, since_date, min_karma, limit, cooldown, max_items
ctx,
"forums",
"sync_lesswrong",
- since_date=since_date,
+ since=since_date,
min_karma=min_karma,
limit=limit,
cooldown=cooldown,