minor fixes

2025-12-13 15:41:20 +01:00 · 2025-06-04 00:13:35 +00:00 · 2025-06-04 00:13:35 +00:00 · 3d9f8ae55f
commit 3d9f8ae55f
parent ac9bdb1dfc
5 changed files with 42 additions and 17 deletions
--- a/src/memory/common/db/models/source_items.py
+++ b/src/memory/common/db/models/source_items.py
@ -269,7 +269,7 @@ class Comic(SourceItem):
        return {k: v for k, v in payload.items() if v is not None}

    def _chunk_contents(self) -> Sequence[extract.DataChunk]:
-        image = Image.open(pathlib.Path(cast(str, self.filename)))
+        image = Image.open(settings.FILE_STORAGE_DIR / cast(str, self.filename))
        description = f"{self.title} by {self.author}"
        return [extract.DataChunk(data=[image, description])]

--- a/src/memory/common/summarizer.py
+++ b/src/memory/common/summarizer.py
@ -1,8 +1,9 @@
-import json
 import logging
 import traceback
 from typing import Any

+from bs4 import BeautifulSoup
+
 from memory.common import settings, chunker

 logger = logging.getLogger(__name__)
@ -12,11 +13,13 @@ The following text is already concise. Please identify 3-5 relevant tags that ca

 Tags should be lowercase and use hyphens instead of spaces, e.g. "machine-learning" instead of "Machine Learning".

-Return your response as JSON with this format:
-{{
-"summary": "{summary}",
-"tags": ["tag1", "tag2", "tag3"]
-}}
+Return your response as XML with this format:
+<summary>{summary}</summary>
+<tags>
+    <tag>tag1</tag>
+    <tag>tag2</tag>
+    <tag>tag3</tag>
+</tags>

 Text:
 {content}
@ -28,17 +31,28 @@ Also provide 3-5 relevant tags that capture the main topics or themes.

 Tags should be lowercase and use hyphens instead of spaces, e.g. "machine-learning" instead of "Machine Learning".

-Return your response as JSON with this format:
-{{
-    "summary": "your summary here",
-    "tags": ["tag1", "tag2", "tag3"]
-}}
+Return your response as XML with this format:
+
+<summary>your summary here</summary>
+<tags>
+    <tag>tag1</tag>
+    <tag>tag2</tag>
+    <tag>tag3</tag>
+</tags>

 Text to summarize:
 {content}
 """


+def parse_response(response: str) -> dict[str, Any]:
+    """Parse the response from the summarizer."""
+    soup = BeautifulSoup(response, "xml")
+    summary = soup.find("summary").text
+    tags = [tag.text for tag in soup.find_all("tag")]
+    return {"summary": summary, "tags": tags}
+
+
 def _call_openai(prompt: str) -> dict[str, Any]:
    """Call OpenAI API for summarization."""
    import openai
@ -58,7 +72,7 @@ def _call_openai(prompt: str) -> dict[str, Any]:
            temperature=0.3,
            max_tokens=2048,
        )
-        return json.loads(response.choices[0].message.content or "{}")
+        return parse_response(response.choices[0].message.content or "")
    except Exception as e:
        logger.error(f"OpenAI API error: {e}")
        raise
@ -73,13 +87,14 @@ def _call_anthropic(prompt: str) -> dict[str, Any]:
        response = client.messages.create(
            model=settings.SUMMARIZER_MODEL.split("/")[1],
            messages=[{"role": "user", "content": prompt}],
-            system="You are a helpful assistant that creates concise summaries and identifies key topics. Always respond with valid JSON.",
+            system="You are a helpful assistant that creates concise summaries and identifies key topics. Always respond with valid XML.",
            temperature=0.3,
            max_tokens=2048,
        )
-        return json.loads(response.content[0].text)
+        return parse_response(response.content[0].text)
    except Exception as e:
        logger.error(f"Anthropic API error: {e}")
+        logger.error(response.content[0].text)
        raise


--- a/src/memory/parsers/archives.py
+++ b/src/memory/parsers/archives.py
@ -294,4 +294,5 @@ feeds = [
    "https://www.theredhandfiles.com/",
    "https://karlin.blog/",
    "https://slatestarcodex.com/",
+    "https://www.astralcodexten.com/",
 ]
--- a/src/memory/workers/tasks/comic.py
+++ b/src/memory/workers/tasks/comic.py
@ -75,6 +75,7 @@ def sync_comic(
    published_date: datetime | None = None,
 ):
    """Synchronize a comic from a URL."""
+    logger.info(f"syncing comic {url}")
    with make_session() as session:
        existing_comic = check_content_exists(session, Comic, url=url)
        if existing_comic:
@ -101,7 +102,7 @@ def sync_comic(
        url=url,
        published=published_date,
        author=author,
-        filename=filename.resolve().as_posix(),
+        filename=filename.resolve().relative_to(settings.FILE_STORAGE_DIR).as_posix(),
        mime_type=mime_type,
        size=len(response.content),
        sha256=create_content_hash(f"{image_url}{published_date}"),
--- a/tools/run_celery_task.py
+++ b/tools/run_celery_task.py
@ -84,6 +84,7 @@ TASK_MAPPINGS = {
        "sync_smbc": SYNC_SMBC,
        "sync_xkcd": SYNC_XKCD,
        "sync_comic": SYNC_COMIC,
+        "full_sync_comics": "memory.workers.tasks.comic.full_sync_comic",
    },
    "forums": {
        "sync_lesswrong": SYNC_LESSWRONG,
@ -422,6 +423,13 @@ def comic_sync_comic(ctx, image_url, title, author, published_date):
    )


+@comic.command("full-sync-comics")
+@click.pass_context
+def comic_full_sync_comics(ctx):
+    """Full sync comics."""
+    execute_task(ctx, "comic", "full_sync_comics")
+
+
@cli.group()
@click.pass_context
 def forums(ctx):
@ -442,7 +450,7 @@ def forums_sync_lesswrong(ctx, since_date, min_karma, limit, cooldown, max_items
        ctx,
        "forums",
        "sync_lesswrong",
-        since_date=since_date,
+        since=since_date,
        min_karma=min_karma,
        limit=limit,
        cooldown=cooldown,