From 0f15e4e410b1d0f61b3988342f375598b76cb993 Mon Sep 17 00:00:00 2001
From: Daniel O'Connell <github@ahiru.pl>
Date: Tue, 27 May 2025 01:42:22 +0200
Subject: [PATCH] Check all feeds work

---
 .../20250527_013945_rename_rss_feed.py        | 123 ++++++++++++++++++
 src/memory/common/db/models.py                |   8 +-
 src/memory/common/parsers/blogs.py            |  33 -----
 src/memory/common/parsers/feeds.py            |   4 +-
 tests/memory/common/parsers/test_feeds.py     |   6 +-
 5 files changed, 133 insertions(+), 41 deletions(-)
 create mode 100644 db/migrations/versions/20250527_013945_rename_rss_feed.py

diff --git a/db/migrations/versions/20250527_013945_rename_rss_feed.py b/db/migrations/versions/20250527_013945_rename_rss_feed.py
new file mode 100644
index 0000000..8482f55
--- /dev/null
+++ b/db/migrations/versions/20250527_013945_rename_rss_feed.py
@@ -0,0 +1,123 @@
+"""Rename rss feed
+
+Revision ID: f8e6a7f80928
+Revises: d897c6353a84
+Create Date: 2025-05-27 01:39:45.722077
+
+"""
+
+from typing import Sequence, Union
+
+from alembic import op
+import sqlalchemy as sa
+from sqlalchemy.dialects import postgresql
+
+# revision identifiers, used by Alembic.
+revision: str = "f8e6a7f80928"
+down_revision: Union[str, None] = "d897c6353a84"
+branch_labels: Union[str, Sequence[str], None] = None
+depends_on: Union[str, Sequence[str], None] = None
+
+
+def upgrade() -> None:
+    op.create_table(
+        "article_feeds",
+        sa.Column("id", sa.BigInteger(), nullable=False),
+        sa.Column("url", sa.Text(), nullable=False),
+        sa.Column("title", sa.Text(), nullable=True),
+        sa.Column("description", sa.Text(), nullable=True),
+        sa.Column("tags", sa.ARRAY(sa.Text()), server_default="{}", nullable=False),
+        sa.Column("last_checked_at", sa.DateTime(timezone=True), nullable=True),
+        sa.Column("active", sa.Boolean(), server_default="true", nullable=False),
+        sa.Column(
+            "created_at",
+            sa.DateTime(timezone=True),
+            server_default=sa.text("now()"),
+            nullable=False,
+        ),
+        sa.Column(
+            "updated_at",
+            sa.DateTime(timezone=True),
+            server_default=sa.text("now()"),
+            nullable=False,
+        ),
+        sa.PrimaryKeyConstraint("id"),
+        sa.UniqueConstraint("url"),
+    )
+    op.create_index(
+        "article_feeds_active_idx",
+        "article_feeds",
+        ["active", "last_checked_at"],
+        unique=False,
+    )
+    op.create_index(
+        "article_feeds_tags_idx",
+        "article_feeds",
+        ["tags"],
+        unique=False,
+        postgresql_using="gin",
+    )
+    op.drop_index("rss_feeds_active_idx", table_name="rss_feeds")
+    op.drop_index("rss_feeds_tags_idx", table_name="rss_feeds", postgresql_using="gin")
+    op.drop_table("rss_feeds")
+
+
+def downgrade() -> None:
+    op.create_table(
+        "rss_feeds",
+        sa.Column("id", sa.BIGINT(), autoincrement=True, nullable=False),
+        sa.Column("url", sa.TEXT(), autoincrement=False, nullable=False),
+        sa.Column("title", sa.TEXT(), autoincrement=False, nullable=True),
+        sa.Column("description", sa.TEXT(), autoincrement=False, nullable=True),
+        sa.Column(
+            "tags",
+            postgresql.ARRAY(sa.TEXT()),
+            server_default=sa.text("'{}'::text[]"),
+            autoincrement=False,
+            nullable=False,
+        ),
+        sa.Column(
+            "last_checked_at",
+            postgresql.TIMESTAMP(timezone=True),
+            autoincrement=False,
+            nullable=True,
+        ),
+        sa.Column(
+            "active",
+            sa.BOOLEAN(),
+            server_default=sa.text("true"),
+            autoincrement=False,
+            nullable=False,
+        ),
+        sa.Column(
+            "created_at",
+            postgresql.TIMESTAMP(timezone=True),
+            server_default=sa.text("now()"),
+            autoincrement=False,
+            nullable=False,
+        ),
+        sa.Column(
+            "updated_at",
+            postgresql.TIMESTAMP(timezone=True),
+            server_default=sa.text("now()"),
+            autoincrement=False,
+            nullable=False,
+        ),
+        sa.PrimaryKeyConstraint("id", name="rss_feeds_pkey"),
+        sa.UniqueConstraint("url", name="rss_feeds_url_key"),
+    )
+    op.create_index(
+        "rss_feeds_tags_idx",
+        "rss_feeds",
+        ["tags"],
+        unique=False,
+        postgresql_using="gin",
+    )
+    op.create_index(
+        "rss_feeds_active_idx", "rss_feeds", ["active", "last_checked_at"], unique=False
+    )
+    op.drop_index(
+        "article_feeds_tags_idx", table_name="article_feeds", postgresql_using="gin"
+    )
+    op.drop_index("article_feeds_active_idx", table_name="article_feeds")
+    op.drop_table("article_feeds")
diff --git a/src/memory/common/db/models.py b/src/memory/common/db/models.py
index 1cf1417..ab38440 100644
--- a/src/memory/common/db/models.py
+++ b/src/memory/common/db/models.py
@@ -697,8 +697,8 @@ class GithubItem(SourceItem):
     )
 
 
-class RssFeed(Base):
-    __tablename__ = "rss_feeds"
+class ArticleFeed(Base):
+    __tablename__ = "article_feeds"
 
     id = Column(BigInteger, primary_key=True)
     url = Column(Text, nullable=False, unique=True)
@@ -716,8 +716,8 @@ class RssFeed(Base):
 
     # Add indexes
     __table_args__ = (
-        Index("rss_feeds_active_idx", "active", "last_checked_at"),
-        Index("rss_feeds_tags_idx", "tags", postgresql_using="gin"),
+        Index("article_feeds_active_idx", "active", "last_checked_at"),
+        Index("article_feeds_tags_idx", "tags", postgresql_using="gin"),
     )
 
 
diff --git a/src/memory/common/parsers/blogs.py b/src/memory/common/parsers/blogs.py
index 021e718..c99b2e3 100644
--- a/src/memory/common/parsers/blogs.py
+++ b/src/memory/common/parsers/blogs.py
@@ -615,36 +615,3 @@ def parse_webpage(url: str) -> Article:
     html = cast(str, fetch_html(url))
     parser = get_parser_for_url(url, html)
     return parser.parse(html, url)
-
-
-feeds = [
-    "https://archive.ph/o/IQUoT/https://www.bloomberg.com/opinion/authors/ARbTQlRLRjE/matthew-s-levine",
-    "https://www.rifters.com/crawl/",
-    "https://rachelbythebay.com/w/",
-    "https://danluu.com/",
-    "https://guzey.come",
-    "https://aphyr.com/",
-    "https://www.applieddivinitystudies.com/",
-    "https://www.imightbewrong.org/",
-    "https://www.kvetch.au/",
-    "https://www.overcomingbias.com/",
-    "https://samkriss.substack.com/",
-    "https://www.richardhanania.com/",
-    "https://skunkledger.substack.com/",
-    "https://taipology.substack.com/",
-    "https://putanumonit.com/",
-    "https://www.flyingmachinestudios.com/",
-    "https://www.theintrinsicperspective.com/",
-    "https://www.strangeloopcanon.com/",
-    "https://slimemoldtimemold.com/",
-    "https://zeroinputagriculture.substack.com/",
-    "https://nayafia.substack.com",
-    "https://www.paulgraham.com/articles.html",
-    "https://mcfunley.com/writing",
-    "https://www.bitsaboutmoney.com/",
-    "https://akarlin.com",
-    "https://www.exurbe.com/",
-    "https://acoup.blog/",
-    "https://www.theredhandfiles.com/",
-    "https://karlin.blog/",
-]
diff --git a/src/memory/common/parsers/feeds.py b/src/memory/common/parsers/feeds.py
index 6ea25a4..d352c7e 100644
--- a/src/memory/common/parsers/feeds.py
+++ b/src/memory/common/parsers/feeds.py
@@ -319,7 +319,8 @@ class GuzeyParser(HTMLListParser):
 
 
 class PaulGrahamParser(HTMLListParser):
-    item_selector = "font a[href]"
+    item_selector = "img + font"
+    title_selector = "a"
     skip_patterns = DEFAULT_SKIP_PATTERNS + [
         r"\.txt$",  # Skip text files
         r"turbifycdn\.com",  # Skip CDN links
@@ -329,7 +330,6 @@ class PaulGrahamParser(HTMLListParser):
         # Only include items that are actual essays (relative URLs ending in .html)
         return (
             item.url.endswith(".html")
-            and not item.url.startswith("http")
             and len(item.title) > 5  # Filter out very short titles
         )
 
diff --git a/tests/memory/common/parsers/test_feeds.py b/tests/memory/common/parsers/test_feeds.py
index b8cf541..7981bfc 100644
--- a/tests/memory/common/parsers/test_feeds.py
+++ b/tests/memory/common/parsers/test_feeds.py
@@ -589,10 +589,12 @@ def test_html_list_parser_extract_date_without_selector():
         (
             PaulGrahamParser,
             "https://www.paulgraham.com/articles",
-            [("Long enough title", "essay.html")],
+            [
+                ("Long enough title", "essay.html"),
+                ("Long enough title", "https://other.com/essay.html"),
+            ],
             [
                 ("Short", "essay.html"),
-                ("Long enough title", "https://other.com/essay.html"),
                 ("Long enough title", "document.txt"),
             ],
         ),