From 0f15e4e410b1d0f61b3988342f375598b76cb993 Mon Sep 17 00:00:00 2001 From: Daniel O'Connell Date: Tue, 27 May 2025 01:42:22 +0200 Subject: [PATCH] Check all feeds work --- .../20250527_013945_rename_rss_feed.py | 123 ++++++++++++++++++ src/memory/common/db/models.py | 8 +- src/memory/common/parsers/blogs.py | 33 ----- src/memory/common/parsers/feeds.py | 4 +- tests/memory/common/parsers/test_feeds.py | 6 +- 5 files changed, 133 insertions(+), 41 deletions(-) create mode 100644 db/migrations/versions/20250527_013945_rename_rss_feed.py diff --git a/db/migrations/versions/20250527_013945_rename_rss_feed.py b/db/migrations/versions/20250527_013945_rename_rss_feed.py new file mode 100644 index 0000000..8482f55 --- /dev/null +++ b/db/migrations/versions/20250527_013945_rename_rss_feed.py @@ -0,0 +1,123 @@ +"""Rename rss feed + +Revision ID: f8e6a7f80928 +Revises: d897c6353a84 +Create Date: 2025-05-27 01:39:45.722077 + +""" + +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa +from sqlalchemy.dialects import postgresql + +# revision identifiers, used by Alembic. +revision: str = "f8e6a7f80928" +down_revision: Union[str, None] = "d897c6353a84" +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + op.create_table( + "article_feeds", + sa.Column("id", sa.BigInteger(), nullable=False), + sa.Column("url", sa.Text(), nullable=False), + sa.Column("title", sa.Text(), nullable=True), + sa.Column("description", sa.Text(), nullable=True), + sa.Column("tags", sa.ARRAY(sa.Text()), server_default="{}", nullable=False), + sa.Column("last_checked_at", sa.DateTime(timezone=True), nullable=True), + sa.Column("active", sa.Boolean(), server_default="true", nullable=False), + sa.Column( + "created_at", + sa.DateTime(timezone=True), + server_default=sa.text("now()"), + nullable=False, + ), + sa.Column( + "updated_at", + sa.DateTime(timezone=True), + server_default=sa.text("now()"), + nullable=False, + ), + sa.PrimaryKeyConstraint("id"), + sa.UniqueConstraint("url"), + ) + op.create_index( + "article_feeds_active_idx", + "article_feeds", + ["active", "last_checked_at"], + unique=False, + ) + op.create_index( + "article_feeds_tags_idx", + "article_feeds", + ["tags"], + unique=False, + postgresql_using="gin", + ) + op.drop_index("rss_feeds_active_idx", table_name="rss_feeds") + op.drop_index("rss_feeds_tags_idx", table_name="rss_feeds", postgresql_using="gin") + op.drop_table("rss_feeds") + + +def downgrade() -> None: + op.create_table( + "rss_feeds", + sa.Column("id", sa.BIGINT(), autoincrement=True, nullable=False), + sa.Column("url", sa.TEXT(), autoincrement=False, nullable=False), + sa.Column("title", sa.TEXT(), autoincrement=False, nullable=True), + sa.Column("description", sa.TEXT(), autoincrement=False, nullable=True), + sa.Column( + "tags", + postgresql.ARRAY(sa.TEXT()), + server_default=sa.text("'{}'::text[]"), + autoincrement=False, + nullable=False, + ), + sa.Column( + "last_checked_at", + postgresql.TIMESTAMP(timezone=True), + autoincrement=False, + nullable=True, + ), + sa.Column( + "active", + sa.BOOLEAN(), + server_default=sa.text("true"), + autoincrement=False, + nullable=False, + ), + sa.Column( + "created_at", + postgresql.TIMESTAMP(timezone=True), + server_default=sa.text("now()"), + autoincrement=False, + nullable=False, + ), + sa.Column( + "updated_at", + postgresql.TIMESTAMP(timezone=True), + server_default=sa.text("now()"), + autoincrement=False, + nullable=False, + ), + sa.PrimaryKeyConstraint("id", name="rss_feeds_pkey"), + sa.UniqueConstraint("url", name="rss_feeds_url_key"), + ) + op.create_index( + "rss_feeds_tags_idx", + "rss_feeds", + ["tags"], + unique=False, + postgresql_using="gin", + ) + op.create_index( + "rss_feeds_active_idx", "rss_feeds", ["active", "last_checked_at"], unique=False + ) + op.drop_index( + "article_feeds_tags_idx", table_name="article_feeds", postgresql_using="gin" + ) + op.drop_index("article_feeds_active_idx", table_name="article_feeds") + op.drop_table("article_feeds") diff --git a/src/memory/common/db/models.py b/src/memory/common/db/models.py index 1cf1417..ab38440 100644 --- a/src/memory/common/db/models.py +++ b/src/memory/common/db/models.py @@ -697,8 +697,8 @@ class GithubItem(SourceItem): ) -class RssFeed(Base): - __tablename__ = "rss_feeds" +class ArticleFeed(Base): + __tablename__ = "article_feeds" id = Column(BigInteger, primary_key=True) url = Column(Text, nullable=False, unique=True) @@ -716,8 +716,8 @@ class RssFeed(Base): # Add indexes __table_args__ = ( - Index("rss_feeds_active_idx", "active", "last_checked_at"), - Index("rss_feeds_tags_idx", "tags", postgresql_using="gin"), + Index("article_feeds_active_idx", "active", "last_checked_at"), + Index("article_feeds_tags_idx", "tags", postgresql_using="gin"), ) diff --git a/src/memory/common/parsers/blogs.py b/src/memory/common/parsers/blogs.py index 021e718..c99b2e3 100644 --- a/src/memory/common/parsers/blogs.py +++ b/src/memory/common/parsers/blogs.py @@ -615,36 +615,3 @@ def parse_webpage(url: str) -> Article: html = cast(str, fetch_html(url)) parser = get_parser_for_url(url, html) return parser.parse(html, url) - - -feeds = [ - "https://archive.ph/o/IQUoT/https://www.bloomberg.com/opinion/authors/ARbTQlRLRjE/matthew-s-levine", - "https://www.rifters.com/crawl/", - "https://rachelbythebay.com/w/", - "https://danluu.com/", - "https://guzey.come", - "https://aphyr.com/", - "https://www.applieddivinitystudies.com/", - "https://www.imightbewrong.org/", - "https://www.kvetch.au/", - "https://www.overcomingbias.com/", - "https://samkriss.substack.com/", - "https://www.richardhanania.com/", - "https://skunkledger.substack.com/", - "https://taipology.substack.com/", - "https://putanumonit.com/", - "https://www.flyingmachinestudios.com/", - "https://www.theintrinsicperspective.com/", - "https://www.strangeloopcanon.com/", - "https://slimemoldtimemold.com/", - "https://zeroinputagriculture.substack.com/", - "https://nayafia.substack.com", - "https://www.paulgraham.com/articles.html", - "https://mcfunley.com/writing", - "https://www.bitsaboutmoney.com/", - "https://akarlin.com", - "https://www.exurbe.com/", - "https://acoup.blog/", - "https://www.theredhandfiles.com/", - "https://karlin.blog/", -] diff --git a/src/memory/common/parsers/feeds.py b/src/memory/common/parsers/feeds.py index 6ea25a4..d352c7e 100644 --- a/src/memory/common/parsers/feeds.py +++ b/src/memory/common/parsers/feeds.py @@ -319,7 +319,8 @@ class GuzeyParser(HTMLListParser): class PaulGrahamParser(HTMLListParser): - item_selector = "font a[href]" + item_selector = "img + font" + title_selector = "a" skip_patterns = DEFAULT_SKIP_PATTERNS + [ r"\.txt$", # Skip text files r"turbifycdn\.com", # Skip CDN links @@ -329,7 +330,6 @@ class PaulGrahamParser(HTMLListParser): # Only include items that are actual essays (relative URLs ending in .html) return ( item.url.endswith(".html") - and not item.url.startswith("http") and len(item.title) > 5 # Filter out very short titles ) diff --git a/tests/memory/common/parsers/test_feeds.py b/tests/memory/common/parsers/test_feeds.py index b8cf541..7981bfc 100644 --- a/tests/memory/common/parsers/test_feeds.py +++ b/tests/memory/common/parsers/test_feeds.py @@ -589,10 +589,12 @@ def test_html_list_parser_extract_date_without_selector(): ( PaulGrahamParser, "https://www.paulgraham.com/articles", - [("Long enough title", "essay.html")], + [ + ("Long enough title", "essay.html"), + ("Long enough title", "https://other.com/essay.html"), + ], [ ("Short", "essay.html"), - ("Long enough title", "https://other.com/essay.html"), ("Long enough title", "document.txt"), ], ),