mirror of
https://github.com/mruwnik/memory.git
synced 2025-06-08 13:24:41 +02:00
Check all feeds work
This commit is contained in:
parent
876fa87725
commit
0f15e4e410
123
db/migrations/versions/20250527_013945_rename_rss_feed.py
Normal file
123
db/migrations/versions/20250527_013945_rename_rss_feed.py
Normal file
@ -0,0 +1,123 @@
|
|||||||
|
"""Rename rss feed
|
||||||
|
|
||||||
|
Revision ID: f8e6a7f80928
|
||||||
|
Revises: d897c6353a84
|
||||||
|
Create Date: 2025-05-27 01:39:45.722077
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
from typing import Sequence, Union
|
||||||
|
|
||||||
|
from alembic import op
|
||||||
|
import sqlalchemy as sa
|
||||||
|
from sqlalchemy.dialects import postgresql
|
||||||
|
|
||||||
|
# revision identifiers, used by Alembic.
|
||||||
|
revision: str = "f8e6a7f80928"
|
||||||
|
down_revision: Union[str, None] = "d897c6353a84"
|
||||||
|
branch_labels: Union[str, Sequence[str], None] = None
|
||||||
|
depends_on: Union[str, Sequence[str], None] = None
|
||||||
|
|
||||||
|
|
||||||
|
def upgrade() -> None:
|
||||||
|
op.create_table(
|
||||||
|
"article_feeds",
|
||||||
|
sa.Column("id", sa.BigInteger(), nullable=False),
|
||||||
|
sa.Column("url", sa.Text(), nullable=False),
|
||||||
|
sa.Column("title", sa.Text(), nullable=True),
|
||||||
|
sa.Column("description", sa.Text(), nullable=True),
|
||||||
|
sa.Column("tags", sa.ARRAY(sa.Text()), server_default="{}", nullable=False),
|
||||||
|
sa.Column("last_checked_at", sa.DateTime(timezone=True), nullable=True),
|
||||||
|
sa.Column("active", sa.Boolean(), server_default="true", nullable=False),
|
||||||
|
sa.Column(
|
||||||
|
"created_at",
|
||||||
|
sa.DateTime(timezone=True),
|
||||||
|
server_default=sa.text("now()"),
|
||||||
|
nullable=False,
|
||||||
|
),
|
||||||
|
sa.Column(
|
||||||
|
"updated_at",
|
||||||
|
sa.DateTime(timezone=True),
|
||||||
|
server_default=sa.text("now()"),
|
||||||
|
nullable=False,
|
||||||
|
),
|
||||||
|
sa.PrimaryKeyConstraint("id"),
|
||||||
|
sa.UniqueConstraint("url"),
|
||||||
|
)
|
||||||
|
op.create_index(
|
||||||
|
"article_feeds_active_idx",
|
||||||
|
"article_feeds",
|
||||||
|
["active", "last_checked_at"],
|
||||||
|
unique=False,
|
||||||
|
)
|
||||||
|
op.create_index(
|
||||||
|
"article_feeds_tags_idx",
|
||||||
|
"article_feeds",
|
||||||
|
["tags"],
|
||||||
|
unique=False,
|
||||||
|
postgresql_using="gin",
|
||||||
|
)
|
||||||
|
op.drop_index("rss_feeds_active_idx", table_name="rss_feeds")
|
||||||
|
op.drop_index("rss_feeds_tags_idx", table_name="rss_feeds", postgresql_using="gin")
|
||||||
|
op.drop_table("rss_feeds")
|
||||||
|
|
||||||
|
|
||||||
|
def downgrade() -> None:
|
||||||
|
op.create_table(
|
||||||
|
"rss_feeds",
|
||||||
|
sa.Column("id", sa.BIGINT(), autoincrement=True, nullable=False),
|
||||||
|
sa.Column("url", sa.TEXT(), autoincrement=False, nullable=False),
|
||||||
|
sa.Column("title", sa.TEXT(), autoincrement=False, nullable=True),
|
||||||
|
sa.Column("description", sa.TEXT(), autoincrement=False, nullable=True),
|
||||||
|
sa.Column(
|
||||||
|
"tags",
|
||||||
|
postgresql.ARRAY(sa.TEXT()),
|
||||||
|
server_default=sa.text("'{}'::text[]"),
|
||||||
|
autoincrement=False,
|
||||||
|
nullable=False,
|
||||||
|
),
|
||||||
|
sa.Column(
|
||||||
|
"last_checked_at",
|
||||||
|
postgresql.TIMESTAMP(timezone=True),
|
||||||
|
autoincrement=False,
|
||||||
|
nullable=True,
|
||||||
|
),
|
||||||
|
sa.Column(
|
||||||
|
"active",
|
||||||
|
sa.BOOLEAN(),
|
||||||
|
server_default=sa.text("true"),
|
||||||
|
autoincrement=False,
|
||||||
|
nullable=False,
|
||||||
|
),
|
||||||
|
sa.Column(
|
||||||
|
"created_at",
|
||||||
|
postgresql.TIMESTAMP(timezone=True),
|
||||||
|
server_default=sa.text("now()"),
|
||||||
|
autoincrement=False,
|
||||||
|
nullable=False,
|
||||||
|
),
|
||||||
|
sa.Column(
|
||||||
|
"updated_at",
|
||||||
|
postgresql.TIMESTAMP(timezone=True),
|
||||||
|
server_default=sa.text("now()"),
|
||||||
|
autoincrement=False,
|
||||||
|
nullable=False,
|
||||||
|
),
|
||||||
|
sa.PrimaryKeyConstraint("id", name="rss_feeds_pkey"),
|
||||||
|
sa.UniqueConstraint("url", name="rss_feeds_url_key"),
|
||||||
|
)
|
||||||
|
op.create_index(
|
||||||
|
"rss_feeds_tags_idx",
|
||||||
|
"rss_feeds",
|
||||||
|
["tags"],
|
||||||
|
unique=False,
|
||||||
|
postgresql_using="gin",
|
||||||
|
)
|
||||||
|
op.create_index(
|
||||||
|
"rss_feeds_active_idx", "rss_feeds", ["active", "last_checked_at"], unique=False
|
||||||
|
)
|
||||||
|
op.drop_index(
|
||||||
|
"article_feeds_tags_idx", table_name="article_feeds", postgresql_using="gin"
|
||||||
|
)
|
||||||
|
op.drop_index("article_feeds_active_idx", table_name="article_feeds")
|
||||||
|
op.drop_table("article_feeds")
|
@ -697,8 +697,8 @@ class GithubItem(SourceItem):
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
class RssFeed(Base):
|
class ArticleFeed(Base):
|
||||||
__tablename__ = "rss_feeds"
|
__tablename__ = "article_feeds"
|
||||||
|
|
||||||
id = Column(BigInteger, primary_key=True)
|
id = Column(BigInteger, primary_key=True)
|
||||||
url = Column(Text, nullable=False, unique=True)
|
url = Column(Text, nullable=False, unique=True)
|
||||||
@ -716,8 +716,8 @@ class RssFeed(Base):
|
|||||||
|
|
||||||
# Add indexes
|
# Add indexes
|
||||||
__table_args__ = (
|
__table_args__ = (
|
||||||
Index("rss_feeds_active_idx", "active", "last_checked_at"),
|
Index("article_feeds_active_idx", "active", "last_checked_at"),
|
||||||
Index("rss_feeds_tags_idx", "tags", postgresql_using="gin"),
|
Index("article_feeds_tags_idx", "tags", postgresql_using="gin"),
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@ -615,36 +615,3 @@ def parse_webpage(url: str) -> Article:
|
|||||||
html = cast(str, fetch_html(url))
|
html = cast(str, fetch_html(url))
|
||||||
parser = get_parser_for_url(url, html)
|
parser = get_parser_for_url(url, html)
|
||||||
return parser.parse(html, url)
|
return parser.parse(html, url)
|
||||||
|
|
||||||
|
|
||||||
feeds = [
|
|
||||||
"https://archive.ph/o/IQUoT/https://www.bloomberg.com/opinion/authors/ARbTQlRLRjE/matthew-s-levine",
|
|
||||||
"https://www.rifters.com/crawl/",
|
|
||||||
"https://rachelbythebay.com/w/",
|
|
||||||
"https://danluu.com/",
|
|
||||||
"https://guzey.come",
|
|
||||||
"https://aphyr.com/",
|
|
||||||
"https://www.applieddivinitystudies.com/",
|
|
||||||
"https://www.imightbewrong.org/",
|
|
||||||
"https://www.kvetch.au/",
|
|
||||||
"https://www.overcomingbias.com/",
|
|
||||||
"https://samkriss.substack.com/",
|
|
||||||
"https://www.richardhanania.com/",
|
|
||||||
"https://skunkledger.substack.com/",
|
|
||||||
"https://taipology.substack.com/",
|
|
||||||
"https://putanumonit.com/",
|
|
||||||
"https://www.flyingmachinestudios.com/",
|
|
||||||
"https://www.theintrinsicperspective.com/",
|
|
||||||
"https://www.strangeloopcanon.com/",
|
|
||||||
"https://slimemoldtimemold.com/",
|
|
||||||
"https://zeroinputagriculture.substack.com/",
|
|
||||||
"https://nayafia.substack.com",
|
|
||||||
"https://www.paulgraham.com/articles.html",
|
|
||||||
"https://mcfunley.com/writing",
|
|
||||||
"https://www.bitsaboutmoney.com/",
|
|
||||||
"https://akarlin.com",
|
|
||||||
"https://www.exurbe.com/",
|
|
||||||
"https://acoup.blog/",
|
|
||||||
"https://www.theredhandfiles.com/",
|
|
||||||
"https://karlin.blog/",
|
|
||||||
]
|
|
||||||
|
@ -319,7 +319,8 @@ class GuzeyParser(HTMLListParser):
|
|||||||
|
|
||||||
|
|
||||||
class PaulGrahamParser(HTMLListParser):
|
class PaulGrahamParser(HTMLListParser):
|
||||||
item_selector = "font a[href]"
|
item_selector = "img + font"
|
||||||
|
title_selector = "a"
|
||||||
skip_patterns = DEFAULT_SKIP_PATTERNS + [
|
skip_patterns = DEFAULT_SKIP_PATTERNS + [
|
||||||
r"\.txt$", # Skip text files
|
r"\.txt$", # Skip text files
|
||||||
r"turbifycdn\.com", # Skip CDN links
|
r"turbifycdn\.com", # Skip CDN links
|
||||||
@ -329,7 +330,6 @@ class PaulGrahamParser(HTMLListParser):
|
|||||||
# Only include items that are actual essays (relative URLs ending in .html)
|
# Only include items that are actual essays (relative URLs ending in .html)
|
||||||
return (
|
return (
|
||||||
item.url.endswith(".html")
|
item.url.endswith(".html")
|
||||||
and not item.url.startswith("http")
|
|
||||||
and len(item.title) > 5 # Filter out very short titles
|
and len(item.title) > 5 # Filter out very short titles
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -589,10 +589,12 @@ def test_html_list_parser_extract_date_without_selector():
|
|||||||
(
|
(
|
||||||
PaulGrahamParser,
|
PaulGrahamParser,
|
||||||
"https://www.paulgraham.com/articles",
|
"https://www.paulgraham.com/articles",
|
||||||
[("Long enough title", "essay.html")],
|
[
|
||||||
|
("Long enough title", "essay.html"),
|
||||||
|
("Long enough title", "https://other.com/essay.html"),
|
||||||
|
],
|
||||||
[
|
[
|
||||||
("Short", "essay.html"),
|
("Short", "essay.html"),
|
||||||
("Long enough title", "https://other.com/essay.html"),
|
|
||||||
("Long enough title", "document.txt"),
|
("Long enough title", "document.txt"),
|
||||||
],
|
],
|
||||||
),
|
),
|
||||||
|
Loading…
x
Reference in New Issue
Block a user