From 1538d38bf67234cfc5ad6c5a3dec8bc92f3db025 Mon Sep 17 00:00:00 2001 From: Daniel O'Connell Date: Thu, 26 Jun 2025 10:27:00 +0200 Subject: [PATCH] proper parsing for SSC --- src/memory/parsers/blogs.py | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/src/memory/parsers/blogs.py b/src/memory/parsers/blogs.py index 631e19f..76a4afd 100644 --- a/src/memory/parsers/blogs.py +++ b/src/memory/parsers/blogs.py @@ -532,6 +532,36 @@ class NadiaXyzParser(BaseHTMLParser): ] +class SlateStarCodexParser(BaseHTMLParser): + """Parser for slatestarcodex.com (Scott Alexander's blog).""" + + article_selector = ".post, .hentry, [id^='post-']" + title_selector = "h1.pjgm-posttitle, h1" + author_selector = ".author.vcard a, .url.fn.n" + date_selector = ".entry-date" + date_format = "%B %d, %Y" # "January 21, 2021" format + content_selector = ".pjgm-postcontent" + author = "Scott Alexander" + + remove_selectors = BaseHTMLParser.remove_selectors + [ + ".pjgm-postmeta", + ".pjgm-postutility", + ".pjgm-navigation", + "#pjgm-navbelow", + "#comments", + ".commentlist", + ".widget-area", + "#left-sidebar", + "#primary", + ".sidebar-toggle", + ".aar_div", # Advertisement divs + ".pjgm-header", + ".pjgm-footer", + "#pjgm-menubar", + "#pjgm-bigtitle", + ] + + class BloombergParser(BaseHTMLParser): """Parser for bloomberg.com.""" @@ -578,6 +608,7 @@ PARSER_REGISTRY = { r"theredhandfiles\.com": TheRedHandFilesParser, r"rachelbythebay\.com": RachelByTheBayParser, r"nadia\.xyz": NadiaXyzParser, + r"slatestarcodex\.com": SlateStarCodexParser, }