mirror of
https://github.com/mruwnik/memory.git
synced 2025-06-28 15:14:45 +02:00
proper parsing for SSC
This commit is contained in:
parent
4049cf15b4
commit
1538d38bf6
@ -532,6 +532,36 @@ class NadiaXyzParser(BaseHTMLParser):
|
|||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
class SlateStarCodexParser(BaseHTMLParser):
|
||||||
|
"""Parser for slatestarcodex.com (Scott Alexander's blog)."""
|
||||||
|
|
||||||
|
article_selector = ".post, .hentry, [id^='post-']"
|
||||||
|
title_selector = "h1.pjgm-posttitle, h1"
|
||||||
|
author_selector = ".author.vcard a, .url.fn.n"
|
||||||
|
date_selector = ".entry-date"
|
||||||
|
date_format = "%B %d, %Y" # "January 21, 2021" format
|
||||||
|
content_selector = ".pjgm-postcontent"
|
||||||
|
author = "Scott Alexander"
|
||||||
|
|
||||||
|
remove_selectors = BaseHTMLParser.remove_selectors + [
|
||||||
|
".pjgm-postmeta",
|
||||||
|
".pjgm-postutility",
|
||||||
|
".pjgm-navigation",
|
||||||
|
"#pjgm-navbelow",
|
||||||
|
"#comments",
|
||||||
|
".commentlist",
|
||||||
|
".widget-area",
|
||||||
|
"#left-sidebar",
|
||||||
|
"#primary",
|
||||||
|
".sidebar-toggle",
|
||||||
|
".aar_div", # Advertisement divs
|
||||||
|
".pjgm-header",
|
||||||
|
".pjgm-footer",
|
||||||
|
"#pjgm-menubar",
|
||||||
|
"#pjgm-bigtitle",
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
class BloombergParser(BaseHTMLParser):
|
class BloombergParser(BaseHTMLParser):
|
||||||
"""Parser for bloomberg.com."""
|
"""Parser for bloomberg.com."""
|
||||||
|
|
||||||
@ -578,6 +608,7 @@ PARSER_REGISTRY = {
|
|||||||
r"theredhandfiles\.com": TheRedHandFilesParser,
|
r"theredhandfiles\.com": TheRedHandFilesParser,
|
||||||
r"rachelbythebay\.com": RachelByTheBayParser,
|
r"rachelbythebay\.com": RachelByTheBayParser,
|
||||||
r"nadia\.xyz": NadiaXyzParser,
|
r"nadia\.xyz": NadiaXyzParser,
|
||||||
|
r"slatestarcodex\.com": SlateStarCodexParser,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user