#!/usr/bin/env python __license__ = "GPL v3" import json from datetime import datetime from os.path import splitext from urllib.parse import urljoin from calibre.web.feeds.news import BasicNewsRecipe, prefixed_classes class ScientificAmerican(BasicNewsRecipe): title = "Scientific American" description = "Popular Science. Monthly magazine. Should be downloaded around the middle of each month." category = "science" __author__ = "Kovid Goyal" no_stylesheets = True language = "en" publisher = "Nature Publishing Group" remove_empty_feeds = True remove_javascript = True timefmt = " [%B %Y]" remove_attributes = ["height", "width"] masthead_url = ( "https://static.scientificamerican.com/sciam/assets/Image/newsletter/salogo.png" ) extra_css = """ [class^="article_dek-"] { font-style:italic; color:#202020; } [class^="article_authors-"] {font-size:small; color:#202020; } [class^="article__image-"] { font-size:small; text-align:center; } [class^="lead_image-"] { font-size:small; text-align:center; } [class^="bio-"] { font-size:small; color:#404040; } em { color:#202020; } """ needs_subscription = "optional" keep_only_tags = [ prefixed_classes( 'article_hed- article_dek- article_authors- lead_image- article__content- bio-' ), ] remove_tags = [ dict(name=['button', 'svg', 'iframe', 'source']) ] def get_browser(self, *args): br = BasicNewsRecipe.get_browser(self) if self.username and self.password: br.open("https://www.scientificamerican.com/account/login/") br.select_form(predicate=lambda f: f.attrs.get("id") == "login") br["emailAddress"] = self.username br["password"] = self.password br.submit() return br def parse_index(self): # Get the cover, date and issue URL fp_soup = self.index_to_soup("https://www.scientificamerican.com") curr_issue_link = fp_soup.select(".tout_current-issue__cover a") if not curr_issue_link: self.abort_recipe_processing("Unable to find issue link") issue_url = curr_issue_link[0]["href"] soup = self.index_to_soup(issue_url) script = soup.find("script", id="__NEXT_DATA__") if not script: self.abort_recipe_processing("Unable to find script") issue_info = ( json.loads(script.contents[0]) .get("props", {}) .get("pageProps", {}) .get("issue", {}) ) if not issue_info: self.abort_recipe_processing("Unable to find issue info") image_id, ext = splitext(issue_info["image"]) self.cover_url = f"https://static.scientificamerican.com/sciam/cache/file/{image_id}_source{ext}?w=800" edition_date = datetime.strptime(issue_info["issue_date"], "%Y-%m-%d") self.timefmt = f" [{edition_date:%B %Y}]" feeds = {} for section in ("featured", "departments"): for article in issue_info.get("article_previews", {}).get(section, []): if section == "featured": feed_name = "Features" else: feed_name = article["category"] if feed_name not in feeds: feeds[feed_name] = [] feeds[feed_name].append( { "title": article["title"], "url": urljoin( "https://www.scientificamerican.com/article/", article["slug"], ), "description": article["summary"], } ) return feeds.items()