From 9920446eb8afc93dd8c37febdd4d2e414235ef31 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sat, 1 Feb 2025 09:23:29 +0530 Subject: [PATCH] Update spektrum.de --- recipes/spektrum.recipe | 66 +++++++++++++++++++++++------------------ 1 file changed, 37 insertions(+), 29 deletions(-) diff --git a/recipes/spektrum.recipe b/recipes/spektrum.recipe index b9443e23555f..d51a9ec23d31 100644 --- a/recipes/spektrum.recipe +++ b/recipes/spektrum.recipe @@ -1,9 +1,9 @@ #!/usr/bin/env python # vim:fileencoding=utf-8 ## -# Written: October 2012 (new coding) -# Version: 9.0 -# Last update: 2018-02-22 +## Written: October 2012 (new coding) +## Version: 10.0 +## Last update: 2025-01-15 ## from __future__ import absolute_import, division, print_function, unicode_literals @@ -31,7 +31,7 @@ class Spektrum(BasicNewsRecipe): description = u'German online portal of Spektrum der Wissenschaft' publisher = 'Spektrum der Wissenschaft Verlagsgesellschaft mbH' category = 'science news, Germany' - oldest_article = 7 + oldest_article = 3 max_articles_per_feed = 100 no_stylesheets = True remove_javascript = True @@ -39,27 +39,19 @@ class Spektrum(BasicNewsRecipe): language = 'de' encoding = 'utf8' ignore_duplicate_articles = {'title'} + scale_news_images_to_device = True + compress_news_images = True - cover_url = 'https://www.spektrum.de/js_css/sde/assets/img/svg/sdw_dark.svg' + cover_url = 'https://upload.wikimedia.org/wikipedia/commons/thumb/1/1d/Spektrum_der_Wissenschaft_Logo_seit_2016.svg/640px-Spektrum_der_Wissenschaft_Logo_seit_2016.svg.png' masthead_url = 'http://www.spektrum.de/fm/861/spektrum.de.png' feeds = [ - ( - u'Spektrum.de', - u'http://www.spektrum.de/alias/rss/spektrum-de-rss-feed/996406' - ), - # (u'Spektrum der Wissenschaft', u'http://www.spektrum.de/alias/rss/spektrum-der-wissenschaft-rss-feed/982623'), - # (u'Gehirn & Geist', u'http://www.spektrum.de/alias/rss/gehirn-geist-rss-feed/982626'), - ( - u'Sterne und Weltraum', - u'http://www.spektrum.de/alias/rss/sterne-und-weltraum-rss-feed/865248' - ), - # (u'Meistgelesene Artikel',u'http://www.spektrum.de/alias/rss/spektrum-de-meistgelesene-artikel/1224665'), # AGe 2014-08-21 new - ] + (u'Spektrum.de', u'http://www.spektrum.de/alias/rss/spektrum-de-rss-feed/996406'), + ] keep_only_tags = [ - dict(name='article', attrs={'class': 'content'}), - ] + dict(name='article', attrs={'class':'content'}),classes('callout-box') + ] remove_tags = [ classes('hide-for-print'), @@ -71,6 +63,15 @@ class Spektrum(BasicNewsRecipe): ] def parse_feeds(self): + unwanted_article_types = [ + 'podcast', + 'video', + 'raetsel', + 'leseprobe', + # 'kolumne', + # 'rezension', + # 'news', + ] # Call parent's method. feeds = BasicNewsRecipe.parse_feeds(self) # Loop through all feeds. @@ -79,16 +80,23 @@ class Spektrum(BasicNewsRecipe): for article in feed.articles[:]: if 'VIDEO' in article.title: feed.articles.remove(article) - # Remove articles with 'video','podcast' or 'rezension' in the url. - elif 'podcast' in article.url: - feed.articles.remove(article) - elif 'video' in article.url: - feed.articles.remove(article) - elif 'rezension' in article.url: - feed.articles.remove(article) + continue + # Remove articles with '..' in the url. + for keyword in unwanted_article_types: + if keyword in article.url: + feed.articles.remove(article) + continue + return feeds - def preprocess_html(self, soup, *a): - for img in soup.findAll('img', attrs={'data-src': True}): - img['src'] = img['data-src'] + def preprocess_html(self, soup): + for noscript in soup.findAll('noscript'): + noscript.name = 'div' return soup + + def preprocess_raw_html(self, raw, url): + # remove articles requiring login and advertisements + unwantedtag = 'content pw-premium' + if unwantedtag in raw: + self.abort_article('Skipping unwanted article with tag:' + unwantedtag) + return raw