Merge pull request #281 from yldoctrine/add_check_certificate_config

Add check_certificate option in configuration to be able to crawl sites not having a valid certificate
fhamborg · Jul 29, 2024 · ec05935 · ec05935
2 parents 04dca08 + 94e4a9b
commit ec05935
Show file tree

Hide file tree

Showing 11 changed files with 108 additions and 51 deletions.
diff --git a/newsplease/__main__.py b/newsplease/__main__.py
@@ -14,7 +14,7 @@
 from elasticsearch import Elasticsearch
 from scrapy.utils.log import configure_logging
 
-from .pipeline.pipelines import RedisStorageClient
+from newsplease.pipeline.pipelines import RedisStorageClient
 
 cur_path = os.path.dirname(os.path.realpath(__file__))
 par_path = os.path.dirname(cur_path)

diff --git a/newsplease/config/config.cfg b/newsplease/config/config.cfg
@@ -41,6 +41,10 @@ fallbacks = {
 # In case the check fails, the next crawler in the pipeline will be used
 check_crawler_has_urls_to_scan = False
 
+# Check that the site uses a certificate which is valid and not expired
+# Default: True
+check_certificate = True
+
 # Determines how many hours need to pass since the last download of a webpage
 # to be downloaded again by the RssCrawler
 # default: 6

diff --git a/newsplease/crawler/spiders/download_crawler.py b/newsplease/crawler/spiders/download_crawler.py
@@ -42,13 +42,14 @@ def parse(self, response):
         )
 
     @staticmethod
-    def supports_site(url):
+    def supports_site(url: str, check_certificate: bool = True) -> bool:
         """
         As long as the url exists, this crawler will work!
 
         Determines if this crawler works on the given url.
 
         :param str url: The url to test
+        :param str check_certificate:
         :return bool: Determines wether this crawler work on the given url
         """
         return True
diff --git a/newsplease/crawler/spiders/gdelt_crawler.py b/newsplease/crawler/spiders/gdelt_crawler.py
@@ -104,13 +104,14 @@ def only_extracts_articles():
         return True
 
     @staticmethod
-    def supports_site(url):
+    def supports_site(url: str, check_certificate: bool = True) -> bool:
         """
         Rss Crawler is supported if the url is a valid rss feed
 
         Determines if this crawler works on the given url.
 
         :param str url: The url to test
+        :param str check_certificate:
         :return bool: Determines wether this crawler work on the given url
         """
 

diff --git a/newsplease/crawler/spiders/newsplease_spider.py b/newsplease/crawler/spiders/newsplease_spider.py
@@ -9,21 +9,23 @@ class NewspleaseSpider(ABC):
 
     @staticmethod
     @abstractmethod
-    def supports_site(url: str) -> bool:
+    def supports_site(url: str, check_certificate: bool = True) -> bool:
         """
         Determines if this spider works on the given URL.
 
         :param str url: The url to test
+        :param bool check_certificate: The url to test
         :return bool:
         """
         pass
 
     @staticmethod
-    def has_urls_to_scan(url: str) -> bool:
+    def has_urls_to_scan(url: str, check_certificate: bool = True) -> bool:
         """
         Determines if this spider has any URLs to scan.
 
         :param str url: The url to test
+        :param bool check_certificate:
         :return bool:
         """
         return True
diff --git a/newsplease/crawler/spiders/recursive_crawler.py b/newsplease/crawler/spiders/recursive_crawler.py
@@ -56,13 +56,14 @@ def parse(self, response):
             response, self.allowed_domains[0], self.original_url)
 
     @staticmethod
-    def supports_site(url):
+    def supports_site(url: str, check_certificate: bool = True) -> bool:
         """
         Recursive Crawler are supported by every site!
 
         Determines if this crawler works on the given url.
 
         :param str url: The url to test
+        :param bool check_certificate:
         :return bool: Determines wether this crawler work on the given url
         """
         return True
diff --git a/newsplease/crawler/spiders/recursive_sitemap_crawler.py b/newsplease/crawler/spiders/recursive_sitemap_crawler.py
@@ -35,8 +35,13 @@ def __init__(self, helper, url, config, ignore_regex, *args, **kwargs):
         self.original_url = url
 
         self.allowed_domains = [self.helper.url_extractor.get_allowed_domain(url)]
+        self.check_certificate = (bool(config.section("Crawler").get('check_certificate'))
+                                  if config.section("Crawler").get('check_certificate') is not None
+                                  else True)
         self.sitemap_urls = self.helper.url_extractor.get_sitemap_urls(
-            url, config.section("Crawler")["sitemap_allow_subdomains"]
+            domain_url=url,
+            allow_subdomains=config.section("Crawler")["sitemap_allow_subdomains"],
+            check_certificate=self.check_certificate,
         )
         super(RecursiveSitemapCrawler, self).__init__(*args, **kwargs)
 
@@ -60,14 +65,15 @@ def parse(self, response):
         )
 
     @staticmethod
-    def supports_site(url):
+    def supports_site(url: str, check_certificate: bool = True) -> bool:
         """
         Sitemap-Crawler are supported by every site which have a
         Sitemap set in the robots.txt.
 
         Determines if this crawler works on the given url.
 
         :param str url: The url to test
+        :param bool check_certificate:
         :return bool: Determines wether this crawler work on the given url
         """
-        return UrlExtractor.sitemap_check(url)
+        return UrlExtractor.sitemap_check(url=url, check_certificate=check_certificate)
diff --git a/newsplease/crawler/spiders/rss_crawler.py b/newsplease/crawler/spiders/rss_crawler.py
@@ -41,6 +41,11 @@ def __init__(self, helper, url, config, ignore_regex, *args, **kwargs):
 
         self.ignored_allowed_domain = self.helper.url_extractor \
             .get_allowed_domain(url)
+
+        self.check_certificate = (bool(config.section("Crawler").get('check_certificate'))
+                                  if config.section("Crawler").get('check_certificate') is not None
+                                  else True)
+
         self.start_urls = [self.helper.url_extractor.get_start_url(url)]
 
         super(RssCrawler, self).__init__(*args, **kwargs)
@@ -63,8 +68,14 @@ def rss_parse(self, response):
         """
         for item in response.xpath('//item'):
             for url in item.xpath('link/text()').extract():
-                yield scrapy.Request(url, lambda resp: self.article_parse(
-                    resp, item.xpath('title/text()').extract()[0]))
+                yield scrapy.Request(
+                    url=url,
+                    callback=lambda resp: self.article_parse(
+                        resp,
+                        item.xpath('title/text()').extract()[0]
+                    ),
+
+                )
 
     def article_parse(self, response, rss_title=None):
         """
@@ -90,47 +101,40 @@ def only_extracts_articles():
         return True
 
     @staticmethod
-    def get_potential_redirection_from_url(url):
-        """Ensure we have the correct URL to check for RSS feed"""
-        opener = urllib2.build_opener(urllib2.HTTPRedirectHandler)
-        url = UrlExtractor.url_to_request_with_agent(url)
-        redirect_url = opener.open(url).url
-        return redirect_url
-
-    @staticmethod
-    def supports_site(url):
+    def supports_site(url: str, check_certificate: bool = True) -> bool:
         """
         Rss Crawler are supported if by every site containing an rss feed.
 
         Determines if this crawler works on the given url.
 
         :param str url: The url to test
+        :param bool check_certificate:
         :return bool: Determines wether this crawler work on the given url
         """
 
         # Follow redirects
-        redirect_url = RssCrawler.get_potential_redirection_from_url(url)
-        redirect = UrlExtractor.url_to_request_with_agent(redirect_url)
+        redirect_url = UrlExtractor.follow_redirects(url=url, check_certificate=check_certificate)
 
         # Check if a standard rss feed exists
-        response = urllib2.urlopen(redirect).read()
+        response = UrlExtractor.request_url(url=redirect_url, check_certificate=check_certificate).read()
         return re.search(re_rss, response.decode("utf-8")) is not None
 
     @staticmethod
-    def has_urls_to_scan(url: str) -> bool:
+    def has_urls_to_scan(url: str, check_certificate: bool = True) -> bool:
         """
         Check if the RSS feed contains any URL to scan
 
         :param str url: The url to test
+        :param bool check_certificate:
         :return bool:
         """
-        redirect_url = RssCrawler.get_potential_redirection_from_url(url)
+        redirect_url = UrlExtractor.follow_redirects(url=url, check_certificate=check_certificate)
 
-        response = get(redirect_url)
+        response = get(url=redirect_url, verify=check_certificate)
         scrapy_response = TextResponse(url=redirect_url, body=response.text.encode())
 
         rss_url = UrlExtractor.get_rss_url(scrapy_response)
-        rss_content = get(rss_url).text
+        rss_content = get(url=rss_url, verify=check_certificate).text
         rss_response = XmlResponse(url=rss_url, body=rss_content, encoding="utf-8")
 
         urls_to_scan = [

diff --git a/newsplease/crawler/spiders/sitemap_crawler.py b/newsplease/crawler/spiders/sitemap_crawler.py
@@ -23,15 +23,21 @@ def __init__(self, helper, url, config, ignore_regex, *args, **kwargs):
 
         self.config = config
         self.helper = helper
+
         self.original_url = url
 
         self.allowed_domains = [
             self.helper.url_extractor.get_allowed_domain(
                 url, config.section("Crawler")["sitemap_allow_subdomains"]
             )
         ]
+        self.check_certificate = (bool(config.section("Crawler").get('check_certificate'))
+                                  if config.section("Crawler").get('check_certificate') is not None
+                                  else True)
         self.sitemap_urls = self.helper.url_extractor.get_sitemap_urls(
-            url, config.section("Crawler")["sitemap_allow_subdomains"]
+            domain_url=url,
+            allow_subdomains=config.section("Crawler")["sitemap_allow_subdomains"],
+            check_certificate=self.check_certificate,
         )
 
         self.log.debug(self.sitemap_urls)
@@ -61,15 +67,16 @@ def only_extracts_articles():
         return True
 
     @staticmethod
-    def supports_site(url):
+    def supports_site(url: str, check_certificate: bool = True) -> bool:
         """
         Sitemap-Crawler are supported by every site which have a
         Sitemap set in the robots.txt.
 
         Determines if this crawler works on the given url.
 
         :param str url: The url to test
+        :param str check_certificate:
         :return bool: Determines wether this crawler work on the given url
         """
 
-        return UrlExtractor.sitemap_check(url)
+        return UrlExtractor.sitemap_check(url=url, check_certificate=check_certificate)