diff --git a/newsplease/__main__.py b/newsplease/__main__.py index 4f4a0808..9723d4d3 100644 --- a/newsplease/__main__.py +++ b/newsplease/__main__.py @@ -14,7 +14,7 @@ from elasticsearch import Elasticsearch from scrapy.utils.log import configure_logging -from .pipeline.pipelines import RedisStorageClient +from newsplease.pipeline.pipelines import RedisStorageClient cur_path = os.path.dirname(os.path.realpath(__file__)) par_path = os.path.dirname(cur_path) diff --git a/newsplease/config/config.cfg b/newsplease/config/config.cfg index d63599c4..95d30134 100644 --- a/newsplease/config/config.cfg +++ b/newsplease/config/config.cfg @@ -41,6 +41,10 @@ fallbacks = { # In case the check fails, the next crawler in the pipeline will be used check_crawler_has_urls_to_scan = False +# Check that the site uses a certificate which is valid and not expired +# Default: True +check_certificate = True + # Determines how many hours need to pass since the last download of a webpage # to be downloaded again by the RssCrawler # default: 6 diff --git a/newsplease/crawler/spiders/download_crawler.py b/newsplease/crawler/spiders/download_crawler.py index 96de9ebc..7703df13 100644 --- a/newsplease/crawler/spiders/download_crawler.py +++ b/newsplease/crawler/spiders/download_crawler.py @@ -42,13 +42,14 @@ def parse(self, response): ) @staticmethod - def supports_site(url): + def supports_site(url: str, check_certificate: bool = True) -> bool: """ As long as the url exists, this crawler will work! Determines if this crawler works on the given url. :param str url: The url to test + :param str check_certificate: :return bool: Determines wether this crawler work on the given url """ return True diff --git a/newsplease/crawler/spiders/gdelt_crawler.py b/newsplease/crawler/spiders/gdelt_crawler.py index 826e920f..62d8d993 100644 --- a/newsplease/crawler/spiders/gdelt_crawler.py +++ b/newsplease/crawler/spiders/gdelt_crawler.py @@ -104,13 +104,14 @@ def only_extracts_articles(): return True @staticmethod - def supports_site(url): + def supports_site(url: str, check_certificate: bool = True) -> bool: """ Rss Crawler is supported if the url is a valid rss feed Determines if this crawler works on the given url. :param str url: The url to test + :param str check_certificate: :return bool: Determines wether this crawler work on the given url """ diff --git a/newsplease/crawler/spiders/newsplease_spider.py b/newsplease/crawler/spiders/newsplease_spider.py index 411e6ef3..470b9577 100644 --- a/newsplease/crawler/spiders/newsplease_spider.py +++ b/newsplease/crawler/spiders/newsplease_spider.py @@ -9,21 +9,23 @@ class NewspleaseSpider(ABC): @staticmethod @abstractmethod - def supports_site(url: str) -> bool: + def supports_site(url: str, check_certificate: bool = True) -> bool: """ Determines if this spider works on the given URL. :param str url: The url to test + :param bool check_certificate: The url to test :return bool: """ pass @staticmethod - def has_urls_to_scan(url: str) -> bool: + def has_urls_to_scan(url: str, check_certificate: bool = True) -> bool: """ Determines if this spider has any URLs to scan. :param str url: The url to test + :param bool check_certificate: :return bool: """ return True diff --git a/newsplease/crawler/spiders/recursive_crawler.py b/newsplease/crawler/spiders/recursive_crawler.py index 9eeb2fb3..7ef0d694 100644 --- a/newsplease/crawler/spiders/recursive_crawler.py +++ b/newsplease/crawler/spiders/recursive_crawler.py @@ -56,13 +56,14 @@ def parse(self, response): response, self.allowed_domains[0], self.original_url) @staticmethod - def supports_site(url): + def supports_site(url: str, check_certificate: bool = True) -> bool: """ Recursive Crawler are supported by every site! Determines if this crawler works on the given url. :param str url: The url to test + :param bool check_certificate: :return bool: Determines wether this crawler work on the given url """ return True diff --git a/newsplease/crawler/spiders/recursive_sitemap_crawler.py b/newsplease/crawler/spiders/recursive_sitemap_crawler.py index 0a70fd6d..0b68c673 100644 --- a/newsplease/crawler/spiders/recursive_sitemap_crawler.py +++ b/newsplease/crawler/spiders/recursive_sitemap_crawler.py @@ -35,8 +35,13 @@ def __init__(self, helper, url, config, ignore_regex, *args, **kwargs): self.original_url = url self.allowed_domains = [self.helper.url_extractor.get_allowed_domain(url)] + self.check_certificate = (bool(config.section("Crawler").get('check_certificate')) + if config.section("Crawler").get('check_certificate') is not None + else True) self.sitemap_urls = self.helper.url_extractor.get_sitemap_urls( - url, config.section("Crawler")["sitemap_allow_subdomains"] + domain_url=url, + allow_subdomains=config.section("Crawler")["sitemap_allow_subdomains"], + check_certificate=self.check_certificate, ) super(RecursiveSitemapCrawler, self).__init__(*args, **kwargs) @@ -60,7 +65,7 @@ def parse(self, response): ) @staticmethod - def supports_site(url): + def supports_site(url: str, check_certificate: bool = True) -> bool: """ Sitemap-Crawler are supported by every site which have a Sitemap set in the robots.txt. @@ -68,6 +73,7 @@ def supports_site(url): Determines if this crawler works on the given url. :param str url: The url to test + :param bool check_certificate: :return bool: Determines wether this crawler work on the given url """ - return UrlExtractor.sitemap_check(url) + return UrlExtractor.sitemap_check(url=url, check_certificate=check_certificate) diff --git a/newsplease/crawler/spiders/rss_crawler.py b/newsplease/crawler/spiders/rss_crawler.py index a5444c11..9579be12 100644 --- a/newsplease/crawler/spiders/rss_crawler.py +++ b/newsplease/crawler/spiders/rss_crawler.py @@ -41,6 +41,11 @@ def __init__(self, helper, url, config, ignore_regex, *args, **kwargs): self.ignored_allowed_domain = self.helper.url_extractor \ .get_allowed_domain(url) + + self.check_certificate = (bool(config.section("Crawler").get('check_certificate')) + if config.section("Crawler").get('check_certificate') is not None + else True) + self.start_urls = [self.helper.url_extractor.get_start_url(url)] super(RssCrawler, self).__init__(*args, **kwargs) @@ -63,8 +68,14 @@ def rss_parse(self, response): """ for item in response.xpath('//item'): for url in item.xpath('link/text()').extract(): - yield scrapy.Request(url, lambda resp: self.article_parse( - resp, item.xpath('title/text()').extract()[0])) + yield scrapy.Request( + url=url, + callback=lambda resp: self.article_parse( + resp, + item.xpath('title/text()').extract()[0] + ), + + ) def article_parse(self, response, rss_title=None): """ @@ -90,47 +101,40 @@ def only_extracts_articles(): return True @staticmethod - def get_potential_redirection_from_url(url): - """Ensure we have the correct URL to check for RSS feed""" - opener = urllib2.build_opener(urllib2.HTTPRedirectHandler) - url = UrlExtractor.url_to_request_with_agent(url) - redirect_url = opener.open(url).url - return redirect_url - - @staticmethod - def supports_site(url): + def supports_site(url: str, check_certificate: bool = True) -> bool: """ Rss Crawler are supported if by every site containing an rss feed. Determines if this crawler works on the given url. :param str url: The url to test + :param bool check_certificate: :return bool: Determines wether this crawler work on the given url """ # Follow redirects - redirect_url = RssCrawler.get_potential_redirection_from_url(url) - redirect = UrlExtractor.url_to_request_with_agent(redirect_url) + redirect_url = UrlExtractor.follow_redirects(url=url, check_certificate=check_certificate) # Check if a standard rss feed exists - response = urllib2.urlopen(redirect).read() + response = UrlExtractor.request_url(url=redirect_url, check_certificate=check_certificate).read() return re.search(re_rss, response.decode("utf-8")) is not None @staticmethod - def has_urls_to_scan(url: str) -> bool: + def has_urls_to_scan(url: str, check_certificate: bool = True) -> bool: """ Check if the RSS feed contains any URL to scan :param str url: The url to test + :param bool check_certificate: :return bool: """ - redirect_url = RssCrawler.get_potential_redirection_from_url(url) + redirect_url = UrlExtractor.follow_redirects(url=url, check_certificate=check_certificate) - response = get(redirect_url) + response = get(url=redirect_url, verify=check_certificate) scrapy_response = TextResponse(url=redirect_url, body=response.text.encode()) rss_url = UrlExtractor.get_rss_url(scrapy_response) - rss_content = get(rss_url).text + rss_content = get(url=rss_url, verify=check_certificate).text rss_response = XmlResponse(url=rss_url, body=rss_content, encoding="utf-8") urls_to_scan = [ diff --git a/newsplease/crawler/spiders/sitemap_crawler.py b/newsplease/crawler/spiders/sitemap_crawler.py index 87de752e..68daa4f7 100644 --- a/newsplease/crawler/spiders/sitemap_crawler.py +++ b/newsplease/crawler/spiders/sitemap_crawler.py @@ -23,6 +23,7 @@ def __init__(self, helper, url, config, ignore_regex, *args, **kwargs): self.config = config self.helper = helper + self.original_url = url self.allowed_domains = [ @@ -30,8 +31,13 @@ def __init__(self, helper, url, config, ignore_regex, *args, **kwargs): url, config.section("Crawler")["sitemap_allow_subdomains"] ) ] + self.check_certificate = (bool(config.section("Crawler").get('check_certificate')) + if config.section("Crawler").get('check_certificate') is not None + else True) self.sitemap_urls = self.helper.url_extractor.get_sitemap_urls( - url, config.section("Crawler")["sitemap_allow_subdomains"] + domain_url=url, + allow_subdomains=config.section("Crawler")["sitemap_allow_subdomains"], + check_certificate=self.check_certificate, ) self.log.debug(self.sitemap_urls) @@ -61,7 +67,7 @@ def only_extracts_articles(): return True @staticmethod - def supports_site(url): + def supports_site(url: str, check_certificate: bool = True) -> bool: """ Sitemap-Crawler are supported by every site which have a Sitemap set in the robots.txt. @@ -69,7 +75,8 @@ def supports_site(url): Determines if this crawler works on the given url. :param str url: The url to test + :param str check_certificate: :return bool: Determines wether this crawler work on the given url """ - return UrlExtractor.sitemap_check(url) + return UrlExtractor.sitemap_check(url=url, check_certificate=check_certificate) diff --git a/newsplease/helper_classes/url_extractor.py b/newsplease/helper_classes/url_extractor.py index 4e1857d3..c40b90df 100644 --- a/newsplease/helper_classes/url_extractor.py +++ b/newsplease/helper_classes/url_extractor.py @@ -5,6 +5,7 @@ import logging import os import re +import ssl from typing import Optional from scrapy.http import Response from http.client import HTTPResponse @@ -62,22 +63,42 @@ def get_subdomain(url: str) -> str: ] @staticmethod - def follow_redirects(url: str) -> str: + def follow_redirects(url: str, check_certificate: bool = True) -> str: """ Get's the url actual address by following forwards :param str url: the url to work on + :param bool check_certificate: :return str: actual address of url """ - url = UrlExtractor.url_to_request_with_agent(url) - opener = urllib2.build_opener(urllib2.HTTPRedirectHandler) - return opener.open(url).url + return UrlExtractor.request_url(url=url, check_certificate=check_certificate).url @staticmethod - def check_sitemap_urls(domain_url: str) -> list[str]: + def request_url(url: str, check_certificate: bool = True) -> HTTPResponse: + """ + :param str url: the url to work on + :param bool check_certificate: + :return HTTPResponse: + """ + request = UrlExtractor.url_to_request_with_agent(url) + + if check_certificate: + opener = urllib2.build_opener(urllib2.HTTPRedirectHandler) + return opener.open(request).url + + context = ssl.create_default_context() + context.check_hostname = False + context.verify_mode = ssl.CERT_NONE + response = urllib2.urlopen(request, context=context) + + return response + + @staticmethod + def check_sitemap_urls(domain_url: str, check_certificate: bool = True) -> list[str]: """Check if a set of sitemaps exists for the requested domain :param str domain_url: The URL to work on + :param bool check_certificate: :return list[str] working_sitemap_paths: All available sitemap for the domain_url """ working_sitemap_paths = [] @@ -86,9 +107,8 @@ def check_sitemap_urls(domain_url: str) -> list[str]: for sitemap_path in sitemap_patterns: # check common patterns url_sitemap = urljoin(domain_url, sitemap_path) - request = UrlExtractor.url_to_request_with_agent(url_sitemap) try: - response = urllib2.urlopen(request) + response = UrlExtractor.request_url(url=url_sitemap, check_certificate=check_certificate) # Keep sitemaps that exist, including those resulting from redirections if response.getcode() in [200, 301, 308]: logging.debug(f"Found an existing sitemap: {response.url}") @@ -99,18 +119,19 @@ def check_sitemap_urls(domain_url: str) -> list[str]: return working_sitemap_paths @staticmethod - def get_robots_response(url: str, allow_subdomains: bool) -> Optional[HTTPResponse]: + def get_robots_response(url: str, allow_subdomains: bool, check_certificate: bool = True) -> Optional[HTTPResponse]: """ Retrieve robots.txt response if it exists :param str url: the url to work on + :param bool check_certificate: :param bool allow_subdomains: Determines if the robot.txt may be the subdomain's :return: the robot.txt's HTTP response or None if it's not retrieved """ redirect_url = UrlExtractor.follow_redirects( - url="http://" - + UrlExtractor.get_allowed_domain(url, allow_subdomains=allow_subdomains), + url="http://" + UrlExtractor.get_allowed_domain(url, allow_subdomains=allow_subdomains), + check_certificate=check_certificate ) # Get robots.txt @@ -123,53 +144,58 @@ def get_robots_response(url: str, allow_subdomains: bool) -> Optional[HTTPRespon robots_url = "{url.scheme}://{url_netloc}/robots.txt".format( url=parsed, url_netloc=url_netloc ) - robots_req = UrlExtractor.url_to_request_with_agent(robots_url) try: - response = urllib2.urlopen(robots_req) + response = UrlExtractor.request_url(url=robots_url, check_certificate=check_certificate) if response.getcode() == 200: return response except URLError: if allow_subdomains: - return UrlExtractor.get_robots_response(url=url, allow_subdomains=False) + return UrlExtractor.get_robots_response( + url=url, + allow_subdomains=False, + check_certificate=check_certificate + ) return None @staticmethod - def sitemap_check(url: str) -> bool: + def sitemap_check(url: str, check_certificate: bool = True) -> bool: """ Sitemap-Crawlers are supported by every site that has a Sitemap set in the robots.txt, or any sitemap present in the domain :param str url: the url to work on + :param bool check_certificate: :return bool: Determines if a sitemap exists """ robots_response = UrlExtractor.get_robots_response( - url=url, allow_subdomains=True + url=url, allow_subdomains=True, check_certificate=check_certificate ) if robots_response and robots_response.getcode() == 200: # Check if "Sitemap" is set return "Sitemap:" in robots_response.read().decode("utf-8") # Check if there is an existing sitemap outside of robots.txt - sitemap_urls = UrlExtractor.check_sitemap_urls(domain_url=url) + sitemap_urls = UrlExtractor.check_sitemap_urls(domain_url=url, check_certificate=check_certificate) any_sitemap_found = len(sitemap_urls) > 0 if not any_sitemap_found: logging.warning("Fatal: neither robots.txt nor sitemap found.") return any_sitemap_found @staticmethod - def get_sitemap_urls(domain_url: str, allow_subdomains: bool) -> list[str]: + def get_sitemap_urls(domain_url: str, allow_subdomains: bool, check_certificate: bool) -> list[str]: """Retrieve SitemapCrawler input URLs from robots.txt or sitemaps :param str domain_url: The URL to work on :param bool allow_subdomains: Determines if the robot.txt may be the subdomain's + :param bool check_certificate: :return list[str]: robots.txt URL or available sitemaps """ robots_response = UrlExtractor.get_robots_response( - url=domain_url, allow_subdomains=allow_subdomains + url=domain_url, allow_subdomains=allow_subdomains, check_certificate=check_certificate ) if robots_response and robots_response.getcode() == 200: return [robots_response.url] - return UrlExtractor.check_sitemap_urls(domain_url=domain_url) + return UrlExtractor.check_sitemap_urls(domain_url=domain_url, check_certificate=check_certificate) @staticmethod def get_rss_url(response: Response) -> str: diff --git a/newsplease/single_crawler.py b/newsplease/single_crawler.py index d4441756..4b339066 100644 --- a/newsplease/single_crawler.py +++ b/newsplease/single_crawler.py @@ -197,6 +197,9 @@ def get_crawler(self, crawler: str, url: str): :rtype: crawler-class or None """ check_crawler_has_urls_to_scan = self.cfg_crawler.get('check_crawler_has_urls_to_scan') + check_certificate = (bool(self.cfg_crawler.get('check_certificate')) + if self.cfg_crawler.get('check_certificate') is not None + else True) checked_crawlers = [] while crawler is not None and crawler not in checked_crawlers: @@ -208,13 +211,15 @@ def get_crawler(self, crawler: str, url: str): return current try: - crawler_supports_site = current.supports_site(url) + crawler_supports_site = current.supports_site(url=url, check_certificate=check_certificate) except Exception as e: self.log.info(f'Crawler not supported due to: {str(e)}', exc_info=True) crawler_supports_site = False if crawler_supports_site: - if check_crawler_has_urls_to_scan and not current.has_urls_to_scan(url): + if (check_crawler_has_urls_to_scan + and not current.has_urls_to_scan(url=url, check_certificate=check_certificate) + ): self.log.warning(f"Crawler {crawler} has no url to scan for {url}") else: self.log.debug("Using crawler %s for %s.", crawler, url)