Skip to content

Commit

Permalink
Add check_certificate option in configuration to be able to crawl sit…
Browse files Browse the repository at this point in the history
…es not having a valid certificate
  • Loading branch information
yldoctrine committed Jul 26, 2024
1 parent 04dca08 commit 94e4a9b
Show file tree
Hide file tree
Showing 11 changed files with 108 additions and 51 deletions.
2 changes: 1 addition & 1 deletion newsplease/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
from elasticsearch import Elasticsearch
from scrapy.utils.log import configure_logging

from .pipeline.pipelines import RedisStorageClient
from newsplease.pipeline.pipelines import RedisStorageClient

cur_path = os.path.dirname(os.path.realpath(__file__))
par_path = os.path.dirname(cur_path)
Expand Down
4 changes: 4 additions & 0 deletions newsplease/config/config.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,10 @@ fallbacks = {
# In case the check fails, the next crawler in the pipeline will be used
check_crawler_has_urls_to_scan = False

# Check that the site uses a certificate which is valid and not expired
# Default: True
check_certificate = True

# Determines how many hours need to pass since the last download of a webpage
# to be downloaded again by the RssCrawler
# default: 6
Expand Down
3 changes: 2 additions & 1 deletion newsplease/crawler/spiders/download_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,13 +42,14 @@ def parse(self, response):
)

@staticmethod
def supports_site(url):
def supports_site(url: str, check_certificate: bool = True) -> bool:
"""
As long as the url exists, this crawler will work!
Determines if this crawler works on the given url.
:param str url: The url to test
:param str check_certificate:
:return bool: Determines wether this crawler work on the given url
"""
return True
3 changes: 2 additions & 1 deletion newsplease/crawler/spiders/gdelt_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,13 +104,14 @@ def only_extracts_articles():
return True

@staticmethod
def supports_site(url):
def supports_site(url: str, check_certificate: bool = True) -> bool:
"""
Rss Crawler is supported if the url is a valid rss feed
Determines if this crawler works on the given url.
:param str url: The url to test
:param str check_certificate:
:return bool: Determines wether this crawler work on the given url
"""

Expand Down
6 changes: 4 additions & 2 deletions newsplease/crawler/spiders/newsplease_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,21 +9,23 @@ class NewspleaseSpider(ABC):

@staticmethod
@abstractmethod
def supports_site(url: str) -> bool:
def supports_site(url: str, check_certificate: bool = True) -> bool:
"""
Determines if this spider works on the given URL.
:param str url: The url to test
:param bool check_certificate: The url to test
:return bool:
"""
pass

@staticmethod
def has_urls_to_scan(url: str) -> bool:
def has_urls_to_scan(url: str, check_certificate: bool = True) -> bool:
"""
Determines if this spider has any URLs to scan.
:param str url: The url to test
:param bool check_certificate:
:return bool:
"""
return True
3 changes: 2 additions & 1 deletion newsplease/crawler/spiders/recursive_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,13 +56,14 @@ def parse(self, response):
response, self.allowed_domains[0], self.original_url)

@staticmethod
def supports_site(url):
def supports_site(url: str, check_certificate: bool = True) -> bool:
"""
Recursive Crawler are supported by every site!
Determines if this crawler works on the given url.
:param str url: The url to test
:param bool check_certificate:
:return bool: Determines wether this crawler work on the given url
"""
return True
12 changes: 9 additions & 3 deletions newsplease/crawler/spiders/recursive_sitemap_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,8 +35,13 @@ def __init__(self, helper, url, config, ignore_regex, *args, **kwargs):
self.original_url = url

self.allowed_domains = [self.helper.url_extractor.get_allowed_domain(url)]
self.check_certificate = (bool(config.section("Crawler").get('check_certificate'))
if config.section("Crawler").get('check_certificate') is not None
else True)
self.sitemap_urls = self.helper.url_extractor.get_sitemap_urls(
url, config.section("Crawler")["sitemap_allow_subdomains"]
domain_url=url,
allow_subdomains=config.section("Crawler")["sitemap_allow_subdomains"],
check_certificate=self.check_certificate,
)
super(RecursiveSitemapCrawler, self).__init__(*args, **kwargs)

Expand All @@ -60,14 +65,15 @@ def parse(self, response):
)

@staticmethod
def supports_site(url):
def supports_site(url: str, check_certificate: bool = True) -> bool:
"""
Sitemap-Crawler are supported by every site which have a
Sitemap set in the robots.txt.
Determines if this crawler works on the given url.
:param str url: The url to test
:param bool check_certificate:
:return bool: Determines wether this crawler work on the given url
"""
return UrlExtractor.sitemap_check(url)
return UrlExtractor.sitemap_check(url=url, check_certificate=check_certificate)
40 changes: 22 additions & 18 deletions newsplease/crawler/spiders/rss_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,11 @@ def __init__(self, helper, url, config, ignore_regex, *args, **kwargs):

self.ignored_allowed_domain = self.helper.url_extractor \
.get_allowed_domain(url)

self.check_certificate = (bool(config.section("Crawler").get('check_certificate'))
if config.section("Crawler").get('check_certificate') is not None
else True)

self.start_urls = [self.helper.url_extractor.get_start_url(url)]

super(RssCrawler, self).__init__(*args, **kwargs)
Expand All @@ -63,8 +68,14 @@ def rss_parse(self, response):
"""
for item in response.xpath('//item'):
for url in item.xpath('link/text()').extract():
yield scrapy.Request(url, lambda resp: self.article_parse(
resp, item.xpath('title/text()').extract()[0]))
yield scrapy.Request(
url=url,
callback=lambda resp: self.article_parse(
resp,
item.xpath('title/text()').extract()[0]
),

)

def article_parse(self, response, rss_title=None):
"""
Expand All @@ -90,47 +101,40 @@ def only_extracts_articles():
return True

@staticmethod
def get_potential_redirection_from_url(url):
"""Ensure we have the correct URL to check for RSS feed"""
opener = urllib2.build_opener(urllib2.HTTPRedirectHandler)
url = UrlExtractor.url_to_request_with_agent(url)
redirect_url = opener.open(url).url
return redirect_url

@staticmethod
def supports_site(url):
def supports_site(url: str, check_certificate: bool = True) -> bool:
"""
Rss Crawler are supported if by every site containing an rss feed.
Determines if this crawler works on the given url.
:param str url: The url to test
:param bool check_certificate:
:return bool: Determines wether this crawler work on the given url
"""

# Follow redirects
redirect_url = RssCrawler.get_potential_redirection_from_url(url)
redirect = UrlExtractor.url_to_request_with_agent(redirect_url)
redirect_url = UrlExtractor.follow_redirects(url=url, check_certificate=check_certificate)

# Check if a standard rss feed exists
response = urllib2.urlopen(redirect).read()
response = UrlExtractor.request_url(url=redirect_url, check_certificate=check_certificate).read()
return re.search(re_rss, response.decode("utf-8")) is not None

@staticmethod
def has_urls_to_scan(url: str) -> bool:
def has_urls_to_scan(url: str, check_certificate: bool = True) -> bool:
"""
Check if the RSS feed contains any URL to scan
:param str url: The url to test
:param bool check_certificate:
:return bool:
"""
redirect_url = RssCrawler.get_potential_redirection_from_url(url)
redirect_url = UrlExtractor.follow_redirects(url=url, check_certificate=check_certificate)

response = get(redirect_url)
response = get(url=redirect_url, verify=check_certificate)
scrapy_response = TextResponse(url=redirect_url, body=response.text.encode())

rss_url = UrlExtractor.get_rss_url(scrapy_response)
rss_content = get(rss_url).text
rss_content = get(url=rss_url, verify=check_certificate).text
rss_response = XmlResponse(url=rss_url, body=rss_content, encoding="utf-8")

urls_to_scan = [
Expand Down
13 changes: 10 additions & 3 deletions newsplease/crawler/spiders/sitemap_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,15 +23,21 @@ def __init__(self, helper, url, config, ignore_regex, *args, **kwargs):

self.config = config
self.helper = helper

self.original_url = url

self.allowed_domains = [
self.helper.url_extractor.get_allowed_domain(
url, config.section("Crawler")["sitemap_allow_subdomains"]
)
]
self.check_certificate = (bool(config.section("Crawler").get('check_certificate'))
if config.section("Crawler").get('check_certificate') is not None
else True)
self.sitemap_urls = self.helper.url_extractor.get_sitemap_urls(
url, config.section("Crawler")["sitemap_allow_subdomains"]
domain_url=url,
allow_subdomains=config.section("Crawler")["sitemap_allow_subdomains"],
check_certificate=self.check_certificate,
)

self.log.debug(self.sitemap_urls)
Expand Down Expand Up @@ -61,15 +67,16 @@ def only_extracts_articles():
return True

@staticmethod
def supports_site(url):
def supports_site(url: str, check_certificate: bool = True) -> bool:
"""
Sitemap-Crawler are supported by every site which have a
Sitemap set in the robots.txt.
Determines if this crawler works on the given url.
:param str url: The url to test
:param str check_certificate:
:return bool: Determines wether this crawler work on the given url
"""

return UrlExtractor.sitemap_check(url)
return UrlExtractor.sitemap_check(url=url, check_certificate=check_certificate)
Loading

0 comments on commit 94e4a9b

Please sign in to comment.