merge scrapy/quotesbot#8

mikespub-archive · Oct 7, 2021 · ba46530 · ba46530
1 parent 3963960
commit ba46530
Show file tree

Hide file tree

Showing 5 changed files with 51 additions and 60 deletions.
diff --git a/quotesbot/quotesbot/items.py b/quotesbot/quotesbot/items.py
@@ -1,9 +1,7 @@
-# -*- coding: utf-8 -*-
-
 # Define here the models for your scraped items
 #
 # See documentation in:
-# http://doc.scrapy.org/en/latest/topics/items.html
+# https://doc.scrapy.org/en/latest/topics/items.html
 
 import scrapy
 

diff --git a/quotesbot/quotesbot/pipelines.py b/quotesbot/quotesbot/pipelines.py
@@ -1,11 +1,9 @@
-# -*- coding: utf-8 -*-
-
 # Define your item pipelines here
 #
 # Don't forget to add your pipeline to the ITEM_PIPELINES setting
-# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
+# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
 
 
-class QuotesbotPipeline(object):
+class QuotesbotPipeline:
     def process_item(self, item, spider):
         return item
diff --git a/quotesbot/quotesbot/settings.py b/quotesbot/quotesbot/settings.py
@@ -1,13 +1,11 @@
-# -*- coding: utf-8 -*-
-
 # Scrapy settings for quotesbot project
 #
 # For simplicity, this file contains only settings considered important or
 # commonly used. You can find more settings consulting the documentation:
 #
-#     http://doc.scrapy.org/en/latest/topics/settings.html
-#     http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
-#     http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
+#     https://doc.scrapy.org/en/latest/topics/settings.html
+#     https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
+#     https://doc.scrapy.org/en/latest/topics/spider-middleware.html
 
 BOT_NAME = 'quotesbot'
 
@@ -16,75 +14,75 @@
 
 
 # Crawl responsibly by identifying yourself (and your website) on the user-agent
-#USER_AGENT = 'quotesbot (+http://www.yourdomain.com)'
+# USER_AGENT = 'holidaygems (+http://www.yourdomain.com)'
 
 # Obey robots.txt rules
 ROBOTSTXT_OBEY = True
 
 # Configure maximum concurrent requests performed by Scrapy (default: 16)
-#CONCURRENT_REQUESTS = 32
+# CONCURRENT_REQUESTS = 32
 
 # Configure a delay for requests for the same website (default: 0)
-# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
+# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
 # See also autothrottle settings and docs
-#DOWNLOAD_DELAY = 3
+# DOWNLOAD_DELAY = 3
 # The download delay setting will honor only one of:
-#CONCURRENT_REQUESTS_PER_DOMAIN = 16
-#CONCURRENT_REQUESTS_PER_IP = 16
+# CONCURRENT_REQUESTS_PER_DOMAIN = 16
+# CONCURRENT_REQUESTS_PER_IP = 16
 
 # Disable cookies (enabled by default)
-#COOKIES_ENABLED = False
+# COOKIES_ENABLED = False
 
 # Disable Telnet Console (enabled by default)
-#TELNETCONSOLE_ENABLED = False
+# TELNETCONSOLE_ENABLED = False
 
 # Override the default request headers:
-#DEFAULT_REQUEST_HEADERS = {
+# DEFAULT_REQUEST_HEADERS = {
 #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
 #   'Accept-Language': 'en',
-#}
+# }
 
 # Enable or disable spider middlewares
-# See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
-#SPIDER_MIDDLEWARES = {
-#    'quotesbot.middlewares.MyCustomSpiderMiddleware': 543,
-#}
+# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
+# SPIDER_MIDDLEWARES = {
+#    'scrapingbot.middlewares.SpiderMiddleware': 543,
+# }
 
 # Enable or disable downloader middlewares
-# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
-#DOWNLOADER_MIDDLEWARES = {
+# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
+# DOWNLOADER_MIDDLEWARES = {
 #    'quotesbot.middlewares.MyCustomDownloaderMiddleware': 543,
-#}
+# }
 
 # Enable or disable extensions
-# See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
-#EXTENSIONS = {
+# See https://doc.scrapy.org/en/latest/topics/extensions.html
+# EXTENSIONS = {
 #    'scrapy.extensions.telnet.TelnetConsole': None,
-#}
+# }
 
 # Configure item pipelines
-# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
-#ITEM_PIPELINES = {
+# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
+# ITEM_PIPELINES = {
 #    'quotesbot.pipelines.SomePipeline': 300,
-#}
+# }
 
 # Enable and configure the AutoThrottle extension (disabled by default)
-# See http://doc.scrapy.org/en/latest/topics/autothrottle.html
-#AUTOTHROTTLE_ENABLED = True
+# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
+# AUTOTHROTTLE_ENABLED = True
 # The initial download delay
-#AUTOTHROTTLE_START_DELAY = 5
+# AUTOTHROTTLE_START_DELAY = 5
 # The maximum download delay to be set in case of high latencies
-#AUTOTHROTTLE_MAX_DELAY = 60
+# AUTOTHROTTLE_MAX_DELAY = 60
 # The average number of requests Scrapy should be sending in parallel to
 # each remote server
-#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
+# AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
 # Enable showing throttling stats for every response received:
-#AUTOTHROTTLE_DEBUG = False
+# AUTOTHROTTLE_DEBUG = False
 
 # Enable and configure HTTP caching (disabled by default)
-# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
-#HTTPCACHE_ENABLED = True
-#HTTPCACHE_EXPIRATION_SECS = 0
-#HTTPCACHE_DIR = 'httpcache'
-#HTTPCACHE_IGNORE_HTTP_CODES = []
-#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
+# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings  # noqa
+# HTTPCACHE_ENABLED = True
+# HTTPCACHE_EXPIRATION_SECS = 0
+# HTTPCACHE_DIR = 'httpcache'
+# HTTPCACHE_IGNORE_HTTP_CODES = []
+# HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
diff --git a/quotesbot/quotesbot/spiders/toscrape-css.py b/quotesbot/quotesbot/spiders/toscrape-css.py
@@ -1,4 +1,3 @@
-# -*- coding: utf-8 -*-
 import scrapy
 
 
@@ -11,12 +10,11 @@ class ToScrapeCSSSpider(scrapy.Spider):
     def parse(self, response):
         for quote in response.css("div.quote"):
             yield {
-                'text': quote.css("span.text::text").extract_first(),
-                'author': quote.css("small.author::text").extract_first(),
-                'tags': quote.css("div.tags > a.tag::text").extract()
+                'text': quote.css("span.text::text").get(),
+                'author': quote.css("small.author::text").get(),
+                'tags': quote.css("div.tags > a.tag::text").getall()
             }
 
-        next_page_url = response.css("li.next > a::attr(href)").extract_first()
+        next_page_url = response.css("li.next > a::attr(href)").get()
         if next_page_url is not None:
-            yield scrapy.Request(response.urljoin(next_page_url))
-
+            yield response.follow(next_page_url)
diff --git a/quotesbot/quotesbot/spiders/toscrape-xpath.py b/quotesbot/quotesbot/spiders/toscrape-xpath.py
@@ -1,4 +1,3 @@
-# -*- coding: utf-8 -*-
 import scrapy
 
 
@@ -11,12 +10,12 @@ class ToScrapeSpiderXPath(scrapy.Spider):
     def parse(self, response):
         for quote in response.xpath('//div[@class="quote"]'):
             yield {
-                'text': quote.xpath('./span[@class="text"]/text()').extract_first(),
-                'author': quote.xpath('.//small[@class="author"]/text()').extract_first(),
-                'tags': quote.xpath('.//div[@class="tags"]/a[@class="tag"]/text()').extract()
+                'text': quote.xpath('./span[@class="text"]/text()').get(),
+                'author': quote.xpath('.//small[@class="author"]/text()').get(),
+                'tags': quote.xpath('.//div[@class="tags"]/a[@class="tag"]/text()').getall()
             }
 
-        next_page_url = response.xpath('//li[@class="next"]/a/@href').extract_first()
+        next_page_url = response.xpath('//li[@class="next"]/a/@href').get()
         if next_page_url is not None:
-            yield scrapy.Request(response.urljoin(next_page_url))
+            yield response.follow(next_page_url)