diff --git a/crawler/code/Makefile b/crawler/code/Makefile index 7b1a4afe2..1bc2bc0f7 100644 --- a/crawler/code/Makefile +++ b/crawler/code/Makefile @@ -1,4 +1,5 @@ -PYTHON3=python3.4 +# Use whichever Python 3 is default on your system or in your virtualenv. +PYTHON3=python3 test: $(PYTHON3) crawl.py -q what-if.xkcd.com diff --git a/crawler/code/requirements.txt b/crawler/code/requirements.txt index 286f47da5..e9f66ea2f 100644 --- a/crawler/code/requirements.txt +++ b/crawler/code/requirements.txt @@ -4,4 +4,4 @@ # # Install this package with "python3 -m pip install -r requirements.txt". -aiohttp>=0.21 +aiohttp>=1.2 diff --git a/crawler/code/supplemental/README-SUPPLEMENTAL.txt b/crawler/code/supplemental/README-SUPPLEMENTAL.txt index 21f8c0887..dba09322b 100644 --- a/crawler/code/supplemental/README-SUPPLEMENTAL.txt +++ b/crawler/code/supplemental/README-SUPPLEMENTAL.txt @@ -1,7 +1,7 @@ Authors: A. Jesse Jiryu Davis Project: Web crawler Requirements: - * Python 3.3 or 3.4 + * Python 3.3 or later Illustrations of an async web crawler built upon: diff --git a/crawler/code/supplemental/blocking-fetch.py b/crawler/code/supplemental/blocking-fetch.py index 9017bd778..74d164e08 100644 --- a/crawler/code/supplemental/blocking-fetch.py +++ b/crawler/code/supplemental/blocking-fetch.py @@ -3,8 +3,8 @@ def threaded_method(): sock = socket.socket() - sock.connect(('xkcd.com', 80)) - request = 'GET /353/ HTTP/1.0\r\nHost: xkcd.com\r\n\r\n' + sock.connect(('aosabook.org', 80)) + request = 'GET /en/index.html HTTP/1.0\r\nHost: aosabook.org\r\n\r\n' sock.send(request.encode('ascii')) response = b'' chunk = sock.recv(4096) @@ -12,6 +12,6 @@ def threaded_method(): response += chunk chunk = sock.recv(4096) - print(response) + print('Got {} bytes'.format(len(response))) threaded_method() diff --git a/crawler/code/supplemental/loop-with-callbacks.py b/crawler/code/supplemental/loop-with-callbacks.py index 14165ee10..87810b20e 100644 --- a/crawler/code/supplemental/loop-with-callbacks.py +++ b/crawler/code/supplemental/loop-with-callbacks.py @@ -29,14 +29,14 @@ def fetch(self): self.sock = socket.socket() self.sock.setblocking(False) try: - self.sock.connect(('xkcd.com', 80)) + self.sock.connect(('aosabook.org', 80)) except BlockingIOError: pass selector.register(self.sock.fileno(), EVENT_WRITE, self.connected) def connected(self, key, mask): selector.unregister(key.fd) - get = 'GET {} HTTP/1.0\r\nHost: xkcd.com\r\n\r\n'.format(self.url) + get = 'GET {} HTTP/1.0\r\nHost: aosabook.org\r\n\r\n'.format(self.url) self.sock.send(get.encode('ascii')) selector.register(key.fd, EVENT_READ, self.read_response) @@ -79,9 +79,12 @@ def parse_links(self): if parts.scheme not in ('', 'http', 'https'): continue host, port = urllib.parse.splitport(parts.netloc) - if host and host.lower() not in ('xkcd.com', 'www.xkcd.com'): + if host and host.lower() not in ('aosabook.org', + 'www.aosabook.org'): continue defragmented, frag = urllib.parse.urldefrag(parts.path) + if defragmented == '': + defragmented = '/' links.add(defragmented) return links diff --git a/crawler/code/supplemental/loop-with-coroutines.py b/crawler/code/supplemental/loop-with-coroutines.py index aa12f5009..90f8cc164 100644 --- a/crawler/code/supplemental/loop-with-coroutines.py +++ b/crawler/code/supplemental/loop-with-coroutines.py @@ -18,9 +18,6 @@ def __init__(self): self.result = None self._callbacks = [] - def result(self): - return self.result - def add_done_callback(self, fn): self._callbacks.append(fn) @@ -105,8 +102,8 @@ def fetch(self): concurrency_achieved = max(concurrency_achieved, len(urls_todo)) sock = socket.socket() - yield from connect(sock, ('xkcd.com', 80)) - get = 'GET {} HTTP/1.0\r\nHost: xkcd.com\r\n\r\n'.format(self.url) + yield from connect(sock, ('aosabook.org', 80)) + get = 'GET {} HTTP/1.0\r\nHost: aosabook.org\r\n\r\n'.format(self.url) sock.send(get.encode('ascii')) self.response = yield from read_all(sock) @@ -135,9 +132,13 @@ def _process_response(self): if parts.scheme not in ('', 'http', 'https'): continue host, port = urllib.parse.splitport(parts.netloc) - if host and host.lower() not in ('xkcd.com', 'www.xkcd.com'): + if host and host.lower() not in ('aosabook.org', + 'www.aosabook.org'): continue defragmented, frag = urllib.parse.urldefrag(parts.path) + if defragmented == '': + defragmented = '/' + if defragmented not in urls_seen: urls_todo.add(defragmented) urls_seen.add(defragmented) diff --git a/crawler/code/supplemental/non-blocking-fetch-stupid.py b/crawler/code/supplemental/non-blocking-fetch-stupid.py index 1e7968de7..918fd7eaf 100644 --- a/crawler/code/supplemental/non-blocking-fetch-stupid.py +++ b/crawler/code/supplemental/non-blocking-fetch-stupid.py @@ -4,11 +4,11 @@ sock = socket.socket() sock.setblocking(False) try: - sock.connect(('xkcd.com', 80)) + sock.connect(('aosabook.org', 80)) except BlockingIOError: pass -request = 'GET /353/ HTTP/1.0\r\nHost: xkcd.com\r\n\r\n' +request = 'GET /en/index.html HTTP/1.0\r\nHost: aosabook.org\r\n\r\n' encoded = request.encode('ascii') while True: @@ -19,3 +19,19 @@ pass print('sent') + +response = b'' +chunk = b'' + +while True: + try: + chunk = sock.recv(4096) + except OSError: + continue + + if chunk: + response += chunk + else: + break + +print('Got {} bytes'.format(len(response))) diff --git a/crawler/crawler.markdown b/crawler/crawler.markdown index aa2cea038..97ee0c691 100644 --- a/crawler/crawler.markdown +++ b/crawler/crawler.markdown @@ -21,13 +21,13 @@ We can hasten this process by downloading many pages concurrently. As the crawle ## The Traditional Approach -How do we make the crawler concurrent? Traditionally we would create a thread pool. Each thread would be in charge of downloading one page at a time over a socket. For example, to download a page from `xkcd.com`: +How do we make the crawler concurrent? Traditionally we would create a thread pool. Each thread would be in charge of downloading one page at a time over a socket. For example, to download a page from `aosabook.org`, the site for the Architecture of Open Source Applications books: ```python def fetch(url): sock = socket.socket() - sock.connect(('xkcd.com', 80)) - request = 'GET {} HTTP/1.0\r\nHost: xkcd.com\r\n\r\n'.format(url) + sock.connect(('aosabook.org', 80)) + request = 'GET {} HTTP/1.0\r\nHost: aosabook.org\r\n\r\n'.format(url) sock.send(request.encode('ascii')) response = b'' chunk = sock.recv(4096) @@ -60,7 +60,7 @@ before we begin to connect to the server: sock = socket.socket() sock.setblocking(False) try: - sock.connect(('xkcd.com', 80)) + sock.connect(('aosabook.org', 80)) except BlockingIOError: pass ``` @@ -70,7 +70,7 @@ Irritatingly, a non-blocking socket throws an exception from `connect`, even whe Now our crawler needs a way to know when the connection is established, so it can send the HTTP request. We could simply keep trying in a tight loop: ```python -request = 'GET {} HTTP/1.0\r\nHost: xkcd.com\r\n\r\n'.format(url) +request = 'GET {} HTTP/1.0\r\nHost: aosabook.org\r\n\r\n'.format(url) encoded = request.encode('ascii') while True: @@ -95,7 +95,7 @@ selector = DefaultSelector() sock = socket.socket() sock.setblocking(False) try: - sock.connect(('xkcd.com', 80)) + sock.connect(('aosabook.org', 80)) except BlockingIOError: pass @@ -162,7 +162,7 @@ We begin by calling `Fetcher.fetch`: self.sock = socket.socket() self.sock.setblocking(False) try: - self.sock.connect(('xkcd.com', 80)) + self.sock.connect(('aosabook.org', 80)) except BlockingIOError: pass @@ -175,8 +175,8 @@ We begin by calling `Fetcher.fetch`: The `fetch` method begins connecting a socket. But notice the method returns before the connection is established. It must return control to the event loop to wait for the connection. To understand why, imagine our whole application was structured so: ```python -# Begin fetching http://xkcd.com/353/ -fetcher = Fetcher('/353/') +# Begin fetching http://aosabook.org/en/index.html +fetcher = Fetcher('/en/index.html') fetcher.fetch() while True: @@ -195,7 +195,7 @@ Here is the implementation of `connected`: def connected(self, key, mask): print('connected!') selector.unregister(key.fd) - request = 'GET {} HTTP/1.0\r\nHost: xkcd.com\r\n\r\n'.format(self.url) + request = 'GET {} HTTP/1.0\r\nHost: aosabook.org\r\n\r\n'.format(self.url) self.sock.send(request.encode('ascii')) # Register the next callback. @@ -258,8 +258,8 @@ Let us explain what we mean by that. Consider how simply we fetched a URL on a t # Blocking version. def fetch(url): sock = socket.socket() - sock.connect(('xkcd.com', 80)) - request = 'GET {} HTTP/1.0\r\nHost: xkcd.com\r\n\r\n'.format(url) + sock.connect(('aosabook.org', 80)) + request = 'GET {} HTTP/1.0\r\nHost: aosabook.org\r\n\r\n'.format(url) sock.send(request.encode('ascii')) response = b'' chunk = sock.recv(4096) @@ -493,7 +493,7 @@ class Fetcher: self.sock = socket.socket() self.sock.setblocking(False) try: - self.sock.connect(('xkcd.com', 80)) + self.sock.connect(('aosabook.org', 80)) except BlockingIOError: pass selector.register(self.sock.fileno(), @@ -512,7 +512,7 @@ The `fetch` method begins connecting a socket, then registers the callback, `con sock = socket.socket() sock.setblocking(False) try: - sock.connect(('xkcd.com', 80)) + sock.connect(('aosabook.org', 80)) except BlockingIOError: pass @@ -549,8 +549,8 @@ class Task: next_future.add_done_callback(self.step) -# Begin fetching http://xkcd.com/353/ -fetcher = Fetcher('/353/') +# Begin fetching http://aosabook.org/en/index.html +fetcher = Fetcher('/en/index.html') Task(fetcher.fetch()) loop() @@ -795,7 +795,7 @@ We collect the workers' shared state in a crawler class, and write the main logi ```python loop = asyncio.get_event_loop() -crawler = crawling.Crawler('http://xkcd.com', +crawler = crawling.Crawler('http://aosabook.org', max_redirect=10) loop.run_until_complete(crawler.crawl()) @@ -924,14 +924,14 @@ We promised to explain why the items in the queue are pairs, like: ```python # URL to fetch, and the number of redirects left. -('http://xkcd.com/353', 10) +('http://aosabook.org/en', 10) ``` New URLs have ten redirects remaining. Fetching this particular URL results in a redirect to a new location with a trailing slash. We decrement the number of redirects remaining, and put the next location in the queue: ```python # URL with a trailing slash. Nine redirects left. -('http://xkcd.com/353/', 9) +('http://aosabook.org/en/', 9) ``` The `aiohttp` package we use would follow redirects by default and give us the final response. We tell it not to, however, and handle redirects in the crawler, so it can coalesce redirect paths that lead to the same destination: if we have already seen this URL, it is in ``self.seen_urls`` and we have already started on this path from a different entry point: