Skip to content

Crawler corrections January 2017 #245

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 4 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion crawler/code/Makefile
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
PYTHON3=python3.4
# Use whichever Python 3 is default on your system or in your virtualenv.
PYTHON3=python3
test:
$(PYTHON3) crawl.py -q what-if.xkcd.com

Expand Down
2 changes: 1 addition & 1 deletion crawler/code/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,4 @@
#
# Install this package with "python3 -m pip install -r requirements.txt".

aiohttp>=0.21
aiohttp>=1.2
2 changes: 1 addition & 1 deletion crawler/code/supplemental/README-SUPPLEMENTAL.txt
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
Authors: A. Jesse Jiryu Davis
Project: Web crawler
Requirements:
* Python 3.3 or 3.4
* Python 3.3 or later

Illustrations of an async web crawler built upon:

Expand Down
6 changes: 3 additions & 3 deletions crawler/code/supplemental/blocking-fetch.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,15 @@

def threaded_method():
sock = socket.socket()
sock.connect(('xkcd.com', 80))
request = 'GET /353/ HTTP/1.0\r\nHost: xkcd.com\r\n\r\n'
sock.connect(('aosabook.org', 80))
request = 'GET /en/index.html HTTP/1.0\r\nHost: aosabook.org\r\n\r\n'
sock.send(request.encode('ascii'))
response = b''
chunk = sock.recv(4096)
while chunk:
response += chunk
chunk = sock.recv(4096)

print(response)
print('Got {} bytes'.format(len(response)))

threaded_method()
9 changes: 6 additions & 3 deletions crawler/code/supplemental/loop-with-callbacks.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,14 +29,14 @@ def fetch(self):
self.sock = socket.socket()
self.sock.setblocking(False)
try:
self.sock.connect(('xkcd.com', 80))
self.sock.connect(('aosabook.org', 80))
except BlockingIOError:
pass
selector.register(self.sock.fileno(), EVENT_WRITE, self.connected)

def connected(self, key, mask):
selector.unregister(key.fd)
get = 'GET {} HTTP/1.0\r\nHost: xkcd.com\r\n\r\n'.format(self.url)
get = 'GET {} HTTP/1.0\r\nHost: aosabook.org\r\n\r\n'.format(self.url)
self.sock.send(get.encode('ascii'))
selector.register(key.fd, EVENT_READ, self.read_response)

Expand Down Expand Up @@ -79,9 +79,12 @@ def parse_links(self):
if parts.scheme not in ('', 'http', 'https'):
continue
host, port = urllib.parse.splitport(parts.netloc)
if host and host.lower() not in ('xkcd.com', 'www.xkcd.com'):
if host and host.lower() not in ('aosabook.org',
'www.aosabook.org'):
continue
defragmented, frag = urllib.parse.urldefrag(parts.path)
if defragmented == '':
defragmented = '/'
links.add(defragmented)

return links
Expand Down
13 changes: 7 additions & 6 deletions crawler/code/supplemental/loop-with-coroutines.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,6 @@ def __init__(self):
self.result = None
self._callbacks = []

def result(self):
return self.result

def add_done_callback(self, fn):
self._callbacks.append(fn)

Expand Down Expand Up @@ -105,8 +102,8 @@ def fetch(self):
concurrency_achieved = max(concurrency_achieved, len(urls_todo))

sock = socket.socket()
yield from connect(sock, ('xkcd.com', 80))
get = 'GET {} HTTP/1.0\r\nHost: xkcd.com\r\n\r\n'.format(self.url)
yield from connect(sock, ('aosabook.org', 80))
get = 'GET {} HTTP/1.0\r\nHost: aosabook.org\r\n\r\n'.format(self.url)
sock.send(get.encode('ascii'))
self.response = yield from read_all(sock)

Expand Down Expand Up @@ -135,9 +132,13 @@ def _process_response(self):
if parts.scheme not in ('', 'http', 'https'):
continue
host, port = urllib.parse.splitport(parts.netloc)
if host and host.lower() not in ('xkcd.com', 'www.xkcd.com'):
if host and host.lower() not in ('aosabook.org',
'www.aosabook.org'):
continue
defragmented, frag = urllib.parse.urldefrag(parts.path)
if defragmented == '':
defragmented = '/'

if defragmented not in urls_seen:
urls_todo.add(defragmented)
urls_seen.add(defragmented)
Expand Down
20 changes: 18 additions & 2 deletions crawler/code/supplemental/non-blocking-fetch-stupid.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,11 @@
sock = socket.socket()
sock.setblocking(False)
try:
sock.connect(('xkcd.com', 80))
sock.connect(('aosabook.org', 80))
except BlockingIOError:
pass

request = 'GET /353/ HTTP/1.0\r\nHost: xkcd.com\r\n\r\n'
request = 'GET /en/index.html HTTP/1.0\r\nHost: aosabook.org\r\n\r\n'
encoded = request.encode('ascii')

while True:
Expand All @@ -19,3 +19,19 @@
pass

print('sent')

response = b''
chunk = b''

while True:
try:
chunk = sock.recv(4096)
except OSError:
continue

if chunk:
response += chunk
else:
break

print('Got {} bytes'.format(len(response)))
38 changes: 19 additions & 19 deletions crawler/crawler.markdown
Original file line number Diff line number Diff line change
Expand Up @@ -21,13 +21,13 @@ We can hasten this process by downloading many pages concurrently. As the crawle

## The Traditional Approach

How do we make the crawler concurrent? Traditionally we would create a thread pool. Each thread would be in charge of downloading one page at a time over a socket. For example, to download a page from `xkcd.com`:
How do we make the crawler concurrent? Traditionally we would create a thread pool. Each thread would be in charge of downloading one page at a time over a socket. For example, to download a page from `aosabook.org`, the site for the Architecture of Open Source Applications books:

```python
def fetch(url):
sock = socket.socket()
sock.connect(('xkcd.com', 80))
request = 'GET {} HTTP/1.0\r\nHost: xkcd.com\r\n\r\n'.format(url)
sock.connect(('aosabook.org', 80))
request = 'GET {} HTTP/1.0\r\nHost: aosabook.org\r\n\r\n'.format(url)
sock.send(request.encode('ascii'))
response = b''
chunk = sock.recv(4096)
Expand Down Expand Up @@ -60,7 +60,7 @@ before we begin to connect to the server:
sock = socket.socket()
sock.setblocking(False)
try:
sock.connect(('xkcd.com', 80))
sock.connect(('aosabook.org', 80))
except BlockingIOError:
pass
```
Expand All @@ -70,7 +70,7 @@ Irritatingly, a non-blocking socket throws an exception from `connect`, even whe
Now our crawler needs a way to know when the connection is established, so it can send the HTTP request. We could simply keep trying in a tight loop:

```python
request = 'GET {} HTTP/1.0\r\nHost: xkcd.com\r\n\r\n'.format(url)
request = 'GET {} HTTP/1.0\r\nHost: aosabook.org\r\n\r\n'.format(url)
encoded = request.encode('ascii')

while True:
Expand All @@ -95,7 +95,7 @@ selector = DefaultSelector()
sock = socket.socket()
sock.setblocking(False)
try:
sock.connect(('xkcd.com', 80))
sock.connect(('aosabook.org', 80))
except BlockingIOError:
pass

Expand Down Expand Up @@ -162,7 +162,7 @@ We begin by calling `Fetcher.fetch`:
self.sock = socket.socket()
self.sock.setblocking(False)
try:
self.sock.connect(('xkcd.com', 80))
self.sock.connect(('aosabook.org', 80))
except BlockingIOError:
pass

Expand All @@ -175,8 +175,8 @@ We begin by calling `Fetcher.fetch`:
The `fetch` method begins connecting a socket. But notice the method returns before the connection is established. It must return control to the event loop to wait for the connection. To understand why, imagine our whole application was structured so:

```python
# Begin fetching http://xkcd.com/353/
fetcher = Fetcher('/353/')
# Begin fetching http://aosabook.org/en/index.html
fetcher = Fetcher('/en/index.html')
fetcher.fetch()

while True:
Expand All @@ -195,7 +195,7 @@ Here is the implementation of `connected`:
def connected(self, key, mask):
print('connected!')
selector.unregister(key.fd)
request = 'GET {} HTTP/1.0\r\nHost: xkcd.com\r\n\r\n'.format(self.url)
request = 'GET {} HTTP/1.0\r\nHost: aosabook.org\r\n\r\n'.format(self.url)
self.sock.send(request.encode('ascii'))

# Register the next callback.
Expand Down Expand Up @@ -258,8 +258,8 @@ Let us explain what we mean by that. Consider how simply we fetched a URL on a t
# Blocking version.
def fetch(url):
sock = socket.socket()
sock.connect(('xkcd.com', 80))
request = 'GET {} HTTP/1.0\r\nHost: xkcd.com\r\n\r\n'.format(url)
sock.connect(('aosabook.org', 80))
request = 'GET {} HTTP/1.0\r\nHost: aosabook.org\r\n\r\n'.format(url)
sock.send(request.encode('ascii'))
response = b''
chunk = sock.recv(4096)
Expand Down Expand Up @@ -493,7 +493,7 @@ class Fetcher:
self.sock = socket.socket()
self.sock.setblocking(False)
try:
self.sock.connect(('xkcd.com', 80))
self.sock.connect(('aosabook.org', 80))
except BlockingIOError:
pass
selector.register(self.sock.fileno(),
Expand All @@ -512,7 +512,7 @@ The `fetch` method begins connecting a socket, then registers the callback, `con
sock = socket.socket()
sock.setblocking(False)
try:
sock.connect(('xkcd.com', 80))
sock.connect(('aosabook.org', 80))
except BlockingIOError:
pass

Expand Down Expand Up @@ -549,8 +549,8 @@ class Task:

next_future.add_done_callback(self.step)

# Begin fetching http://xkcd.com/353/
fetcher = Fetcher('/353/')
# Begin fetching http://aosabook.org/en/index.html
fetcher = Fetcher('/en/index.html')
Task(fetcher.fetch())

loop()
Expand Down Expand Up @@ -795,7 +795,7 @@ We collect the workers' shared state in a crawler class, and write the main logi
```python
loop = asyncio.get_event_loop()

crawler = crawling.Crawler('http://xkcd.com',
crawler = crawling.Crawler('http://aosabook.org',
max_redirect=10)

loop.run_until_complete(crawler.crawl())
Expand Down Expand Up @@ -924,14 +924,14 @@ We promised to explain why the items in the queue are pairs, like:

```python
# URL to fetch, and the number of redirects left.
('http://xkcd.com/353', 10)
('http://aosabook.org/en', 10)
```

New URLs have ten redirects remaining. Fetching this particular URL results in a redirect to a new location with a trailing slash. We decrement the number of redirects remaining, and put the next location in the queue:

```python
# URL with a trailing slash. Nine redirects left.
('http://xkcd.com/353/', 9)
('http://aosabook.org/en/', 9)
```

The `aiohttp` package we use would follow redirects by default and give us the final response. We tell it not to, however, and handle redirects in the crawler, so it can coalesce redirect paths that lead to the same destination: if we have already seen this URL, it is in ``self.seen_urls`` and we have already started on this path from a different entry point:
Expand Down