-
Notifications
You must be signed in to change notification settings - Fork 52
/
pages_scrape.py
49 lines (40 loc) · 1.62 KB
/
pages_scrape.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
import logging
import requests
def scrape(url, extractor):
"""
Function to request and parse a given URL. Returns only the "relevant"
text.
Parameters
----------
url : String.
URL to request and parse.
extractor : Goose class instance.
An instance of Goose that allows for parsing of content.
Returns
-------
text : String.
Parsed text from the specified website.
meta : String.
Parsed meta description of an article. Usually equivalent to the
lede.
"""
logger = logging.getLogger('scraper_log')
try:
headers = {'User-Agent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1700.107 Safari/537.36"}
page = requests.get(url, headers=headers)
try:
try:
article = extractor.extract(raw_html=page.content)
except UnicodeDecodeError:
article = extractor.extract(raw_html=page.content.decode('utf-8',
errors='replace'))
text = article.cleaned_text
meta = article.meta_description
return text, meta
#Generic error catching is bad
except Exception, e:
print 'There was an error. Check the log file for more information.'
logger.warning('Problem scraping URL: {}. {}.'.format(url, e))
except Exception, e:
print 'There was an error. Check the log file for more information.'
logger.warning('Problem requesting url: {}. {}'.format(url, e))