-
Notifications
You must be signed in to change notification settings - Fork 0
/
web_scraper.py
74 lines (62 loc) · 2.63 KB
/
web_scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
import wikipediaapi
import concurrent.futures
from tqdm import tqdm
import pandas as pd
def wiki_scrape(topic_name, verbose=True):
def wiki_link(link):
try:
page = wiki_api.page(link)
if page.exists():
return {'page': link, 'text': page.text, 'link': page.fullurl,
'categories': list(page.categories.keys())}
except:
return None
wiki_api = wikipediaapi.Wikipedia(language='en',
extract_format=wikipediaapi.ExtractFormat.WIKI)
page_name = wiki_api.page(topic_name)
if not page_name.exists():
print('Page {} does not exist.'.format(topic_name))
return
page_links = list(page_name.links.keys())
progress = tqdm(desc='Links Scraped', unit='',
total=len(page_links)) if verbose else None
sources = [{'page': topic_name, 'text': page_name.text, 'link': page_name.fullurl,
'categories': list(page_name.categories.keys())}]
with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
future_link = {executor.submit(
wiki_link, link): link for link in page_links}
for future in concurrent.futures.as_completed(future_link):
data = future.result()
sources.append(data) if data else None
progress.update(1) if verbose else None
progress.close() if verbose else None
namespaces = ('Wikipedia', 'Special', 'Talk', 'LyricWiki', 'File', 'MediaWiki',
'Template', 'Help', 'User', 'Category talk', 'Portal talk')
sources = pd.DataFrame(sources)
sources = sources[(len(sources['text']) > 20)
& ~(sources['page'].str.startswith(namespaces, na=True))]
sources['categories'] = sources.categories.apply(
lambda x: [y[9:] for y in x])
sources['topic'] = topic_name
print('Wikipedia pages scraped:', len(sources))
return sources
def wiki_page(page_name):
"""
Get the text of a single wikipedia page
:param page_name:
:return: dataframe with page name, text, link, and categories
"""
wiki_api = wikipediaapi.Wikipedia(language='en',
extract_format=wikipediaapi.ExtractFormat.WIKI)
page_name = wiki_api.page(page_name)
if not page_name.exists():
print('Page {} does not exist.'.format(page_name))
return
page_data = pd.DataFrame({
'page': page_name,
'text': page_name.text,
'link': page_name.fullurl,
'categories': [[y[9:] for y in
list(page_name.categories.keys())]],
})
return page_data