-
Notifications
You must be signed in to change notification settings - Fork 15
/
link_fetcher.py
executable file
·122 lines (105 loc) · 4.08 KB
/
link_fetcher.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
#!/usr/bin/env python3
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.firefox.options import Options
import re
import requests
import time
class LinkFetcher:
"""
This fetches the links to all blog posts for a user
"""
def __init__(self, driver_type='headless', username='nishparadox'):
if not username:
raise ValueError("Invalid username")
self.username = username
self.url = "https://medium.com/@{}/latest".format(username)
self.driver_type = driver_type
if driver_type == 'headless':
options = Options()
options.set_headless(headless=True)
self.driver = webdriver.Firefox(firefox_options=options)
elif driver_type == 'chrome':
self.driver = webdriver.Chrome()
else:
self.driver = webdriver.Firefox()
def _scroll_to_oblivion(self):
"""
Simulate infinite scroll as per the medium's site
"""
pause = 3
last_height = self.driver.execute_script("return document.body.scrollHeight")
i = 0
# self.driver.get_screenshot_as_file("data/screen"+str(i)+".png")
while True:
print("Scrolling...")
self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(pause)
new_height = self.driver.execute_script("return document.body.scrollHeight")
# print("New Height :: {}".format(new_height))
if new_height == last_height:
break
last_height = new_height
i += 1
# self.driver.get_screenshot_as_file("data/screen"+str(i)+".jpg")
return last_height
def _parse(self, html):
"""
Parse to get the actual links to the posts
"""
soup = BeautifulSoup(html, 'html.parser')
class_name = 'streamItem streamItem--postPreview js-streamItem'
divs = soup.find_all('div', {'class' : class_name})
links = []
regex = re.compile('Read More', re.IGNORECASE)
for div in divs:
anchors = div.find_all('a', href=True, text=regex)
if anchors:
links.append(anchors[0]['href'])
return links
def _parse2(self, html):
"""
The old function to fetch actual links of this post.
Since medium has shifted to react based rendering,
there is no actual class name for post.
So, here we fetch all the links that match to the pattern:
/p/someid/
"""
soup = BeautifulSoup(html, 'html.parser')
links = soup.find_all('a', href=re.compile(r'.*/p/.*'))
href_list = set()
base_url = "https://medium.com{}"
for link in links:
href = link['href']
href_list.add(base_url.format(href))
return href_list
def _parse3(self, html):
"""
The latest function to fetch actual links of this post.
Since medium has shifted to react based rendering,
there is no actual class name for post.
So, here we fetch all the links that match to the pattern:
/@username/post-slag
"""
soup = BeautifulSoup(html, 'html.parser')
# links = soup.find_all('a', href=re.compile(r'.*/p/.*'))
pattern = r"https://medium.com/@{}/*".format(self.username)
links = soup.find_all('a', href=re.compile(pattern))
href_list = set()
base_url = "{}"
for link in links:
href = link['href']
href_list.add(base_url.format(href).strip())
return href_list
def get_links(self):
print("Using driver type :: {}".format(self.driver_type))
self.driver.get(self.url)
h = self._scroll_to_oblivion()
print("Preparing to fetch available links...")
return self._parse3(self.driver.page_source)
def main():
link_fetcher = LinkFetcher('headless', username='nishparadox')
links = link_fetcher.get_links()
print(links)
if __name__ == "__main__":
main()