link_fetcher.py

#!/usr/bin/env python3


from bs4 import BeautifulSoup

from selenium import webdriver
from selenium.webdriver.firefox.options import Options

import re
import requests
import time


class LinkFetcher:
    """
        This fetches the links to all blog posts for a user
    """
    def __init__(self, driver_type='headless', username='nishparadox'):
        if not username:
            raise ValueError("Invalid username")
        self.username = username
        self.url = "https://medium.com/@{}/latest".format(username)
        self.driver_type = driver_type
        if driver_type == 'headless':
            options = Options()
            options.set_headless(headless=True)
            self.driver = webdriver.Firefox(firefox_options=options)
        elif driver_type == 'chrome':
            self.driver = webdriver.Chrome()
        else:
            self.driver = webdriver.Firefox()

    def _scroll_to_oblivion(self):
        """
            Simulate infinite scroll as per the medium's site
        """
        pause = 3
        last_height = self.driver.execute_script("return document.body.scrollHeight")
        i = 0
        # self.driver.get_screenshot_as_file("data/screen"+str(i)+".png")
        while True:
            print("Scrolling...")
            self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            time.sleep(pause)
            new_height = self.driver.execute_script("return document.body.scrollHeight")
            # print("New Height :: {}".format(new_height))
            if new_height == last_height:
                break
            last_height = new_height
            i += 1
            # self.driver.get_screenshot_as_file("data/screen"+str(i)+".jpg")
        return last_height

    def _parse(self, html):
        """
            Parse to get the actual links to the posts
        """
        soup = BeautifulSoup(html, 'html.parser')
        class_name = 'streamItem streamItem--postPreview js-streamItem'
        divs = soup.find_all('div', {'class' : class_name})
        links = []
        regex = re.compile('Read More', re.IGNORECASE)
        for div in divs:
            anchors = div.find_all('a', href=True, text=regex)
            if anchors:
                links.append(anchors[0]['href'])
        return links

    def _parse2(self, html):
        """
            The old function to fetch actual links of this post.
            Since medium has shifted to react based rendering,
            there is no actual class name for post.
            So, here we fetch all the links that match to the pattern:
                /p/someid/
        """
        soup = BeautifulSoup(html, 'html.parser')
        links = soup.find_all('a', href=re.compile(r'.*/p/.*'))
        href_list = set()
        base_url = "https://medium.com{}"
        for link in links:
            href = link['href']
            href_list.add(base_url.format(href))
        return href_list

    def _parse3(self, html):
        """
            The latest function to fetch actual links of this post.
            Since medium has shifted to react based rendering,
            there is no actual class name for post.
            So, here we fetch all the links that match to the pattern:
                /@username/post-slag
        """
        soup = BeautifulSoup(html, 'html.parser')
        # links = soup.find_all('a', href=re.compile(r'.*/p/.*'))
        pattern = r"https://medium.com/@{}/*".format(self.username)
        links = soup.find_all('a', href=re.compile(pattern))
        href_list = set()
        base_url = "{}"
        for link in links:
            href = link['href']
            href_list.add(base_url.format(href).strip())
        return href_list


    def get_links(self):
        print("Using driver type :: {}".format(self.driver_type))
        self.driver.get(self.url)
        h = self._scroll_to_oblivion()
        print("Preparing to fetch available links...")
        return self._parse3(self.driver.page_source)


def main():
    link_fetcher = LinkFetcher('headless', username='nishparadox')
    links = link_fetcher.get_links()
    print(links)

if __name__ == "__main__":
    main()