webCrawler/WebCrawlerXLSXVerison.py at main · developersrkl/webCrawler · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
from urllib.parse import urlparse
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup
from webdriver_manager.chrome import ChromeDriverManager

from concurrent.futures import ProcessPoolExecutor, wait

import re,os
import pandas as pd
import numpy as np
import openpyxl

def getChromeDriver():
    options = webdriver.ChromeOptions()
    options.add_argument('ignore-certificate-errors')
    #options.add_argument("--headless")
    options.add_argument("--disable-gpu")
    options.add_argument('--blink-settings=imagesEnabled=false')

    prefs = {
        "download_restrictions": 3,
    }
    options.add_experimental_option(
        "prefs", prefs
    )

    driver=webdriver.Chrome(service=Service(ChromeDriverManager().install()),chrome_options=options) # define a chrome driver
    return driver

def getKeyWords(keywordFile):
    with open(keywordFile,'r', encoding="utf8") as kf:
        text = kf.read().splitlines()
        return text

def getUrls(UrlFile):
    df = pd.read_csv(UrlFile,encoding="utf8")
    url_list = df['Website'].dropna().tolist()
    return url_list

def getSoup(url):
    driver = getChromeDriver()
    soup =None
    get_url=''
    try:
        #driver.maximize_window()
        driver.get(url)
        get_url = driver.current_url
        WebDriverWait(driver, 1).until(EC.url_to_be(url))
        WebDriverWait(driver, 1).until(EC.element_to_be_clickable((By.XPATH,'//*[@id="cookie_action_close_header"]'))).click()
    except:
        pass

    if get_url == url:
        page_source = driver.page_source
        soup = BeautifulSoup(page_source,features='html.parser')

    driver.quit()
    return soup

def findKeywords(url,keywords):
    soup = getSoup(url)
    file = open('results.txt', 'a+',encoding='utf8')
    try:
        if soup != None :
            dataRow=[]
            for keyword in keywords:
                matches = soup.body.find_all(string=re.compile(keyword, re.IGNORECASE))
                len_match = len(matches)
                if len_match > 0:
                    dataRow.append((urlparse(url).netloc,url,keyword,len_match))

            for t in dataRow:
                    file.write(','.join(str(s) for s in t) + '\n')

    except:
        pass
    file.close()

def getHref(url):
    soup = getSoup(url)
    suburls=[]
    if soup != None :
        for link in soup.find_all('a', href=True):
                if link['href'] is not None and link['href'].startswith(("https:","http:")):
                    if urlparse(link['href']).netloc == urlparse(url).netloc:
                        if not link['href'] in suburls and not link['href'] in url and not link['href'].endswith((".png",".pdf",".jpg",".jpeg")):
                            suburls.append(link['href']+ '\n')

        with open('urls.txt', 'a+') as f:
            f.writelines(url+ '\n')
            f.writelines(suburls)

    return suburls

def getAllHref(url):
    for link in getHref(url):
            getAllHref(link)

def webCrawler(keywords,urls):

    # list to store the processes
    processList = []

    # initialize the mutiprocess interface
    with ProcessPoolExecutor(os.cpu_count()-1) as executor:
        for url in urls:
            processList.append(executor.submit(getAllHref, url))

    # wait for all the threads to complete
    wait(processList)

    urlset = sorted(set(line.strip() for line in open('urls.txt')))

    file=open('results.txt', 'w+',encoding='utf8') # w+ means overwrite file and read, append 'a+' means add to file and read
    file.close()

    kprocessList = []
    with ProcessPoolExecutor(os.cpu_count()-1) as executor:
        for url in urlset:
            kprocessList.append(executor.submit(findKeywords,url,keywords))
    wait(kprocessList)

    df = pd.read_csv('results.txt',encoding='utf8')
    df.columns = ["BaseUrl","Url","KeyWord","Matches"]
    dfpivot=df[['BaseUrl','KeyWord','Matches']].groupby(['BaseUrl', 'KeyWord'])['Matches'].sum().unstack('KeyWord',fill_value=0)
    dfpivot.to_excel('results.xlsx')


if __name__ == '__main__':
    keywordFile='keyword.txt'
    UrlFile='All investors.csv'
    keywords_lst = getKeyWords(keywordFile)
    urls_lst = getUrls(UrlFile)
    webCrawler(keywords_lst,urls_lst)