-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathWebCrawlerXLSXVerison.py
More file actions
139 lines (111 loc) · 4.36 KB
/
WebCrawlerXLSXVerison.py
File metadata and controls
139 lines (111 loc) · 4.36 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
from urllib.parse import urlparse
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup
from webdriver_manager.chrome import ChromeDriverManager
from concurrent.futures import ProcessPoolExecutor, wait
import re,os
import pandas as pd
import numpy as np
import openpyxl
def getChromeDriver():
options = webdriver.ChromeOptions()
options.add_argument('ignore-certificate-errors')
#options.add_argument("--headless")
options.add_argument("--disable-gpu")
options.add_argument('--blink-settings=imagesEnabled=false')
prefs = {
"download_restrictions": 3,
}
options.add_experimental_option(
"prefs", prefs
)
driver=webdriver.Chrome(service=Service(ChromeDriverManager().install()),chrome_options=options) # define a chrome driver
return driver
def getKeyWords(keywordFile):
with open(keywordFile,'r', encoding="utf8") as kf:
text = kf.read().splitlines()
return text
def getUrls(UrlFile):
df = pd.read_csv(UrlFile,encoding="utf8")
url_list = df['Website'].dropna().tolist()
return url_list
def getSoup(url):
driver = getChromeDriver()
soup =None
get_url=''
try:
#driver.maximize_window()
driver.get(url)
get_url = driver.current_url
WebDriverWait(driver, 1).until(EC.url_to_be(url))
WebDriverWait(driver, 1).until(EC.element_to_be_clickable((By.XPATH,'//*[@id="cookie_action_close_header"]'))).click()
except:
pass
if get_url == url:
page_source = driver.page_source
soup = BeautifulSoup(page_source,features='html.parser')
driver.quit()
return soup
def findKeywords(url,keywords):
soup = getSoup(url)
file = open('results.txt', 'a+',encoding='utf8')
try:
if soup != None :
dataRow=[]
for keyword in keywords:
matches = soup.body.find_all(string=re.compile(keyword, re.IGNORECASE))
len_match = len(matches)
if len_match > 0:
dataRow.append((urlparse(url).netloc,url,keyword,len_match))
for t in dataRow:
file.write(','.join(str(s) for s in t) + '\n')
except:
pass
file.close()
def getHref(url):
soup = getSoup(url)
suburls=[]
if soup != None :
for link in soup.find_all('a', href=True):
if link['href'] is not None and link['href'].startswith(("https:","http:")):
if urlparse(link['href']).netloc == urlparse(url).netloc:
if not link['href'] in suburls and not link['href'] in url and not link['href'].endswith((".png",".pdf",".jpg",".jpeg")):
suburls.append(link['href']+ '\n')
with open('urls.txt', 'a+') as f:
f.writelines(url+ '\n')
f.writelines(suburls)
return suburls
def getAllHref(url):
for link in getHref(url):
getAllHref(link)
def webCrawler(keywords,urls):
# list to store the processes
processList = []
# initialize the mutiprocess interface
with ProcessPoolExecutor(os.cpu_count()-1) as executor:
for url in urls:
processList.append(executor.submit(getAllHref, url))
# wait for all the threads to complete
wait(processList)
urlset = sorted(set(line.strip() for line in open('urls.txt')))
file=open('results.txt', 'w+',encoding='utf8') # w+ means overwrite file and read, append 'a+' means add to file and read
file.close()
kprocessList = []
with ProcessPoolExecutor(os.cpu_count()-1) as executor:
for url in urlset:
kprocessList.append(executor.submit(findKeywords,url,keywords))
wait(kprocessList)
df = pd.read_csv('results.txt',encoding='utf8')
df.columns = ["BaseUrl","Url","KeyWord","Matches"]
dfpivot=df[['BaseUrl','KeyWord','Matches']].groupby(['BaseUrl', 'KeyWord'])['Matches'].sum().unstack('KeyWord',fill_value=0)
dfpivot.to_excel('results.xlsx')
if __name__ == '__main__':
keywordFile='keyword.txt'
UrlFile='All investors.csv'
keywords_lst = getKeyWords(keywordFile)
urls_lst = getUrls(UrlFile)
webCrawler(keywords_lst,urls_lst)