-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathDoximity-selenium.py
124 lines (100 loc) · 3.34 KB
/
Doximity-selenium.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
from selenium import webdriver # powers the browser interaction
from selenium.webdriver.support.ui import Select # selects menu options
from bs4 import BeautifulSoup # to parse HTML
import csv # to write CSV
import pandas as pd # to see CSV
import time
import os
import random
header = [
'Name',
'Title',
'Hospital',
'Phone',
'State',
'Tags',
'Summary',
'Skills',
'City',
'Address']
with open("cardi.csv", "a") as f:
csv_w_electro = csv.writer(f)
csv_w_electro.writerow(header)
driver = webdriver.PhantomJS()
next_page = "https://www.doximity.com/directory/md/specialty/thoracic-surgery?from_slug=pub%2Fmichael-peter-kaye-md"
for i in range(1000):
driver.get(next_page)
try:
next_page = BeautifulSoup(
driver.page_source, "html5lib").find(
"a", {
"class": "next_page"})['href']
next_page = "https://www.doximity.com" + next_page
except:
next_page = ""
links = [a.get_attribute(
'href') for a in driver.find_elements_by_css_selector("ul.list-4-col a")]
links = random.sample(links, 15)
for l in links:
driver.get(l)
soup = BeautifulSoup(driver.page_source, "html5lib")
try:
name = soup.find("span", {"id": "user_full_name"}).text.strip()
print(name)
except:
name = ""
try:
title = soup.find("p", {"itemprop": "jobTitle"}).text.strip()
except:
title = ""
try:
city = soup.find(
"span", {
"itemprop": "addressLocality"}).text.strip()
except:
city = ""
try:
state = soup.find("span",
{"itemprop": "addressRegion"}).text.strip()
except:
state = ""
try:
address = soup.find("div", {"class": "col-1-2"}).text.strip()
except:
address = ""
try:
hospital = soup.find("section",
{"class": "section hospital-info"}).findAll("span",
{"itemprop": "name"})
hospitals = '; '.join([x.text.strip() for x in hospital])
except:
hospitals = ""
try:
phone = soup.find("span", {"itemprop": "telephone"}).text.strip()
except:
phone = ""
try:
summary = soup.find(
"section", {
"class": "section summary-info"}).find("ul").text.strip()
except:
summary = ""
try:
skills = soup.find(
"div", {
"class": "section skills-info"}).find("ul").text.strip()
except:
skills = ""
try:
tags = soup.find("div", {"class": "section"}).find(
"p").text.strip()
if len(phone) > 0:
if "cardi" in tags.lower():
with open("cardi.csv", "a") as f:
csv_w_electro = csv.writer(f)
csv_w_electro.writerow(
[name, title, hospitals, phone, state, tags, summary, skills, city, address])
except:
pass
time.sleep(random.randint(1, 3))
time.sleep(random.randint(1, 3))