-
Notifications
You must be signed in to change notification settings - Fork 105
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
bd40082
commit 9330ab6
Showing
30 changed files
with
19,054 additions
and
0 deletions.
There are no files selected for viewing
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,22 @@ | ||
# -*- coding: utf-8 -*- | ||
import scrapy | ||
|
||
|
||
class DiseasesSpider(scrapy.Spider): | ||
name = 'diseases' | ||
allowed_domains = ['www.mayoclinic.org'] | ||
start_urls = ['https://www.mayoclinic.org'] | ||
|
||
def parse(self, response): | ||
letters = response.xpath("//ol[@class='acces-alpha']/li/a") | ||
for letter in letters: | ||
link = letter.xpath(".//@href").get() | ||
yield response.follow(url=link, callback=self.parse_letter) | ||
|
||
def parse_letter(self, response): | ||
diseases = response.xpath("//div[@id='index']/ol/li/a") | ||
for disease in diseases: | ||
yield { | ||
"name": ' '.join(disease.xpath(".//text()").getall()), | ||
"link": 'https://www.mayoclinic.org' + disease.xpath(".//@href").get() | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,72 @@ | ||
import pandas as pd | ||
from bs4 import BeautifulSoup, NavigableString, Tag | ||
import urllib.request | ||
import csv | ||
|
||
|
||
def parse(soup): | ||
out={} | ||
for header in soup.find_all('h2'): | ||
nextNode = header | ||
if len(nextNode.text) == 0: | ||
break | ||
if nextNode.text not in ['Symptoms', 'Causes', 'Risk factors', 'Complications', 'Prevention']: | ||
continue | ||
key = nextNode.text | ||
content = [] | ||
while True: | ||
nextNode = nextNode.nextSibling | ||
if nextNode is None: | ||
break | ||
if isinstance(nextNode, NavigableString): | ||
continue | ||
if isinstance(nextNode, Tag): | ||
if nextNode.name == "h2": | ||
break | ||
if nextNode.name == "div": # skip pop-up window for illustration and alt-text | ||
continue | ||
elif len(nextNode.get_text(strip=True).strip())!=0: | ||
# print(nextNode.get_text(strip=False).strip(),'+++') | ||
content.append(nextNode.get_text(strip=False).strip()) | ||
out[key]= '\n'.join(content) | ||
if 'Symptoms' not in list(out.keys()): | ||
out['Symptoms'] = 'None' | ||
if 'Causes' not in list(out.keys()): | ||
out['Causes'] = 'None' | ||
if 'Risk factors' not in list(out.keys()): | ||
out['Risk factors'] = 'None' | ||
if 'Complications' not in list(out.keys()): | ||
out['Complications'] = 'None' | ||
if 'Prevention' not in list(out.keys()): | ||
out['Prevention'] = 'None' | ||
# print('Successfully parsed 1 item!') | ||
return out | ||
|
||
df = pd.read_csv("diseases.csv") | ||
|
||
headers = {'user-agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 11_2_2) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0.3 Safari/605.1.15'} | ||
|
||
i = 0 | ||
with open('diseases_parsed.csv','w', encoding='utf8') as f: | ||
for url in df['link'].tolist(): | ||
i += 1 | ||
req = urllib.request.Request(url,headers=headers) | ||
resp = urllib.request.urlopen(req) | ||
soup = BeautifulSoup(resp.read(), 'html.parser') | ||
mydict = parse(soup) | ||
w = csv.DictWriter(f, mydict.keys()) | ||
if i == 1: | ||
w.writeheader() | ||
w.writerow(mydict) | ||
if i % 100 == 0: | ||
print('Done: '+str(i)) | ||
|
||
|
||
parsed = pd.read_csv("diseases_parsed.csv") | ||
|
||
df2 = df.merge(parsed, how='inner', left_index=True, right_index=True) | ||
|
||
df2.to_csv('for_jingyi.csv') | ||
|
||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,62 @@ | ||
import scrapy | ||
import logging | ||
from urllib.parse import urljoin | ||
|
||
class QuotesSpider(scrapy.Spider): | ||
name = 'disease_web_scrape' | ||
custom_settings = { | ||
'LOG_LEVEL': 'INFO', | ||
'RETRY_TIMES': '100' | ||
} | ||
start_urls = [ | ||
'https://www.orpha.net/consor/cgi-bin/Disease_Search.php?lng=EN&search=Disease_Search_List' | ||
] | ||
|
||
def parse(self, response): | ||
base_url = 'https://www.orpha.net/consor/cgi-bin/' | ||
#list = response.xpath('(//h3[text() = "Alphabetical list"]/following-sibling::div[1]//li/a)[1]/@href').get() | ||
#list_href = list[0].xpath('./@href').get() | ||
#request = scrapy.Request(urljoin(base_url, list), callback=self.parse_list) | ||
#yield request | ||
|
||
for alphabet in response.xpath('//h3[text() = "Alphabetical list"]/following-sibling::div[1]//li/a'): | ||
list_href = alphabet.xpath('./@href').get() | ||
request = scrapy.Request(urljoin(base_url, list_href), callback=self.parse_list) | ||
yield request | ||
|
||
def parse_list(self, response): | ||
logging.info(("List: " + response.request.url)) | ||
base_url = 'https://www.orpha.net/consor/cgi-bin/' | ||
for disease in response.xpath('//div[@id = "result-box"]/ul/li/a'): | ||
disease_name = disease.xpath('./text()').get() | ||
disease_url = disease.xpath('./@href').get() | ||
request = scrapy.Request(urljoin(base_url, disease_url), callback=self.parse_content, errback=self.errback) | ||
request.meta['disease'] = disease_name | ||
yield request | ||
|
||
def parse_content(self, response): | ||
disease_shortname = response.meta['disease'] | ||
disease_name = response.xpath("//h2[3]/text()").get() | ||
disease_id = response.xpath("//div[@class = 'idcard artBlock']/h3/text()").get() | ||
definition = response.xpath('//div[@class = "definition"]/section/p/text()').get() | ||
prevalence = response.xpath("//ul[@class = 'idData']//em[text()='Prevalence: ']/following-sibling::strong/text()").get() | ||
UMLS = response.xpath("//ul[@class = 'idData']//em[text()='UMLS: ']/following-sibling::strong/text()").get() | ||
epidemiology = response.xpath("//div[@class = 'articleInfo']/h3[contains(text(), 'Epidemiology')]/following-sibling::section[1]/p/text()").get() | ||
clinical_description = response.xpath("//div[@class = 'articleInfo']/h3[contains(text(), 'Clinical description')]/following-sibling::section[1]/p/text()").get() | ||
management_and_treatment = response.xpath("//div[@class = 'articleInfo']/h3[contains(text(), 'Management and treatment')]/following-sibling::section[1]/p/text()").get() | ||
yield{ | ||
'disease_shortname': disease_shortname, | ||
'disease_name': disease_name, | ||
'disease_id': disease_id, | ||
'definition': definition, | ||
'prevalence': prevalence, | ||
'UMLS': UMLS, | ||
'epidemiology': epidemiology, | ||
'clinical_description': clinical_description, | ||
'management_and_treatment': management_and_treatment | ||
} | ||
|
||
def errback(self, response): | ||
logging.info(("ERROR: " + response.meta['disease'])) | ||
request = scrapy.Request(response.request.url, callback=self.parse_content, errback=self.errback) | ||
yield request |
Oops, something went wrong.