Skip to content

Commit

Permalink
initial commit
Browse files Browse the repository at this point in the history
  • Loading branch information
payalchandak committed Apr 24, 2022
1 parent bd40082 commit 9330ab6
Show file tree
Hide file tree
Showing 30 changed files with 19,054 additions and 0 deletions.
2,667 changes: 2,667 additions & 0 deletions case_study/autism.ipynb

Large diffs are not rendered by default.

22 changes: 22 additions & 0 deletions datasets/feature_extraction/disease/mayo/diseases.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
# -*- coding: utf-8 -*-
import scrapy


class DiseasesSpider(scrapy.Spider):
name = 'diseases'
allowed_domains = ['www.mayoclinic.org']
start_urls = ['https://www.mayoclinic.org']

def parse(self, response):
letters = response.xpath("//ol[@class='acces-alpha']/li/a")
for letter in letters:
link = letter.xpath(".//@href").get()
yield response.follow(url=link, callback=self.parse_letter)

def parse_letter(self, response):
diseases = response.xpath("//div[@id='index']/ol/li/a")
for disease in diseases:
yield {
"name": ' '.join(disease.xpath(".//text()").getall()),
"link": 'https://www.mayoclinic.org' + disease.xpath(".//@href").get()
}
72 changes: 72 additions & 0 deletions datasets/feature_extraction/disease/mayo/mayo.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
import pandas as pd
from bs4 import BeautifulSoup, NavigableString, Tag
import urllib.request
import csv


def parse(soup):
out={}
for header in soup.find_all('h2'):
nextNode = header
if len(nextNode.text) == 0:
break
if nextNode.text not in ['Symptoms', 'Causes', 'Risk factors', 'Complications', 'Prevention']:
continue
key = nextNode.text
content = []
while True:
nextNode = nextNode.nextSibling
if nextNode is None:
break
if isinstance(nextNode, NavigableString):
continue
if isinstance(nextNode, Tag):
if nextNode.name == "h2":
break
if nextNode.name == "div": # skip pop-up window for illustration and alt-text
continue
elif len(nextNode.get_text(strip=True).strip())!=0:
# print(nextNode.get_text(strip=False).strip(),'+++')
content.append(nextNode.get_text(strip=False).strip())
out[key]= '\n'.join(content)
if 'Symptoms' not in list(out.keys()):
out['Symptoms'] = 'None'
if 'Causes' not in list(out.keys()):
out['Causes'] = 'None'
if 'Risk factors' not in list(out.keys()):
out['Risk factors'] = 'None'
if 'Complications' not in list(out.keys()):
out['Complications'] = 'None'
if 'Prevention' not in list(out.keys()):
out['Prevention'] = 'None'
# print('Successfully parsed 1 item!')
return out

df = pd.read_csv("diseases.csv")

headers = {'user-agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 11_2_2) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0.3 Safari/605.1.15'}

i = 0
with open('diseases_parsed.csv','w', encoding='utf8') as f:
for url in df['link'].tolist():
i += 1
req = urllib.request.Request(url,headers=headers)
resp = urllib.request.urlopen(req)
soup = BeautifulSoup(resp.read(), 'html.parser')
mydict = parse(soup)
w = csv.DictWriter(f, mydict.keys())
if i == 1:
w.writeheader()
w.writerow(mydict)
if i % 100 == 0:
print('Done: '+str(i))


parsed = pd.read_csv("diseases_parsed.csv")

df2 = df.merge(parsed, how='inner', left_index=True, right_index=True)

df2.to_csv('for_jingyi.csv')



62 changes: 62 additions & 0 deletions datasets/feature_extraction/disease/orphanet/orpha.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
import scrapy
import logging
from urllib.parse import urljoin

class QuotesSpider(scrapy.Spider):
name = 'disease_web_scrape'
custom_settings = {
'LOG_LEVEL': 'INFO',
'RETRY_TIMES': '100'
}
start_urls = [
'https://www.orpha.net/consor/cgi-bin/Disease_Search.php?lng=EN&search=Disease_Search_List'
]

def parse(self, response):
base_url = 'https://www.orpha.net/consor/cgi-bin/'
#list = response.xpath('(//h3[text() = "Alphabetical list"]/following-sibling::div[1]//li/a)[1]/@href').get()
#list_href = list[0].xpath('./@href').get()
#request = scrapy.Request(urljoin(base_url, list), callback=self.parse_list)
#yield request

for alphabet in response.xpath('//h3[text() = "Alphabetical list"]/following-sibling::div[1]//li/a'):
list_href = alphabet.xpath('./@href').get()
request = scrapy.Request(urljoin(base_url, list_href), callback=self.parse_list)
yield request

def parse_list(self, response):
logging.info(("List: " + response.request.url))
base_url = 'https://www.orpha.net/consor/cgi-bin/'
for disease in response.xpath('//div[@id = "result-box"]/ul/li/a'):
disease_name = disease.xpath('./text()').get()
disease_url = disease.xpath('./@href').get()
request = scrapy.Request(urljoin(base_url, disease_url), callback=self.parse_content, errback=self.errback)
request.meta['disease'] = disease_name
yield request

def parse_content(self, response):
disease_shortname = response.meta['disease']
disease_name = response.xpath("//h2[3]/text()").get()
disease_id = response.xpath("//div[@class = 'idcard artBlock']/h3/text()").get()
definition = response.xpath('//div[@class = "definition"]/section/p/text()').get()
prevalence = response.xpath("//ul[@class = 'idData']//em[text()='Prevalence: ']/following-sibling::strong/text()").get()
UMLS = response.xpath("//ul[@class = 'idData']//em[text()='UMLS: ']/following-sibling::strong/text()").get()
epidemiology = response.xpath("//div[@class = 'articleInfo']/h3[contains(text(), 'Epidemiology')]/following-sibling::section[1]/p/text()").get()
clinical_description = response.xpath("//div[@class = 'articleInfo']/h3[contains(text(), 'Clinical description')]/following-sibling::section[1]/p/text()").get()
management_and_treatment = response.xpath("//div[@class = 'articleInfo']/h3[contains(text(), 'Management and treatment')]/following-sibling::section[1]/p/text()").get()
yield{
'disease_shortname': disease_shortname,
'disease_name': disease_name,
'disease_id': disease_id,
'definition': definition,
'prevalence': prevalence,
'UMLS': UMLS,
'epidemiology': epidemiology,
'clinical_description': clinical_description,
'management_and_treatment': management_and_treatment
}

def errback(self, response):
logging.info(("ERROR: " + response.meta['disease']))
request = scrapy.Request(response.request.url, callback=self.parse_content, errback=self.errback)
yield request
Loading

0 comments on commit 9330ab6

Please sign in to comment.