initial commit

mims-harvard · Apr 24, 2022 · 9330ab6 · 9330ab6
1 parent bd40082
commit 9330ab6
Show file tree

Hide file tree

Showing 30 changed files with 19,054 additions and 0 deletions.
diff --git a/case_study/autism.ipynb b/case_study/autism.ipynb
diff --git a/datasets/feature_extraction/disease/mayo/diseases.py b/datasets/feature_extraction/disease/mayo/diseases.py
@@ -0,0 +1,22 @@
+# -*- coding: utf-8 -*-
+import scrapy
+
+
+class DiseasesSpider(scrapy.Spider):
+    name = 'diseases'
+    allowed_domains = ['www.mayoclinic.org']
+    start_urls = ['https://www.mayoclinic.org']
+
+    def parse(self, response):
+        letters = response.xpath("//ol[@class='acces-alpha']/li/a")
+        for letter in letters:
+            link = letter.xpath(".//@href").get()
+            yield response.follow(url=link, callback=self.parse_letter)
+
+    def parse_letter(self, response):
+        diseases = response.xpath("//div[@id='index']/ol/li/a")
+        for disease in diseases:
+            yield {
+                "name": ' '.join(disease.xpath(".//text()").getall()),
+                "link": 'https://www.mayoclinic.org' + disease.xpath(".//@href").get()
+            }
diff --git a/datasets/feature_extraction/disease/mayo/mayo.py b/datasets/feature_extraction/disease/mayo/mayo.py
@@ -0,0 +1,72 @@
+import pandas as pd
+from bs4 import BeautifulSoup, NavigableString, Tag
+import urllib.request
+import csv
+
+
+def parse(soup):
+    out={}
+    for header in soup.find_all('h2'):
+        nextNode = header
+        if len(nextNode.text) == 0:
+            break
+        if nextNode.text not in ['Symptoms', 'Causes', 'Risk factors', 'Complications', 'Prevention']:
+            continue
+        key = nextNode.text
+        content = []
+        while True:
+            nextNode = nextNode.nextSibling
+            if nextNode is None:
+                break
+            if isinstance(nextNode, NavigableString):
+                continue
+            if isinstance(nextNode, Tag):
+                if nextNode.name == "h2":
+                    break
+                if nextNode.name == "div": # skip pop-up window for illustration and alt-text
+                    continue
+                elif len(nextNode.get_text(strip=True).strip())!=0:
+#                     print(nextNode.get_text(strip=False).strip(),'+++')
+                    content.append(nextNode.get_text(strip=False).strip())
+        out[key]= '\n'.join(content)
+    if 'Symptoms' not in list(out.keys()):
+        out['Symptoms'] = 'None'
+    if 'Causes' not in list(out.keys()):
+        out['Causes'] = 'None'
+    if 'Risk factors' not in list(out.keys()):
+        out['Risk factors'] = 'None'
+    if 'Complications' not in list(out.keys()):
+        out['Complications'] = 'None'
+    if 'Prevention' not in list(out.keys()):
+        out['Prevention'] = 'None'
+#     print('Successfully parsed 1 item!')
+    return out
+
+df = pd.read_csv("diseases.csv")
+
+headers = {'user-agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 11_2_2) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0.3 Safari/605.1.15'}
+
+i = 0
+with open('diseases_parsed.csv','w', encoding='utf8') as f:
+    for url in df['link'].tolist():
+        i += 1
+        req = urllib.request.Request(url,headers=headers)
+        resp = urllib.request.urlopen(req)
+        soup = BeautifulSoup(resp.read(), 'html.parser')
+        mydict = parse(soup)
+        w = csv.DictWriter(f, mydict.keys())
+        if i == 1:
+            w.writeheader()
+        w.writerow(mydict)
+        if i % 100 == 0:
+            print('Done: '+str(i))
+
+
+parsed = pd.read_csv("diseases_parsed.csv")
+
+df2 = df.merge(parsed, how='inner', left_index=True, right_index=True)
+
+df2.to_csv('for_jingyi.csv')
+
+
+
diff --git a/datasets/feature_extraction/disease/orphanet/orpha.py b/datasets/feature_extraction/disease/orphanet/orpha.py
@@ -0,0 +1,62 @@
+import scrapy
+import logging
+from urllib.parse import urljoin
+
+class QuotesSpider(scrapy.Spider):
+    name = 'disease_web_scrape'
+    custom_settings = {
+        'LOG_LEVEL': 'INFO',
+        'RETRY_TIMES': '100'
+    }
+    start_urls = [
+        'https://www.orpha.net/consor/cgi-bin/Disease_Search.php?lng=EN&search=Disease_Search_List'
+    ]
+
+    def parse(self, response):
+        base_url = 'https://www.orpha.net/consor/cgi-bin/'
+        #list = response.xpath('(//h3[text() = "Alphabetical list"]/following-sibling::div[1]//li/a)[1]/@href').get()
+        #list_href = list[0].xpath('./@href').get()
+        #request = scrapy.Request(urljoin(base_url, list), callback=self.parse_list)
+        #yield request
+
+        for alphabet in response.xpath('//h3[text() = "Alphabetical list"]/following-sibling::div[1]//li/a'):
+            list_href = alphabet.xpath('./@href').get()
+            request = scrapy.Request(urljoin(base_url, list_href), callback=self.parse_list)
+            yield request
+
+    def parse_list(self, response):
+        logging.info(("List: " + response.request.url))
+        base_url = 'https://www.orpha.net/consor/cgi-bin/'
+        for disease in response.xpath('//div[@id = "result-box"]/ul/li/a'):
+            disease_name = disease.xpath('./text()').get()
+            disease_url = disease.xpath('./@href').get()
+            request = scrapy.Request(urljoin(base_url, disease_url), callback=self.parse_content, errback=self.errback)
+            request.meta['disease'] = disease_name
+            yield request
+
+    def parse_content(self, response):
+        disease_shortname = response.meta['disease']
+        disease_name = response.xpath("//h2[3]/text()").get()
+        disease_id = response.xpath("//div[@class = 'idcard artBlock']/h3/text()").get()
+        definition = response.xpath('//div[@class = "definition"]/section/p/text()').get()
+        prevalence = response.xpath("//ul[@class = 'idData']//em[text()='Prevalence: ']/following-sibling::strong/text()").get()
+        UMLS = response.xpath("//ul[@class = 'idData']//em[text()='UMLS: ']/following-sibling::strong/text()").get()
+        epidemiology = response.xpath("//div[@class = 'articleInfo']/h3[contains(text(), 'Epidemiology')]/following-sibling::section[1]/p/text()").get()
+        clinical_description = response.xpath("//div[@class = 'articleInfo']/h3[contains(text(), 'Clinical description')]/following-sibling::section[1]/p/text()").get()
+        management_and_treatment = response.xpath("//div[@class = 'articleInfo']/h3[contains(text(), 'Management and treatment')]/following-sibling::section[1]/p/text()").get()
+        yield{
+            'disease_shortname': disease_shortname,
+            'disease_name': disease_name,
+            'disease_id': disease_id,
+            'definition': definition,
+            'prevalence': prevalence,
+            'UMLS': UMLS,
+            'epidemiology': epidemiology,
+            'clinical_description': clinical_description,
+            'management_and_treatment': management_and_treatment
+        }
+
+    def errback(self, response):
+        logging.info(("ERROR: " + response.meta['disease']))
+        request = scrapy.Request(response.request.url, callback=self.parse_content, errback=self.errback)
+        yield request