From 95f93c2f245693dcb8e7fcc6aaaa46f6f5e70f3a Mon Sep 17 00:00:00 2001 From: kwonangela7 Date: Mon, 18 May 2020 22:30:14 -0700 Subject: [PATCH 01/39] i think I got series data for the cases --- marin_scraper.py | 82 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 82 insertions(+) create mode 100644 marin_scraper.py diff --git a/marin_scraper.py b/marin_scraper.py new file mode 100644 index 00000000..06296984 --- /dev/null +++ b/marin_scraper.py @@ -0,0 +1,82 @@ +# is there a way to get historical data +# is there a way to have a script click on the "download csv" files? +#!/usr/bin/env python3 +import csv +import json +import numpy as np + +# build csv parsing functionality first, then scrape the metadata, and then figure out how to download the CSVs (seleniuim) +# throw error when the button isn't correctly pressed +# do a final check +def get_county_data(): + url = 'https://coronavirus.marinhhs.org/surveillance' + case_csv = '/Users/angelakwon/Downloads/data-Eq6Es.csv' + """Main method for populating county data""" + with open('data-covid19-sfbayarea/data-model.json') as template: + model = json.load(template) + + #model['name'] = + #model['update_time'] = + model['source_url'] = url + #model['meta_from_source'] = + # make sure to get the comments below the data + #model['meta_from_baypd'] + model["series"]["cases"] = get_case_series(case_csv) + #cases - new cases for that day, cumul_cases - total number of cases which have occurred + #model["series"]["deaths"] = get_death_series() + #model["series"]["tests"] = get_test_series() + + #print(model) + +def get_case_series(csv_): + #series = [{key:[]} for key in ["date", "cases", "cumul_cases"]] + series = [] + with open(csv_, mode = 'r') as case_csv: + csv_reader = csv.DictReader(case_csv) + csv_headers = list(next(csv_reader).keys()) # TO-DO: Make it work without hard coding the keys + case_history = [] + for row in csv_reader: + # TO-DO: throw an exception if there are more than the expected number of headers, or when order has changed + daily = {} + daily["date"] = row["Date"] # TO-DO: need to format the date properly + case_history.append(int(row["Total Cases"])) + daily["cumul_cases"] = row["Total Cases"] + series.append(daily) + + case_history_diff = np.diff(case_history) + case_history_diff = np.insert(case_history_diff, 0, 0) # there will be no calculated difference for the first day, so adding it in manually + # is it ok to assume they will be the same value? + for val, case_num in enumerate(case_history_diff): + series[val]["cases"] = case_num + print(series) + # needs to return a list of dictionaries of time series + +#def get_death_series(csv): + + +#def get_test_series(): + + +#def get_case_totals_gender(): + +#def get_case_totals_age(): + +#def get_case_totals_race_eth(): + +#def get_case_totals_category(): + + +#def get_death_totals_gender(): + +#def get_death_totals_age(): + +#def get_death_totals_race_eth(): + +#def get_death_totals_underlying(): + +#def get_death_totals_transmission(): + +# population totals + +get_county_data() +# figure out a way to run the scraper through the command line \ No newline at end of file From 46d23cf51c7543b13220b2c942db2ef74f3359de Mon Sep 17 00:00:00 2001 From: kwonangela7 Date: Mon, 25 May 2020 20:55:49 -0700 Subject: [PATCH 02/39] tried a variety of things to download csvs, eventually selected the right element and clicked on the right elt, but nothing downloaded. switching methods anyway, so going to scrap this code --- data-model.json | 4 +- marin_scraper.py | 140 +++++++++++++++++++++++++++++++++++++++++++---- 2 files changed, 130 insertions(+), 14 deletions(-) diff --git a/data-model.json b/data-model.json index a9c9f207..d4bd9f1e 100644 --- a/data-model.json +++ b/data-model.json @@ -6,12 +6,12 @@ "meta_from_baypd": "STORE IMPORTANT NOTES ABOUT OUR METHODS HERE", "series": { "cases": [ - { "date": "yyyy-mm-dd", "cases": -1, "cumul_cases": -1}, + { "date": "yyyy-mm-dd", "cases": -1, "cumul_cases": -1 }, { "date": "yyyy-mm-dd", "cases": -1, "cumul_cases": -1 } ], "deaths": [ { "date": "yyyy-mm-dd", "deaths": -1, "cumul_deaths": -1 }, - { "date": "yyyy-mm-dd", "deaths": -1, "cumul_deaths": -1} + { "date": "yyyy-mm-dd", "deaths": -1, "cumul_deaths": -1 } ], "tests": [ { diff --git a/marin_scraper.py b/marin_scraper.py index 06296984..ba6757d8 100644 --- a/marin_scraper.py +++ b/marin_scraper.py @@ -4,15 +4,17 @@ import csv import json import numpy as np +from selenium import webdriver +from bs4 import BeautifulSoup -# build csv parsing functionality first, then scrape the metadata, and then figure out how to download the CSVs (seleniuim) +# GOING TO WORK ON: then scrape the metadata, and then figure out how to download the CSVs (seleniuim) # throw error when the button isn't correctly pressed # do a final check def get_county_data(): url = 'https://coronavirus.marinhhs.org/surveillance' case_csv = '/Users/angelakwon/Downloads/data-Eq6Es.csv' """Main method for populating county data""" - with open('data-covid19-sfbayarea/data-model.json') as template: + with open('/Users/angelakwon/Desktop/data-covid19-sfbayarea/data-model.json') as template: model = json.load(template) #model['name'] = @@ -22,14 +24,102 @@ def get_county_data(): # make sure to get the comments below the data #model['meta_from_baypd'] model["series"]["cases"] = get_case_series(case_csv) - #cases - new cases for that day, cumul_cases - total number of cases which have occurred - #model["series"]["deaths"] = get_death_series() - #model["series"]["tests"] = get_test_series() + model["series"]["deaths"] = get_death_series(case_csv) + #model["series"]["tests"] = get_test_series(case_csv) #print(model) + #print(get_metadata(url)) + print(download_csvs(url)) + +def download_csvs(url): + # div class = "dw-chart-notes" + driver = webdriver.Chrome('/Users/angelakwon/Downloads/chromedriver') + driver.implicitly_wait(30) + driver.get(url) + + iframe_list = driver.find_elements_by_tag_name('iframe') + driver.switch_to.frame(iframe_list[3]) + driver.implicitly_wait(30) + for elt in driver.find_elements_by_tag_name('a'): + if not elt.is_displayed(): + elt.sendkeys(Keys.RETURN) # got the element to be clicked on, but... not downloading anything? + #print(driver.find_elements_by_css_selector('.dw-chart-footer .dw-data-link')) + #.class1 .class2 + #print(driver.find_elements_by_css_selector('div.footer > a')) + + #link = driver.find_element_by_tag_name('a') + #print(driver.find_elements_by_tag_name('a')) # there are two + #print(driver.find_elements_by_css_selector('#datawrapper-chart-Eq6Es a')) # I think this selection is wrong + + #datawrapper-chart-Eq6Es a + + #print(driver.find_elements_by_class_name('dw-data-link')) + #driver.maximize_window() + #print("Element is visible? " + str(driver.find_element_by_tag_name('a').is_displayed())) + + #print("Element is visible? " + str(driver.find_element_by_class_name('dw-data-link').is_displayed())) + + #WebDriverWait(browser, 10).until(EC.element_to_be_clickable((By.CSS_SELECTOR, 'a'))) + #ActionChains(driver).move_to_element(link).click(link).perform() + # both of these produce the same kind of error + + + + # so the reason I was getting a null error was because... + # you can't select an iframe using the name? not sure. + + # 1. try to switch to the iframe using the name or id. + # The name is datawrapper-chart-Eq6Es + #return driver.switch_to.frame('datawrapper-chart-Eq6Es') + # returns None + + # 2. Try to switch to the iframe using the frame index + #return driver.switch_to.frame(3) + # returns None + # maybe it's lowkey nested, so let's just get the first frame. + #return driver.switch_to.frame(1) + + # 3. Try XPATH - recommended in a good number of SO posts. + #csv = driver.find_element_by_xpath("/////////////iframe[1]") + # seems too deep to be found by xpath? + #return csv + + + # Then search for the element by class name/tag name. + #driver.find_element(By.TAG_NAME, 'a').click() + + # all csvs have the a tag, and class=dw-data-link + #print(driver.find_element_by_class_name("dw-data-link")) + + # x path is probably not the best way, but let's give it a try + # the click button is within the within an iframe. + # I need to switch to the iframe, and then click, + + + # Then leave the iframe + driver.switch_to.default_content() + return 'Done downloading' + + #print(driver.find_element(By.TAG_NAME, 'button').click()) + + # Clicking on it should be simple + # print(soup.find_all("a", class_ = "dw-data-link")) + + +def get_metadata(url): + # I think I want to return a list + + notes = [] + driver = webdriver.Chrome('/Users/angelakwon/Downloads/chromedriver') + driver.implicitly_wait(30) + driver.get(url) + soup = BeautifulSoup(driver.page_source, 'html5lib') + soup.find_all('p') + # THEN use the getText function + + driver.quit() # does this close the tab? def get_case_series(csv_): - #series = [{key:[]} for key in ["date", "cases", "cumul_cases"]] series = [] with open(csv_, mode = 'r') as case_csv: csv_reader = csv.DictReader(case_csv) @@ -45,16 +135,42 @@ def get_case_series(csv_): case_history_diff = np.diff(case_history) case_history_diff = np.insert(case_history_diff, 0, 0) # there will be no calculated difference for the first day, so adding it in manually - # is it ok to assume they will be the same value? for val, case_num in enumerate(case_history_diff): series[val]["cases"] = case_num - print(series) - # needs to return a list of dictionaries of time series - -#def get_death_series(csv): + return series +def get_death_series(csv_): + series = [] + with open(csv_, mode = 'r') as case_csv: + csv_reader = csv.DictReader(case_csv) + csv_headers = list(next(csv_reader).keys()) # TO-DO: Make it work without hard coding the keys + case_history = [] + for row in csv_reader: + # TO-DO: throw an exception if there are more than the expected number of headers, or when order has changed + daily = {} + daily["date"] = row["Date"] # TO-DO: need to format the date properly + case_history.append(int(row["Total Deaths"])) + daily["cumul_deaths"] = row["Total Deaths"] + series.append(daily) + + case_history_diff = np.diff(case_history) + case_history_diff = np.insert(case_history_diff, 0, 0) # there will be no calculated difference for the first day, so adding it in manually + for val, case_num in enumerate(case_history_diff): + series[val]["deaths"] = case_num # should I change up the order of the keys? + return series #def get_test_series(): + # "date": "yyyy-mm-dd", + # "tests": -1, + # "positive": -1, + # "negative": -1, + # "pending": -1, + # "cumul_tests": -1, + # "cumul_pos": -1, + # "cumul_neg": -1, + # "cumul_pend": -1 + #save the first row as values + #need to keep track of pos and negative, but no values for pending #def get_case_totals_gender(): @@ -74,7 +190,7 @@ def get_case_series(csv_): #def get_death_totals_underlying(): -#def get_death_totals_transmission(): +#def get_death_totals_transmission(): # not sure if this information exists # population totals From e3e23577c9012a65541db5d2c899358ee6a161b0 Mon Sep 17 00:00:00 2001 From: kwonangela7 Date: Mon, 25 May 2020 21:03:27 -0700 Subject: [PATCH 03/39] added in Rob's suggested code --- marin_scraper.py | 89 ++++++++++++------------------------------------ 1 file changed, 22 insertions(+), 67 deletions(-) diff --git a/marin_scraper.py b/marin_scraper.py index ba6757d8..1bffe12d 100644 --- a/marin_scraper.py +++ b/marin_scraper.py @@ -6,6 +6,7 @@ import numpy as np from selenium import webdriver from bs4 import BeautifulSoup +from urllib.parse import unquote_plus # GOING TO WORK ON: then scrape the metadata, and then figure out how to download the CSVs (seleniuim) # throw error when the button isn't correctly pressed @@ -29,81 +30,35 @@ def get_county_data(): #print(model) #print(get_metadata(url)) - print(download_csvs(url)) + print(extract_csvs(url)) -def download_csvs(url): +def extract_csvs(url): # div class = "dw-chart-notes" - driver = webdriver.Chrome('/Users/angelakwon/Downloads/chromedriver') + driver = webdriver.Chrome('/Users/angelakwon/Downloads/chromedriver') + # can I leave this blank, will virtual env take care of it? driver.implicitly_wait(30) driver.get(url) - iframe_list = driver.find_elements_by_tag_name('iframe') - driver.switch_to.frame(iframe_list[3]) - driver.implicitly_wait(30) - for elt in driver.find_elements_by_tag_name('a'): - if not elt.is_displayed(): - elt.sendkeys(Keys.RETURN) # got the element to be clicked on, but... not downloading anything? - #print(driver.find_elements_by_css_selector('.dw-chart-footer .dw-data-link')) - #.class1 .class2 - #print(driver.find_elements_by_css_selector('div.footer > a')) - - #link = driver.find_element_by_tag_name('a') - #print(driver.find_elements_by_tag_name('a')) # there are two - #print(driver.find_elements_by_css_selector('#datawrapper-chart-Eq6Es a')) # I think this selection is wrong - - #datawrapper-chart-Eq6Es a - - #print(driver.find_elements_by_class_name('dw-data-link')) - #driver.maximize_window() - #print("Element is visible? " + str(driver.find_element_by_tag_name('a').is_displayed())) - - #print("Element is visible? " + str(driver.find_element_by_class_name('dw-data-link').is_displayed())) - - #WebDriverWait(browser, 10).until(EC.element_to_be_clickable((By.CSS_SELECTOR, 'a'))) - #ActionChains(driver).move_to_element(link).click(link).perform() - # both of these produce the same kind of error - - - - # so the reason I was getting a null error was because... - # you can't select an iframe using the name? not sure. - - # 1. try to switch to the iframe using the name or id. - # The name is datawrapper-chart-Eq6Es - #return driver.switch_to.frame('datawrapper-chart-Eq6Es') - # returns None - - # 2. Try to switch to the iframe using the frame index - #return driver.switch_to.frame(3) - # returns None - # maybe it's lowkey nested, so let's just get the first frame. - #return driver.switch_to.frame(1) - - # 3. Try XPATH - recommended in a good number of SO posts. - #csv = driver.find_element_by_xpath("/////////////iframe[1]") - # seems too deep to be found by xpath? - #return csv - - - # Then search for the element by class name/tag name. - #driver.find_element(By.TAG_NAME, 'a').click() - - # all csvs have the a tag, and class=dw-data-link - #print(driver.find_element_by_class_name("dw-data-link")) - - # x path is probably not the best way, but let's give it a try - # the click button is within the within an iframe. - # I need to switch to the iframe, and then click, - + chart_id = 'tyXjV' + frame = driver.find_element_by_css_selector(f'iframe[src^="//datawrapper.dwcdn.net/{chart_id}/"]') + driver.switch_to.frame(frame) + # Grab the raw data out of the link's href attribute + csv_data = driver.find_element_by_class_name('dw-data-link').get_attribute('href') + # Switch back to the parent frame to "reset" the context + driver.switch_to.parent_frame() + + # Deal with the data + if csv_data.startswith('data:'): + media, data = csv_data[5:].split(',', 1) + # Will likely always have this kind of data type + if media != 'application/octet-stream;charset=utf-8': + raise ValueError(f'Cannot handle media type "{media}"') + csv_string = unquote_plus(data) + print(csv_string) # Then leave the iframe driver.switch_to.default_content() - return 'Done downloading' - - #print(driver.find_element(By.TAG_NAME, 'button').click()) - - # Clicking on it should be simple - # print(soup.find_all("a", class_ = "dw-data-link")) + #return 'Done downloading' def get_metadata(url): From 9893f63c8dac846ec3adad0f7e9b1ec81624ddf3 Mon Sep 17 00:00:00 2001 From: kwonangela7 Date: Wed, 27 May 2020 00:26:59 -0700 Subject: [PATCH 04/39] revised csv parsing logic now that I'm working with a csv_string --- marin_scraper.py | 70 ++++++++++++++++++++++++++++-------------------- 1 file changed, 41 insertions(+), 29 deletions(-) diff --git a/marin_scraper.py b/marin_scraper.py index 1bffe12d..29e3ce00 100644 --- a/marin_scraper.py +++ b/marin_scraper.py @@ -8,57 +8,63 @@ from bs4 import BeautifulSoup from urllib.parse import unquote_plus -# GOING TO WORK ON: then scrape the metadata, and then figure out how to download the CSVs (seleniuim) -# throw error when the button isn't correctly pressed + # do a final check def get_county_data(): url = 'https://coronavirus.marinhhs.org/surveillance' - case_csv = '/Users/angelakwon/Downloads/data-Eq6Es.csv' + #case_csv = '/Users/angelakwon/Downloads/data-Eq6Es.csv' """Main method for populating county data""" with open('/Users/angelakwon/Desktop/data-covid19-sfbayarea/data-model.json') as template: + # TO-DO: Need to change the location here to work for anyone..look at other scrapers model = json.load(template) + csvs = {"cases": "Eq6Es", "deaths": "Eq6Es", "tests": None, "age": "VOeBm", "gender": "FEciW", "race_eth": "aBeEd", "transmission": None} + # There are two separate race_eth csvs, but I think I picked the more comprehensive one. + # The other one has id: 6RXFj + # TO-DO: Do they have + # NOTE: they used to have a pos/neg test csv, but it seems to be gone now. + # also, their graph doesn't show pos/neg. + #model['name'] = #model['update_time'] = model['source_url'] = url #model['meta_from_source'] = # make sure to get the comments below the data #model['meta_from_baypd'] - model["series"]["cases"] = get_case_series(case_csv) - model["series"]["deaths"] = get_death_series(case_csv) + model["series"]["cases"] = get_case_series(csvs["cases"], url) + #model["series"]["deaths"] = get_death_series(csvs["deaths"], url) #model["series"]["tests"] = get_test_series(case_csv) #print(model) #print(get_metadata(url)) - print(extract_csvs(url)) + print(model["series"]["cases"]) -def extract_csvs(url): +def extract_csvs(chart_id, url): # div class = "dw-chart-notes" driver = webdriver.Chrome('/Users/angelakwon/Downloads/chromedriver') # can I leave this blank, will virtual env take care of it? driver.implicitly_wait(30) driver.get(url) - chart_id = 'tyXjV' frame = driver.find_element_by_css_selector(f'iframe[src^="//datawrapper.dwcdn.net/{chart_id}/"]') driver.switch_to.frame(frame) # Grab the raw data out of the link's href attribute csv_data = driver.find_element_by_class_name('dw-data-link').get_attribute('href') # Switch back to the parent frame to "reset" the context driver.switch_to.parent_frame() - + # Deal with the data if csv_data.startswith('data:'): - media, data = csv_data[5:].split(',', 1) - # Will likely always have this kind of data type - if media != 'application/octet-stream;charset=utf-8': - raise ValueError(f'Cannot handle media type "{media}"') - csv_string = unquote_plus(data) - print(csv_string) + media, data = csv_data[5:].split(',', 1) + # Will likely always have this kind of data type + if media != 'application/octet-stream;charset=utf-8': + raise ValueError(f'Cannot handle media type "{media}"') + csv_string = unquote_plus(data) + # Then leave the iframe driver.switch_to.default_content() - #return 'Done downloading' + return csv_string def get_metadata(url): @@ -74,19 +80,24 @@ def get_metadata(url): driver.quit() # does this close the tab? -def get_case_series(csv_): +def get_case_series(chart_id, url): + csv_ = extract_csvs(chart_id, url) series = [] - with open(csv_, mode = 'r') as case_csv: - csv_reader = csv.DictReader(case_csv) - csv_headers = list(next(csv_reader).keys()) # TO-DO: Make it work without hard coding the keys - case_history = [] - for row in csv_reader: - # TO-DO: throw an exception if there are more than the expected number of headers, or when order has changed - daily = {} - daily["date"] = row["Date"] # TO-DO: need to format the date properly - case_history.append(int(row["Total Cases"])) - daily["cumul_cases"] = row["Total Cases"] - series.append(daily) + + csv_strs = csv_.splitlines() + keys = csv_strs[0].split(',') + #print(keys) + + case_history = [] + + # TO-DO: Double check this in morning, then transfer over logic to deaths + for row in csv_strs[1:]: + # # TO-DO: throw an exception if there are more than the expected number of headers, or when order has changed + daily = {} + daily["date"] = row.split(',')[0] # TO-DO: need to format the date properly + case_history.append(int(row.split(',')[1])) + daily["cumul_cases"] = row.split(',')[1] + series.append(daily) case_history_diff = np.diff(case_history) case_history_diff = np.insert(case_history_diff, 0, 0) # there will be no calculated difference for the first day, so adding it in manually @@ -94,7 +105,8 @@ def get_case_series(csv_): series[val]["cases"] = case_num return series -def get_death_series(csv_): +def get_death_series(chart_id, url): + csv_ = extract_csvs(chart_id, url) series = [] with open(csv_, mode = 'r') as case_csv: csv_reader = csv.DictReader(case_csv) From a5a66d806853fcb40addc329e0ad9f65119478e3 Mon Sep 17 00:00:00 2001 From: kwonangela7 Date: Mon, 1 Jun 2020 23:07:59 -0700 Subject: [PATCH 05/39] finished breakdown parsings --- marin_scraper.py | 161 ++++++++++++++++++++++++++++++----------------- 1 file changed, 103 insertions(+), 58 deletions(-) diff --git a/marin_scraper.py b/marin_scraper.py index 29e3ce00..32b82d70 100644 --- a/marin_scraper.py +++ b/marin_scraper.py @@ -1,5 +1,3 @@ -# is there a way to get historical data -# is there a way to have a script click on the "download csv" files? #!/usr/bin/env python3 import csv import json @@ -7,23 +5,22 @@ from selenium import webdriver from bs4 import BeautifulSoup from urllib.parse import unquote_plus +from datetime import datetime # do a final check def get_county_data(): - url = 'https://coronavirus.marinhhs.org/surveillance' - #case_csv = '/Users/angelakwon/Downloads/data-Eq6Es.csv' """Main method for populating county data""" + + url = 'https://coronavirus.marinhhs.org/surveillance' with open('/Users/angelakwon/Desktop/data-covid19-sfbayarea/data-model.json') as template: - # TO-DO: Need to change the location here to work for anyone..look at other scrapers + # TO-DO: Need to change this to github location model = json.load(template) csvs = {"cases": "Eq6Es", "deaths": "Eq6Es", "tests": None, "age": "VOeBm", "gender": "FEciW", "race_eth": "aBeEd", "transmission": None} - # There are two separate race_eth csvs, but I think I picked the more comprehensive one. - # The other one has id: 6RXFj - # TO-DO: Do they have # NOTE: they used to have a pos/neg test csv, but it seems to be gone now. # also, their graph doesn't show pos/neg. + # population totals and transmission data missing. #model['name'] = #model['update_time'] = @@ -31,22 +28,24 @@ def get_county_data(): #model['meta_from_source'] = # make sure to get the comments below the data #model['meta_from_baypd'] - model["series"]["cases"] = get_case_series(csvs["cases"], url) + #model["series"]["cases"] = get_case_series(csvs["cases"], url) #model["series"]["deaths"] = get_death_series(csvs["deaths"], url) #model["series"]["tests"] = get_test_series(case_csv) - - #print(model) - #print(get_metadata(url)) - print(model["series"]["cases"]) + #model["case_totals"]["age_group"], model["death_totals"]["age_group"] = get_breakdown_age(csvs["age"], url) + #model["case_totals"]["gender"], model["death_totals"]["gender"] = get_breakdown_gender(csvs["gender"], url) + model["case_totals"]["race_eth"], model["death_totals"]["race_eth"] = get_breakdown_race_eth(csvs["race_eth"], url) + print(model["case_totals"]["race_eth"], model["death_totals"]["race_eth"]) + def extract_csvs(chart_id, url): - # div class = "dw-chart-notes" - driver = webdriver.Chrome('/Users/angelakwon/Downloads/chromedriver') - # can I leave this blank, will virtual env take care of it? + driver = webdriver.Chrome('/Users/angelakwon/Downloads/chromedriver') # can I leave this blank, will virtual env take care of it? + driver.implicitly_wait(30) driver.get(url) - frame = driver.find_element_by_css_selector(f'iframe[src^="//datawrapper.dwcdn.net/{chart_id}/"]') + #frame = driver.find_element_by_css_selector(f'iframe[src^="//datawrapper.dwcdn.net/{chart_id}/"]') + # the link changed - now it attaches a random number after the chart id so I needed to change the attribute. + frame = driver.find_element_by_css_selector(f'iframe[src*="//datawrapper.dwcdn.net/{chart_id}/"]') driver.switch_to.frame(frame) # Grab the raw data out of the link's href attribute csv_data = driver.find_element_by_class_name('dw-data-link').get_attribute('href') @@ -75,10 +74,10 @@ def get_metadata(url): driver.implicitly_wait(30) driver.get(url) soup = BeautifulSoup(driver.page_source, 'html5lib') - soup.find_all('p') + soup.find_all('p') .getText() # THEN use the getText function - driver.quit() # does this close the tab? + driver.quit() def get_case_series(chart_id, url): csv_ = extract_csvs(chart_id, url) @@ -90,13 +89,15 @@ def get_case_series(chart_id, url): case_history = [] - # TO-DO: Double check this in morning, then transfer over logic to deaths for row in csv_strs[1:]: # # TO-DO: throw an exception if there are more than the expected number of headers, or when order has changed daily = {} - daily["date"] = row.split(',')[0] # TO-DO: need to format the date properly + #daily["date"] = row.split(',')[0] + date_time_obj = datetime.strptime(row.split(',')[0], '%m/%d/%Y') + daily["date"] = date_time_obj.isoformat() + # TO-DO: need to format the date properly case_history.append(int(row.split(',')[1])) - daily["cumul_cases"] = row.split(',')[1] + daily["cumul_cases"] = int(row.split(',')[1]) series.append(daily) case_history_diff = np.diff(case_history) @@ -108,58 +109,102 @@ def get_case_series(chart_id, url): def get_death_series(chart_id, url): csv_ = extract_csvs(chart_id, url) series = [] - with open(csv_, mode = 'r') as case_csv: - csv_reader = csv.DictReader(case_csv) - csv_headers = list(next(csv_reader).keys()) # TO-DO: Make it work without hard coding the keys - case_history = [] - for row in csv_reader: - # TO-DO: throw an exception if there are more than the expected number of headers, or when order has changed - daily = {} - daily["date"] = row["Date"] # TO-DO: need to format the date properly - case_history.append(int(row["Total Deaths"])) - daily["cumul_deaths"] = row["Total Deaths"] - series.append(daily) + + csv_strs = csv_.splitlines() + keys = csv_strs[0].split(',') + + death_history = [] + + for row in csv_strs[1:]: + # # TO-DO: throw an exception if there are more than the expected number of headers, or when order has changed + daily = {} + date_time_obj = datetime.strptime(row.split(',')[0], '%m/%d/%Y') + daily["date"] = date_time_obj.isoformat() + death_history.append(int(row.split(',')[4])) + daily["cumul_deaths"] = int(row.split(',')[4]) + series.append(daily) - case_history_diff = np.diff(case_history) - case_history_diff = np.insert(case_history_diff, 0, 0) # there will be no calculated difference for the first day, so adding it in manually - for val, case_num in enumerate(case_history_diff): - series[val]["deaths"] = case_num # should I change up the order of the keys? + death_history_diff = np.diff(death_history) + death_history_diff = np.insert(death_history_diff, 0, 0) # there will be no calculated difference for the first day, so adding it in manually + for val, death_num in enumerate(death_history_diff): + series[val]["deaths"] = death_num return series -#def get_test_series(): - # "date": "yyyy-mm-dd", - # "tests": -1, - # "positive": -1, - # "negative": -1, - # "pending": -1, - # "cumul_tests": -1, - # "cumul_pos": -1, - # "cumul_neg": -1, - # "cumul_pend": -1 - #save the first row as values - #need to keep track of pos and negative, but no values for pending +def get_breakdown_age(chart_id, url): + """ Gets breakdown of cases and deaths by age """ + csv_ = extract_csvs(chart_id, url) + c_brkdown = [] + d_brkdown = [] -#def get_case_totals_gender(): + csv_strs = csv_.splitlines() + keys = csv_strs[0].split(',') # don't know if this is actually needed -#def get_case_totals_age(): + + for row in csv_strs[1:]: + c_age = {} + d_age = {} + c_age["group"] = row.split(',')[0] + c_age["raw_count"] = int(row.split(',')[2]) + d_age["group"] = row.split(',')[0] + d_age["raw_count"] = int(row.split(',')[4]) + c_brkdown.append(c_age) + d_brkdown.append(d_age) + + return c_brkdown, d_brkdown + +def get_breakdown_gender(chart_id, url): + """ Gets breakdown of cases and deaths by gender """ + csv_ = extract_csvs(chart_id, url) -#def get_case_totals_race_eth(): + csv_strs = csv_.splitlines() + keys = csv_strs[0].split(',') # don't know if this is actually needed -#def get_case_totals_category(): + c_gender = {} + d_gender = {} + + for row in csv_strs[1:]: + split = row.split(',') + gender = split[0].lower() + c_gender[gender] = int(split[2]) + d_gender[gender] = int(split[4]) + # check to see what other scrapers have done with missing data model values + return c_gender, d_gender -#def get_death_totals_gender(): -#def get_death_totals_age(): +def get_breakdown_race_eth(chart_id, url): + csv_ = extract_csvs(chart_id, url) -#def get_death_totals_race_eth(): + csv_strs = csv_.splitlines() + key_mapping = {"black/african american":"African_Amer", "hispanic/latino": "Latinx_or_Hispanic", + "american indian/alaska native": "Native_Amer", "native hawaiian/pacific islander": "Pacific_Islander", "white": "White", "asian": "Asian", "multi or other race": "Multi or Other Race"} + # "Multiple_Race", "Other" are not separate in this data set - they are one value under "Multi or Other Race" + + c_race_eth = {} + d_race_eth = {} + + for row in csv_strs[1:]: + split = row.split(',') + race_eth = split[0].lower() + if race_eth not in key_mapping: + print("New race_eth group") + else: + c_race_eth[key_mapping[race_eth]] = int(split[2]) + d_race_eth[key_mapping[race_eth]] = int(split[6]) + # check to see what other scrapers have done with missing data model values + + return c_race_eth, d_race_eth + +#def get_breakdown_transmission(): #def get_death_totals_underlying(): -#def get_death_totals_transmission(): # not sure if this information exists -# population totals +#def get_test_series(): + get_county_data() + + # figure out a way to run the scraper through the command line \ No newline at end of file From a27073f6ae3debfaed5b038121b0f36e42799cef Mon Sep 17 00:00:00 2001 From: kwonangela7 Date: Tue, 9 Jun 2020 21:29:04 -0700 Subject: [PATCH 06/39] finalized series and test scraping methods with function annotations and raised exceptions --- marin_scraper.py | 202 +++++++++++++++++++++++++++++++++-------------- 1 file changed, 143 insertions(+), 59 deletions(-) diff --git a/marin_scraper.py b/marin_scraper.py index 32b82d70..8b088124 100644 --- a/marin_scraper.py +++ b/marin_scraper.py @@ -2,50 +2,52 @@ import csv import json import numpy as np +from typing import List, Dict, Tuple from selenium import webdriver from bs4 import BeautifulSoup from urllib.parse import unquote_plus from datetime import datetime +import re +#from .utils import get_data_model -# do a final check -def get_county_data(): +def get_county_data() -> Dict: """Main method for populating county data""" url = 'https://coronavirus.marinhhs.org/surveillance' with open('/Users/angelakwon/Desktop/data-covid19-sfbayarea/data-model.json') as template: - # TO-DO: Need to change this to github location model = json.load(template) + #model = get_data_model() - csvs = {"cases": "Eq6Es", "deaths": "Eq6Es", "tests": None, "age": "VOeBm", "gender": "FEciW", "race_eth": "aBeEd", "transmission": None} - # NOTE: they used to have a pos/neg test csv, but it seems to be gone now. - # also, their graph doesn't show pos/neg. + chart_ids = {"cases": "Eq6Es", "deaths": "Eq6Es", "tests": '2Hgir', "age": "VOeBm", "gender": "FEciW", "race_eth": "aBeEd"} # population totals and transmission data missing. - #model['name'] = - #model['update_time'] = + model['name'] = "Alameda" + model['update_time'] = datetime.today().isoformat() + # No actual update time on their website? They update most charts daily (so the isoformat is only partially correct.) model['source_url'] = url - #model['meta_from_source'] = - # make sure to get the comments below the data - #model['meta_from_baypd'] - #model["series"]["cases"] = get_case_series(csvs["cases"], url) - #model["series"]["deaths"] = get_death_series(csvs["deaths"], url) - #model["series"]["tests"] = get_test_series(case_csv) - #model["case_totals"]["age_group"], model["death_totals"]["age_group"] = get_breakdown_age(csvs["age"], url) - #model["case_totals"]["gender"], model["death_totals"]["gender"] = get_breakdown_gender(csvs["gender"], url) - model["case_totals"]["race_eth"], model["death_totals"]["race_eth"] = get_breakdown_race_eth(csvs["race_eth"], url) - print(model["case_totals"]["race_eth"], model["death_totals"]["race_eth"]) + #model['meta_from_source'] = get_metadata(url, chart_ids) + model["series"]["cases"] = get_case_series(chart_ids["cases"], url) + model["series"]["deaths"] = get_death_series(chart_ids["deaths"], url) + model["series"]["tests"] = get_test_series(chart_ids["tests"], url) + model["case_totals"]["age_group"], model["death_totals"]["age_group"] = get_breakdown_age(chart_ids["age"], url) + model["case_totals"]["gender"], model["death_totals"]["gender"] = get_breakdown_gender(chart_ids["gender"], url) + model["case_totals"]["race_eth"], model["death_totals"]["race_eth"] = get_breakdown_race_eth(chart_ids["race_eth"], url) + #model["series"]["tests"] = get_test_series(case_csv) -def extract_csvs(chart_id, url): - driver = webdriver.Chrome('/Users/angelakwon/Downloads/chromedriver') # can I leave this blank, will virtual env take care of it? + print(model) + +def extract_csvs(chart_id: str, url: str) -> str: + """This method extracts the csv string from the data wrapper charts.""" + driver = webdriver.Chrome('/Users/angelakwon/Downloads/chromedriver') + # need to figure out how to change the webdriver driver.implicitly_wait(30) driver.get(url) - #frame = driver.find_element_by_css_selector(f'iframe[src^="//datawrapper.dwcdn.net/{chart_id}/"]') - # the link changed - now it attaches a random number after the chart id so I needed to change the attribute. frame = driver.find_element_by_css_selector(f'iframe[src*="//datawrapper.dwcdn.net/{chart_id}/"]') + driver.switch_to.frame(frame) # Grab the raw data out of the link's href attribute csv_data = driver.find_element_by_class_name('dw-data-link').get_attribute('href') @@ -59,92 +61,130 @@ def extract_csvs(chart_id, url): if media != 'application/octet-stream;charset=utf-8': raise ValueError(f'Cannot handle media type "{media}"') csv_string = unquote_plus(data) - # Then leave the iframe driver.switch_to.default_content() - return csv_string + return csv_string -def get_metadata(url): - # I think I want to return a list - +def get_metadata(url: str, chart_ids: str) -> Tuple: notes = [] - driver = webdriver.Chrome('/Users/angelakwon/Downloads/chromedriver') + driver = webdriver.Chrome('/Users/angelakwon/Downloads/chromedriver') # change this to point to Github one driver.implicitly_wait(30) driver.get(url) soup = BeautifulSoup(driver.page_source, 'html5lib') - soup.find_all('p') .getText() - # THEN use the getText function + metadata = [] + + to_be_matched = ['Total Cases, Recovered, Hospitalizations and Deaths by Date Reported', 'Daily Count of Positive Results and Total Tests for Marin County Residents by Test Date ', 'Cases, Hospitalizations, and Deaths by Age, Gender and Race/Ethnicity '] + chart_metadata = [] + + for text in to_be_matched: + target = soup.find('h4',text=text) + if not target: + raise ValueError('Cannot handle this header.') + for sib in target.find_next_siblings()[:1]: # I only want the first paragraph tag + metadata += sib.text + + + # Metadata for each csv file I pull. There's probably a better way to organize this. + for chart_id in chart_ids.values(): + frame = driver.find_element_by_css_selector(f'iframe[src*="//datawrapper.dwcdn.net/{chart_id}/"]') + driver.switch_to.frame(frame) + for c in driver.find_elements_by_class_name('dw-chart-notes'): + chart_metadata.append(soup.find('div').getText()) + + # Switch back to the parent frame to "reset" the context + driver.switch_to.parent_frame() driver.quit() + # Return the metadata. I take the set of the chart_metadata since there are repeating metadata strings. + return metadata, list(set(chart_metadata)) -def get_case_series(chart_id, url): +def get_case_series(chart_id: str, url: str) -> List: + """This method extracts the date, number of cumulative cases, and new cases.""" csv_ = extract_csvs(chart_id, url) series = [] csv_strs = csv_.splitlines() keys = csv_strs[0].split(',') - #print(keys) + if keys != ['Date', 'Total Cases', 'Total Recovered*', 'Total Hospitalized', 'Total Deaths']: + raise ValueError('The headers have changed') + case_history = [] for row in csv_strs[1:]: - # # TO-DO: throw an exception if there are more than the expected number of headers, or when order has changed daily = {} - #daily["date"] = row.split(',')[0] + # Grab the date in the first column date_time_obj = datetime.strptime(row.split(',')[0], '%m/%d/%Y') daily["date"] = date_time_obj.isoformat() - # TO-DO: need to format the date properly + # Collect the case totals in order to compute the change in cases per day case_history.append(int(row.split(',')[1])) + # Grab the cumulative number in the fifth column daily["cumul_cases"] = int(row.split(',')[1]) series.append(daily) case_history_diff = np.diff(case_history) - case_history_diff = np.insert(case_history_diff, 0, 0) # there will be no calculated difference for the first day, so adding it in manually + # there will be no calculated difference for the first day, so adding it in manually + case_history_diff = np.insert(case_history_diff, 0, 0) + # adding the case differences into the series for val, case_num in enumerate(case_history_diff): series[val]["cases"] = case_num return series -def get_death_series(chart_id, url): +def get_death_series(chart_id: str, url: str) -> List: + """This method extracts the date, number of cumulative deaths, and new deaths.""" csv_ = extract_csvs(chart_id, url) series = [] csv_strs = csv_.splitlines() keys = csv_strs[0].split(',') - + if keys != ['Date', 'Total Cases', 'Total Recovered*', 'Total Hospitalized', 'Total Deaths']: + raise ValueError('The headers have changed.') + death_history = [] for row in csv_strs[1:]: - # # TO-DO: throw an exception if there are more than the expected number of headers, or when order has changed daily = {} + # Grab the date in the first column date_time_obj = datetime.strptime(row.split(',')[0], '%m/%d/%Y') daily["date"] = date_time_obj.isoformat() + # Collect the death totals in order to compute the change in deaths per day death_history.append(int(row.split(',')[4])) + # Grab the cumulative number in the fifth column daily["cumul_deaths"] = int(row.split(',')[4]) series.append(daily) death_history_diff = np.diff(death_history) - death_history_diff = np.insert(death_history_diff, 0, 0) # there will be no calculated difference for the first day, so adding it in manually + # there will be no calculated difference for the first day, so adding it in manually + death_history_diff = np.insert(death_history_diff, 0, 0) + # adding the case differences into the series for val, death_num in enumerate(death_history_diff): series[val]["deaths"] = death_num return series - -def get_breakdown_age(chart_id, url): - """ Gets breakdown of cases and deaths by age """ +def get_breakdown_age(chart_id: str, url: str) -> Tuple: + """This method gets the breakdown of cases and deaths by age.""" csv_ = extract_csvs(chart_id, url) c_brkdown = [] d_brkdown = [] csv_strs = csv_.splitlines() - keys = csv_strs[0].split(',') # don't know if this is actually needed + keys = csv_strs[0].split(',') - + if keys != ['Age Category', 'POPULATION', 'Cases', 'Hospitalizations', 'Deaths']: + raise ValueError('The headers have changed') + + ages = ['0-18', '19-34', '35-49', '50-64', '65'] for row in csv_strs[1:]: c_age = {} d_age = {} + # Extracting the age group and the raw count (the 3rd and 5th columns, respectively) for both cases and deaths. + # Each new row has data for a different age group. c_age["group"] = row.split(',')[0] + print(type(c_age["group"])) + if c_age["group"] not in ages: + raise ValueError('The age groups have changed.') c_age["raw_count"] = int(row.split(',')[2]) d_age["group"] = row.split(',')[0] d_age["raw_count"] = int(row.split(',')[4]) @@ -153,30 +193,42 @@ def get_breakdown_age(chart_id, url): return c_brkdown, d_brkdown -def get_breakdown_gender(chart_id, url): - """ Gets breakdown of cases and deaths by gender """ +def get_breakdown_gender(chart_id: str, url: str) -> Tuple: + """This method gets the breakdown of cases and deaths by gender.""" csv_ = extract_csvs(chart_id, url) csv_strs = csv_.splitlines() - keys = csv_strs[0].split(',') # don't know if this is actually needed + keys = csv_strs[0].split(',') + if keys != ['Gender', 'POPULATION', 'Cases', 'Hospitalizations', 'Deaths']: + raise ValueError('The headers have changed.') + genders = ['Male', 'Female'] c_gender = {} d_gender = {} for row in csv_strs[1:]: + # Extracting the gender and the raw count (the 3rd and 5th columns, respectively) for both cases and deaths. + # Each new row has data for a different gender. split = row.split(',') gender = split[0].lower() + if gender not in genders: + return ValueError('The genders have changed.') c_gender[gender] = int(split[2]) d_gender[gender] = int(split[4]) - # check to see what other scrapers have done with missing data model values return c_gender, d_gender +def get_breakdown_race_eth(chart_id: str, url: str) -> Tuple: + """This method gets the breakdown of cases and deaths by race/ethnicity.""" -def get_breakdown_race_eth(chart_id, url): csv_ = extract_csvs(chart_id, url) csv_strs = csv_.splitlines() + keys = csv_strs[0].split(',') + + if keys != ['Race/Ethnicity', 'COUNTY POPULATION', 'Case Count', 'Percent of Cases', 'Hospitalization Count', 'Percent of Hospitalizations', 'Death Count', 'Percent of Deaths']: + raise ValueError("The headers have changed.") + key_mapping = {"black/african american":"African_Amer", "hispanic/latino": "Latinx_or_Hispanic", "american indian/alaska native": "Native_Amer", "native hawaiian/pacific islander": "Pacific_Islander", "white": "White", "asian": "Asian", "multi or other race": "Multi or Other Race"} # "Multiple_Race", "Other" are not separate in this data set - they are one value under "Multi or Other Race" @@ -188,23 +240,55 @@ def get_breakdown_race_eth(chart_id, url): split = row.split(',') race_eth = split[0].lower() if race_eth not in key_mapping: - print("New race_eth group") + raise ValueError("The race_eth groups have changed.") else: c_race_eth[key_mapping[race_eth]] = int(split[2]) d_race_eth[key_mapping[race_eth]] = int(split[6]) - # check to see what other scrapers have done with missing data model values return c_race_eth, d_race_eth -#def get_breakdown_transmission(): - -#def get_death_totals_underlying(): +def get_test_series(chart_id: str, url: str) -> Tuple: + """This method gets the date, the number of positive and negative tests on that date, and the number of cumulative positive and negative tests.""" + csv_ = extract_csvs(chart_id, url) + series = [] -#def get_test_series(): + csv_strs = csv_.splitlines() + keys = csv_strs[0].split(',') + + test_history = [] + + # Grab the dates, which are in the header + for entry in csv_strs[:1][0].split(',')[1:]: + # need to exclude very first item in the csv_strs[:1][0].split(',') list (which is the value 'Date') + daily = {} + date_time_obj = datetime.strptime(entry, '%m/%d/%Y') + daily["date"] = date_time_obj.isoformat() + series.append(daily) + + # Grab the positive test result numbers, which is in the second row. + # [1:] is included to make sure that 'Positive Tests' is not captured. + p_entries = csv_strs[1:2][0].split(',')[1:] + + # initialize values + cumul_pos = int(p_entries[0]) + series[0]["positive"] = int(p_entries[0]) + series[0]["cumul_pos"] = cumul_pos + index = 1 + + while index < len(series): + # dictionary gets weird + day = series[index] + curr = int(p_entries[index]) + day["positive"] = int(curr) + cumul_pos += curr + day["cumul_pos"] = cumul_pos + index += 1 + + # Grab the negative test result numbers, which is in the third row. + # "negative", "cumul_neg" + + return series get_county_data() - - -# figure out a way to run the scraper through the command line \ No newline at end of file From 30ccf08d6be54f2cbdbea649067bdd55f2305018 Mon Sep 17 00:00:00 2001 From: kwonangela7 Date: Wed, 10 Jun 2020 16:48:27 -0700 Subject: [PATCH 07/39] fixed the bug so that only chart notes from the charts I'm looking at are pulled, and put in get_data_model function that Elaine wrote --- marin_scraper.py | 73 +++++++++++++++++++++++++++--------------------- 1 file changed, 41 insertions(+), 32 deletions(-) diff --git a/marin_scraper.py b/marin_scraper.py index 8b088124..0c52ed44 100644 --- a/marin_scraper.py +++ b/marin_scraper.py @@ -9,24 +9,21 @@ from datetime import datetime import re -#from .utils import get_data_model +from .utils import get_data_model def get_county_data() -> Dict: """Main method for populating county data""" url = 'https://coronavirus.marinhhs.org/surveillance' - with open('/Users/angelakwon/Desktop/data-covid19-sfbayarea/data-model.json') as template: - model = json.load(template) - #model = get_data_model() + model = get_data_model() chart_ids = {"cases": "Eq6Es", "deaths": "Eq6Es", "tests": '2Hgir', "age": "VOeBm", "gender": "FEciW", "race_eth": "aBeEd"} # population totals and transmission data missing. - - model['name'] = "Alameda" + model['name'] = "Marin County" model['update_time'] = datetime.today().isoformat() # No actual update time on their website? They update most charts daily (so the isoformat is only partially correct.) model['source_url'] = url - #model['meta_from_source'] = get_metadata(url, chart_ids) + model['meta_from_source'] = get_metadata(url, chart_ids) model["series"]["cases"] = get_case_series(chart_ids["cases"], url) model["series"]["deaths"] = get_death_series(chart_ids["deaths"], url) model["series"]["tests"] = get_test_series(chart_ids["tests"], url) @@ -34,8 +31,6 @@ def get_county_data() -> Dict: model["case_totals"]["gender"], model["death_totals"]["gender"] = get_breakdown_gender(chart_ids["gender"], url) model["case_totals"]["race_eth"], model["death_totals"]["race_eth"] = get_breakdown_race_eth(chart_ids["race_eth"], url) - #model["series"]["tests"] = get_test_series(case_csv) - print(model) def extract_csvs(chart_id: str, url: str) -> str: @@ -83,20 +78,22 @@ def get_metadata(url: str, chart_ids: str) -> Tuple: if not target: raise ValueError('Cannot handle this header.') for sib in target.find_next_siblings()[:1]: # I only want the first paragraph tag - metadata += sib.text + # Is it more efficient to use something like (soup object).select('h1 + p') to grab the first paragraph that follows? + metadata += [sib.text] - - # Metadata for each csv file I pull. There's probably a better way to organize this. + # Metadata for each chart visualizing the data of the csv file I'll pull. There's probably a better way to organize this. for chart_id in chart_ids.values(): frame = driver.find_element_by_css_selector(f'iframe[src*="//datawrapper.dwcdn.net/{chart_id}/"]') driver.switch_to.frame(frame) + # The metadata for the charts is located in elements with the class `dw-chart-notes' for c in driver.find_elements_by_class_name('dw-chart-notes'): - chart_metadata.append(soup.find('div').getText()) + chart_metadata.append(c.text) # Switch back to the parent frame to "reset" the context driver.switch_to.parent_frame() - + driver.quit() + # Return the metadata. I take the set of the chart_metadata since there are repeating metadata strings. return metadata, list(set(chart_metadata)) @@ -175,14 +172,13 @@ def get_breakdown_age(chart_id: str, url: str) -> Tuple: if keys != ['Age Category', 'POPULATION', 'Cases', 'Hospitalizations', 'Deaths']: raise ValueError('The headers have changed') - ages = ['0-18', '19-34', '35-49', '50-64', '65'] + ages = ['0-18', '19-34', '35-49', '50-64', '65+'] for row in csv_strs[1:]: c_age = {} d_age = {} # Extracting the age group and the raw count (the 3rd and 5th columns, respectively) for both cases and deaths. # Each new row has data for a different age group. c_age["group"] = row.split(',')[0] - print(type(c_age["group"])) if c_age["group"] not in ages: raise ValueError('The age groups have changed.') c_age["raw_count"] = int(row.split(',')[2]) @@ -202,7 +198,7 @@ def get_breakdown_gender(chart_id: str, url: str) -> Tuple: if keys != ['Gender', 'POPULATION', 'Cases', 'Hospitalizations', 'Deaths']: raise ValueError('The headers have changed.') - genders = ['Male', 'Female'] + genders = ['male', 'female'] c_gender = {} d_gender = {} @@ -266,29 +262,42 @@ def get_test_series(chart_id: str, url: str) -> Tuple: daily["date"] = date_time_obj.isoformat() series.append(daily) - + # The slicing makes this if statement hard to look at... there must be a better way? + if csv_strs[1:2][0].split(',')[:1][0] != 'Positive Tests' and csv_strs[2:][0].split(',')[:1][0] != 'Negative Tests': + raise ValueError('The kinds of tests have changed.') + # Grab the positive test result numbers, which is in the second row. # [1:] is included to make sure that 'Positive Tests' is not captured. p_entries = csv_strs[1:2][0].split(',')[1:] + n_entries = csv_strs[2:][0].split(',')[1:] + + get_test_series_helper(series, p_entries, ['positive', 'cumul_pos']) + get_test_series_helper(series, n_entries, ['negative', 'cumul_neg']) + + return series - # initialize values - cumul_pos = int(p_entries[0]) - series[0]["positive"] = int(p_entries[0]) - series[0]["cumul_pos"] = cumul_pos - index = 1 +def get_test_series_helper(series: list, entries: list, keys: list) -> List: + """This method helps get the pos/neg test count and the cumulative pos/neg test count.""" + + # initialize values cumulative number, the positive/negative and cumul_pos/neg values for the first day, and the index needed for the while loop. + + # there's probably a more efficient way to do all of this, but I just wasn't sure. + cumul = int(entries[0]) + series[0][keys[0]] = int(entries[0]) + series[0][keys[1]] = cumul + index = 1 while index < len(series): - # dictionary gets weird + # get a particular day day = series[index] - curr = int(p_entries[index]) - day["positive"] = int(curr) - cumul_pos += curr - day["cumul_pos"] = cumul_pos + curr = int(entries[index]) + # get pos/neg test count + day[keys[0]] = int(curr) + # add that day's pos/neg test count to get cumulative number of positive tests + cumul += curr + day[keys[1]] = cumul index += 1 + return series - # Grab the negative test result numbers, which is in the third row. - # "negative", "cumul_neg" - - return series get_county_data() From cb9947c3da2eeef4b9052643ca59d91e29de0a54 Mon Sep 17 00:00:00 2001 From: kwonangela7 Date: Wed, 10 Jun 2020 16:54:10 -0700 Subject: [PATCH 08/39] moved marin scraper to folder --- covid19_sfbayarea/data/marin_scraper.py | 303 ++++++++++++++++++++++++ 1 file changed, 303 insertions(+) create mode 100644 covid19_sfbayarea/data/marin_scraper.py diff --git a/covid19_sfbayarea/data/marin_scraper.py b/covid19_sfbayarea/data/marin_scraper.py new file mode 100644 index 00000000..0c52ed44 --- /dev/null +++ b/covid19_sfbayarea/data/marin_scraper.py @@ -0,0 +1,303 @@ +#!/usr/bin/env python3 +import csv +import json +import numpy as np +from typing import List, Dict, Tuple +from selenium import webdriver +from bs4 import BeautifulSoup +from urllib.parse import unquote_plus +from datetime import datetime +import re + +from .utils import get_data_model + +def get_county_data() -> Dict: + """Main method for populating county data""" + + url = 'https://coronavirus.marinhhs.org/surveillance' + model = get_data_model() + + chart_ids = {"cases": "Eq6Es", "deaths": "Eq6Es", "tests": '2Hgir', "age": "VOeBm", "gender": "FEciW", "race_eth": "aBeEd"} + # population totals and transmission data missing. + model['name'] = "Marin County" + model['update_time'] = datetime.today().isoformat() + # No actual update time on their website? They update most charts daily (so the isoformat is only partially correct.) + model['source_url'] = url + model['meta_from_source'] = get_metadata(url, chart_ids) + model["series"]["cases"] = get_case_series(chart_ids["cases"], url) + model["series"]["deaths"] = get_death_series(chart_ids["deaths"], url) + model["series"]["tests"] = get_test_series(chart_ids["tests"], url) + model["case_totals"]["age_group"], model["death_totals"]["age_group"] = get_breakdown_age(chart_ids["age"], url) + model["case_totals"]["gender"], model["death_totals"]["gender"] = get_breakdown_gender(chart_ids["gender"], url) + model["case_totals"]["race_eth"], model["death_totals"]["race_eth"] = get_breakdown_race_eth(chart_ids["race_eth"], url) + + print(model) + +def extract_csvs(chart_id: str, url: str) -> str: + """This method extracts the csv string from the data wrapper charts.""" + driver = webdriver.Chrome('/Users/angelakwon/Downloads/chromedriver') + # need to figure out how to change the webdriver + + driver.implicitly_wait(30) + driver.get(url) + + frame = driver.find_element_by_css_selector(f'iframe[src*="//datawrapper.dwcdn.net/{chart_id}/"]') + + driver.switch_to.frame(frame) + # Grab the raw data out of the link's href attribute + csv_data = driver.find_element_by_class_name('dw-data-link').get_attribute('href') + # Switch back to the parent frame to "reset" the context + driver.switch_to.parent_frame() + + # Deal with the data + if csv_data.startswith('data:'): + media, data = csv_data[5:].split(',', 1) + # Will likely always have this kind of data type + if media != 'application/octet-stream;charset=utf-8': + raise ValueError(f'Cannot handle media type "{media}"') + csv_string = unquote_plus(data) + + # Then leave the iframe + driver.switch_to.default_content() + + return csv_string + +def get_metadata(url: str, chart_ids: str) -> Tuple: + notes = [] + driver = webdriver.Chrome('/Users/angelakwon/Downloads/chromedriver') # change this to point to Github one + driver.implicitly_wait(30) + driver.get(url) + soup = BeautifulSoup(driver.page_source, 'html5lib') + metadata = [] + + to_be_matched = ['Total Cases, Recovered, Hospitalizations and Deaths by Date Reported', 'Daily Count of Positive Results and Total Tests for Marin County Residents by Test Date ', 'Cases, Hospitalizations, and Deaths by Age, Gender and Race/Ethnicity '] + chart_metadata = [] + + for text in to_be_matched: + target = soup.find('h4',text=text) + if not target: + raise ValueError('Cannot handle this header.') + for sib in target.find_next_siblings()[:1]: # I only want the first paragraph tag + # Is it more efficient to use something like (soup object).select('h1 + p') to grab the first paragraph that follows? + metadata += [sib.text] + + # Metadata for each chart visualizing the data of the csv file I'll pull. There's probably a better way to organize this. + for chart_id in chart_ids.values(): + frame = driver.find_element_by_css_selector(f'iframe[src*="//datawrapper.dwcdn.net/{chart_id}/"]') + driver.switch_to.frame(frame) + # The metadata for the charts is located in elements with the class `dw-chart-notes' + for c in driver.find_elements_by_class_name('dw-chart-notes'): + chart_metadata.append(c.text) + + # Switch back to the parent frame to "reset" the context + driver.switch_to.parent_frame() + + driver.quit() + + # Return the metadata. I take the set of the chart_metadata since there are repeating metadata strings. + return metadata, list(set(chart_metadata)) + +def get_case_series(chart_id: str, url: str) -> List: + """This method extracts the date, number of cumulative cases, and new cases.""" + csv_ = extract_csvs(chart_id, url) + series = [] + + csv_strs = csv_.splitlines() + keys = csv_strs[0].split(',') + + if keys != ['Date', 'Total Cases', 'Total Recovered*', 'Total Hospitalized', 'Total Deaths']: + raise ValueError('The headers have changed') + + case_history = [] + + for row in csv_strs[1:]: + daily = {} + # Grab the date in the first column + date_time_obj = datetime.strptime(row.split(',')[0], '%m/%d/%Y') + daily["date"] = date_time_obj.isoformat() + # Collect the case totals in order to compute the change in cases per day + case_history.append(int(row.split(',')[1])) + # Grab the cumulative number in the fifth column + daily["cumul_cases"] = int(row.split(',')[1]) + series.append(daily) + + case_history_diff = np.diff(case_history) + # there will be no calculated difference for the first day, so adding it in manually + case_history_diff = np.insert(case_history_diff, 0, 0) + # adding the case differences into the series + for val, case_num in enumerate(case_history_diff): + series[val]["cases"] = case_num + return series + +def get_death_series(chart_id: str, url: str) -> List: + """This method extracts the date, number of cumulative deaths, and new deaths.""" + csv_ = extract_csvs(chart_id, url) + series = [] + + csv_strs = csv_.splitlines() + keys = csv_strs[0].split(',') + if keys != ['Date', 'Total Cases', 'Total Recovered*', 'Total Hospitalized', 'Total Deaths']: + raise ValueError('The headers have changed.') + + death_history = [] + + for row in csv_strs[1:]: + daily = {} + # Grab the date in the first column + date_time_obj = datetime.strptime(row.split(',')[0], '%m/%d/%Y') + daily["date"] = date_time_obj.isoformat() + # Collect the death totals in order to compute the change in deaths per day + death_history.append(int(row.split(',')[4])) + # Grab the cumulative number in the fifth column + daily["cumul_deaths"] = int(row.split(',')[4]) + series.append(daily) + + death_history_diff = np.diff(death_history) + # there will be no calculated difference for the first day, so adding it in manually + death_history_diff = np.insert(death_history_diff, 0, 0) + # adding the case differences into the series + for val, death_num in enumerate(death_history_diff): + series[val]["deaths"] = death_num + return series + +def get_breakdown_age(chart_id: str, url: str) -> Tuple: + """This method gets the breakdown of cases and deaths by age.""" + csv_ = extract_csvs(chart_id, url) + c_brkdown = [] + d_brkdown = [] + + csv_strs = csv_.splitlines() + keys = csv_strs[0].split(',') + + if keys != ['Age Category', 'POPULATION', 'Cases', 'Hospitalizations', 'Deaths']: + raise ValueError('The headers have changed') + + ages = ['0-18', '19-34', '35-49', '50-64', '65+'] + for row in csv_strs[1:]: + c_age = {} + d_age = {} + # Extracting the age group and the raw count (the 3rd and 5th columns, respectively) for both cases and deaths. + # Each new row has data for a different age group. + c_age["group"] = row.split(',')[0] + if c_age["group"] not in ages: + raise ValueError('The age groups have changed.') + c_age["raw_count"] = int(row.split(',')[2]) + d_age["group"] = row.split(',')[0] + d_age["raw_count"] = int(row.split(',')[4]) + c_brkdown.append(c_age) + d_brkdown.append(d_age) + + return c_brkdown, d_brkdown + +def get_breakdown_gender(chart_id: str, url: str) -> Tuple: + """This method gets the breakdown of cases and deaths by gender.""" + csv_ = extract_csvs(chart_id, url) + + csv_strs = csv_.splitlines() + keys = csv_strs[0].split(',') + if keys != ['Gender', 'POPULATION', 'Cases', 'Hospitalizations', 'Deaths']: + raise ValueError('The headers have changed.') + + genders = ['male', 'female'] + c_gender = {} + d_gender = {} + + for row in csv_strs[1:]: + # Extracting the gender and the raw count (the 3rd and 5th columns, respectively) for both cases and deaths. + # Each new row has data for a different gender. + split = row.split(',') + gender = split[0].lower() + if gender not in genders: + return ValueError('The genders have changed.') + c_gender[gender] = int(split[2]) + d_gender[gender] = int(split[4]) + + return c_gender, d_gender + +def get_breakdown_race_eth(chart_id: str, url: str) -> Tuple: + """This method gets the breakdown of cases and deaths by race/ethnicity.""" + + csv_ = extract_csvs(chart_id, url) + + csv_strs = csv_.splitlines() + keys = csv_strs[0].split(',') + + if keys != ['Race/Ethnicity', 'COUNTY POPULATION', 'Case Count', 'Percent of Cases', 'Hospitalization Count', 'Percent of Hospitalizations', 'Death Count', 'Percent of Deaths']: + raise ValueError("The headers have changed.") + + key_mapping = {"black/african american":"African_Amer", "hispanic/latino": "Latinx_or_Hispanic", + "american indian/alaska native": "Native_Amer", "native hawaiian/pacific islander": "Pacific_Islander", "white": "White", "asian": "Asian", "multi or other race": "Multi or Other Race"} + # "Multiple_Race", "Other" are not separate in this data set - they are one value under "Multi or Other Race" + + c_race_eth = {} + d_race_eth = {} + + for row in csv_strs[1:]: + split = row.split(',') + race_eth = split[0].lower() + if race_eth not in key_mapping: + raise ValueError("The race_eth groups have changed.") + else: + c_race_eth[key_mapping[race_eth]] = int(split[2]) + d_race_eth[key_mapping[race_eth]] = int(split[6]) + + return c_race_eth, d_race_eth + +def get_test_series(chart_id: str, url: str) -> Tuple: + """This method gets the date, the number of positive and negative tests on that date, and the number of cumulative positive and negative tests.""" + + csv_ = extract_csvs(chart_id, url) + series = [] + + csv_strs = csv_.splitlines() + keys = csv_strs[0].split(',') + + test_history = [] + + # Grab the dates, which are in the header + for entry in csv_strs[:1][0].split(',')[1:]: + # need to exclude very first item in the csv_strs[:1][0].split(',') list (which is the value 'Date') + daily = {} + date_time_obj = datetime.strptime(entry, '%m/%d/%Y') + daily["date"] = date_time_obj.isoformat() + series.append(daily) + + # The slicing makes this if statement hard to look at... there must be a better way? + if csv_strs[1:2][0].split(',')[:1][0] != 'Positive Tests' and csv_strs[2:][0].split(',')[:1][0] != 'Negative Tests': + raise ValueError('The kinds of tests have changed.') + + # Grab the positive test result numbers, which is in the second row. + # [1:] is included to make sure that 'Positive Tests' is not captured. + p_entries = csv_strs[1:2][0].split(',')[1:] + n_entries = csv_strs[2:][0].split(',')[1:] + + get_test_series_helper(series, p_entries, ['positive', 'cumul_pos']) + get_test_series_helper(series, n_entries, ['negative', 'cumul_neg']) + + return series + +def get_test_series_helper(series: list, entries: list, keys: list) -> List: + """This method helps get the pos/neg test count and the cumulative pos/neg test count.""" + + # initialize values cumulative number, the positive/negative and cumul_pos/neg values for the first day, and the index needed for the while loop. + + # there's probably a more efficient way to do all of this, but I just wasn't sure. + cumul = int(entries[0]) + series[0][keys[0]] = int(entries[0]) + series[0][keys[1]] = cumul + index = 1 + + while index < len(series): + # get a particular day + day = series[index] + curr = int(entries[index]) + # get pos/neg test count + day[keys[0]] = int(curr) + # add that day's pos/neg test count to get cumulative number of positive tests + cumul += curr + day[keys[1]] = cumul + index += 1 + return series + + +get_county_data() From b53c84e9ed6e4cc0c4e9cfdb3cad03248fa84439 Mon Sep 17 00:00:00 2001 From: kwonangela7 Date: Wed, 10 Jun 2020 17:07:00 -0700 Subject: [PATCH 09/39] deleted extra copy of marin_scraper.py --- marin_scraper.py | 303 ----------------------------------------------- 1 file changed, 303 deletions(-) delete mode 100644 marin_scraper.py diff --git a/marin_scraper.py b/marin_scraper.py deleted file mode 100644 index 0c52ed44..00000000 --- a/marin_scraper.py +++ /dev/null @@ -1,303 +0,0 @@ -#!/usr/bin/env python3 -import csv -import json -import numpy as np -from typing import List, Dict, Tuple -from selenium import webdriver -from bs4 import BeautifulSoup -from urllib.parse import unquote_plus -from datetime import datetime -import re - -from .utils import get_data_model - -def get_county_data() -> Dict: - """Main method for populating county data""" - - url = 'https://coronavirus.marinhhs.org/surveillance' - model = get_data_model() - - chart_ids = {"cases": "Eq6Es", "deaths": "Eq6Es", "tests": '2Hgir', "age": "VOeBm", "gender": "FEciW", "race_eth": "aBeEd"} - # population totals and transmission data missing. - model['name'] = "Marin County" - model['update_time'] = datetime.today().isoformat() - # No actual update time on their website? They update most charts daily (so the isoformat is only partially correct.) - model['source_url'] = url - model['meta_from_source'] = get_metadata(url, chart_ids) - model["series"]["cases"] = get_case_series(chart_ids["cases"], url) - model["series"]["deaths"] = get_death_series(chart_ids["deaths"], url) - model["series"]["tests"] = get_test_series(chart_ids["tests"], url) - model["case_totals"]["age_group"], model["death_totals"]["age_group"] = get_breakdown_age(chart_ids["age"], url) - model["case_totals"]["gender"], model["death_totals"]["gender"] = get_breakdown_gender(chart_ids["gender"], url) - model["case_totals"]["race_eth"], model["death_totals"]["race_eth"] = get_breakdown_race_eth(chart_ids["race_eth"], url) - - print(model) - -def extract_csvs(chart_id: str, url: str) -> str: - """This method extracts the csv string from the data wrapper charts.""" - driver = webdriver.Chrome('/Users/angelakwon/Downloads/chromedriver') - # need to figure out how to change the webdriver - - driver.implicitly_wait(30) - driver.get(url) - - frame = driver.find_element_by_css_selector(f'iframe[src*="//datawrapper.dwcdn.net/{chart_id}/"]') - - driver.switch_to.frame(frame) - # Grab the raw data out of the link's href attribute - csv_data = driver.find_element_by_class_name('dw-data-link').get_attribute('href') - # Switch back to the parent frame to "reset" the context - driver.switch_to.parent_frame() - - # Deal with the data - if csv_data.startswith('data:'): - media, data = csv_data[5:].split(',', 1) - # Will likely always have this kind of data type - if media != 'application/octet-stream;charset=utf-8': - raise ValueError(f'Cannot handle media type "{media}"') - csv_string = unquote_plus(data) - - # Then leave the iframe - driver.switch_to.default_content() - - return csv_string - -def get_metadata(url: str, chart_ids: str) -> Tuple: - notes = [] - driver = webdriver.Chrome('/Users/angelakwon/Downloads/chromedriver') # change this to point to Github one - driver.implicitly_wait(30) - driver.get(url) - soup = BeautifulSoup(driver.page_source, 'html5lib') - metadata = [] - - to_be_matched = ['Total Cases, Recovered, Hospitalizations and Deaths by Date Reported', 'Daily Count of Positive Results and Total Tests for Marin County Residents by Test Date ', 'Cases, Hospitalizations, and Deaths by Age, Gender and Race/Ethnicity '] - chart_metadata = [] - - for text in to_be_matched: - target = soup.find('h4',text=text) - if not target: - raise ValueError('Cannot handle this header.') - for sib in target.find_next_siblings()[:1]: # I only want the first paragraph tag - # Is it more efficient to use something like (soup object).select('h1 + p') to grab the first paragraph that follows? - metadata += [sib.text] - - # Metadata for each chart visualizing the data of the csv file I'll pull. There's probably a better way to organize this. - for chart_id in chart_ids.values(): - frame = driver.find_element_by_css_selector(f'iframe[src*="//datawrapper.dwcdn.net/{chart_id}/"]') - driver.switch_to.frame(frame) - # The metadata for the charts is located in elements with the class `dw-chart-notes' - for c in driver.find_elements_by_class_name('dw-chart-notes'): - chart_metadata.append(c.text) - - # Switch back to the parent frame to "reset" the context - driver.switch_to.parent_frame() - - driver.quit() - - # Return the metadata. I take the set of the chart_metadata since there are repeating metadata strings. - return metadata, list(set(chart_metadata)) - -def get_case_series(chart_id: str, url: str) -> List: - """This method extracts the date, number of cumulative cases, and new cases.""" - csv_ = extract_csvs(chart_id, url) - series = [] - - csv_strs = csv_.splitlines() - keys = csv_strs[0].split(',') - - if keys != ['Date', 'Total Cases', 'Total Recovered*', 'Total Hospitalized', 'Total Deaths']: - raise ValueError('The headers have changed') - - case_history = [] - - for row in csv_strs[1:]: - daily = {} - # Grab the date in the first column - date_time_obj = datetime.strptime(row.split(',')[0], '%m/%d/%Y') - daily["date"] = date_time_obj.isoformat() - # Collect the case totals in order to compute the change in cases per day - case_history.append(int(row.split(',')[1])) - # Grab the cumulative number in the fifth column - daily["cumul_cases"] = int(row.split(',')[1]) - series.append(daily) - - case_history_diff = np.diff(case_history) - # there will be no calculated difference for the first day, so adding it in manually - case_history_diff = np.insert(case_history_diff, 0, 0) - # adding the case differences into the series - for val, case_num in enumerate(case_history_diff): - series[val]["cases"] = case_num - return series - -def get_death_series(chart_id: str, url: str) -> List: - """This method extracts the date, number of cumulative deaths, and new deaths.""" - csv_ = extract_csvs(chart_id, url) - series = [] - - csv_strs = csv_.splitlines() - keys = csv_strs[0].split(',') - if keys != ['Date', 'Total Cases', 'Total Recovered*', 'Total Hospitalized', 'Total Deaths']: - raise ValueError('The headers have changed.') - - death_history = [] - - for row in csv_strs[1:]: - daily = {} - # Grab the date in the first column - date_time_obj = datetime.strptime(row.split(',')[0], '%m/%d/%Y') - daily["date"] = date_time_obj.isoformat() - # Collect the death totals in order to compute the change in deaths per day - death_history.append(int(row.split(',')[4])) - # Grab the cumulative number in the fifth column - daily["cumul_deaths"] = int(row.split(',')[4]) - series.append(daily) - - death_history_diff = np.diff(death_history) - # there will be no calculated difference for the first day, so adding it in manually - death_history_diff = np.insert(death_history_diff, 0, 0) - # adding the case differences into the series - for val, death_num in enumerate(death_history_diff): - series[val]["deaths"] = death_num - return series - -def get_breakdown_age(chart_id: str, url: str) -> Tuple: - """This method gets the breakdown of cases and deaths by age.""" - csv_ = extract_csvs(chart_id, url) - c_brkdown = [] - d_brkdown = [] - - csv_strs = csv_.splitlines() - keys = csv_strs[0].split(',') - - if keys != ['Age Category', 'POPULATION', 'Cases', 'Hospitalizations', 'Deaths']: - raise ValueError('The headers have changed') - - ages = ['0-18', '19-34', '35-49', '50-64', '65+'] - for row in csv_strs[1:]: - c_age = {} - d_age = {} - # Extracting the age group and the raw count (the 3rd and 5th columns, respectively) for both cases and deaths. - # Each new row has data for a different age group. - c_age["group"] = row.split(',')[0] - if c_age["group"] not in ages: - raise ValueError('The age groups have changed.') - c_age["raw_count"] = int(row.split(',')[2]) - d_age["group"] = row.split(',')[0] - d_age["raw_count"] = int(row.split(',')[4]) - c_brkdown.append(c_age) - d_brkdown.append(d_age) - - return c_brkdown, d_brkdown - -def get_breakdown_gender(chart_id: str, url: str) -> Tuple: - """This method gets the breakdown of cases and deaths by gender.""" - csv_ = extract_csvs(chart_id, url) - - csv_strs = csv_.splitlines() - keys = csv_strs[0].split(',') - if keys != ['Gender', 'POPULATION', 'Cases', 'Hospitalizations', 'Deaths']: - raise ValueError('The headers have changed.') - - genders = ['male', 'female'] - c_gender = {} - d_gender = {} - - for row in csv_strs[1:]: - # Extracting the gender and the raw count (the 3rd and 5th columns, respectively) for both cases and deaths. - # Each new row has data for a different gender. - split = row.split(',') - gender = split[0].lower() - if gender not in genders: - return ValueError('The genders have changed.') - c_gender[gender] = int(split[2]) - d_gender[gender] = int(split[4]) - - return c_gender, d_gender - -def get_breakdown_race_eth(chart_id: str, url: str) -> Tuple: - """This method gets the breakdown of cases and deaths by race/ethnicity.""" - - csv_ = extract_csvs(chart_id, url) - - csv_strs = csv_.splitlines() - keys = csv_strs[0].split(',') - - if keys != ['Race/Ethnicity', 'COUNTY POPULATION', 'Case Count', 'Percent of Cases', 'Hospitalization Count', 'Percent of Hospitalizations', 'Death Count', 'Percent of Deaths']: - raise ValueError("The headers have changed.") - - key_mapping = {"black/african american":"African_Amer", "hispanic/latino": "Latinx_or_Hispanic", - "american indian/alaska native": "Native_Amer", "native hawaiian/pacific islander": "Pacific_Islander", "white": "White", "asian": "Asian", "multi or other race": "Multi or Other Race"} - # "Multiple_Race", "Other" are not separate in this data set - they are one value under "Multi or Other Race" - - c_race_eth = {} - d_race_eth = {} - - for row in csv_strs[1:]: - split = row.split(',') - race_eth = split[0].lower() - if race_eth not in key_mapping: - raise ValueError("The race_eth groups have changed.") - else: - c_race_eth[key_mapping[race_eth]] = int(split[2]) - d_race_eth[key_mapping[race_eth]] = int(split[6]) - - return c_race_eth, d_race_eth - -def get_test_series(chart_id: str, url: str) -> Tuple: - """This method gets the date, the number of positive and negative tests on that date, and the number of cumulative positive and negative tests.""" - - csv_ = extract_csvs(chart_id, url) - series = [] - - csv_strs = csv_.splitlines() - keys = csv_strs[0].split(',') - - test_history = [] - - # Grab the dates, which are in the header - for entry in csv_strs[:1][0].split(',')[1:]: - # need to exclude very first item in the csv_strs[:1][0].split(',') list (which is the value 'Date') - daily = {} - date_time_obj = datetime.strptime(entry, '%m/%d/%Y') - daily["date"] = date_time_obj.isoformat() - series.append(daily) - - # The slicing makes this if statement hard to look at... there must be a better way? - if csv_strs[1:2][0].split(',')[:1][0] != 'Positive Tests' and csv_strs[2:][0].split(',')[:1][0] != 'Negative Tests': - raise ValueError('The kinds of tests have changed.') - - # Grab the positive test result numbers, which is in the second row. - # [1:] is included to make sure that 'Positive Tests' is not captured. - p_entries = csv_strs[1:2][0].split(',')[1:] - n_entries = csv_strs[2:][0].split(',')[1:] - - get_test_series_helper(series, p_entries, ['positive', 'cumul_pos']) - get_test_series_helper(series, n_entries, ['negative', 'cumul_neg']) - - return series - -def get_test_series_helper(series: list, entries: list, keys: list) -> List: - """This method helps get the pos/neg test count and the cumulative pos/neg test count.""" - - # initialize values cumulative number, the positive/negative and cumul_pos/neg values for the first day, and the index needed for the while loop. - - # there's probably a more efficient way to do all of this, but I just wasn't sure. - cumul = int(entries[0]) - series[0][keys[0]] = int(entries[0]) - series[0][keys[1]] = cumul - index = 1 - - while index < len(series): - # get a particular day - day = series[index] - curr = int(entries[index]) - # get pos/neg test count - day[keys[0]] = int(curr) - # add that day's pos/neg test count to get cumulative number of positive tests - cumul += curr - day[keys[1]] = cumul - index += 1 - return series - - -get_county_data() From c108ac14184b528a34cb93942a756fa66e1a0e92 Mon Sep 17 00:00:00 2001 From: kwonangela7 Date: Fri, 19 Jun 2020 23:26:30 -0700 Subject: [PATCH 10/39] converted tab to 4 spaces --- covid19_sfbayarea/data/marin_scraper.py | 524 ++++++++++++------------ 1 file changed, 263 insertions(+), 261 deletions(-) diff --git a/covid19_sfbayarea/data/marin_scraper.py b/covid19_sfbayarea/data/marin_scraper.py index 0c52ed44..b326dab0 100644 --- a/covid19_sfbayarea/data/marin_scraper.py +++ b/covid19_sfbayarea/data/marin_scraper.py @@ -9,295 +9,297 @@ from datetime import datetime import re +from ..webdriver import get_firefox from .utils import get_data_model +#Can you also please make sure to use 4-space (not tab) indentation, so this is consistent with the other files? def get_county_data() -> Dict: - """Main method for populating county data""" - - url = 'https://coronavirus.marinhhs.org/surveillance' - model = get_data_model() - - chart_ids = {"cases": "Eq6Es", "deaths": "Eq6Es", "tests": '2Hgir', "age": "VOeBm", "gender": "FEciW", "race_eth": "aBeEd"} - # population totals and transmission data missing. - model['name'] = "Marin County" - model['update_time'] = datetime.today().isoformat() - # No actual update time on their website? They update most charts daily (so the isoformat is only partially correct.) - model['source_url'] = url - model['meta_from_source'] = get_metadata(url, chart_ids) - model["series"]["cases"] = get_case_series(chart_ids["cases"], url) - model["series"]["deaths"] = get_death_series(chart_ids["deaths"], url) - model["series"]["tests"] = get_test_series(chart_ids["tests"], url) - model["case_totals"]["age_group"], model["death_totals"]["age_group"] = get_breakdown_age(chart_ids["age"], url) - model["case_totals"]["gender"], model["death_totals"]["gender"] = get_breakdown_gender(chart_ids["gender"], url) - model["case_totals"]["race_eth"], model["death_totals"]["race_eth"] = get_breakdown_race_eth(chart_ids["race_eth"], url) - - print(model) + """Main method for populating county data""" + + url = 'https://coronavirus.marinhhs.org/surveillance' + model = get_data_model() + + chart_ids = {"cases": "Eq6Es", "deaths": "Eq6Es", "tests": '2Hgir', "age": "VOeBm", "gender": "FEciW", "race_eth": "aBeEd"} + # population totals and transmission data missing. + model['name'] = "Marin County" + model['update_time'] = datetime.today().isoformat() + # No actual update time on their website? They update most charts daily (so the isoformat is only partially correct.) + model['source_url'] = url + model['meta_from_source'] = get_metadata(url, chart_ids) + model["series"]["cases"] = get_case_series(chart_ids["cases"], url) + model["series"]["deaths"] = get_death_series(chart_ids["deaths"], url) + model["series"]["tests"] = get_test_series(chart_ids["tests"], url) + model["case_totals"]["age_group"], model["death_totals"]["age_group"] = get_breakdown_age(chart_ids["age"], url) + model["case_totals"]["gender"], model["death_totals"]["gender"] = get_breakdown_gender(chart_ids["gender"], url) + model["case_totals"]["race_eth"], model["death_totals"]["race_eth"] = get_breakdown_race_eth(chart_ids["race_eth"], url) + + print(model) def extract_csvs(chart_id: str, url: str) -> str: - """This method extracts the csv string from the data wrapper charts.""" - driver = webdriver.Chrome('/Users/angelakwon/Downloads/chromedriver') - # need to figure out how to change the webdriver - - driver.implicitly_wait(30) - driver.get(url) - - frame = driver.find_element_by_css_selector(f'iframe[src*="//datawrapper.dwcdn.net/{chart_id}/"]') - - driver.switch_to.frame(frame) - # Grab the raw data out of the link's href attribute - csv_data = driver.find_element_by_class_name('dw-data-link').get_attribute('href') - # Switch back to the parent frame to "reset" the context - driver.switch_to.parent_frame() - - # Deal with the data - if csv_data.startswith('data:'): - media, data = csv_data[5:].split(',', 1) - # Will likely always have this kind of data type - if media != 'application/octet-stream;charset=utf-8': - raise ValueError(f'Cannot handle media type "{media}"') - csv_string = unquote_plus(data) - - # Then leave the iframe - driver.switch_to.default_content() - - return csv_string + """This method extracts the csv string from the data wrapper charts.""" + driver = get_firefox() + # need to figure out how to change the webdriver + + driver.implicitly_wait(30) + driver.get(url) + + frame = driver.find_element_by_css_selector(f'iframe[src*="//datawrapper.dwcdn.net/{chart_id}/"]') + + driver.switch_to.frame(frame) + # Grab the raw data out of the link's href attribute + csv_data = driver.find_element_by_class_name('dw-data-link').get_attribute('href') + # Switch back to the parent frame to "reset" the context + driver.switch_to.parent_frame() + + # Deal with the data + if csv_data.startswith('data:'): + media, data = csv_data[5:].split(',', 1) + # Will likely always have this kind of data type + if media != 'application/octet-stream;charset=utf-8': + raise ValueError(f'Cannot handle media type "{media}"') + csv_string = unquote_plus(data) + + # Then leave the iframe + driver.switch_to.default_content() + + return csv_string def get_metadata(url: str, chart_ids: str) -> Tuple: - notes = [] - driver = webdriver.Chrome('/Users/angelakwon/Downloads/chromedriver') # change this to point to Github one - driver.implicitly_wait(30) - driver.get(url) - soup = BeautifulSoup(driver.page_source, 'html5lib') - metadata = [] - - to_be_matched = ['Total Cases, Recovered, Hospitalizations and Deaths by Date Reported', 'Daily Count of Positive Results and Total Tests for Marin County Residents by Test Date ', 'Cases, Hospitalizations, and Deaths by Age, Gender and Race/Ethnicity '] - chart_metadata = [] - - for text in to_be_matched: - target = soup.find('h4',text=text) - if not target: - raise ValueError('Cannot handle this header.') - for sib in target.find_next_siblings()[:1]: # I only want the first paragraph tag - # Is it more efficient to use something like (soup object).select('h1 + p') to grab the first paragraph that follows? - metadata += [sib.text] - - # Metadata for each chart visualizing the data of the csv file I'll pull. There's probably a better way to organize this. - for chart_id in chart_ids.values(): - frame = driver.find_element_by_css_selector(f'iframe[src*="//datawrapper.dwcdn.net/{chart_id}/"]') - driver.switch_to.frame(frame) - # The metadata for the charts is located in elements with the class `dw-chart-notes' - for c in driver.find_elements_by_class_name('dw-chart-notes'): - chart_metadata.append(c.text) - - # Switch back to the parent frame to "reset" the context - driver.switch_to.parent_frame() - - driver.quit() - - # Return the metadata. I take the set of the chart_metadata since there are repeating metadata strings. - return metadata, list(set(chart_metadata)) + notes = [] + driver = get_firefox() + driver.implicitly_wait(30) + driver.get(url) + soup = BeautifulSoup(driver.page_source, 'html5lib') + metadata = [] + + to_be_matched = ['Total Cases, Recovered, Hospitalizations and Deaths by Date Reported', 'Daily Count of Positive Results and Total Tests for Marin County Residents by Test Date ', 'Cases, Hospitalizations, and Deaths by Age, Gender and Race/Ethnicity '] + chart_metadata = [] + + for text in to_be_matched: + target = soup.find('h4',text=text) + if not target: + raise ValueError('Cannot handle this header.') + for sib in target.find_next_siblings()[:1]: # I only want the first paragraph tag + # Is it more efficient to use something like (soup object).select('h1 + p') to grab the first paragraph that follows? + metadata += [sib.text] + + # Metadata for each chart visualizing the data of the csv file I'll pull. There's probably a better way to organize this. + for chart_id in chart_ids.values(): + frame = driver.find_element_by_css_selector(f'iframe[src*="//datawrapper.dwcdn.net/{chart_id}/"]') + driver.switch_to.frame(frame) + # The metadata for the charts is located in elements with the class `dw-chart-notes' + for c in driver.find_elements_by_class_name('dw-chart-notes'): + chart_metadata.append(c.text) + + # Switch back to the parent frame to "reset" the context + driver.switch_to.parent_frame() + + driver.quit() + + # Return the metadata. I take the set of the chart_metadata since there are repeating metadata strings. + return metadata, list(set(chart_metadata)) def get_case_series(chart_id: str, url: str) -> List: - """This method extracts the date, number of cumulative cases, and new cases.""" - csv_ = extract_csvs(chart_id, url) - series = [] - - csv_strs = csv_.splitlines() - keys = csv_strs[0].split(',') - - if keys != ['Date', 'Total Cases', 'Total Recovered*', 'Total Hospitalized', 'Total Deaths']: - raise ValueError('The headers have changed') - - case_history = [] - - for row in csv_strs[1:]: - daily = {} - # Grab the date in the first column - date_time_obj = datetime.strptime(row.split(',')[0], '%m/%d/%Y') - daily["date"] = date_time_obj.isoformat() - # Collect the case totals in order to compute the change in cases per day - case_history.append(int(row.split(',')[1])) - # Grab the cumulative number in the fifth column - daily["cumul_cases"] = int(row.split(',')[1]) - series.append(daily) - - case_history_diff = np.diff(case_history) - # there will be no calculated difference for the first day, so adding it in manually - case_history_diff = np.insert(case_history_diff, 0, 0) - # adding the case differences into the series - for val, case_num in enumerate(case_history_diff): - series[val]["cases"] = case_num - return series + """This method extracts the date, number of cumulative cases, and new cases.""" + csv_ = extract_csvs(chart_id, url) + series = [] + + csv_strs = csv_.splitlines() + keys = csv_strs[0].split(',') + + if keys != ['Date', 'Total Cases', 'Total Recovered*', 'Total Hospitalized', 'Total Deaths']: + raise ValueError('The headers have changed') + + case_history = [] + + for row in csv_strs[1:]: + daily = {} + # Grab the date in the first column + date_time_obj = datetime.strptime(row.split(',')[0], '%m/%d/%Y') + daily["date"] = date_time_obj.isoformat() + # Collect the case totals in order to compute the change in cases per day + case_history.append(int(row.split(',')[1])) + # Grab the cumulative number in the fifth column + daily["cumul_cases"] = int(row.split(',')[1]) + series.append(daily) + + case_history_diff = np.diff(case_history) + # there will be no calculated difference for the first day, so adding it in manually + case_history_diff = np.insert(case_history_diff, 0, 0) + # adding the case differences into the series + for val, case_num in enumerate(case_history_diff): + series[val]["cases"] = case_num + return series def get_death_series(chart_id: str, url: str) -> List: - """This method extracts the date, number of cumulative deaths, and new deaths.""" - csv_ = extract_csvs(chart_id, url) - series = [] - - csv_strs = csv_.splitlines() - keys = csv_strs[0].split(',') - if keys != ['Date', 'Total Cases', 'Total Recovered*', 'Total Hospitalized', 'Total Deaths']: - raise ValueError('The headers have changed.') - - death_history = [] - - for row in csv_strs[1:]: - daily = {} - # Grab the date in the first column - date_time_obj = datetime.strptime(row.split(',')[0], '%m/%d/%Y') - daily["date"] = date_time_obj.isoformat() - # Collect the death totals in order to compute the change in deaths per day - death_history.append(int(row.split(',')[4])) - # Grab the cumulative number in the fifth column - daily["cumul_deaths"] = int(row.split(',')[4]) - series.append(daily) - - death_history_diff = np.diff(death_history) - # there will be no calculated difference for the first day, so adding it in manually - death_history_diff = np.insert(death_history_diff, 0, 0) - # adding the case differences into the series - for val, death_num in enumerate(death_history_diff): - series[val]["deaths"] = death_num - return series + """This method extracts the date, number of cumulative deaths, and new deaths.""" + csv_ = extract_csvs(chart_id, url) + series = [] + + csv_strs = csv_.splitlines() + keys = csv_strs[0].split(',') + if keys != ['Date', 'Total Cases', 'Total Recovered*', 'Total Hospitalized', 'Total Deaths']: + raise ValueError('The headers have changed.') + + death_history = [] + + for row in csv_strs[1:]: + daily = {} + # Grab the date in the first column + date_time_obj = datetime.strptime(row.split(',')[0], '%m/%d/%Y') + daily["date"] = date_time_obj.isoformat() + # Collect the death totals in order to compute the change in deaths per day + death_history.append(int(row.split(',')[4])) + # Grab the cumulative number in the fifth column + daily["cumul_deaths"] = int(row.split(',')[4]) + series.append(daily) + + death_history_diff = np.diff(death_history) + # there will be no calculated difference for the first day, so adding it in manually + death_history_diff = np.insert(death_history_diff, 0, 0) + # adding the case differences into the series + for val, death_num in enumerate(death_history_diff): + series[val]["deaths"] = death_num + return series def get_breakdown_age(chart_id: str, url: str) -> Tuple: - """This method gets the breakdown of cases and deaths by age.""" - csv_ = extract_csvs(chart_id, url) - c_brkdown = [] - d_brkdown = [] - - csv_strs = csv_.splitlines() - keys = csv_strs[0].split(',') - - if keys != ['Age Category', 'POPULATION', 'Cases', 'Hospitalizations', 'Deaths']: - raise ValueError('The headers have changed') - - ages = ['0-18', '19-34', '35-49', '50-64', '65+'] - for row in csv_strs[1:]: - c_age = {} - d_age = {} - # Extracting the age group and the raw count (the 3rd and 5th columns, respectively) for both cases and deaths. - # Each new row has data for a different age group. - c_age["group"] = row.split(',')[0] - if c_age["group"] not in ages: - raise ValueError('The age groups have changed.') - c_age["raw_count"] = int(row.split(',')[2]) - d_age["group"] = row.split(',')[0] - d_age["raw_count"] = int(row.split(',')[4]) - c_brkdown.append(c_age) - d_brkdown.append(d_age) - - return c_brkdown, d_brkdown + """This method gets the breakdown of cases and deaths by age.""" + csv_ = extract_csvs(chart_id, url) + c_brkdown = [] + d_brkdown = [] + + csv_strs = csv_.splitlines() + keys = csv_strs[0].split(',') + + if keys != ['Age Category', 'POPULATION', 'Cases', 'Hospitalizations', 'Deaths']: + raise ValueError('The headers have changed') + + ages = ['0-18', '19-34', '35-49', '50-64', '65+'] + for row in csv_strs[1:]: + c_age = {} + d_age = {} + # Extracting the age group and the raw count (the 3rd and 5th columns, respectively) for both cases and deaths. + # Each new row has data for a different age group. + c_age["group"] = row.split(',')[0] + if c_age["group"] not in ages: + raise ValueError('The age groups have changed.') + c_age["raw_count"] = int(row.split(',')[2]) + d_age["group"] = row.split(',')[0] + d_age["raw_count"] = int(row.split(',')[4]) + c_brkdown.append(c_age) + d_brkdown.append(d_age) + + return c_brkdown, d_brkdown def get_breakdown_gender(chart_id: str, url: str) -> Tuple: - """This method gets the breakdown of cases and deaths by gender.""" - csv_ = extract_csvs(chart_id, url) - - csv_strs = csv_.splitlines() - keys = csv_strs[0].split(',') - if keys != ['Gender', 'POPULATION', 'Cases', 'Hospitalizations', 'Deaths']: - raise ValueError('The headers have changed.') - - genders = ['male', 'female'] - c_gender = {} - d_gender = {} - - for row in csv_strs[1:]: - # Extracting the gender and the raw count (the 3rd and 5th columns, respectively) for both cases and deaths. - # Each new row has data for a different gender. - split = row.split(',') - gender = split[0].lower() - if gender not in genders: - return ValueError('The genders have changed.') - c_gender[gender] = int(split[2]) - d_gender[gender] = int(split[4]) - - return c_gender, d_gender + """This method gets the breakdown of cases and deaths by gender.""" + csv_ = extract_csvs(chart_id, url) + + csv_strs = csv_.splitlines() + keys = csv_strs[0].split(',') + if keys != ['Gender', 'POPULATION', 'Cases', 'Hospitalizations', 'Deaths']: + raise ValueError('The headers have changed.') + + genders = ['male', 'female'] + c_gender = {} + d_gender = {} + + for row in csv_strs[1:]: + # Extracting the gender and the raw count (the 3rd and 5th columns, respectively) for both cases and deaths. + # Each new row has data for a different gender. + split = row.split(',') + gender = split[0].lower() + if gender not in genders: + return ValueError('The genders have changed.') + c_gender[gender] = int(split[2]) + d_gender[gender] = int(split[4]) + + return c_gender, d_gender def get_breakdown_race_eth(chart_id: str, url: str) -> Tuple: - """This method gets the breakdown of cases and deaths by race/ethnicity.""" + """This method gets the breakdown of cases and deaths by race/ethnicity.""" - csv_ = extract_csvs(chart_id, url) + csv_ = extract_csvs(chart_id, url) - csv_strs = csv_.splitlines() - keys = csv_strs[0].split(',') - - if keys != ['Race/Ethnicity', 'COUNTY POPULATION', 'Case Count', 'Percent of Cases', 'Hospitalization Count', 'Percent of Hospitalizations', 'Death Count', 'Percent of Deaths']: - raise ValueError("The headers have changed.") + csv_strs = csv_.splitlines() + keys = csv_strs[0].split(',') + + if keys != ['Race/Ethnicity', 'COUNTY POPULATION', 'Case Count', 'Percent of Cases', 'Hospitalization Count', 'Percent of Hospitalizations', 'Death Count', 'Percent of Deaths']: + raise ValueError("The headers have changed.") - key_mapping = {"black/african american":"African_Amer", "hispanic/latino": "Latinx_or_Hispanic", + key_mapping = {"black/african american":"African_Amer", "hispanic/latino": "Latinx_or_Hispanic", "american indian/alaska native": "Native_Amer", "native hawaiian/pacific islander": "Pacific_Islander", "white": "White", "asian": "Asian", "multi or other race": "Multi or Other Race"} # "Multiple_Race", "Other" are not separate in this data set - they are one value under "Multi or Other Race" - c_race_eth = {} - d_race_eth = {} - - for row in csv_strs[1:]: - split = row.split(',') - race_eth = split[0].lower() - if race_eth not in key_mapping: - raise ValueError("The race_eth groups have changed.") - else: - c_race_eth[key_mapping[race_eth]] = int(split[2]) - d_race_eth[key_mapping[race_eth]] = int(split[6]) + c_race_eth = {} + d_race_eth = {} + + for row in csv_strs[1:]: + split = row.split(',') + race_eth = split[0].lower() + if race_eth not in key_mapping: + raise ValueError("The race_eth groups have changed.") + else: + c_race_eth[key_mapping[race_eth]] = int(split[2]) + d_race_eth[key_mapping[race_eth]] = int(split[6]) - return c_race_eth, d_race_eth + return c_race_eth, d_race_eth def get_test_series(chart_id: str, url: str) -> Tuple: - """This method gets the date, the number of positive and negative tests on that date, and the number of cumulative positive and negative tests.""" - - csv_ = extract_csvs(chart_id, url) - series = [] - - csv_strs = csv_.splitlines() - keys = csv_strs[0].split(',') - - test_history = [] - - # Grab the dates, which are in the header - for entry in csv_strs[:1][0].split(',')[1:]: - # need to exclude very first item in the csv_strs[:1][0].split(',') list (which is the value 'Date') - daily = {} - date_time_obj = datetime.strptime(entry, '%m/%d/%Y') - daily["date"] = date_time_obj.isoformat() - series.append(daily) - - # The slicing makes this if statement hard to look at... there must be a better way? - if csv_strs[1:2][0].split(',')[:1][0] != 'Positive Tests' and csv_strs[2:][0].split(',')[:1][0] != 'Negative Tests': - raise ValueError('The kinds of tests have changed.') - - # Grab the positive test result numbers, which is in the second row. - # [1:] is included to make sure that 'Positive Tests' is not captured. - p_entries = csv_strs[1:2][0].split(',')[1:] - n_entries = csv_strs[2:][0].split(',')[1:] - - get_test_series_helper(series, p_entries, ['positive', 'cumul_pos']) - get_test_series_helper(series, n_entries, ['negative', 'cumul_neg']) - - return series + """This method gets the date, the number of positive and negative tests on that date, and the number of cumulative positive and negative tests.""" + + csv_ = extract_csvs(chart_id, url) + series = [] + + csv_strs = csv_.splitlines() + keys = csv_strs[0].split(',') + + test_history = [] + + # Grab the dates, which are in the header + for entry in csv_strs[:1][0].split(',')[1:]: + # need to exclude very first item in the csv_strs[:1][0].split(',') list (which is the value 'Date') + daily = {} + date_time_obj = datetime.strptime(entry, '%m/%d/%Y') + daily["date"] = date_time_obj.isoformat() + series.append(daily) + + # The slicing makes this if statement hard to look at... there must be a better way? + if csv_strs[1:2][0].split(',')[:1][0] != 'Positive Tests' and csv_strs[2:][0].split(',')[:1][0] != 'Negative Tests': + raise ValueError('The kinds of tests have changed.') + + # Grab the positive test result numbers, which is in the second row. + # [1:] is included to make sure that 'Positive Tests' is not captured. + p_entries = csv_strs[1:2][0].split(',')[1:] + n_entries = csv_strs[2:][0].split(',')[1:] + + get_test_series_helper(series, p_entries, ['positive', 'cumul_pos']) + get_test_series_helper(series, n_entries, ['negative', 'cumul_neg']) + + return series def get_test_series_helper(series: list, entries: list, keys: list) -> List: - """This method helps get the pos/neg test count and the cumulative pos/neg test count.""" - - # initialize values cumulative number, the positive/negative and cumul_pos/neg values for the first day, and the index needed for the while loop. - - # there's probably a more efficient way to do all of this, but I just wasn't sure. - cumul = int(entries[0]) - series[0][keys[0]] = int(entries[0]) - series[0][keys[1]] = cumul - index = 1 - - while index < len(series): - # get a particular day - day = series[index] - curr = int(entries[index]) - # get pos/neg test count - day[keys[0]] = int(curr) - # add that day's pos/neg test count to get cumulative number of positive tests - cumul += curr - day[keys[1]] = cumul - index += 1 - return series + """This method helps get the pos/neg test count and the cumulative pos/neg test count.""" + + # initialize values cumulative number, the positive/negative and cumul_pos/neg values for the first day, and the index needed for the while loop. + + # there's probably a more efficient way to do all of this, but I just wasn't sure. + cumul = int(entries[0]) + series[0][keys[0]] = int(entries[0]) + series[0][keys[1]] = cumul + index = 1 + + while index < len(series): + # get a particular day + day = series[index] + curr = int(entries[index]) + # get pos/neg test count + day[keys[0]] = int(curr) + # add that day's pos/neg test count to get cumulative number of positive tests + cumul += curr + day[keys[1]] = cumul + index += 1 + return series get_county_data() From c6a969fb31f9a429d64b0b1c844c7a7db0fca550 Mon Sep 17 00:00:00 2001 From: kwonangela7 Date: Fri, 19 Jun 2020 23:41:42 -0700 Subject: [PATCH 11/39] raised error for wrong kind of href --- covid19_sfbayarea/data/marin_scraper.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/covid19_sfbayarea/data/marin_scraper.py b/covid19_sfbayarea/data/marin_scraper.py index b326dab0..1f6be284 100644 --- a/covid19_sfbayarea/data/marin_scraper.py +++ b/covid19_sfbayarea/data/marin_scraper.py @@ -11,7 +11,6 @@ from ..webdriver import get_firefox from .utils import get_data_model -#Can you also please make sure to use 4-space (not tab) indentation, so this is consistent with the other files? def get_county_data() -> Dict: """Main method for populating county data""" @@ -58,6 +57,8 @@ def extract_csvs(chart_id: str, url: str) -> str: if media != 'application/octet-stream;charset=utf-8': raise ValueError(f'Cannot handle media type "{media}"') csv_string = unquote_plus(data) + else: + raise ValueError('Cannot handle this csv_data href') # Then leave the iframe driver.switch_to.default_content() From 0156f85e62a92c2da61a3df87388b9821a6947ca Mon Sep 17 00:00:00 2001 From: kwonangela7 Date: Sat, 27 Jun 2020 22:23:49 -0700 Subject: [PATCH 12/39] Update covid19_sfbayarea/data/marin_scraper.py changed method name to get_county() Co-authored-by: Rob Brackett --- covid19_sfbayarea/data/marin_scraper.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/covid19_sfbayarea/data/marin_scraper.py b/covid19_sfbayarea/data/marin_scraper.py index 0c52ed44..c26c75bc 100644 --- a/covid19_sfbayarea/data/marin_scraper.py +++ b/covid19_sfbayarea/data/marin_scraper.py @@ -11,7 +11,7 @@ from .utils import get_data_model -def get_county_data() -> Dict: +def get_county() -> Dict: """Main method for populating county data""" url = 'https://coronavirus.marinhhs.org/surveillance' From 029f36781620f0df752e827e77ceaa7724c01c89 Mon Sep 17 00:00:00 2001 From: kwonangela7 Date: Sat, 27 Jun 2020 22:24:21 -0700 Subject: [PATCH 13/39] changed module name --- covid19_sfbayarea/data/{marin_scraper.py => marin.py} | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) rename covid19_sfbayarea/data/{marin_scraper.py => marin.py} (98%) diff --git a/covid19_sfbayarea/data/marin_scraper.py b/covid19_sfbayarea/data/marin.py similarity index 98% rename from covid19_sfbayarea/data/marin_scraper.py rename to covid19_sfbayarea/data/marin.py index 1f6be284..29ee317c 100644 --- a/covid19_sfbayarea/data/marin_scraper.py +++ b/covid19_sfbayarea/data/marin.py @@ -9,8 +9,8 @@ from datetime import datetime import re -from ..webdriver import get_firefox -from .utils import get_data_model +from .webdriver import get_firefox +from utils import get_data_model def get_county_data() -> Dict: """Main method for populating county data""" @@ -65,7 +65,7 @@ def extract_csvs(chart_id: str, url: str) -> str: return csv_string -def get_metadata(url: str, chart_ids: str) -> Tuple: +def get_metadata(url: str, chart_ids: Dict[str, str]) -> Tuple: notes = [] driver = get_firefox() driver.implicitly_wait(30) @@ -81,6 +81,7 @@ def get_metadata(url: str, chart_ids: str) -> Tuple: if not target: raise ValueError('Cannot handle this header.') for sib in target.find_next_siblings()[:1]: # I only want the first paragraph tag + ### FIXXX ####### # Is it more efficient to use something like (soup object).select('h1 + p') to grab the first paragraph that follows? metadata += [sib.text] From a36b2c05ed7571b6d1019aab712dffa5f70b1f3c Mon Sep 17 00:00:00 2001 From: kwonangela7 Date: Sat, 27 Jun 2020 22:41:08 -0700 Subject: [PATCH 14/39] renamed file, will rename at the end lol --- covid19_sfbayarea/data/{marin.py => marin_scraper.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename covid19_sfbayarea/data/{marin.py => marin_scraper.py} (100%) diff --git a/covid19_sfbayarea/data/marin.py b/covid19_sfbayarea/data/marin_scraper.py similarity index 100% rename from covid19_sfbayarea/data/marin.py rename to covid19_sfbayarea/data/marin_scraper.py From 0aadb08ca44612c00353e0a10058fcc9a203a6e4 Mon Sep 17 00:00:00 2001 From: kwonangela7 Date: Sat, 27 Jun 2020 22:45:15 -0700 Subject: [PATCH 15/39] renamed county function, added scraper to init file --- covid19_sfbayarea/data/__init__.py | 2 +- covid19_sfbayarea/data/marin_scraper.py | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/covid19_sfbayarea/data/__init__.py b/covid19_sfbayarea/data/__init__.py index 948e6dc4..4e2dd291 100644 --- a/covid19_sfbayarea/data/__init__.py +++ b/covid19_sfbayarea/data/__init__.py @@ -5,7 +5,7 @@ scrapers: Dict[str, Any] = { 'alameda': alameda, # 'contra_costa': None, - # 'marin': None, + 'marin': marin_scraper, # 'napa': None, 'san_francisco': san_francisco # 'san_mateo': None, diff --git a/covid19_sfbayarea/data/marin_scraper.py b/covid19_sfbayarea/data/marin_scraper.py index 29ee317c..1bcc9144 100644 --- a/covid19_sfbayarea/data/marin_scraper.py +++ b/covid19_sfbayarea/data/marin_scraper.py @@ -12,7 +12,7 @@ from .webdriver import get_firefox from utils import get_data_model -def get_county_data() -> Dict: +def get_county() -> Dict: """Main method for populating county data""" url = 'https://coronavirus.marinhhs.org/surveillance' @@ -32,7 +32,7 @@ def get_county_data() -> Dict: model["case_totals"]["gender"], model["death_totals"]["gender"] = get_breakdown_gender(chart_ids["gender"], url) model["case_totals"]["race_eth"], model["death_totals"]["race_eth"] = get_breakdown_race_eth(chart_ids["race_eth"], url) - print(model) + return model def extract_csvs(chart_id: str, url: str) -> str: """This method extracts the csv string from the data wrapper charts.""" @@ -304,4 +304,4 @@ def get_test_series_helper(series: list, entries: list, keys: list) -> List: return series -get_county_data() +get_county() From 37880a6d0cd2b21381da76387b02bb3afbe41805 Mon Sep 17 00:00:00 2001 From: kwonangela7 Date: Sat, 27 Jun 2020 22:52:49 -0700 Subject: [PATCH 16/39] pls ignore previous renaming commits, this is the actual commit to properly use the scraper script --- covid19_sfbayarea/data/__init__.py | 2 +- covid19_sfbayarea/data/marin.py | 307 +++++++++++++++++++++++++++++ 2 files changed, 308 insertions(+), 1 deletion(-) create mode 100644 covid19_sfbayarea/data/marin.py diff --git a/covid19_sfbayarea/data/__init__.py b/covid19_sfbayarea/data/__init__.py index 4e2dd291..9e18cf0e 100644 --- a/covid19_sfbayarea/data/__init__.py +++ b/covid19_sfbayarea/data/__init__.py @@ -5,7 +5,7 @@ scrapers: Dict[str, Any] = { 'alameda': alameda, # 'contra_costa': None, - 'marin': marin_scraper, + 'marin': marin, # 'napa': None, 'san_francisco': san_francisco # 'san_mateo': None, diff --git a/covid19_sfbayarea/data/marin.py b/covid19_sfbayarea/data/marin.py new file mode 100644 index 00000000..1bcc9144 --- /dev/null +++ b/covid19_sfbayarea/data/marin.py @@ -0,0 +1,307 @@ +#!/usr/bin/env python3 +import csv +import json +import numpy as np +from typing import List, Dict, Tuple +from selenium import webdriver +from bs4 import BeautifulSoup +from urllib.parse import unquote_plus +from datetime import datetime +import re + +from .webdriver import get_firefox +from utils import get_data_model + +def get_county() -> Dict: + """Main method for populating county data""" + + url = 'https://coronavirus.marinhhs.org/surveillance' + model = get_data_model() + + chart_ids = {"cases": "Eq6Es", "deaths": "Eq6Es", "tests": '2Hgir', "age": "VOeBm", "gender": "FEciW", "race_eth": "aBeEd"} + # population totals and transmission data missing. + model['name'] = "Marin County" + model['update_time'] = datetime.today().isoformat() + # No actual update time on their website? They update most charts daily (so the isoformat is only partially correct.) + model['source_url'] = url + model['meta_from_source'] = get_metadata(url, chart_ids) + model["series"]["cases"] = get_case_series(chart_ids["cases"], url) + model["series"]["deaths"] = get_death_series(chart_ids["deaths"], url) + model["series"]["tests"] = get_test_series(chart_ids["tests"], url) + model["case_totals"]["age_group"], model["death_totals"]["age_group"] = get_breakdown_age(chart_ids["age"], url) + model["case_totals"]["gender"], model["death_totals"]["gender"] = get_breakdown_gender(chart_ids["gender"], url) + model["case_totals"]["race_eth"], model["death_totals"]["race_eth"] = get_breakdown_race_eth(chart_ids["race_eth"], url) + + return model + +def extract_csvs(chart_id: str, url: str) -> str: + """This method extracts the csv string from the data wrapper charts.""" + driver = get_firefox() + # need to figure out how to change the webdriver + + driver.implicitly_wait(30) + driver.get(url) + + frame = driver.find_element_by_css_selector(f'iframe[src*="//datawrapper.dwcdn.net/{chart_id}/"]') + + driver.switch_to.frame(frame) + # Grab the raw data out of the link's href attribute + csv_data = driver.find_element_by_class_name('dw-data-link').get_attribute('href') + # Switch back to the parent frame to "reset" the context + driver.switch_to.parent_frame() + + # Deal with the data + if csv_data.startswith('data:'): + media, data = csv_data[5:].split(',', 1) + # Will likely always have this kind of data type + if media != 'application/octet-stream;charset=utf-8': + raise ValueError(f'Cannot handle media type "{media}"') + csv_string = unquote_plus(data) + else: + raise ValueError('Cannot handle this csv_data href') + + # Then leave the iframe + driver.switch_to.default_content() + + return csv_string + +def get_metadata(url: str, chart_ids: Dict[str, str]) -> Tuple: + notes = [] + driver = get_firefox() + driver.implicitly_wait(30) + driver.get(url) + soup = BeautifulSoup(driver.page_source, 'html5lib') + metadata = [] + + to_be_matched = ['Total Cases, Recovered, Hospitalizations and Deaths by Date Reported', 'Daily Count of Positive Results and Total Tests for Marin County Residents by Test Date ', 'Cases, Hospitalizations, and Deaths by Age, Gender and Race/Ethnicity '] + chart_metadata = [] + + for text in to_be_matched: + target = soup.find('h4',text=text) + if not target: + raise ValueError('Cannot handle this header.') + for sib in target.find_next_siblings()[:1]: # I only want the first paragraph tag + ### FIXXX ####### + # Is it more efficient to use something like (soup object).select('h1 + p') to grab the first paragraph that follows? + metadata += [sib.text] + + # Metadata for each chart visualizing the data of the csv file I'll pull. There's probably a better way to organize this. + for chart_id in chart_ids.values(): + frame = driver.find_element_by_css_selector(f'iframe[src*="//datawrapper.dwcdn.net/{chart_id}/"]') + driver.switch_to.frame(frame) + # The metadata for the charts is located in elements with the class `dw-chart-notes' + for c in driver.find_elements_by_class_name('dw-chart-notes'): + chart_metadata.append(c.text) + + # Switch back to the parent frame to "reset" the context + driver.switch_to.parent_frame() + + driver.quit() + + # Return the metadata. I take the set of the chart_metadata since there are repeating metadata strings. + return metadata, list(set(chart_metadata)) + +def get_case_series(chart_id: str, url: str) -> List: + """This method extracts the date, number of cumulative cases, and new cases.""" + csv_ = extract_csvs(chart_id, url) + series = [] + + csv_strs = csv_.splitlines() + keys = csv_strs[0].split(',') + + if keys != ['Date', 'Total Cases', 'Total Recovered*', 'Total Hospitalized', 'Total Deaths']: + raise ValueError('The headers have changed') + + case_history = [] + + for row in csv_strs[1:]: + daily = {} + # Grab the date in the first column + date_time_obj = datetime.strptime(row.split(',')[0], '%m/%d/%Y') + daily["date"] = date_time_obj.isoformat() + # Collect the case totals in order to compute the change in cases per day + case_history.append(int(row.split(',')[1])) + # Grab the cumulative number in the fifth column + daily["cumul_cases"] = int(row.split(',')[1]) + series.append(daily) + + case_history_diff = np.diff(case_history) + # there will be no calculated difference for the first day, so adding it in manually + case_history_diff = np.insert(case_history_diff, 0, 0) + # adding the case differences into the series + for val, case_num in enumerate(case_history_diff): + series[val]["cases"] = case_num + return series + +def get_death_series(chart_id: str, url: str) -> List: + """This method extracts the date, number of cumulative deaths, and new deaths.""" + csv_ = extract_csvs(chart_id, url) + series = [] + + csv_strs = csv_.splitlines() + keys = csv_strs[0].split(',') + if keys != ['Date', 'Total Cases', 'Total Recovered*', 'Total Hospitalized', 'Total Deaths']: + raise ValueError('The headers have changed.') + + death_history = [] + + for row in csv_strs[1:]: + daily = {} + # Grab the date in the first column + date_time_obj = datetime.strptime(row.split(',')[0], '%m/%d/%Y') + daily["date"] = date_time_obj.isoformat() + # Collect the death totals in order to compute the change in deaths per day + death_history.append(int(row.split(',')[4])) + # Grab the cumulative number in the fifth column + daily["cumul_deaths"] = int(row.split(',')[4]) + series.append(daily) + + death_history_diff = np.diff(death_history) + # there will be no calculated difference for the first day, so adding it in manually + death_history_diff = np.insert(death_history_diff, 0, 0) + # adding the case differences into the series + for val, death_num in enumerate(death_history_diff): + series[val]["deaths"] = death_num + return series + +def get_breakdown_age(chart_id: str, url: str) -> Tuple: + """This method gets the breakdown of cases and deaths by age.""" + csv_ = extract_csvs(chart_id, url) + c_brkdown = [] + d_brkdown = [] + + csv_strs = csv_.splitlines() + keys = csv_strs[0].split(',') + + if keys != ['Age Category', 'POPULATION', 'Cases', 'Hospitalizations', 'Deaths']: + raise ValueError('The headers have changed') + + ages = ['0-18', '19-34', '35-49', '50-64', '65+'] + for row in csv_strs[1:]: + c_age = {} + d_age = {} + # Extracting the age group and the raw count (the 3rd and 5th columns, respectively) for both cases and deaths. + # Each new row has data for a different age group. + c_age["group"] = row.split(',')[0] + if c_age["group"] not in ages: + raise ValueError('The age groups have changed.') + c_age["raw_count"] = int(row.split(',')[2]) + d_age["group"] = row.split(',')[0] + d_age["raw_count"] = int(row.split(',')[4]) + c_brkdown.append(c_age) + d_brkdown.append(d_age) + + return c_brkdown, d_brkdown + +def get_breakdown_gender(chart_id: str, url: str) -> Tuple: + """This method gets the breakdown of cases and deaths by gender.""" + csv_ = extract_csvs(chart_id, url) + + csv_strs = csv_.splitlines() + keys = csv_strs[0].split(',') + if keys != ['Gender', 'POPULATION', 'Cases', 'Hospitalizations', 'Deaths']: + raise ValueError('The headers have changed.') + + genders = ['male', 'female'] + c_gender = {} + d_gender = {} + + for row in csv_strs[1:]: + # Extracting the gender and the raw count (the 3rd and 5th columns, respectively) for both cases and deaths. + # Each new row has data for a different gender. + split = row.split(',') + gender = split[0].lower() + if gender not in genders: + return ValueError('The genders have changed.') + c_gender[gender] = int(split[2]) + d_gender[gender] = int(split[4]) + + return c_gender, d_gender + +def get_breakdown_race_eth(chart_id: str, url: str) -> Tuple: + """This method gets the breakdown of cases and deaths by race/ethnicity.""" + + csv_ = extract_csvs(chart_id, url) + + csv_strs = csv_.splitlines() + keys = csv_strs[0].split(',') + + if keys != ['Race/Ethnicity', 'COUNTY POPULATION', 'Case Count', 'Percent of Cases', 'Hospitalization Count', 'Percent of Hospitalizations', 'Death Count', 'Percent of Deaths']: + raise ValueError("The headers have changed.") + + key_mapping = {"black/african american":"African_Amer", "hispanic/latino": "Latinx_or_Hispanic", + "american indian/alaska native": "Native_Amer", "native hawaiian/pacific islander": "Pacific_Islander", "white": "White", "asian": "Asian", "multi or other race": "Multi or Other Race"} + # "Multiple_Race", "Other" are not separate in this data set - they are one value under "Multi or Other Race" + + c_race_eth = {} + d_race_eth = {} + + for row in csv_strs[1:]: + split = row.split(',') + race_eth = split[0].lower() + if race_eth not in key_mapping: + raise ValueError("The race_eth groups have changed.") + else: + c_race_eth[key_mapping[race_eth]] = int(split[2]) + d_race_eth[key_mapping[race_eth]] = int(split[6]) + + return c_race_eth, d_race_eth + +def get_test_series(chart_id: str, url: str) -> Tuple: + """This method gets the date, the number of positive and negative tests on that date, and the number of cumulative positive and negative tests.""" + + csv_ = extract_csvs(chart_id, url) + series = [] + + csv_strs = csv_.splitlines() + keys = csv_strs[0].split(',') + + test_history = [] + + # Grab the dates, which are in the header + for entry in csv_strs[:1][0].split(',')[1:]: + # need to exclude very first item in the csv_strs[:1][0].split(',') list (which is the value 'Date') + daily = {} + date_time_obj = datetime.strptime(entry, '%m/%d/%Y') + daily["date"] = date_time_obj.isoformat() + series.append(daily) + + # The slicing makes this if statement hard to look at... there must be a better way? + if csv_strs[1:2][0].split(',')[:1][0] != 'Positive Tests' and csv_strs[2:][0].split(',')[:1][0] != 'Negative Tests': + raise ValueError('The kinds of tests have changed.') + + # Grab the positive test result numbers, which is in the second row. + # [1:] is included to make sure that 'Positive Tests' is not captured. + p_entries = csv_strs[1:2][0].split(',')[1:] + n_entries = csv_strs[2:][0].split(',')[1:] + + get_test_series_helper(series, p_entries, ['positive', 'cumul_pos']) + get_test_series_helper(series, n_entries, ['negative', 'cumul_neg']) + + return series + +def get_test_series_helper(series: list, entries: list, keys: list) -> List: + """This method helps get the pos/neg test count and the cumulative pos/neg test count.""" + + # initialize values cumulative number, the positive/negative and cumul_pos/neg values for the first day, and the index needed for the while loop. + + # there's probably a more efficient way to do all of this, but I just wasn't sure. + cumul = int(entries[0]) + series[0][keys[0]] = int(entries[0]) + series[0][keys[1]] = cumul + index = 1 + + while index < len(series): + # get a particular day + day = series[index] + curr = int(entries[index]) + # get pos/neg test count + day[keys[0]] = int(curr) + # add that day's pos/neg test count to get cumulative number of positive tests + cumul += curr + day[keys[1]] = cumul + index += 1 + return series + + +get_county() From 9b9051054e473ff516a9959c871fa432150af83b Mon Sep 17 00:00:00 2001 From: kwonangela7 Date: Sat, 27 Jun 2020 22:53:29 -0700 Subject: [PATCH 17/39] removing file with the wrong name --- covid19_sfbayarea/data/marin_scraper.py | 307 ------------------------ 1 file changed, 307 deletions(-) delete mode 100644 covid19_sfbayarea/data/marin_scraper.py diff --git a/covid19_sfbayarea/data/marin_scraper.py b/covid19_sfbayarea/data/marin_scraper.py deleted file mode 100644 index 1bcc9144..00000000 --- a/covid19_sfbayarea/data/marin_scraper.py +++ /dev/null @@ -1,307 +0,0 @@ -#!/usr/bin/env python3 -import csv -import json -import numpy as np -from typing import List, Dict, Tuple -from selenium import webdriver -from bs4 import BeautifulSoup -from urllib.parse import unquote_plus -from datetime import datetime -import re - -from .webdriver import get_firefox -from utils import get_data_model - -def get_county() -> Dict: - """Main method for populating county data""" - - url = 'https://coronavirus.marinhhs.org/surveillance' - model = get_data_model() - - chart_ids = {"cases": "Eq6Es", "deaths": "Eq6Es", "tests": '2Hgir', "age": "VOeBm", "gender": "FEciW", "race_eth": "aBeEd"} - # population totals and transmission data missing. - model['name'] = "Marin County" - model['update_time'] = datetime.today().isoformat() - # No actual update time on their website? They update most charts daily (so the isoformat is only partially correct.) - model['source_url'] = url - model['meta_from_source'] = get_metadata(url, chart_ids) - model["series"]["cases"] = get_case_series(chart_ids["cases"], url) - model["series"]["deaths"] = get_death_series(chart_ids["deaths"], url) - model["series"]["tests"] = get_test_series(chart_ids["tests"], url) - model["case_totals"]["age_group"], model["death_totals"]["age_group"] = get_breakdown_age(chart_ids["age"], url) - model["case_totals"]["gender"], model["death_totals"]["gender"] = get_breakdown_gender(chart_ids["gender"], url) - model["case_totals"]["race_eth"], model["death_totals"]["race_eth"] = get_breakdown_race_eth(chart_ids["race_eth"], url) - - return model - -def extract_csvs(chart_id: str, url: str) -> str: - """This method extracts the csv string from the data wrapper charts.""" - driver = get_firefox() - # need to figure out how to change the webdriver - - driver.implicitly_wait(30) - driver.get(url) - - frame = driver.find_element_by_css_selector(f'iframe[src*="//datawrapper.dwcdn.net/{chart_id}/"]') - - driver.switch_to.frame(frame) - # Grab the raw data out of the link's href attribute - csv_data = driver.find_element_by_class_name('dw-data-link').get_attribute('href') - # Switch back to the parent frame to "reset" the context - driver.switch_to.parent_frame() - - # Deal with the data - if csv_data.startswith('data:'): - media, data = csv_data[5:].split(',', 1) - # Will likely always have this kind of data type - if media != 'application/octet-stream;charset=utf-8': - raise ValueError(f'Cannot handle media type "{media}"') - csv_string = unquote_plus(data) - else: - raise ValueError('Cannot handle this csv_data href') - - # Then leave the iframe - driver.switch_to.default_content() - - return csv_string - -def get_metadata(url: str, chart_ids: Dict[str, str]) -> Tuple: - notes = [] - driver = get_firefox() - driver.implicitly_wait(30) - driver.get(url) - soup = BeautifulSoup(driver.page_source, 'html5lib') - metadata = [] - - to_be_matched = ['Total Cases, Recovered, Hospitalizations and Deaths by Date Reported', 'Daily Count of Positive Results and Total Tests for Marin County Residents by Test Date ', 'Cases, Hospitalizations, and Deaths by Age, Gender and Race/Ethnicity '] - chart_metadata = [] - - for text in to_be_matched: - target = soup.find('h4',text=text) - if not target: - raise ValueError('Cannot handle this header.') - for sib in target.find_next_siblings()[:1]: # I only want the first paragraph tag - ### FIXXX ####### - # Is it more efficient to use something like (soup object).select('h1 + p') to grab the first paragraph that follows? - metadata += [sib.text] - - # Metadata for each chart visualizing the data of the csv file I'll pull. There's probably a better way to organize this. - for chart_id in chart_ids.values(): - frame = driver.find_element_by_css_selector(f'iframe[src*="//datawrapper.dwcdn.net/{chart_id}/"]') - driver.switch_to.frame(frame) - # The metadata for the charts is located in elements with the class `dw-chart-notes' - for c in driver.find_elements_by_class_name('dw-chart-notes'): - chart_metadata.append(c.text) - - # Switch back to the parent frame to "reset" the context - driver.switch_to.parent_frame() - - driver.quit() - - # Return the metadata. I take the set of the chart_metadata since there are repeating metadata strings. - return metadata, list(set(chart_metadata)) - -def get_case_series(chart_id: str, url: str) -> List: - """This method extracts the date, number of cumulative cases, and new cases.""" - csv_ = extract_csvs(chart_id, url) - series = [] - - csv_strs = csv_.splitlines() - keys = csv_strs[0].split(',') - - if keys != ['Date', 'Total Cases', 'Total Recovered*', 'Total Hospitalized', 'Total Deaths']: - raise ValueError('The headers have changed') - - case_history = [] - - for row in csv_strs[1:]: - daily = {} - # Grab the date in the first column - date_time_obj = datetime.strptime(row.split(',')[0], '%m/%d/%Y') - daily["date"] = date_time_obj.isoformat() - # Collect the case totals in order to compute the change in cases per day - case_history.append(int(row.split(',')[1])) - # Grab the cumulative number in the fifth column - daily["cumul_cases"] = int(row.split(',')[1]) - series.append(daily) - - case_history_diff = np.diff(case_history) - # there will be no calculated difference for the first day, so adding it in manually - case_history_diff = np.insert(case_history_diff, 0, 0) - # adding the case differences into the series - for val, case_num in enumerate(case_history_diff): - series[val]["cases"] = case_num - return series - -def get_death_series(chart_id: str, url: str) -> List: - """This method extracts the date, number of cumulative deaths, and new deaths.""" - csv_ = extract_csvs(chart_id, url) - series = [] - - csv_strs = csv_.splitlines() - keys = csv_strs[0].split(',') - if keys != ['Date', 'Total Cases', 'Total Recovered*', 'Total Hospitalized', 'Total Deaths']: - raise ValueError('The headers have changed.') - - death_history = [] - - for row in csv_strs[1:]: - daily = {} - # Grab the date in the first column - date_time_obj = datetime.strptime(row.split(',')[0], '%m/%d/%Y') - daily["date"] = date_time_obj.isoformat() - # Collect the death totals in order to compute the change in deaths per day - death_history.append(int(row.split(',')[4])) - # Grab the cumulative number in the fifth column - daily["cumul_deaths"] = int(row.split(',')[4]) - series.append(daily) - - death_history_diff = np.diff(death_history) - # there will be no calculated difference for the first day, so adding it in manually - death_history_diff = np.insert(death_history_diff, 0, 0) - # adding the case differences into the series - for val, death_num in enumerate(death_history_diff): - series[val]["deaths"] = death_num - return series - -def get_breakdown_age(chart_id: str, url: str) -> Tuple: - """This method gets the breakdown of cases and deaths by age.""" - csv_ = extract_csvs(chart_id, url) - c_brkdown = [] - d_brkdown = [] - - csv_strs = csv_.splitlines() - keys = csv_strs[0].split(',') - - if keys != ['Age Category', 'POPULATION', 'Cases', 'Hospitalizations', 'Deaths']: - raise ValueError('The headers have changed') - - ages = ['0-18', '19-34', '35-49', '50-64', '65+'] - for row in csv_strs[1:]: - c_age = {} - d_age = {} - # Extracting the age group and the raw count (the 3rd and 5th columns, respectively) for both cases and deaths. - # Each new row has data for a different age group. - c_age["group"] = row.split(',')[0] - if c_age["group"] not in ages: - raise ValueError('The age groups have changed.') - c_age["raw_count"] = int(row.split(',')[2]) - d_age["group"] = row.split(',')[0] - d_age["raw_count"] = int(row.split(',')[4]) - c_brkdown.append(c_age) - d_brkdown.append(d_age) - - return c_brkdown, d_brkdown - -def get_breakdown_gender(chart_id: str, url: str) -> Tuple: - """This method gets the breakdown of cases and deaths by gender.""" - csv_ = extract_csvs(chart_id, url) - - csv_strs = csv_.splitlines() - keys = csv_strs[0].split(',') - if keys != ['Gender', 'POPULATION', 'Cases', 'Hospitalizations', 'Deaths']: - raise ValueError('The headers have changed.') - - genders = ['male', 'female'] - c_gender = {} - d_gender = {} - - for row in csv_strs[1:]: - # Extracting the gender and the raw count (the 3rd and 5th columns, respectively) for both cases and deaths. - # Each new row has data for a different gender. - split = row.split(',') - gender = split[0].lower() - if gender not in genders: - return ValueError('The genders have changed.') - c_gender[gender] = int(split[2]) - d_gender[gender] = int(split[4]) - - return c_gender, d_gender - -def get_breakdown_race_eth(chart_id: str, url: str) -> Tuple: - """This method gets the breakdown of cases and deaths by race/ethnicity.""" - - csv_ = extract_csvs(chart_id, url) - - csv_strs = csv_.splitlines() - keys = csv_strs[0].split(',') - - if keys != ['Race/Ethnicity', 'COUNTY POPULATION', 'Case Count', 'Percent of Cases', 'Hospitalization Count', 'Percent of Hospitalizations', 'Death Count', 'Percent of Deaths']: - raise ValueError("The headers have changed.") - - key_mapping = {"black/african american":"African_Amer", "hispanic/latino": "Latinx_or_Hispanic", - "american indian/alaska native": "Native_Amer", "native hawaiian/pacific islander": "Pacific_Islander", "white": "White", "asian": "Asian", "multi or other race": "Multi or Other Race"} - # "Multiple_Race", "Other" are not separate in this data set - they are one value under "Multi or Other Race" - - c_race_eth = {} - d_race_eth = {} - - for row in csv_strs[1:]: - split = row.split(',') - race_eth = split[0].lower() - if race_eth not in key_mapping: - raise ValueError("The race_eth groups have changed.") - else: - c_race_eth[key_mapping[race_eth]] = int(split[2]) - d_race_eth[key_mapping[race_eth]] = int(split[6]) - - return c_race_eth, d_race_eth - -def get_test_series(chart_id: str, url: str) -> Tuple: - """This method gets the date, the number of positive and negative tests on that date, and the number of cumulative positive and negative tests.""" - - csv_ = extract_csvs(chart_id, url) - series = [] - - csv_strs = csv_.splitlines() - keys = csv_strs[0].split(',') - - test_history = [] - - # Grab the dates, which are in the header - for entry in csv_strs[:1][0].split(',')[1:]: - # need to exclude very first item in the csv_strs[:1][0].split(',') list (which is the value 'Date') - daily = {} - date_time_obj = datetime.strptime(entry, '%m/%d/%Y') - daily["date"] = date_time_obj.isoformat() - series.append(daily) - - # The slicing makes this if statement hard to look at... there must be a better way? - if csv_strs[1:2][0].split(',')[:1][0] != 'Positive Tests' and csv_strs[2:][0].split(',')[:1][0] != 'Negative Tests': - raise ValueError('The kinds of tests have changed.') - - # Grab the positive test result numbers, which is in the second row. - # [1:] is included to make sure that 'Positive Tests' is not captured. - p_entries = csv_strs[1:2][0].split(',')[1:] - n_entries = csv_strs[2:][0].split(',')[1:] - - get_test_series_helper(series, p_entries, ['positive', 'cumul_pos']) - get_test_series_helper(series, n_entries, ['negative', 'cumul_neg']) - - return series - -def get_test_series_helper(series: list, entries: list, keys: list) -> List: - """This method helps get the pos/neg test count and the cumulative pos/neg test count.""" - - # initialize values cumulative number, the positive/negative and cumul_pos/neg values for the first day, and the index needed for the while loop. - - # there's probably a more efficient way to do all of this, but I just wasn't sure. - cumul = int(entries[0]) - series[0][keys[0]] = int(entries[0]) - series[0][keys[1]] = cumul - index = 1 - - while index < len(series): - # get a particular day - day = series[index] - curr = int(entries[index]) - # get pos/neg test count - day[keys[0]] = int(curr) - # add that day's pos/neg test count to get cumulative number of positive tests - cumul += curr - day[keys[1]] = cumul - index += 1 - return series - - -get_county() From 8b2f8b96b49ddec29b4f97082e370fb61ac812cd Mon Sep 17 00:00:00 2001 From: kwonangela7 Date: Sun, 28 Jun 2020 23:49:15 -0700 Subject: [PATCH 18/39] added import to init statement --- covid19_sfbayarea/data/__init__.py | 1 + covid19_sfbayarea/data/marin.py | 2 ++ 2 files changed, 3 insertions(+) diff --git a/covid19_sfbayarea/data/__init__.py b/covid19_sfbayarea/data/__init__.py index 9e18cf0e..3a347060 100644 --- a/covid19_sfbayarea/data/__init__.py +++ b/covid19_sfbayarea/data/__init__.py @@ -1,6 +1,7 @@ from typing import Dict, Any from . import alameda from . import san_francisco +from . import marin scrapers: Dict[str, Any] = { 'alameda': alameda, diff --git a/covid19_sfbayarea/data/marin.py b/covid19_sfbayarea/data/marin.py index 1bcc9144..f6118bef 100644 --- a/covid19_sfbayarea/data/marin.py +++ b/covid19_sfbayarea/data/marin.py @@ -22,6 +22,8 @@ def get_county() -> Dict: # population totals and transmission data missing. model['name'] = "Marin County" model['update_time'] = datetime.today().isoformat() + model["meta_from_baypd"] = "There's no actual update time on their website. Not all charts are updated daily." + # No actual update time on their website? They update most charts daily (so the isoformat is only partially correct.) model['source_url'] = url model['meta_from_source'] = get_metadata(url, chart_ids) From 39ce2bba22b666b1bc4020e2ea5d16b94ac47187 Mon Sep 17 00:00:00 2001 From: kwonangela7 Date: Tue, 30 Jun 2020 23:40:43 -0700 Subject: [PATCH 19/39] used soup.select('h4+p') instead of find_next_sibling + threw error --- covid19_sfbayarea/data/marin.py | 45 ++++++++++++++++++--------------- 1 file changed, 25 insertions(+), 20 deletions(-) diff --git a/covid19_sfbayarea/data/marin.py b/covid19_sfbayarea/data/marin.py index f6118bef..f060c7fc 100644 --- a/covid19_sfbayarea/data/marin.py +++ b/covid19_sfbayarea/data/marin.py @@ -1,7 +1,7 @@ #!/usr/bin/env python3 import csv import json -import numpy as np +#import numpy as np from typing import List, Dict, Tuple from selenium import webdriver from bs4 import BeautifulSoup @@ -9,8 +9,8 @@ from datetime import datetime import re -from .webdriver import get_firefox -from utils import get_data_model +from ..webdriver import get_firefox +from .utils import get_data_model def get_county() -> Dict: """Main method for populating county data""" @@ -27,12 +27,12 @@ def get_county() -> Dict: # No actual update time on their website? They update most charts daily (so the isoformat is only partially correct.) model['source_url'] = url model['meta_from_source'] = get_metadata(url, chart_ids) - model["series"]["cases"] = get_case_series(chart_ids["cases"], url) - model["series"]["deaths"] = get_death_series(chart_ids["deaths"], url) - model["series"]["tests"] = get_test_series(chart_ids["tests"], url) - model["case_totals"]["age_group"], model["death_totals"]["age_group"] = get_breakdown_age(chart_ids["age"], url) - model["case_totals"]["gender"], model["death_totals"]["gender"] = get_breakdown_gender(chart_ids["gender"], url) - model["case_totals"]["race_eth"], model["death_totals"]["race_eth"] = get_breakdown_race_eth(chart_ids["race_eth"], url) + #model["series"]["cases"] = get_case_series(chart_ids["cases"], url) + #model["series"]["deaths"] = get_death_series(chart_ids["deaths"], url) + #model["series"]["tests"] = get_test_series(chart_ids["tests"], url) + #model["case_totals"]["age_group"], model["death_totals"]["age_group"] = get_breakdown_age(chart_ids["age"], url) + #model["case_totals"]["gender"], model["death_totals"]["gender"] = get_breakdown_gender(chart_ids["gender"], url) + #model["case_totals"]["race_eth"], model["death_totals"]["race_eth"] = get_breakdown_race_eth(chart_ids["race_eth"], url) return model @@ -50,7 +50,8 @@ def extract_csvs(chart_id: str, url: str) -> str: # Grab the raw data out of the link's href attribute csv_data = driver.find_element_by_class_name('dw-data-link').get_attribute('href') # Switch back to the parent frame to "reset" the context - driver.switch_to.parent_frame() + # I think I can delete this b/c I switch back to the default content below + #driver.switch_to.parent_frame() # Deal with the data if csv_data.startswith('data:'): @@ -76,16 +77,20 @@ def get_metadata(url: str, chart_ids: Dict[str, str]) -> Tuple: metadata = [] to_be_matched = ['Total Cases, Recovered, Hospitalizations and Deaths by Date Reported', 'Daily Count of Positive Results and Total Tests for Marin County Residents by Test Date ', 'Cases, Hospitalizations, and Deaths by Age, Gender and Race/Ethnicity '] - chart_metadata = [] + + chart_metadata = set() for text in to_be_matched: - target = soup.find('h4',text=text) - if not target: - raise ValueError('Cannot handle this header.') - for sib in target.find_next_siblings()[:1]: # I only want the first paragraph tag - ### FIXXX ####### - # Is it more efficient to use something like (soup object).select('h1 + p') to grab the first paragraph that follows? - metadata += [sib.text] + #target = soup.find('h4',text=text) + #if not target: + #raise ValueError('Cannot handle this header.') + if soup.select('h4 + p')[0].text: + metadata += [soup.select('h4 + p')[0].text] + else: + raise ValueError('Location of metadata has changed.') + + #for sib in target.find_next_siblings()[:1]: # I only want the first paragraph tag + #metadata += [sib.text] # Metadata for each chart visualizing the data of the csv file I'll pull. There's probably a better way to organize this. for chart_id in chart_ids.values(): @@ -93,7 +98,7 @@ def get_metadata(url: str, chart_ids: Dict[str, str]) -> Tuple: driver.switch_to.frame(frame) # The metadata for the charts is located in elements with the class `dw-chart-notes' for c in driver.find_elements_by_class_name('dw-chart-notes'): - chart_metadata.append(c.text) + chart_metadata.add(c.text) # Switch back to the parent frame to "reset" the context driver.switch_to.parent_frame() @@ -101,7 +106,7 @@ def get_metadata(url: str, chart_ids: Dict[str, str]) -> Tuple: driver.quit() # Return the metadata. I take the set of the chart_metadata since there are repeating metadata strings. - return metadata, list(set(chart_metadata)) + return metadata, list(chart_metadata) def get_case_series(chart_id: str, url: str) -> List: """This method extracts the date, number of cumulative cases, and new cases.""" From 7521dfbf99796ef0f4be8e0a75dee737d9c719d1 Mon Sep 17 00:00:00 2001 From: kwonangela7 Date: Tue, 7 Jul 2020 00:16:25 -0700 Subject: [PATCH 20/39] fixed get_case_series to use csv modeul, not use numpy, and use the proper date object. --- covid19_sfbayarea/data/marin.py | 58 +++++++++++++++++++++++---------- 1 file changed, 41 insertions(+), 17 deletions(-) diff --git a/covid19_sfbayarea/data/marin.py b/covid19_sfbayarea/data/marin.py index f060c7fc..7b8854b1 100644 --- a/covid19_sfbayarea/data/marin.py +++ b/covid19_sfbayarea/data/marin.py @@ -27,7 +27,7 @@ def get_county() -> Dict: # No actual update time on their website? They update most charts daily (so the isoformat is only partially correct.) model['source_url'] = url model['meta_from_source'] = get_metadata(url, chart_ids) - #model["series"]["cases"] = get_case_series(chart_ids["cases"], url) + model["series"]["cases"] = get_case_series(chart_ids["cases"], url) #model["series"]["deaths"] = get_death_series(chart_ids["deaths"], url) #model["series"]["tests"] = get_test_series(chart_ids["tests"], url) #model["case_totals"]["age_group"], model["death_totals"]["age_group"] = get_breakdown_age(chart_ids["age"], url) @@ -110,32 +110,55 @@ def get_metadata(url: str, chart_ids: Dict[str, str]) -> Tuple: def get_case_series(chart_id: str, url: str) -> List: """This method extracts the date, number of cumulative cases, and new cases.""" - csv_ = extract_csvs(chart_id, url) + csv_str = extract_csvs(chart_id, url) + csv_reader = csv.DictReader(csv_str.splitlines()) series = [] - csv_strs = csv_.splitlines() - keys = csv_strs[0].split(',') + #csv_strs = csv_.splitlines() + keys = csv_reader.fieldnames + next(csv_reader) + + #keys = csv_strs[0].split(',') if keys != ['Date', 'Total Cases', 'Total Recovered*', 'Total Hospitalized', 'Total Deaths']: raise ValueError('The headers have changed') case_history = [] - for row in csv_strs[1:]: + # for row in csv_strs[1:]: + # daily = {} + # # Grab the date in the first column + # date_time_obj = datetime.strptime(row.split(',')[0], '%m/%d/%Y') + # daily["date"] = date_time_obj.isoformat() + # # Collect the case totals in order to compute the change in cases per day + # case_history.append(int(row.split(',')[1])) + # # Grab the cumulative number in the fifth column + # daily["cumul_cases"] = int(row.split(',')[1]) + # series.append(daily) + + # case_history_diff = np.diff(case_history) + # # there will be no calculated difference for the first day, so adding it in manually + # case_history_diff = np.insert(case_history_diff, 0, 0) + # # adding the case differences into the series + # for val, case_num in enumerate(case_history_diff): + # series[val]["cases"] = case_num + # return series + + for row in csv_reader: daily = {} - # Grab the date in the first column - date_time_obj = datetime.strptime(row.split(',')[0], '%m/%d/%Y') - daily["date"] = date_time_obj.isoformat() - # Collect the case totals in order to compute the change in cases per day - case_history.append(int(row.split(',')[1])) - # Grab the cumulative number in the fifth column - daily["cumul_cases"] = int(row.split(',')[1]) + date_time_obj = datetime.strptime(row['Date'], '%m/%d/%Y') + daily["date"] = date_time_obj.strftime('%Y-%m-%d') + case_history.append(int(row["Total Cases"])) + daily["cumul_cases"] = int(row["Total Cases"]) series.append(daily) - - case_history_diff = np.diff(case_history) - # there will be no calculated difference for the first day, so adding it in manually - case_history_diff = np.insert(case_history_diff, 0, 0) - # adding the case differences into the series + + case_history_diff = [] + # Since i'm substracting pairwise elements, I don't want to go too far in the array. + for i in range(0, len(case_history)-1): + case_history_diff.append((int(case_history[i+1]) - int(case_history[i])) + int(series[0]["cumul_cases"])) + # from what I've seen, series[0]["cumul_cases"] will be 0, but I shouldn't assume that. + case_history_diff.insert(0, int(series[0]["cumul_cases"])) + for val, case_num in enumerate(case_history_diff): series[val]["cases"] = case_num return series @@ -148,6 +171,7 @@ def get_death_series(chart_id: str, url: str) -> List: csv_strs = csv_.splitlines() keys = csv_strs[0].split(',') if keys != ['Date', 'Total Cases', 'Total Recovered*', 'Total Hospitalized', 'Total Deaths']: + print(keys) raise ValueError('The headers have changed.') death_history = [] From 1e3fcbcb13101a9f1e4656fa42cb770fc585db86 Mon Sep 17 00:00:00 2001 From: kwonangela7 Date: Wed, 8 Jul 2020 00:19:25 -0700 Subject: [PATCH 21/39] fixed case and deaths series data + breakdown functions to use csv module instead of manual parsing --- covid19_sfbayarea/data/marin.py | 148 +++++++++++++++----------------- 1 file changed, 69 insertions(+), 79 deletions(-) diff --git a/covid19_sfbayarea/data/marin.py b/covid19_sfbayarea/data/marin.py index 7b8854b1..4adf5bdc 100644 --- a/covid19_sfbayarea/data/marin.py +++ b/covid19_sfbayarea/data/marin.py @@ -28,9 +28,9 @@ def get_county() -> Dict: model['source_url'] = url model['meta_from_source'] = get_metadata(url, chart_ids) model["series"]["cases"] = get_case_series(chart_ids["cases"], url) - #model["series"]["deaths"] = get_death_series(chart_ids["deaths"], url) + model["series"]["deaths"] = get_death_series(chart_ids["deaths"], url) #model["series"]["tests"] = get_test_series(chart_ids["tests"], url) - #model["case_totals"]["age_group"], model["death_totals"]["age_group"] = get_breakdown_age(chart_ids["age"], url) + model["case_totals"]["age_group"], model["death_totals"]["age_group"] = get_breakdown_age(chart_ids["age"], url) #model["case_totals"]["gender"], model["death_totals"]["gender"] = get_breakdown_gender(chart_ids["gender"], url) #model["case_totals"]["race_eth"], model["death_totals"]["race_eth"] = get_breakdown_race_eth(chart_ids["race_eth"], url) @@ -49,10 +49,8 @@ def extract_csvs(chart_id: str, url: str) -> str: driver.switch_to.frame(frame) # Grab the raw data out of the link's href attribute csv_data = driver.find_element_by_class_name('dw-data-link').get_attribute('href') - # Switch back to the parent frame to "reset" the context - # I think I can delete this b/c I switch back to the default content below - #driver.switch_to.parent_frame() - + + # Deal with the data if csv_data.startswith('data:'): media, data = csv_data[5:].split(',', 1) @@ -114,46 +112,24 @@ def get_case_series(chart_id: str, url: str) -> List: csv_reader = csv.DictReader(csv_str.splitlines()) series = [] - #csv_strs = csv_.splitlines() keys = csv_reader.fieldnames - next(csv_reader) - - #keys = csv_strs[0].split(',') if keys != ['Date', 'Total Cases', 'Total Recovered*', 'Total Hospitalized', 'Total Deaths']: raise ValueError('The headers have changed') case_history = [] - # for row in csv_strs[1:]: - # daily = {} - # # Grab the date in the first column - # date_time_obj = datetime.strptime(row.split(',')[0], '%m/%d/%Y') - # daily["date"] = date_time_obj.isoformat() - # # Collect the case totals in order to compute the change in cases per day - # case_history.append(int(row.split(',')[1])) - # # Grab the cumulative number in the fifth column - # daily["cumul_cases"] = int(row.split(',')[1]) - # series.append(daily) - - # case_history_diff = np.diff(case_history) - # # there will be no calculated difference for the first day, so adding it in manually - # case_history_diff = np.insert(case_history_diff, 0, 0) - # # adding the case differences into the series - # for val, case_num in enumerate(case_history_diff): - # series[val]["cases"] = case_num - # return series - for row in csv_reader: daily = {} date_time_obj = datetime.strptime(row['Date'], '%m/%d/%Y') daily["date"] = date_time_obj.strftime('%Y-%m-%d') + # Collect the case totals in order to compute the change in cases per day case_history.append(int(row["Total Cases"])) daily["cumul_cases"] = int(row["Total Cases"]) series.append(daily) case_history_diff = [] - # Since i'm substracting pairwise elements, I don't want to go too far in the array. + # Since i'm substracting pairwise elements, I need to adjust the range so I don't get an off by one error. for i in range(0, len(case_history)-1): case_history_diff.append((int(case_history[i+1]) - int(case_history[i])) + int(series[0]["cumul_cases"])) # from what I've seen, series[0]["cumul_cases"] will be 0, but I shouldn't assume that. @@ -165,71 +141,78 @@ def get_case_series(chart_id: str, url: str) -> List: def get_death_series(chart_id: str, url: str) -> List: """This method extracts the date, number of cumulative deaths, and new deaths.""" - csv_ = extract_csvs(chart_id, url) + csv_str = extract_csvs(chart_id, url) + csv_reader = csv.DictReader(csv_str.splitlines()) series = [] - csv_strs = csv_.splitlines() - keys = csv_strs[0].split(',') + keys = csv_reader.fieldnames if keys != ['Date', 'Total Cases', 'Total Recovered*', 'Total Hospitalized', 'Total Deaths']: - print(keys) raise ValueError('The headers have changed.') death_history = [] - for row in csv_strs[1:]: + for row in csv_reader: daily = {} - # Grab the date in the first column - date_time_obj = datetime.strptime(row.split(',')[0], '%m/%d/%Y') - daily["date"] = date_time_obj.isoformat() - # Collect the death totals in order to compute the change in deaths per day - death_history.append(int(row.split(',')[4])) - # Grab the cumulative number in the fifth column - daily["cumul_deaths"] = int(row.split(',')[4]) + date_time_obj = datetime.strptime(row['Date'], '%m/%d/%Y') + daily["date"] = date_time_obj.strftime('%Y-%m-%d') + # Collect the case totals in order to compute the change in cases per day + death_history.append(int(row["Total Deaths"])) + daily["cumul_deaths"] = int(row["Total Deaths"]) series.append(daily) - - death_history_diff = np.diff(death_history) - # there will be no calculated difference for the first day, so adding it in manually - death_history_diff = np.insert(death_history_diff, 0, 0) - # adding the case differences into the series - for val, death_num in enumerate(death_history_diff): - series[val]["deaths"] = death_num + + death_history_diff = [] + # Since i'm substracting pairwise elements, I need to adjust the range so I don't get an off by one error. + for i in range(0, len(death_history)-1): + death_history_diff.append((int(death_history[i+1]) - int(death_history[i])) + int(series[0]["cumul_deaths"])) + # from what I've seen, series[0]["cumul_cases"] will be 0, but I shouldn't assume that. + death_history_diff.insert(0, int(series[0]["cumul_deaths"])) + + for val, case_num in enumerate(death_history_diff): + series[val]["deaths"] = case_num return series def get_breakdown_age(chart_id: str, url: str) -> Tuple: """This method gets the breakdown of cases and deaths by age.""" - csv_ = extract_csvs(chart_id, url) + csv_str = extract_csvs(chart_id, url) + csv_reader = csv.DictReader(csv_str.splitlines()) c_brkdown = [] d_brkdown = [] - csv_strs = csv_.splitlines() - keys = csv_strs[0].split(',') + keys = csv_reader.fieldnames if keys != ['Age Category', 'POPULATION', 'Cases', 'Hospitalizations', 'Deaths']: raise ValueError('The headers have changed') ages = ['0-18', '19-34', '35-49', '50-64', '65+'] - for row in csv_strs[1:]: + new_ages = ['0_to_18', '19_to_34', '35_to_49', '50_to_64', '65_and_older'] + + # TO-DO: should probably make this a key mapping like I do for race_eth + + for row in csv_reader: c_age = {} d_age = {} - # Extracting the age group and the raw count (the 3rd and 5th columns, respectively) for both cases and deaths. - # Each new row has data for a different age group. - c_age["group"] = row.split(',')[0] + # Extracting the age group and the raw count for both cases and deaths. + c_age["group"] = row['Age Category'] if c_age["group"] not in ages: raise ValueError('The age groups have changed.') - c_age["raw_count"] = int(row.split(',')[2]) - d_age["group"] = row.split(',')[0] - d_age["raw_count"] = int(row.split(',')[4]) - c_brkdown.append(c_age) - d_brkdown.append(d_age) + else: + c_age["raw_count"] = int(row["Cases"]) + d_age["group"] = row['Age Category'] + d_age["raw_count"] = int(row["Deaths"]) + c_brkdown.append(c_age) + d_brkdown.append(d_age) + + c_age["group"].keys = new_ages + d_age["group"].keys = new_ages return c_brkdown, d_brkdown def get_breakdown_gender(chart_id: str, url: str) -> Tuple: """This method gets the breakdown of cases and deaths by gender.""" - csv_ = extract_csvs(chart_id, url) + csv_str = extract_csvs(chart_id, url) + csv_reader = csv.DictReader(csv_str.splitlines()) - csv_strs = csv_.splitlines() - keys = csv_strs[0].split(',') + keys = csv_reader.fieldnames if keys != ['Gender', 'POPULATION', 'Cases', 'Hospitalizations', 'Deaths']: raise ValueError('The headers have changed.') @@ -237,25 +220,33 @@ def get_breakdown_gender(chart_id: str, url: str) -> Tuple: c_gender = {} d_gender = {} - for row in csv_strs[1:]: + for row in csv_reader: # Extracting the gender and the raw count (the 3rd and 5th columns, respectively) for both cases and deaths. # Each new row has data for a different gender. - split = row.split(',') - gender = split[0].lower() + gender = row["Gender"].lower() if gender not in genders: return ValueError('The genders have changed.') - c_gender[gender] = int(split[2]) - d_gender[gender] = int(split[4]) + c_gender[gender] = int(row["Cases"]) + d_gender[gender] = int(row["Deaths"]) + + # for row in csv_strs[1:]: + # # Extracting the gender and the raw count (the 3rd and 5th columns, respectively) for both cases and deaths. + # # Each new row has data for a different gender. + # split = row.split(',') + # gender = split[0].lower() + # if gender not in genders: + # return ValueError('The genders have changed.') + # c_gender[gender] = int(split[2]) + # d_gender[gender] = int(split[4]) return c_gender, d_gender def get_breakdown_race_eth(chart_id: str, url: str) -> Tuple: """This method gets the breakdown of cases and deaths by race/ethnicity.""" - csv_ = extract_csvs(chart_id, url) - - csv_strs = csv_.splitlines() - keys = csv_strs[0].split(',') + csv_str = extract_csvs(chart_id, url) + csv_reader = csv.DictReader(csv_str.splitlines()) + keys = csv_reader.fieldnames if keys != ['Race/Ethnicity', 'COUNTY POPULATION', 'Case Count', 'Percent of Cases', 'Hospitalization Count', 'Percent of Hospitalizations', 'Death Count', 'Percent of Deaths']: raise ValueError("The headers have changed.") @@ -266,15 +257,14 @@ def get_breakdown_race_eth(chart_id: str, url: str) -> Tuple: c_race_eth = {} d_race_eth = {} - - for row in csv_strs[1:]: - split = row.split(',') - race_eth = split[0].lower() + + for row in csv_reader: + race_eth = row["Race/Ethnicity"].lower() if race_eth not in key_mapping: raise ValueError("The race_eth groups have changed.") else: - c_race_eth[key_mapping[race_eth]] = int(split[2]) - d_race_eth[key_mapping[race_eth]] = int(split[6]) + c_race_eth[key_mapping[race_eth]] = int(row["Case Count"]) + d_race_eth[key_mapping[race_eth]] = int(row["Death Count"]) return c_race_eth, d_race_eth From 5b04be90c25c85810a0559f08dea74a3c7c72e7d Mon Sep 17 00:00:00 2001 From: Angela Kwon Date: Fri, 10 Jul 2020 23:03:11 -0700 Subject: [PATCH 22/39] testing to get the most recent commits on this branch --- covid19_sfbayarea/data/marin.py | 1 + 1 file changed, 1 insertion(+) diff --git a/covid19_sfbayarea/data/marin.py b/covid19_sfbayarea/data/marin.py index f060c7fc..e59c7314 100644 --- a/covid19_sfbayarea/data/marin.py +++ b/covid19_sfbayarea/data/marin.py @@ -9,6 +9,7 @@ from datetime import datetime import re +#testing from ..webdriver import get_firefox from .utils import get_data_model From 2ba527302bdcf58cd88435849f0f5b0f7323927c Mon Sep 17 00:00:00 2001 From: Angela Kwon Date: Wed, 15 Jul 2020 00:25:10 -0700 Subject: [PATCH 23/39] simplified test logic --- covid19_sfbayarea/data/marin.py | 128 +++++++++++++++++++------------- 1 file changed, 78 insertions(+), 50 deletions(-) diff --git a/covid19_sfbayarea/data/marin.py b/covid19_sfbayarea/data/marin.py index 985790b0..7b3a03a8 100644 --- a/covid19_sfbayarea/data/marin.py +++ b/covid19_sfbayarea/data/marin.py @@ -27,10 +27,10 @@ def get_county() -> Dict: # No actual update time on their website? They update most charts daily (so the isoformat is only partially correct.) model['source_url'] = url - model['meta_from_source'] = get_metadata(url, chart_ids) - model["series"]["cases"] = get_case_series(chart_ids["cases"], url) - model["series"]["deaths"] = get_death_series(chart_ids["deaths"], url) - #model["series"]["tests"] = get_test_series(chart_ids["tests"], url) + #model['meta_from_source'] = get_metadata(url, chart_ids) + #model["series"]["cases"] = get_case_series(chart_ids["cases"], url) + #model["series"]["deaths"] = get_death_series(chart_ids["deaths"], url) + model["series"]["tests"] = get_test_series(chart_ids["tests"], url) model["case_totals"]["age_group"], model["death_totals"]["age_group"] = get_breakdown_age(chart_ids["age"], url) #model["case_totals"]["gender"], model["death_totals"]["gender"] = get_breakdown_gender(chart_ids["gender"], url) #model["case_totals"]["race_eth"], model["death_totals"]["race_eth"] = get_breakdown_race_eth(chart_ids["race_eth"], url) @@ -114,6 +114,8 @@ def get_case_series(chart_id: str, url: str) -> List: series = [] keys = csv_reader.fieldnames + + # TO-DO: is it possible to do 112, 113 and 116 with a context manager to reduce amount of code throughout this file? if keys != ['Date', 'Total Cases', 'Total Recovered*', 'Total Hospitalized', 'Total Deaths']: raise ValueError('The headers have changed') @@ -184,8 +186,8 @@ def get_breakdown_age(chart_id: str, url: str) -> Tuple: if keys != ['Age Category', 'POPULATION', 'Cases', 'Hospitalizations', 'Deaths']: raise ValueError('The headers have changed') - ages = ['0-18', '19-34', '35-49', '50-64', '65+'] - new_ages = ['0_to_18', '19_to_34', '35_to_49', '50_to_64', '65_and_older'] + ages = ['0-18', '19-34', '35-49', '50-64', '65-79', '80-94', '95+' ] + new_ages = ['0_to_18', '19_to_34', '35_to_49', '50_to_64', '65_to_79', '80_to_94', '95_and_older'] # TO-DO: should probably make this a key mapping like I do for race_eth @@ -195,7 +197,7 @@ def get_breakdown_age(chart_id: str, url: str) -> Tuple: # Extracting the age group and the raw count for both cases and deaths. c_age["group"] = row['Age Category'] if c_age["group"] not in ages: - raise ValueError('The age groups have changed.') + raise ValueError(str(c_age["group"]) + ' is not in the list of age groups. The age groups have changed.') else: c_age["raw_count"] = int(row["Cases"]) d_age["group"] = row['Age Category'] @@ -203,8 +205,9 @@ def get_breakdown_age(chart_id: str, url: str) -> Tuple: c_brkdown.append(c_age) d_brkdown.append(d_age) - c_age["group"].keys = new_ages - d_age["group"].keys = new_ages + index = 0 + for (c_group, d_group) in zip(c_age, d_age): + return c_group, d_group return c_brkdown, d_brkdown @@ -212,8 +215,8 @@ def get_breakdown_gender(chart_id: str, url: str) -> Tuple: """This method gets the breakdown of cases and deaths by gender.""" csv_str = extract_csvs(chart_id, url) csv_reader = csv.DictReader(csv_str.splitlines()) - keys = csv_reader.fieldnames + if keys != ['Gender', 'POPULATION', 'Cases', 'Hospitalizations', 'Deaths']: raise ValueError('The headers have changed.') @@ -273,57 +276,82 @@ def get_test_series(chart_id: str, url: str) -> Tuple: """This method gets the date, the number of positive and negative tests on that date, and the number of cumulative positive and negative tests.""" csv_ = extract_csvs(chart_id, url) - series = [] - csv_strs = csv_.splitlines() - keys = csv_strs[0].split(',') - + #csv_reader = csv.DictReader(csv_str.splitlines()) + #keys = csv_reader.fieldnames # this table is flipped so, the "keys" are actually the some of the data values (the dates) + test_history = [] - - # Grab the dates, which are in the header - for entry in csv_strs[:1][0].split(',')[1:]: - # need to exclude very first item in the csv_strs[:1][0].split(',') list (which is the value 'Date') + keys = [row.split(',')[0] for row in csv_strs] + dates, positives, negatives = [row.split(',')[1:] for row in csv_strs] + # I thought this should be 1: instead of :1 but why is :1 right? + # :1 is not right, but it doesn't give an error + + series = zip(dates, positives, negatives) + test_series = [] + + cumul_pos = 0 + cumul_neg = 0 + for entry in series: daily = {} - date_time_obj = datetime.strptime(entry, '%m/%d/%Y') + date_time_obj = datetime.strptime(entry[0], '%m/%d/%Y') daily["date"] = date_time_obj.isoformat() - series.append(daily) + daily["positive"] = int(entry[1]) + cumul_pos += daily["positive"] + daily["cumul_pos"] = cumul_pos + daily["negative"] = int(entry[2]) + cumul_neg += daily["negative"] + daily["cumul_neg"] = cumul_neg + test_series.append(daily) + return test_series - # The slicing makes this if statement hard to look at... there must be a better way? - if csv_strs[1:2][0].split(',')[:1][0] != 'Positive Tests' and csv_strs[2:][0].split(',')[:1][0] != 'Negative Tests': - raise ValueError('The kinds of tests have changed.') - # Grab the positive test result numbers, which is in the second row. - # [1:] is included to make sure that 'Positive Tests' is not captured. - p_entries = csv_strs[1:2][0].split(',')[1:] - n_entries = csv_strs[2:][0].split(',')[1:] + ########################## OLD CODE ################################################ - get_test_series_helper(series, p_entries, ['positive', 'cumul_pos']) - get_test_series_helper(series, n_entries, ['negative', 'cumul_neg']) + # Grab the dates, which are in the header + # for entry in csv_strs[:1][0].split(',')[1:]: + # # need to exclude very first item in the csv_strs[:1][0].split(',') list (which is the value 'Date') + # daily = {} + # date_time_obj = datetime.strptime(entry, '%m/%d/%Y') + # daily["date"] = date_time_obj.isoformat() + # series.append(daily) + + # # The slicing makes this if statement hard to look at... there must be a better way? + # if csv_strs[1:2][0].split(',')[:1][0] != 'Positive Tests' and csv_strs[2:][0].split(',')[:1][0] != 'Negative Tests': + # raise ValueError('The kinds of tests have changed.') + + # # Grab the positive test result numbers, which is in the second row. + # # [1:] is included to make sure that 'Positive Tests' is not captured. + # p_entries = csv_strs[1:2][0].split(',')[1:] + # n_entries = csv_strs[2:][0].split(',')[1:] + + # get_test_series_helper(series, p_entries, ['positive', 'cumul_pos']) + # get_test_series_helper(series, n_entries, ['negative', 'cumul_neg']) - return series + # return series + ################################################ -def get_test_series_helper(series: list, entries: list, keys: list) -> List: - """This method helps get the pos/neg test count and the cumulative pos/neg test count.""" +# def get_test_series_helper(series: list, entries: list, keys: list) -> List: +# """This method helps get the pos/neg test count and the cumulative pos/neg test count.""" - # initialize values cumulative number, the positive/negative and cumul_pos/neg values for the first day, and the index needed for the while loop. +# # initialize values cumulative number, the positive/negative and cumul_pos/neg values for the first day, and the index needed for the while loop. - # there's probably a more efficient way to do all of this, but I just wasn't sure. - cumul = int(entries[0]) - series[0][keys[0]] = int(entries[0]) - series[0][keys[1]] = cumul - index = 1 - - while index < len(series): - # get a particular day - day = series[index] - curr = int(entries[index]) - # get pos/neg test count - day[keys[0]] = int(curr) - # add that day's pos/neg test count to get cumulative number of positive tests - cumul += curr - day[keys[1]] = cumul - index += 1 - return series +# # there's probably a more efficient way to do all of this, but I just wasn't sure. +# cumul = int(entries[0]) +# series[0][keys[0]] = int(entries[0]) +# series[0][keys[1]] = cumul +# index = 1 + +# while index < len(series): +# # get a particular day +# day = series[index] +# curr = int(entries[index]) +# # get pos/neg test count +# day[keys[0]] = int(curr) +# # add that day's pos/neg test count to get cumulative number of positive tests +# cumul += curr +# day[keys[1]] = cumul +# index += 1 +# return series get_county() From d574680ae8bec22dc241a95dd9df4461b52f46a9 Mon Sep 17 00:00:00 2001 From: Angela Kwon Date: Wed, 15 Jul 2020 18:49:43 -0700 Subject: [PATCH 24/39] fixed testing data logic, fixed age mappings. The raw counts for age groups don't match up with the numbers on the spreadsheet I just downloaded but not sure why --- covid19_sfbayarea/data/marin.py | 116 ++++++++------------------------ 1 file changed, 27 insertions(+), 89 deletions(-) diff --git a/covid19_sfbayarea/data/marin.py b/covid19_sfbayarea/data/marin.py index 7b3a03a8..33f7e028 100644 --- a/covid19_sfbayarea/data/marin.py +++ b/covid19_sfbayarea/data/marin.py @@ -1,15 +1,10 @@ #!/usr/bin/env python3 import csv -import json -#import numpy as np from typing import List, Dict, Tuple -from selenium import webdriver from bs4 import BeautifulSoup from urllib.parse import unquote_plus from datetime import datetime -import re - -#testing + from ..webdriver import get_firefox from .utils import get_data_model @@ -30,11 +25,10 @@ def get_county() -> Dict: #model['meta_from_source'] = get_metadata(url, chart_ids) #model["series"]["cases"] = get_case_series(chart_ids["cases"], url) #model["series"]["deaths"] = get_death_series(chart_ids["deaths"], url) - model["series"]["tests"] = get_test_series(chart_ids["tests"], url) + #model["series"]["tests"] = get_test_series(chart_ids["tests"], url) model["case_totals"]["age_group"], model["death_totals"]["age_group"] = get_breakdown_age(chart_ids["age"], url) #model["case_totals"]["gender"], model["death_totals"]["gender"] = get_breakdown_gender(chart_ids["gender"], url) #model["case_totals"]["race_eth"], model["death_totals"]["race_eth"] = get_breakdown_race_eth(chart_ids["race_eth"], url) - return model def extract_csvs(chart_id: str, url: str) -> str: @@ -68,7 +62,6 @@ def extract_csvs(chart_id: str, url: str) -> str: return csv_string def get_metadata(url: str, chart_ids: Dict[str, str]) -> Tuple: - notes = [] driver = get_firefox() driver.implicitly_wait(30) driver.get(url) @@ -186,29 +179,23 @@ def get_breakdown_age(chart_id: str, url: str) -> Tuple: if keys != ['Age Category', 'POPULATION', 'Cases', 'Hospitalizations', 'Deaths']: raise ValueError('The headers have changed') - ages = ['0-18', '19-34', '35-49', '50-64', '65-79', '80-94', '95+' ] - new_ages = ['0_to_18', '19_to_34', '35_to_49', '50_to_64', '65_to_79', '80_to_94', '95_and_older'] - - # TO-DO: should probably make this a key mapping like I do for race_eth + key_mapping = {"0-18": "0_to_18", "19-34": "19_to_34", "35-49": "35_to_49", "50-64": "50_to_64", "65-79": "65_to_79", "80-94": "80_to_94", "95+": "95_and_older"} for row in csv_reader: c_age = {} d_age = {} # Extracting the age group and the raw count for both cases and deaths. - c_age["group"] = row['Age Category'] - if c_age["group"] not in ages: + c_age["group"], d_age["group"] = row['Age Category'], row['Age Category'] + if c_age["group"] not in key_mapping: raise ValueError(str(c_age["group"]) + ' is not in the list of age groups. The age groups have changed.') else: + c_age["group"] = key_mapping[c_age["group"]] c_age["raw_count"] = int(row["Cases"]) - d_age["group"] = row['Age Category'] + d_age["group"] = key_mapping[d_age["group"]] d_age["raw_count"] = int(row["Deaths"]) c_brkdown.append(c_age) d_brkdown.append(d_age) - index = 0 - for (c_group, d_group) in zip(c_age, d_age): - return c_group, d_group - return c_brkdown, d_brkdown def get_breakdown_gender(chart_id: str, url: str) -> Tuple: @@ -252,7 +239,7 @@ def get_breakdown_race_eth(chart_id: str, url: str) -> Tuple: csv_reader = csv.DictReader(csv_str.splitlines()) keys = csv_reader.fieldnames - if keys != ['Race/Ethnicity', 'COUNTY POPULATION', 'Case Count', 'Percent of Cases', 'Hospitalization Count', 'Percent of Hospitalizations', 'Death Count', 'Percent of Deaths']: + if keys != ['Race/Ethnicity', 'COUNTY POPULATION', 'Cases', 'Case Percent', 'Hospitalizations', 'Hospitalizations Percent', 'Deaths', 'Deaths Percent']: raise ValueError("The headers have changed.") key_mapping = {"black/african american":"African_Amer", "hispanic/latino": "Latinx_or_Hispanic", @@ -267,8 +254,8 @@ def get_breakdown_race_eth(chart_id: str, url: str) -> Tuple: if race_eth not in key_mapping: raise ValueError("The race_eth groups have changed.") else: - c_race_eth[key_mapping[race_eth]] = int(row["Case Count"]) - d_race_eth[key_mapping[race_eth]] = int(row["Death Count"]) + c_race_eth[key_mapping[race_eth]] = int(row["Cases"]) + d_race_eth[key_mapping[race_eth]] = int(row["Deaths"]) return c_race_eth, d_race_eth @@ -277,81 +264,32 @@ def get_test_series(chart_id: str, url: str) -> Tuple: csv_ = extract_csvs(chart_id, url) csv_strs = csv_.splitlines() - #csv_reader = csv.DictReader(csv_str.splitlines()) - #keys = csv_reader.fieldnames # this table is flipped so, the "keys" are actually the some of the data values (the dates) - test_history = [] - keys = [row.split(',')[0] for row in csv_strs] dates, positives, negatives = [row.split(',')[1:] for row in csv_strs] - # I thought this should be 1: instead of :1 but why is :1 right? - # :1 is not right, but it doesn't give an error - + # I think this should be 1: instead of :1 series = zip(dates, positives, negatives) + test_series = [] cumul_pos = 0 cumul_neg = 0 for entry in series: daily = {} - date_time_obj = datetime.strptime(entry[0], '%m/%d/%Y') - daily["date"] = date_time_obj.isoformat() - daily["positive"] = int(entry[1]) - cumul_pos += daily["positive"] - daily["cumul_pos"] = cumul_pos - daily["negative"] = int(entry[2]) - cumul_neg += daily["negative"] - daily["cumul_neg"] = cumul_neg - test_series.append(daily) + # I'm not sure why, but I just found out that some of the test series have a 'null' value (in the spot where the number of positive tests is), so I needed to account for that here. + # At least for now, it's only present at the end, so I just break out of the loop and return the test series. + if entry[1] != 'null': + date_time_obj = datetime.strptime(entry[0], '%m/%d/%Y') + daily["date"] = date_time_obj.strftime('%Y-%m-%d') + daily["positive"] = int(entry[1]) + cumul_pos += daily["positive"] + daily["negative"] = int(entry[2]) + cumul_neg += daily["negative"] + daily["cumul_pos"] = cumul_pos + daily["cumul_neg"] = cumul_neg + test_series.append(daily) + else: + break + return test_series - - ########################## OLD CODE ################################################ - - # Grab the dates, which are in the header - # for entry in csv_strs[:1][0].split(',')[1:]: - # # need to exclude very first item in the csv_strs[:1][0].split(',') list (which is the value 'Date') - # daily = {} - # date_time_obj = datetime.strptime(entry, '%m/%d/%Y') - # daily["date"] = date_time_obj.isoformat() - # series.append(daily) - - # # The slicing makes this if statement hard to look at... there must be a better way? - # if csv_strs[1:2][0].split(',')[:1][0] != 'Positive Tests' and csv_strs[2:][0].split(',')[:1][0] != 'Negative Tests': - # raise ValueError('The kinds of tests have changed.') - - # # Grab the positive test result numbers, which is in the second row. - # # [1:] is included to make sure that 'Positive Tests' is not captured. - # p_entries = csv_strs[1:2][0].split(',')[1:] - # n_entries = csv_strs[2:][0].split(',')[1:] - - # get_test_series_helper(series, p_entries, ['positive', 'cumul_pos']) - # get_test_series_helper(series, n_entries, ['negative', 'cumul_neg']) - - # return series - ################################################ - -# def get_test_series_helper(series: list, entries: list, keys: list) -> List: -# """This method helps get the pos/neg test count and the cumulative pos/neg test count.""" - -# # initialize values cumulative number, the positive/negative and cumul_pos/neg values for the first day, and the index needed for the while loop. - -# # there's probably a more efficient way to do all of this, but I just wasn't sure. -# cumul = int(entries[0]) -# series[0][keys[0]] = int(entries[0]) -# series[0][keys[1]] = cumul -# index = 1 - -# while index < len(series): -# # get a particular day -# day = series[index] -# curr = int(entries[index]) -# # get pos/neg test count -# day[keys[0]] = int(curr) -# # add that day's pos/neg test count to get cumulative number of positive tests -# cumul += curr -# day[keys[1]] = cumul -# index += 1 -# return series - - get_county() From 0b94cc4dfbb6f42b409e530194171e9f524cc41c Mon Sep 17 00:00:00 2001 From: Angela Kwon Date: Thu, 16 Jul 2020 23:43:39 -0700 Subject: [PATCH 25/39] fixed linter errors --- covid19_sfbayarea/data/marin.py | 72 ++++++++++++++------------------- 1 file changed, 30 insertions(+), 42 deletions(-) diff --git a/covid19_sfbayarea/data/marin.py b/covid19_sfbayarea/data/marin.py index 33f7e028..09d40cb0 100644 --- a/covid19_sfbayarea/data/marin.py +++ b/covid19_sfbayarea/data/marin.py @@ -1,7 +1,7 @@ #!/usr/bin/env python3 import csv from typing import List, Dict, Tuple -from bs4 import BeautifulSoup +from bs4 import BeautifulSoup # type: ignore from urllib.parse import unquote_plus from datetime import datetime @@ -19,16 +19,14 @@ def get_county() -> Dict: model['name'] = "Marin County" model['update_time'] = datetime.today().isoformat() model["meta_from_baypd"] = "There's no actual update time on their website. Not all charts are updated daily." - - # No actual update time on their website? They update most charts daily (so the isoformat is only partially correct.) model['source_url'] = url - #model['meta_from_source'] = get_metadata(url, chart_ids) - #model["series"]["cases"] = get_case_series(chart_ids["cases"], url) - #model["series"]["deaths"] = get_death_series(chart_ids["deaths"], url) - #model["series"]["tests"] = get_test_series(chart_ids["tests"], url) - model["case_totals"]["age_group"], model["death_totals"]["age_group"] = get_breakdown_age(chart_ids["age"], url) - #model["case_totals"]["gender"], model["death_totals"]["gender"] = get_breakdown_gender(chart_ids["gender"], url) - #model["case_totals"]["race_eth"], model["death_totals"]["race_eth"] = get_breakdown_race_eth(chart_ids["race_eth"], url) + # model['meta_from_source'] = get_metadata(url, chart_ids) + # model["series"]["cases"] = get_case_series(chart_ids["cases"], url) + # model["series"]["deaths"] = get_death_series(chart_ids["deaths"], url) + # model["series"]["tests"] = get_test_series(chart_ids["tests"], url) + # model["case_totals"]["age_group"], model["death_totals"]["age_group"] = get_breakdown_age(chart_ids["age"], url) + model["case_totals"]["gender"], model["death_totals"]["gender"] = get_breakdown_gender(chart_ids["gender"], url) + model["case_totals"]["race_eth"], model["death_totals"]["race_eth"] = get_breakdown_race_eth(chart_ids["race_eth"], url) return model def extract_csvs(chart_id: str, url: str) -> str: @@ -61,7 +59,7 @@ def extract_csvs(chart_id: str, url: str) -> str: return csv_string -def get_metadata(url: str, chart_ids: Dict[str, str]) -> Tuple: +def get_metadata(url: str, chart_ids: Dict[str, str]) -> Tuple[List, List]: driver = get_firefox() driver.implicitly_wait(30) driver.get(url) @@ -104,11 +102,11 @@ def get_case_series(chart_id: str, url: str) -> List: """This method extracts the date, number of cumulative cases, and new cases.""" csv_str = extract_csvs(chart_id, url) csv_reader = csv.DictReader(csv_str.splitlines()) - series = [] + series: list = list() - keys = csv_reader.fieldnames + # use a function for this or context manager / function - # TO-DO: is it possible to do 112, 113 and 116 with a context manager to reduce amount of code throughout this file? + keys = csv_reader.fieldnames if keys != ['Date', 'Total Cases', 'Total Recovered*', 'Total Hospitalized', 'Total Deaths']: raise ValueError('The headers have changed') @@ -116,12 +114,12 @@ def get_case_series(chart_id: str, url: str) -> List: case_history = [] for row in csv_reader: - daily = {} + daily: dict = dict() date_time_obj = datetime.strptime(row['Date'], '%m/%d/%Y') daily["date"] = date_time_obj.strftime('%Y-%m-%d') # Collect the case totals in order to compute the change in cases per day case_history.append(int(row["Total Cases"])) - daily["cumul_cases"] = int(row["Total Cases"]) + daily["cumul_cases"] = int(row["Total Cases"]) series.append(daily) case_history_diff = [] @@ -132,14 +130,14 @@ def get_case_series(chart_id: str, url: str) -> List: case_history_diff.insert(0, int(series[0]["cumul_cases"])) for val, case_num in enumerate(case_history_diff): - series[val]["cases"] = case_num + series[val]["cases"] = case_num return series def get_death_series(chart_id: str, url: str) -> List: """This method extracts the date, number of cumulative deaths, and new deaths.""" csv_str = extract_csvs(chart_id, url) csv_reader = csv.DictReader(csv_str.splitlines()) - series = [] + series: list = list() keys = csv_reader.fieldnames if keys != ['Date', 'Total Cases', 'Total Recovered*', 'Total Hospitalized', 'Total Deaths']: @@ -148,7 +146,7 @@ def get_death_series(chart_id: str, url: str) -> List: death_history = [] for row in csv_reader: - daily = {} + daily: dict = dict() date_time_obj = datetime.strptime(row['Date'], '%m/%d/%Y') daily["date"] = date_time_obj.strftime('%Y-%m-%d') # Collect the case totals in order to compute the change in cases per day @@ -167,12 +165,12 @@ def get_death_series(chart_id: str, url: str) -> List: series[val]["deaths"] = case_num return series -def get_breakdown_age(chart_id: str, url: str) -> Tuple: +def get_breakdown_age(chart_id: str, url: str) -> Tuple[List, List]: """This method gets the breakdown of cases and deaths by age.""" csv_str = extract_csvs(chart_id, url) csv_reader = csv.DictReader(csv_str.splitlines()) - c_brkdown = [] - d_brkdown = [] + c_brkdown: list = list() + d_brkdown: list = list() keys = csv_reader.fieldnames @@ -182,8 +180,8 @@ def get_breakdown_age(chart_id: str, url: str) -> Tuple: key_mapping = {"0-18": "0_to_18", "19-34": "19_to_34", "35-49": "35_to_49", "50-64": "50_to_64", "65-79": "65_to_79", "80-94": "80_to_94", "95+": "95_and_older"} for row in csv_reader: - c_age = {} - d_age = {} + c_age: dict = dict() + d_age: dict = dict() # Extracting the age group and the raw count for both cases and deaths. c_age["group"], d_age["group"] = row['Age Category'], row['Age Category'] if c_age["group"] not in key_mapping: @@ -198,7 +196,7 @@ def get_breakdown_age(chart_id: str, url: str) -> Tuple: return c_brkdown, d_brkdown -def get_breakdown_gender(chart_id: str, url: str) -> Tuple: +def get_breakdown_gender(chart_id: str, url: str) -> Tuple[Dict, Dict]: """This method gets the breakdown of cases and deaths by gender.""" csv_str = extract_csvs(chart_id, url) csv_reader = csv.DictReader(csv_str.splitlines()) @@ -216,23 +214,14 @@ def get_breakdown_gender(chart_id: str, url: str) -> Tuple: # Each new row has data for a different gender. gender = row["Gender"].lower() if gender not in genders: - return ValueError('The genders have changed.') + return ValueError("The genders have changed.") # type: ignore + # is doing this bad practice? mypy doesn't have an issue with the error on line 244 so not sure why this one causes an error c_gender[gender] = int(row["Cases"]) - d_gender[gender] = int(row["Deaths"]) - - # for row in csv_strs[1:]: - # # Extracting the gender and the raw count (the 3rd and 5th columns, respectively) for both cases and deaths. - # # Each new row has data for a different gender. - # split = row.split(',') - # gender = split[0].lower() - # if gender not in genders: - # return ValueError('The genders have changed.') - # c_gender[gender] = int(split[2]) - # d_gender[gender] = int(split[4]) + d_gender[gender] = int(row["Deaths"]) return c_gender, d_gender -def get_breakdown_race_eth(chart_id: str, url: str) -> Tuple: +def get_breakdown_race_eth(chart_id: str, url: str) -> Tuple[Dict, Dict]: """This method gets the breakdown of cases and deaths by race/ethnicity.""" csv_str = extract_csvs(chart_id, url) @@ -259,22 +248,21 @@ def get_breakdown_race_eth(chart_id: str, url: str) -> Tuple: return c_race_eth, d_race_eth -def get_test_series(chart_id: str, url: str) -> Tuple: +def get_test_series(chart_id: str, url: str) -> List: """This method gets the date, the number of positive and negative tests on that date, and the number of cumulative positive and negative tests.""" csv_ = extract_csvs(chart_id, url) csv_strs = csv_.splitlines() dates, positives, negatives = [row.split(',')[1:] for row in csv_strs] - # I think this should be 1: instead of :1 series = zip(dates, positives, negatives) - test_series = [] + test_series: list = list() cumul_pos = 0 cumul_neg = 0 for entry in series: - daily = {} + daily: dict = dict() # I'm not sure why, but I just found out that some of the test series have a 'null' value (in the spot where the number of positive tests is), so I needed to account for that here. # At least for now, it's only present at the end, so I just break out of the loop and return the test series. if entry[1] != 'null': From eb079bef8e6af3e9e7c3cc1b531213b028fd00a6 Mon Sep 17 00:00:00 2001 From: Angela Kwon Date: Thu, 16 Jul 2020 23:53:50 -0700 Subject: [PATCH 26/39] ready to write up code in context managers tomorrow --- covid19_sfbayarea/data/marin.py | 36 ++++++++++++++------------------- 1 file changed, 15 insertions(+), 21 deletions(-) diff --git a/covid19_sfbayarea/data/marin.py b/covid19_sfbayarea/data/marin.py index 09d40cb0..24c0eb00 100644 --- a/covid19_sfbayarea/data/marin.py +++ b/covid19_sfbayarea/data/marin.py @@ -20,20 +20,18 @@ def get_county() -> Dict: model['update_time'] = datetime.today().isoformat() model["meta_from_baypd"] = "There's no actual update time on their website. Not all charts are updated daily." model['source_url'] = url - # model['meta_from_source'] = get_metadata(url, chart_ids) - # model["series"]["cases"] = get_case_series(chart_ids["cases"], url) - # model["series"]["deaths"] = get_death_series(chart_ids["deaths"], url) - # model["series"]["tests"] = get_test_series(chart_ids["tests"], url) - # model["case_totals"]["age_group"], model["death_totals"]["age_group"] = get_breakdown_age(chart_ids["age"], url) + model['meta_from_source'] = get_metadata(url, chart_ids) + model["series"]["cases"] = get_case_series(chart_ids["cases"], url) + model["series"]["deaths"] = get_death_series(chart_ids["deaths"], url) + model["series"]["tests"] = get_test_series(chart_ids["tests"], url) + model["case_totals"]["age_group"], model["death_totals"]["age_group"] = get_breakdown_age(chart_ids["age"], url) model["case_totals"]["gender"], model["death_totals"]["gender"] = get_breakdown_gender(chart_ids["gender"], url) model["case_totals"]["race_eth"], model["death_totals"]["race_eth"] = get_breakdown_race_eth(chart_ids["race_eth"], url) return model def extract_csvs(chart_id: str, url: str) -> str: """This method extracts the csv string from the data wrapper charts.""" - driver = get_firefox() - # need to figure out how to change the webdriver - + driver = get_firefox() driver.implicitly_wait(30) driver.get(url) @@ -71,18 +69,12 @@ def get_metadata(url: str, chart_ids: Dict[str, str]) -> Tuple[List, List]: chart_metadata = set() for text in to_be_matched: - #target = soup.find('h4',text=text) - #if not target: - #raise ValueError('Cannot handle this header.') if soup.select('h4 + p')[0].text: metadata += [soup.select('h4 + p')[0].text] else: raise ValueError('Location of metadata has changed.') - #for sib in target.find_next_siblings()[:1]: # I only want the first paragraph tag - #metadata += [sib.text] - - # Metadata for each chart visualizing the data of the csv file I'll pull. There's probably a better way to organize this. + # Metadata for each chart visualizing the data of the csv file I'll pull. for chart_id in chart_ids.values(): frame = driver.find_element_by_css_selector(f'iframe[src*="//datawrapper.dwcdn.net/{chart_id}/"]') driver.switch_to.frame(frame) @@ -102,12 +94,13 @@ def get_case_series(chart_id: str, url: str) -> List: """This method extracts the date, number of cumulative cases, and new cases.""" csv_str = extract_csvs(chart_id, url) csv_reader = csv.DictReader(csv_str.splitlines()) - series: list = list() # use a function for this or context manager / function keys = csv_reader.fieldnames - + + series: list = list() + if keys != ['Date', 'Total Cases', 'Total Recovered*', 'Total Hospitalized', 'Total Deaths']: raise ValueError('The headers have changed') @@ -137,9 +130,10 @@ def get_death_series(chart_id: str, url: str) -> List: """This method extracts the date, number of cumulative deaths, and new deaths.""" csv_str = extract_csvs(chart_id, url) csv_reader = csv.DictReader(csv_str.splitlines()) + keys = csv_reader.fieldnames + series: list = list() - keys = csv_reader.fieldnames if keys != ['Date', 'Total Cases', 'Total Recovered*', 'Total Hospitalized', 'Total Deaths']: raise ValueError('The headers have changed.') @@ -155,7 +149,7 @@ def get_death_series(chart_id: str, url: str) -> List: series.append(daily) death_history_diff = [] - # Since i'm substracting pairwise elements, I need to adjust the range so I don't get an off by one error. + # Since I'm substracting pairwise elements, I need to adjust the range so I don't get an off by one error. for i in range(0, len(death_history)-1): death_history_diff.append((int(death_history[i+1]) - int(death_history[i])) + int(series[0]["cumul_deaths"])) # from what I've seen, series[0]["cumul_cases"] will be 0, but I shouldn't assume that. @@ -169,11 +163,11 @@ def get_breakdown_age(chart_id: str, url: str) -> Tuple[List, List]: """This method gets the breakdown of cases and deaths by age.""" csv_str = extract_csvs(chart_id, url) csv_reader = csv.DictReader(csv_str.splitlines()) + keys = csv_reader.fieldnames + c_brkdown: list = list() d_brkdown: list = list() - keys = csv_reader.fieldnames - if keys != ['Age Category', 'POPULATION', 'Cases', 'Hospitalizations', 'Deaths']: raise ValueError('The headers have changed') From 862a240a01fcce719c3849804e1c0bb2c03e2999 Mon Sep 17 00:00:00 2001 From: Angela Kwon Date: Sat, 18 Jul 2020 00:05:01 -0700 Subject: [PATCH 27/39] rewrote metadata and extract csv functions using context managers --- covid19_sfbayarea/data/marin.py | 152 +++++++++++++++++++++++--------- 1 file changed, 108 insertions(+), 44 deletions(-) diff --git a/covid19_sfbayarea/data/marin.py b/covid19_sfbayarea/data/marin.py index 24c0eb00..dd1f5474 100644 --- a/covid19_sfbayarea/data/marin.py +++ b/covid19_sfbayarea/data/marin.py @@ -4,6 +4,7 @@ from bs4 import BeautifulSoup # type: ignore from urllib.parse import unquote_plus from datetime import datetime +from contextlib import contextmanager from ..webdriver import get_firefox from .utils import get_data_model @@ -16,19 +17,89 @@ def get_county() -> Dict: chart_ids = {"cases": "Eq6Es", "deaths": "Eq6Es", "tests": '2Hgir', "age": "VOeBm", "gender": "FEciW", "race_eth": "aBeEd"} # population totals and transmission data missing. + driver = get_firefox() + model['name'] = "Marin County" model['update_time'] = datetime.today().isoformat() model["meta_from_baypd"] = "There's no actual update time on their website. Not all charts are updated daily." model['source_url'] = url - model['meta_from_source'] = get_metadata(url, chart_ids) - model["series"]["cases"] = get_case_series(chart_ids["cases"], url) - model["series"]["deaths"] = get_death_series(chart_ids["deaths"], url) - model["series"]["tests"] = get_test_series(chart_ids["tests"], url) - model["case_totals"]["age_group"], model["death_totals"]["age_group"] = get_breakdown_age(chart_ids["age"], url) - model["case_totals"]["gender"], model["death_totals"]["gender"] = get_breakdown_gender(chart_ids["gender"], url) - model["case_totals"]["race_eth"], model["death_totals"]["race_eth"] = get_breakdown_race_eth(chart_ids["race_eth"], url) + #model['meta_from_source'] = get_metadata(url, chart_ids) + model['meta_from_source'] = get_chart_meta(url, driver, chart_ids) + + # model["series"]["cases"] = get_case_series(chart_ids["cases"], url) + # model["series"]["deaths"] = get_death_series(chart_ids["deaths"], url) + # model["series"]["tests"] = get_test_series(chart_ids["tests"], url) + # model["case_totals"]["age_group"], model["death_totals"]["age_group"] = get_breakdown_age(chart_ids["age"], url) + # model["case_totals"]["gender"], model["death_totals"]["gender"] = get_breakdown_gender(chart_ids["gender"], url) + # model["case_totals"]["race_eth"], model["death_totals"]["race_eth"] = get_breakdown_race_eth(chart_ids["race_eth"], url) return model +@contextmanager +def chart_frame(driver, chart_id: str): + frame = driver.find_element_by_css_selector(f'iframe[src*="//datawrapper.dwcdn.net/{chart_id}/"]') + driver.switch_to.frame(frame) + try: + yield frame + finally: + driver.switch_to.default_content() + +def get_chart_data(url, driver, chart_id: str) -> List[str]: + """This method extracts parsed csv data from the csv linked in the data wrapper charts.""" + driver = get_firefox() + driver.implicitly_wait(30) + driver.get(url) + + with chart_frame(driver, chart_id): + csv_data = driver.find_element_by_class_name('dw-data-link').get_attribute('href') + # Deal with the data + if csv_data.startswith('data:'): + media, data = csv_data[5:].split(',', 1) + # Will likely always have this kind of data type + if media != 'application/octet-stream;charset=utf-8': + raise ValueError(f'Cannot handle media type "{media}"') + csv_string = unquote_plus(data) + csv_data = csv_string.splitlines() + else: + raise ValueError('Cannot handle this csv_data href') + + return csv_data + +def get_chart_meta(url, driver, chart_ids: Dict[str, str]) -> List: + """This method gets all the metadata underneath the data wrapper charts.""" + driver = get_firefox() + driver.implicitly_wait(30) + driver.get(url) + soup = BeautifulSoup(driver.page_source, 'html5lib') + metadata = [] + + to_be_matched = ['Total Cases, Recovered, Hospitalizations and Deaths by Date Reported', 'Daily Count of Positive Results and Total Tests for Marin County Residents by Test Date ', 'Cases, Hospitalizations, and Deaths by Age, Gender and Race/Ethnicity '] + + chart_metadata = set() + + for text in to_be_matched: + if soup.select('h4 + p')[0].text: + metadata += [soup.select('h4 + p')[0].text] + else: + raise ValueError('Location of metadata has changed.') + + # Metadata for each chart visualizing the data of the csv file I'll pull. + for chart_id in chart_ids.values(): + with chart_frame(driver, chart_id): + notes = driver.find_elements_by_class_name('dw-chart-notes') + chart_metadata = list({note.text for note in notes}) + + # The metadata for the charts is located in elements with the class `dw-chart-notes' + for c in driver.find_elements_by_class_name('dw-chart-notes'): + chart_metadata.add(c.text) + + # Switch back to the parent frame to "reset" the context + driver.switch_to.parent_frame() + + driver.quit() + + # Return the metadata. I take the set of the chart_metadata since there are repeating metadata strings. + return metadata, list(chart_metadata) + def extract_csvs(chart_id: str, url: str) -> str: """This method extracts the csv string from the data wrapper charts.""" driver = get_firefox() @@ -41,7 +112,6 @@ def extract_csvs(chart_id: str, url: str) -> str: # Grab the raw data out of the link's href attribute csv_data = driver.find_element_by_class_name('dw-data-link').get_attribute('href') - # Deal with the data if csv_data.startswith('data:'): media, data = csv_data[5:].split(',', 1) @@ -58,21 +128,6 @@ def extract_csvs(chart_id: str, url: str) -> str: return csv_string def get_metadata(url: str, chart_ids: Dict[str, str]) -> Tuple[List, List]: - driver = get_firefox() - driver.implicitly_wait(30) - driver.get(url) - soup = BeautifulSoup(driver.page_source, 'html5lib') - metadata = [] - - to_be_matched = ['Total Cases, Recovered, Hospitalizations and Deaths by Date Reported', 'Daily Count of Positive Results and Total Tests for Marin County Residents by Test Date ', 'Cases, Hospitalizations, and Deaths by Age, Gender and Race/Ethnicity '] - - chart_metadata = set() - - for text in to_be_matched: - if soup.select('h4 + p')[0].text: - metadata += [soup.select('h4 + p')[0].text] - else: - raise ValueError('Location of metadata has changed.') # Metadata for each chart visualizing the data of the csv file I'll pull. for chart_id in chart_ids.values(): @@ -90,12 +145,13 @@ def get_metadata(url: str, chart_ids: Dict[str, str]) -> Tuple[List, List]: # Return the metadata. I take the set of the chart_metadata since there are repeating metadata strings. return metadata, list(chart_metadata) -def get_case_series(chart_id: str, url: str) -> List: +def get_case_series(chart_id: str, url: str, driver) -> List: """This method extracts the date, number of cumulative cases, and new cases.""" - csv_str = extract_csvs(chart_id, url) - csv_reader = csv.DictReader(csv_str.splitlines()) - - # use a function for this or context manager / function + + csv_data = get_chart_data(url, driver, chart_id) + csv_reader = csv.DictReader(csv_data) + # csv_str = extract_csvs(chart_id, url) + # csv_reader = csv.DictReader(csv_str.splitlines()) keys = csv_reader.fieldnames @@ -126,10 +182,12 @@ def get_case_series(chart_id: str, url: str) -> List: series[val]["cases"] = case_num return series -def get_death_series(chart_id: str, url: str) -> List: +def get_death_series(chart_id: str, url: str, driver) -> List: """This method extracts the date, number of cumulative deaths, and new deaths.""" - csv_str = extract_csvs(chart_id, url) - csv_reader = csv.DictReader(csv_str.splitlines()) + csv_data = get_chart_data(url, driver, chart_id) + csv_reader = csv.DictReader(csv_data) + # csv_str = extract_csvs(chart_id, url) + # csv_reader = csv.DictReader(csv_str.splitlines()) keys = csv_reader.fieldnames series: list = list() @@ -159,10 +217,12 @@ def get_death_series(chart_id: str, url: str) -> List: series[val]["deaths"] = case_num return series -def get_breakdown_age(chart_id: str, url: str) -> Tuple[List, List]: +def get_breakdown_age(chart_id: str, url: str, driver) -> Tuple[List, List]: """This method gets the breakdown of cases and deaths by age.""" - csv_str = extract_csvs(chart_id, url) - csv_reader = csv.DictReader(csv_str.splitlines()) + csv_data = get_chart_data(url, driver, chart_id) + csv_reader = csv.DictReader(csv_data) + # csv_str = extract_csvs(chart_id, url) + # csv_reader = csv.DictReader(csv_str.splitlines()) keys = csv_reader.fieldnames c_brkdown: list = list() @@ -190,10 +250,12 @@ def get_breakdown_age(chart_id: str, url: str) -> Tuple[List, List]: return c_brkdown, d_brkdown -def get_breakdown_gender(chart_id: str, url: str) -> Tuple[Dict, Dict]: +def get_breakdown_gender(chart_id: str, url: str, driver) -> Tuple[Dict, Dict]: """This method gets the breakdown of cases and deaths by gender.""" - csv_str = extract_csvs(chart_id, url) - csv_reader = csv.DictReader(csv_str.splitlines()) + csv_data = get_chart_data(url, driver, chart_id) + csv_reader = csv.DictReader(csv_data) + # csv_str = extract_csvs(chart_id, url) + # csv_reader = csv.DictReader(csv_str.splitlines()) keys = csv_reader.fieldnames if keys != ['Gender', 'POPULATION', 'Cases', 'Hospitalizations', 'Deaths']: @@ -215,11 +277,13 @@ def get_breakdown_gender(chart_id: str, url: str) -> Tuple[Dict, Dict]: return c_gender, d_gender -def get_breakdown_race_eth(chart_id: str, url: str) -> Tuple[Dict, Dict]: +def get_breakdown_race_eth(chart_id: str, url: str, driver) -> Tuple[Dict, Dict]: """This method gets the breakdown of cases and deaths by race/ethnicity.""" - csv_str = extract_csvs(chart_id, url) - csv_reader = csv.DictReader(csv_str.splitlines()) + csv_data = get_chart_data(url, driver, chart_id) + csv_reader = csv.DictReader(csv_data) + #csv_str = extract_csvs(chart_id, url) + #csv_reader = csv.DictReader(csv_str.splitlines()) keys = csv_reader.fieldnames if keys != ['Race/Ethnicity', 'COUNTY POPULATION', 'Cases', 'Case Percent', 'Hospitalizations', 'Hospitalizations Percent', 'Deaths', 'Deaths Percent']: @@ -244,11 +308,11 @@ def get_breakdown_race_eth(chart_id: str, url: str) -> Tuple[Dict, Dict]: def get_test_series(chart_id: str, url: str) -> List: """This method gets the date, the number of positive and negative tests on that date, and the number of cumulative positive and negative tests.""" + csv_data = get_chart_data(url, driver, chart_id) + # csv_ = extract_csvs(chart_id, url) + # csv_strs = csv_.splitlines() - csv_ = extract_csvs(chart_id, url) - csv_strs = csv_.splitlines() - - dates, positives, negatives = [row.split(',')[1:] for row in csv_strs] + dates, positives, negatives = [row.split(',')[1:] for row in csv_data] series = zip(dates, positives, negatives) test_series: list = list() From 153d3796034e2ea4b57a7fb916a05e5aa30c0be8 Mon Sep 17 00:00:00 2001 From: Angela Kwon Date: Wed, 22 Jul 2020 18:42:56 -0700 Subject: [PATCH 28/39] fixed half of metadata function, not sure what's wrong with the other part --- covid19_sfbayarea/data/marin.py | 100 +++++++++++++++++--------------- 1 file changed, 52 insertions(+), 48 deletions(-) diff --git a/covid19_sfbayarea/data/marin.py b/covid19_sfbayarea/data/marin.py index dd1f5474..c692f1c4 100644 --- a/covid19_sfbayarea/data/marin.py +++ b/covid19_sfbayarea/data/marin.py @@ -24,7 +24,7 @@ def get_county() -> Dict: model["meta_from_baypd"] = "There's no actual update time on their website. Not all charts are updated daily." model['source_url'] = url #model['meta_from_source'] = get_metadata(url, chart_ids) - model['meta_from_source'] = get_chart_meta(url, driver, chart_ids) + model['meta_from_source'] = get_chart_meta(url, chart_ids) # model["series"]["cases"] = get_case_series(chart_ids["cases"], url) # model["series"]["deaths"] = get_death_series(chart_ids["deaths"], url) @@ -44,61 +44,65 @@ def chart_frame(driver, chart_id: str): driver.switch_to.default_content() def get_chart_data(url, driver, chart_id: str) -> List[str]: - """This method extracts parsed csv data from the csv linked in the data wrapper charts.""" - driver = get_firefox() - driver.implicitly_wait(30) - driver.get(url) + """This method extracts parsed csv data from the csv linked in the data wrapper charts.""" + with get_firefox() as driver: + driver.implicitly_wait(30) + driver.get(url) - with chart_frame(driver, chart_id): - csv_data = driver.find_element_by_class_name('dw-data-link').get_attribute('href') - # Deal with the data - if csv_data.startswith('data:'): - media, data = csv_data[5:].split(',', 1) - # Will likely always have this kind of data type - if media != 'application/octet-stream;charset=utf-8': - raise ValueError(f'Cannot handle media type "{media}"') - csv_string = unquote_plus(data) - csv_data = csv_string.splitlines() - else: - raise ValueError('Cannot handle this csv_data href') + with chart_frame(driver, chart_id): + csv_data = driver.find_element_by_class_name('dw-data-link').get_attribute('href') + # Deal with the data + if csv_data.startswith('data:'): + media, data = csv_data[5:].split(',', 1) + # Will likely always have this kind of data type + if media != 'application/octet-stream;charset=utf-8': + raise ValueError(f'Cannot handle media type "{media}"') + csv_string = unquote_plus(data) + csv_data = csv_string.splitlines() + else: + raise ValueError('Cannot handle this csv_data href') return csv_data -def get_chart_meta(url, driver, chart_ids: Dict[str, str]) -> List: - """This method gets all the metadata underneath the data wrapper charts.""" - driver = get_firefox() - driver.implicitly_wait(30) - driver.get(url) - soup = BeautifulSoup(driver.page_source, 'html5lib') - metadata = [] - - to_be_matched = ['Total Cases, Recovered, Hospitalizations and Deaths by Date Reported', 'Daily Count of Positive Results and Total Tests for Marin County Residents by Test Date ', 'Cases, Hospitalizations, and Deaths by Age, Gender and Race/Ethnicity '] - - chart_metadata = set() - - for text in to_be_matched: - if soup.select('h4 + p')[0].text: - metadata += [soup.select('h4 + p')[0].text] - else: - raise ValueError('Location of metadata has changed.') - - # Metadata for each chart visualizing the data of the csv file I'll pull. - for chart_id in chart_ids.values(): - with chart_frame(driver, chart_id): - notes = driver.find_elements_by_class_name('dw-chart-notes') - chart_metadata = list({note.text for note in notes}) - - # The metadata for the charts is located in elements with the class `dw-chart-notes' - for c in driver.find_elements_by_class_name('dw-chart-notes'): - chart_metadata.add(c.text) +def get_chart_meta(url, chart_ids: Dict[str, str]) -> List: + """This method gets all the metadata underneath the data wrapper charts and the metadata.""" + with get_firefox() as driver: + driver.implicitly_wait(30) + driver.get(url) + soup = BeautifulSoup(driver.page_source, 'html5lib') + metadata = set() + + chart_metadata = set() - # Switch back to the parent frame to "reset" the context - driver.switch_to.parent_frame() + for soup_obj in soup.findAll('div', attrs={"class":"surveillance-data-text"}): + if soup_obj.findAll('p'): + metadata = set({paragraph.text.replace("\u2014","").replace("\u00a0", "").replace("\u2019","") for paragraph in soup_obj.findAll('p')}) + else: + raise ValueError('Metadata location has changed.') - driver.quit() + # Metadata for each chart visualizing the data of the csv file I'll pull. + + # new function + for chart_id in chart_ids.values(): + with chart_frame(driver, chart_id): + for soup_obj in soup.findAll('div', attrs={"class": 'notes-block'}): + #chart_metadata = soup_obj + if soup_obj.findAll('span'): + chart_metadata = set({obj.text for obj in soup_obj.findAll('span')}) + else: + raise ValueError('Metadata location has changed.') + + # Switch back to the parent frame to "reset" the context + #driver.switch_to.parent_frame() # I think this is handled by the context manager + + # old function - to be deleted + # for chart_id in chart_ids.values(): + # with chart_frame(driver, chart_id): + # notes = driver.find_elements_by_class_name('dw-chart-notes') + # chart_metadata = list({note.text for note in notes}) # Return the metadata. I take the set of the chart_metadata since there are repeating metadata strings. - return metadata, list(chart_metadata) + return list(metadata), list(chart_metadata) def extract_csvs(chart_id: str, url: str) -> str: """This method extracts the csv string from the data wrapper charts.""" From 90e75ee10b26f20312d25a3231c4c6406560ec0f Mon Sep 17 00:00:00 2001 From: Angela Kwon Date: Tue, 18 Aug 2020 00:05:06 -0700 Subject: [PATCH 29/39] fixed metadata function - finallygit add covid19_sfbayarea/data/marin.py just wasn't waiting long enough.. --- covid19_sfbayarea/data/marin.py | 120 +++++++++++++------------------- 1 file changed, 47 insertions(+), 73 deletions(-) diff --git a/covid19_sfbayarea/data/marin.py b/covid19_sfbayarea/data/marin.py index c692f1c4..96d6d83d 100644 --- a/covid19_sfbayarea/data/marin.py +++ b/covid19_sfbayarea/data/marin.py @@ -5,6 +5,12 @@ from urllib.parse import unquote_plus from datetime import datetime from contextlib import contextmanager +from requests import get +import time +from selenium.webdriver.support.ui import WebDriverWait +from selenium.webdriver.support import expected_conditions as EC +from selenium.webdriver.common.by import By + from ..webdriver import get_firefox from .utils import get_data_model @@ -15,15 +21,27 @@ def get_county() -> Dict: url = 'https://coronavirus.marinhhs.org/surveillance' model = get_data_model() - chart_ids = {"cases": "Eq6Es", "deaths": "Eq6Es", "tests": '2Hgir', "age": "VOeBm", "gender": "FEciW", "race_eth": "aBeEd"} - # population totals and transmission data missing. + chart_ids = {"cases": "Eq6Es", "deaths": "Eq6Es", "age": "zSHDs", "gender": "FEciW", "race_eth": "aBeEd", "tests": '2Hgir'} + + # i don't think it looked at the chart for aBeEd + + + + + + # TO-DOs + # tests have a different csv, might not even have the one I need? + + # age now has zSHDs as the id associated with the href and VOeBm as the id associated with the csv name. + # race_eth chart id has also changed - 6RXFj + # Missing in mate data - just add the cases from the inmate data to the current scraper + driver = get_firefox() model['name'] = "Marin County" model['update_time'] = datetime.today().isoformat() model["meta_from_baypd"] = "There's no actual update time on their website. Not all charts are updated daily." model['source_url'] = url - #model['meta_from_source'] = get_metadata(url, chart_ids) model['meta_from_source'] = get_chart_meta(url, chart_ids) # model["series"]["cases"] = get_case_series(chart_ids["cases"], url) @@ -42,6 +60,7 @@ def chart_frame(driver, chart_id: str): yield frame finally: driver.switch_to.default_content() + driver.quit() def get_chart_data(url, driver, chart_id: str) -> List[str]: """This method extracts parsed csv data from the csv linked in the data wrapper charts.""" @@ -66,88 +85,45 @@ def get_chart_data(url, driver, chart_id: str) -> List[str]: def get_chart_meta(url, chart_ids: Dict[str, str]) -> List: """This method gets all the metadata underneath the data wrapper charts and the metadata.""" + metadata = set() + chart_metadata = set() + with get_firefox() as driver: driver.implicitly_wait(30) driver.get(url) soup = BeautifulSoup(driver.page_source, 'html5lib') - metadata = set() - - chart_metadata = set() - for soup_obj in soup.findAll('div', attrs={"class":"surveillance-data-text"}): + for soup_obj in soup.findAll('div', attrs={"class":"surveillance-data-text"}): if soup_obj.findAll('p'): metadata = set({paragraph.text.replace("\u2014","").replace("\u00a0", "").replace("\u2019","") for paragraph in soup_obj.findAll('p')}) else: raise ValueError('Metadata location has changed.') + with get_firefox() as driver: # I keep getting a connection error so maybe I need to do this again? seems weird. + driver.implicitly_wait(30) + driver.get(url) # Metadata for each chart visualizing the data of the csv file I'll pull. - - # new function - for chart_id in chart_ids.values(): - with chart_frame(driver, chart_id): - for soup_obj in soup.findAll('div', attrs={"class": 'notes-block'}): - #chart_metadata = soup_obj - if soup_obj.findAll('span'): - chart_metadata = set({obj.text for obj in soup_obj.findAll('span')}) - else: - raise ValueError('Metadata location has changed.') - - # Switch back to the parent frame to "reset" the context - #driver.switch_to.parent_frame() # I think this is handled by the context manager - - # old function - to be deleted - # for chart_id in chart_ids.values(): - # with chart_frame(driver, chart_id): - # notes = driver.find_elements_by_class_name('dw-chart-notes') - # chart_metadata = list({note.text for note in notes}) - - # Return the metadata. I take the set of the chart_metadata since there are repeating metadata strings. - return list(metadata), list(chart_metadata) - -def extract_csvs(chart_id: str, url: str) -> str: - """This method extracts the csv string from the data wrapper charts.""" - driver = get_firefox() - driver.implicitly_wait(30) - driver.get(url) - - frame = driver.find_element_by_css_selector(f'iframe[src*="//datawrapper.dwcdn.net/{chart_id}/"]') - - driver.switch_to.frame(frame) - # Grab the raw data out of the link's href attribute - csv_data = driver.find_element_by_class_name('dw-data-link').get_attribute('href') - - # Deal with the data - if csv_data.startswith('data:'): - media, data = csv_data[5:].split(',', 1) - # Will likely always have this kind of data type - if media != 'application/octet-stream;charset=utf-8': - raise ValueError(f'Cannot handle media type "{media}"') - csv_string = unquote_plus(data) - else: - raise ValueError('Cannot handle this csv_data href') + # I had to change my metadata function b/c for whatever reason, my usual code didn't pick up on the class notes block. + # There's something weird with the website that Ricardo and I couldn't quite pinpoint. + source_list = set() + for chart_id in chart_ids.values(): + driver.implicitly_wait(30) + source = driver.find_element_by_css_selector(f'iframe[src*="//datawrapper.dwcdn.net/{chart_id}/"]').get_attribute('src') + source_list.add(source) - # Then leave the iframe - driver.switch_to.default_content() - - return csv_string - -def get_metadata(url: str, chart_ids: Dict[str, str]) -> Tuple[List, List]: - - # Metadata for each chart visualizing the data of the csv file I'll pull. - for chart_id in chart_ids.values(): - frame = driver.find_element_by_css_selector(f'iframe[src*="//datawrapper.dwcdn.net/{chart_id}/"]') - driver.switch_to.frame(frame) - # The metadata for the charts is located in elements with the class `dw-chart-notes' - for c in driver.find_elements_by_class_name('dw-chart-notes'): - chart_metadata.add(c.text) - - # Switch back to the parent frame to "reset" the context - driver.switch_to.parent_frame() - - driver.quit() + with get_firefox() as driver: + for source in source_list: + driver.get(source) + #breakpoint() + import time; time.sleep(5) # this ensures there's enough time for the soup to find the elements and for the chart_metadata to populate. + # From the source code it seems that .get() should be synchronous but it's not working like that :( + soup = BeautifulSoup(driver.page_source, 'html5lib') + for data in soup.findAll('div', attrs = {'class': 'notes-block'}): + #breakpoint() + chart_metadata.add(data.text.strip()) # Return the metadata. I take the set of the chart_metadata since there are repeating metadata strings. - return metadata, list(chart_metadata) + return list(metadata), list(chart_metadata) def get_case_series(chart_id: str, url: str, driver) -> List: """This method extracts the date, number of cumulative cases, and new cases.""" @@ -341,5 +317,3 @@ def get_test_series(chart_id: str, url: str) -> List: break return test_series - -get_county() From 04e62e1b52455c29d02af7e1fbb01d172a605c30 Mon Sep 17 00:00:00 2001 From: Angela Kwon Date: Sat, 22 Aug 2020 10:48:12 -0700 Subject: [PATCH 30/39] fixed linter issue --- covid19_sfbayarea/data/marin.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/covid19_sfbayarea/data/marin.py b/covid19_sfbayarea/data/marin.py index 29c99902..e5afd447 100644 --- a/covid19_sfbayarea/data/marin.py +++ b/covid19_sfbayarea/data/marin.py @@ -118,8 +118,6 @@ def get_inmate_totals(chart_id: str, url: str) -> Tuple: keys = csv_reader.fieldnames - series: list = list() - if keys != ['Updated', 'Total Confirmed Cases', 'Total Resolved Cases', 'COVID-19 Deaths']: raise ValueError('The headers have changed') From ef46d85162e84ddac2e4499bf0762145e64fdde0 Mon Sep 17 00:00:00 2001 From: kwonangela7 Date: Fri, 28 Aug 2020 19:28:28 -0700 Subject: [PATCH 31/39] Update covid19_sfbayarea/data/marin.py "raise ValueError" to "return Value Error" Co-authored-by: Elaine Laguerta <12928553+elaguerta@users.noreply.github.com> --- covid19_sfbayarea/data/marin.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/covid19_sfbayarea/data/marin.py b/covid19_sfbayarea/data/marin.py index e5afd447..9a15e549 100644 --- a/covid19_sfbayarea/data/marin.py +++ b/covid19_sfbayarea/data/marin.py @@ -213,7 +213,7 @@ def get_breakdown_gender(chart_id: str, url: str) -> Tuple[Dict, Dict]: # Each new row has data for a different gender. gender = row["Gender"].lower() if gender not in genders: - return ValueError("The genders have changed.") # type: ignore + raise ValueError("The genders have changed.") # type: ignore # is doing this bad practice? mypy doesn't have an issue with the error on line 244 so not sure why this one causes an error c_gender[gender] = int(row["Cases"]) d_gender[gender] = int(row["Deaths"]) From b4826ccfa06296636da69aa123fdf69bf34d54de Mon Sep 17 00:00:00 2001 From: Angela Kwon Date: Fri, 28 Aug 2020 19:39:25 -0700 Subject: [PATCH 32/39] removed instances of inmate as that data is not collected by marin county moving forward --- covid19_sfbayarea/data/marin.py | 24 +++--------------------- 1 file changed, 3 insertions(+), 21 deletions(-) diff --git a/covid19_sfbayarea/data/marin.py b/covid19_sfbayarea/data/marin.py index e5afd447..5d6ef4c1 100644 --- a/covid19_sfbayarea/data/marin.py +++ b/covid19_sfbayarea/data/marin.py @@ -17,21 +17,19 @@ def get_county() -> Dict: url = 'https://coronavirus.marinhhs.org/surveillance' model = get_data_model() - chart_ids = {"cases": "Eq6Es", "deaths": "Eq6Es", "inmates": "KCNZn", "age": "zSHDs", "gender": "FEciW", "race_eth": "aBeEd"} + chart_ids = {"cases": "Eq6Es", "deaths": "Eq6Es", "age": "zSHDs", "gender": "FEciW", "race_eth": "aBeEd"} # I removed "tests": '2Hgir' from chart_ids b/c it seems to have disappeared from the website? model['name'] = "Marin County" model['update_time'] = datetime.today().isoformat() - model["meta_from_baypd"] = ["There's no actual update time on their website. Not all charts are updated daily.", "The cases and deaths total include inmate numbers, but the cases and deaths series, the testing data and data broken down by race/ethnicity, gender and age do not."] + model["meta_from_baypd"] = ["There's no actual update time on their website. Not all charts are updated daily."] model['source_url'] = url model['meta_from_source'] = get_chart_meta(url, chart_ids) model["series"]["cases"] = get_series_data(chart_ids["cases"], url, ['Date', 'Total Cases', 'Total Recovered*', 'Total Hospitalized', 'Total Deaths'], "cumul_cases", 'Total Cases', 'cases') model["series"]["deaths"] = get_series_data(chart_ids["deaths"], url, ['Date', 'Total Cases', 'Total Recovered*', 'Total Hospitalized', 'Total Deaths'], "cumul_deaths", 'Total Deaths', 'deaths') - model["inmates"]["cases"] = get_inmate_totals(chart_ids["inmates"], url)[0] - model["inmates"]["deaths"] = get_inmate_totals(chart_ids["inmates"], url)[1] - #model["series"]["tests"] = get_test_series(chart_ids["tests"], url) + model["series"]["tests"] = get_test_series(chart_ids["tests"], url) model["case_totals"]["age_group"], model["death_totals"]["age_group"] = get_breakdown_age(chart_ids["age"], url) model["case_totals"]["gender"], model["death_totals"]["gender"] = get_breakdown_gender(chart_ids["gender"], url) model["case_totals"]["race_eth"], model["death_totals"]["race_eth"] = get_breakdown_race_eth(chart_ids["race_eth"], url) @@ -111,22 +109,6 @@ def get_chart_meta(url: str, chart_ids: Dict[str, str]) -> Tuple[List, List]: # Return the metadata. I take the set of the chart_metadata since there are repeating metadata strings. return list(metadata), list(chart_metadata) -def get_inmate_totals(chart_id: str, url: str) -> Tuple: - """This method extracts the number of cases and deaths for San Quentin inmates.""" - csv_data = get_chart_data(url, chart_id) - csv_reader = csv.DictReader(csv_data) - - keys = csv_reader.fieldnames - - if keys != ['Updated', 'Total Confirmed Cases', 'Total Resolved Cases', 'COVID-19 Deaths']: - raise ValueError('The headers have changed') - - for row in csv_reader: - cases = row['Total Confirmed Cases'] - deaths = row['COVID-19 Deaths'] - - return (cases, deaths) - def get_series_data(chart_id: str, url: str, headers: list, model_typ: str, typ: str, new_count: str) -> List: """This method extracts the date, number of cases/deaths, and new cases/deaths.""" From e4f586f5093e0ac44763cf25f100af36b3ed280c Mon Sep 17 00:00:00 2001 From: Angela Kwon Date: Fri, 28 Aug 2020 19:39:49 -0700 Subject: [PATCH 33/39] updated README - inmate section --- data_models/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/data_models/README.md b/data_models/README.md index bb5dcc4f..3b556380 100644 --- a/data_models/README.md +++ b/data_models/README.md @@ -140,7 +140,7 @@ The fields will be used for normalizing the county case and death tabulations, a 6. __Inmate Data__ -This part of the data model currently only applies to Marin County, which reports the case and death count separately from the case and death count in the Marin County community. Note that the case and death data available for inmates is not in series form; there are only aggregated totals. +Data collection is pending resolution of #108. ``` "inmates": { From e4b185bb1209856b67d1750b6de6d36ed3b88c5b Mon Sep 17 00:00:00 2001 From: Angela Kwon Date: Fri, 28 Aug 2020 20:30:24 -0700 Subject: [PATCH 34/39] updated Race and Ethnicity README --- covid19_sfbayarea/data/marin.py | 4 ++-- data_models/README.md | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/covid19_sfbayarea/data/marin.py b/covid19_sfbayarea/data/marin.py index 3634b084..62bfb922 100644 --- a/covid19_sfbayarea/data/marin.py +++ b/covid19_sfbayarea/data/marin.py @@ -18,7 +18,7 @@ def get_county() -> Dict: model = get_data_model() chart_ids = {"cases": "Eq6Es", "deaths": "Eq6Es", "age": "zSHDs", "gender": "FEciW", "race_eth": "aBeEd"} - # I removed "tests": '2Hgir' from chart_ids b/c it seems to have disappeared from the website? + # I removed "tests": '2Hgir' from chart_ids b/c the breakdown of negative and positive tests has disappeared from the website model['name'] = "Marin County" model['update_time'] = datetime.today().isoformat() @@ -29,7 +29,7 @@ def get_county() -> Dict: model["series"]["cases"] = get_series_data(chart_ids["cases"], url, ['Date', 'Total Cases', 'Total Recovered*', 'Total Hospitalized', 'Total Deaths'], "cumul_cases", 'Total Cases', 'cases') model["series"]["deaths"] = get_series_data(chart_ids["deaths"], url, ['Date', 'Total Cases', 'Total Recovered*', 'Total Hospitalized', 'Total Deaths'], "cumul_deaths", 'Total Deaths', 'deaths') - model["series"]["tests"] = get_test_series(chart_ids["tests"], url) + #model["series"]["tests"] = get_test_series(chart_ids["tests"], url) model["case_totals"]["age_group"], model["death_totals"]["age_group"] = get_breakdown_age(chart_ids["age"], url) model["case_totals"]["gender"], model["death_totals"]["gender"] = get_breakdown_gender(chart_ids["gender"], url) model["case_totals"]["race_eth"], model["death_totals"]["race_eth"] = get_breakdown_race_eth(chart_ids["race_eth"], url) diff --git a/data_models/README.md b/data_models/README.md index 3b556380..46b8aa5e 100644 --- a/data_models/README.md +++ b/data_models/README.md @@ -217,7 +217,7 @@ Scraper authors, please keep an eye out for amendments to the data model. # Race and Ethnicity We need to collapse counties that report race and ethnicity into one race/ethnicity dimension. This section will be updated pending information about San Francisco County's methods for reporting race and ethnicity. -The category "Multi_or_Other" was included because Marin rolls up the numbers from "Multi" and "Other" into one. +The category "Multi_or_Other" was included because Marin rolls up the numbers from "Multi" and "Other" into one. Please note that this category is not relevant for counties that report 'Multiple Race' and 'Other Race' separately. # Gender One future potential issue is that some counties still lump non-binary and cis-gender people under "Other", and other counties have started to differentiate. Our data model would ideally match the most detailed county's gender categories. A county with only the "Other" county would have the value of -1 for the non male/female categories, indicating that they are not collecting that information. However, this means that our `"Other"` category would not be internally comparable or consistent. The `"Other"` category for a county that has "Male, Female, Other, MTF, FTM" as separate datapoints should really be called `"Other - not MTF, not FTM"` and is not comparable to the `"Other"` category for a county that only has "Male, Female, Other". From d35f453e040b66fbcd3dc21d05b8f0d9643d0d49 Mon Sep 17 00:00:00 2001 From: Angela Kwon Date: Sat, 29 Aug 2020 13:01:23 -0700 Subject: [PATCH 35/39] made sure to use 4-space indentation, fixed test series function to only scrape positive test data --- covid19_sfbayarea/data/marin.py | 43 ++++++++++++++------------------- 1 file changed, 18 insertions(+), 25 deletions(-) diff --git a/covid19_sfbayarea/data/marin.py b/covid19_sfbayarea/data/marin.py index 62bfb922..39818091 100644 --- a/covid19_sfbayarea/data/marin.py +++ b/covid19_sfbayarea/data/marin.py @@ -17,19 +17,19 @@ def get_county() -> Dict: url = 'https://coronavirus.marinhhs.org/surveillance' model = get_data_model() - chart_ids = {"cases": "Eq6Es", "deaths": "Eq6Es", "age": "zSHDs", "gender": "FEciW", "race_eth": "aBeEd"} - # I removed "tests": '2Hgir' from chart_ids b/c the breakdown of negative and positive tests has disappeared from the website + chart_ids = {"cases": "Eq6Es", "deaths": "Eq6Es", "age": "zSHDs", "gender": "FEciW", "race_eth": "aBeEd", "tests": '7sHQq'} + # The time series data for negative tests is gone, so I've just scraped positive test data using the new chart referenced above. model['name'] = "Marin County" model['update_time'] = datetime.today().isoformat() - model["meta_from_baypd"] = ["There's no actual update time on their website. Not all charts are updated daily."] + model["meta_from_baypd"] = [] model['source_url'] = url model['meta_from_source'] = get_chart_meta(url, chart_ids) model["series"]["cases"] = get_series_data(chart_ids["cases"], url, ['Date', 'Total Cases', 'Total Recovered*', 'Total Hospitalized', 'Total Deaths'], "cumul_cases", 'Total Cases', 'cases') model["series"]["deaths"] = get_series_data(chart_ids["deaths"], url, ['Date', 'Total Cases', 'Total Recovered*', 'Total Hospitalized', 'Total Deaths'], "cumul_deaths", 'Total Deaths', 'deaths') - #model["series"]["tests"] = get_test_series(chart_ids["tests"], url) + model["series"]["tests"] = get_test_series(chart_ids["tests"], url) model["case_totals"]["age_group"], model["death_totals"]["age_group"] = get_breakdown_age(chart_ids["age"], url) model["case_totals"]["gender"], model["death_totals"]["gender"] = get_breakdown_gender(chart_ids["gender"], url) model["case_totals"]["race_eth"], model["death_totals"]["race_eth"] = get_breakdown_race_eth(chart_ids["race_eth"], url) @@ -195,8 +195,7 @@ def get_breakdown_gender(chart_id: str, url: str) -> Tuple[Dict, Dict]: # Each new row has data for a different gender. gender = row["Gender"].lower() if gender not in genders: - raise ValueError("The genders have changed.") # type: ignore - # is doing this bad practice? mypy doesn't have an issue with the error on line 244 so not sure why this one causes an error + raise ValueError("The genders have changed.") c_gender[gender] = int(row["Cases"]) d_gender[gender] = int(row["Deaths"]) @@ -229,31 +228,25 @@ def get_breakdown_race_eth(chart_id: str, url: str) -> Tuple[Dict, Dict]: return c_race_eth, d_race_eth def get_test_series(chart_id: str, url: str) -> List: - """This method gets the date, the number of positive and negative tests on that date, and the number of cumulative positive and negative tests.""" + """This method gets the date, the number of new positive tests on that date, and the number of cumulative positive tests.""" csv_data = get_chart_data(url, chart_id) + csv_reader = csv.DictReader(csv_data) - dates, positives, negatives = [row.split(',')[1:] for row in csv_data] - series = zip(dates, positives, negatives) + keys = csv_reader.fieldnames + + if keys != ['Test Date', 'Positive Tests']: + raise ValueError("The headers have changed.") test_series: list = list() cumul_pos = 0 - cumul_neg = 0 - for entry in series: + for row in csv_reader: daily: dict = dict() - # I'm not sure why, but I just found out that some of the test series have a 'null' value (in the spot where the number of positive tests is), so I needed to account for that here. - # At least for now, it's only present at the end, so I just break out of the loop and return the test series. - if entry[1] != 'null': - date_time_obj = datetime.strptime(entry[0], '%m/%d/%Y') - daily["date"] = date_time_obj.strftime('%Y-%m-%d') - daily["positive"] = int(entry[1]) - cumul_pos += daily["positive"] - daily["negative"] = int(entry[2]) - cumul_neg += daily["negative"] - daily["cumul_pos"] = cumul_pos - daily["cumul_neg"] = cumul_neg - test_series.append(daily) - else: - break + date_time_obj = datetime.strptime(row['Test Date'], '%m/%d/%Y') + daily["date"] = date_time_obj.strftime('%Y-%m-%d') + daily["positive"] = int(row["Positive Tests"]) + cumul_pos += daily["positive"] + daily["cumul_positive"] = cumul_pos + test_series.append(daily) return test_series From eb7c0c19d289d46a0b7e8b18287bec52e01119f8 Mon Sep 17 00:00:00 2001 From: Angela Kwon Date: Sat, 29 Aug 2020 13:05:44 -0700 Subject: [PATCH 36/39] updated meta_from_baypd --- covid19_sfbayarea/data/marin.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/covid19_sfbayarea/data/marin.py b/covid19_sfbayarea/data/marin.py index 39818091..efae9e50 100644 --- a/covid19_sfbayarea/data/marin.py +++ b/covid19_sfbayarea/data/marin.py @@ -22,7 +22,7 @@ def get_county() -> Dict: model['name'] = "Marin County" model['update_time'] = datetime.today().isoformat() - model["meta_from_baypd"] = [] + model["meta_from_baypd"] = ["Negative and pending tests are excluded from the Marin County test data."] model['source_url'] = url model['meta_from_source'] = get_chart_meta(url, chart_ids) From f5d8ae103650c25fe8dfaa39f8d71222023d8472 Mon Sep 17 00:00:00 2001 From: Angela Kwon Date: Sat, 29 Aug 2020 13:08:07 -0700 Subject: [PATCH 37/39] updated meta_from_source --- covid19_sfbayarea/data/marin.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/covid19_sfbayarea/data/marin.py b/covid19_sfbayarea/data/marin.py index efae9e50..66fbcb58 100644 --- a/covid19_sfbayarea/data/marin.py +++ b/covid19_sfbayarea/data/marin.py @@ -22,7 +22,7 @@ def get_county() -> Dict: model['name'] = "Marin County" model['update_time'] = datetime.today().isoformat() - model["meta_from_baypd"] = ["Negative and pending tests are excluded from the Marin County test data."] + model["meta_from_baypd"] = "" model['source_url'] = url model['meta_from_source'] = get_chart_meta(url, chart_ids) @@ -106,6 +106,9 @@ def get_chart_meta(url: str, chart_ids: Dict[str, str]) -> Tuple[List, List]: #breakpoint() chart_metadata.add(data.text.strip()) + #Manually adding in metadata about testing data + chart_metadata.add("Negative and pending tests are excluded from the Marin County test data.") + # Return the metadata. I take the set of the chart_metadata since there are repeating metadata strings. return list(metadata), list(chart_metadata) From 3a031805dea8723bd62adcbd383773f21af0ae22 Mon Sep 17 00:00:00 2001 From: Angela Kwon Date: Sat, 29 Aug 2020 13:50:43 -0700 Subject: [PATCH 38/39] updated meta_from_source about testing data nuances --- covid19_sfbayarea/data/marin.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/covid19_sfbayarea/data/marin.py b/covid19_sfbayarea/data/marin.py index 66fbcb58..0a23b614 100644 --- a/covid19_sfbayarea/data/marin.py +++ b/covid19_sfbayarea/data/marin.py @@ -106,8 +106,9 @@ def get_chart_meta(url: str, chart_ids: Dict[str, str]) -> Tuple[List, List]: #breakpoint() chart_metadata.add(data.text.strip()) - #Manually adding in metadata about testing data + # Manually adding in metadata about testing data chart_metadata.add("Negative and pending tests are excluded from the Marin County test data.") + chart_metadata.add("Note that this test data is about tests done by Marin County residents, not about all tests done in Marin County (includes residents and non-residents).") # Return the metadata. I take the set of the chart_metadata since there are repeating metadata strings. return list(metadata), list(chart_metadata) From 6d552842ec6144404bcc34cf72cd59d5306884e7 Mon Sep 17 00:00:00 2001 From: Elaine Laguerta <12928553+elaguerta@users.noreply.github.com> Date: Wed, 2 Sep 2020 20:11:16 -0700 Subject: [PATCH 39/39] Delete inmates from population_totals --- data_models/data_model.json | 4 ---- 1 file changed, 4 deletions(-) diff --git a/data_models/data_model.json b/data_models/data_model.json index cac0b108..5be90b33 100644 --- a/data_models/data_model.json +++ b/data_models/data_model.json @@ -130,9 +130,5 @@ "White":-1, "Unknown":-1 } - }, - "inmates": { - "cases": -1, - "deaths": -1 } }