From 10f0dfec91aecbb45a055455c5381fa62caca73f Mon Sep 17 00:00:00 2001 From: Logan Cooper Date: Sat, 2 May 2020 13:41:08 -0700 Subject: [PATCH 01/62] organization Merge CDM readme into readme --- CDM_README.md | 152 ------------------------------------------------ README.md | 156 +++++++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 155 insertions(+), 153 deletions(-) delete mode 100644 CDM_README.md diff --git a/CDM_README.md b/CDM_README.md deleted file mode 100644 index e68a82bf..00000000 --- a/CDM_README.md +++ /dev/null @@ -1,152 +0,0 @@ -# Ages - -Please make sure to use the following age brackets for the different counties. Note that the brackets may also vary by whether you are scraping cases or deaths data: - - -## San Francisco -### Cases - "age": [ - {"group": "18_and_under", "raw_count": -1 }, - {"group": "18_to_30", "raw_count": -1 }, - {"group": "31_to_40", "raw_count": -1 }, - {"group": "41_to_50", "raw_count": -1 }, - {"group": "51_to_60", "raw_count": -1 }, - {"group": "61_to_70", "raw_count": -1 }, - {"group": "71_to_80", "raw_count": -1 }, - {"group": "81_and_older", "raw_count": -1} - ] -### Deaths -Data broken down by gender is not available on the json files, only on the dashboard. - - -## Alameda -### Cases - "age": [ - {"group": "18_and_under", "raw_count": -1 }, - {"group": "18_to_30", "raw_count": -1 }, - {"group": "31_to_40", "raw_count": -1 }, - {"group": "41_to_50", "raw_count": -1 }, - {"group": "51_to_60", "raw_count": -1 }, - {"group": "61_to_70", "raw_count": -1 }, - {"group": "71_to_80", "raw_count": -1 }, - {"group": "81_and_older", "raw_count": -1 }, - {"group": "Unknown", "raw_count": -1 } - ] -### Deaths -Data broken down by gender is not available. - - -## Sonoma -### Cases - "age": [ - {"group": "0_to_17", "raw_count": -1 }, - {"group": "18_to_49", "raw_count": -1 }, - {"group": "50_to_64", "raw_count": -1 }, - {"group": "65_and_older", "raw_count": -1 }, - {"group": "Unknown", "raw_count": -1 } - ] -### Deaths -Data broken down by gender is not available. - - -## Santa Clara -### Cases - "age": [ - {"group": "20_and_under", "raw_count": -1 }, - {"group": "21_to_30", "raw_count": -1 }, - {"group": "31_to_40", "raw_count": -1 }, - {"group": "41_to_50", "raw_count": -1 }, - {"group": "51_to_60", "raw_count": -1 }, - {"group": "61_to_70", "raw_count": -1 }, - {"group": "71_to_80", "raw_count": -1 }, - {"group": "81_to_90", "raw_count": -1 }, - {"group": "90_and_older", "raw_count": -1 }, - {"group": "Unknown", "raw_count": -1 } - ] -### Deaths - "age": [ - {"group": "20_and_under", "raw_count": -1 }, - {"group": "21_to_30", "raw_count": -1 }, - {"group": "31_to_40", "raw_count": -1 }, - {"group": "41_to_50", "raw_count": -1 }, - {"group": "51_to_60", "raw_count": -1 }, - {"group": "61_to_70", "raw_count": -1 }, - {"group": "71_to_80", "raw_count": -1 }, - {"group": "81_to_90", "raw_count": -1 }, - {"group": "90_and_older", "raw_count": -1 } - ] - - -## San Mateo -### Cases - "age": [ - {"group": "0_to_19", "raw_count": -1 }, - {"group": "20_to_29", "raw_count": -1 }, - {"group": "30_to_39", "raw_count": -1 }, - {"group": "40_to_49", "raw_count": -1 }, - {"group": "50_to_59", "raw_count": -1 }, - {"group": "60_to_69", "raw_count": -1 }, - {"group": "70_to_79", "raw_count": -1 }, - {"group": "80_to_89", "raw_count": -1 }, - {"group": "90_and_older", "raw_count": -1 } - ] -### Deaths - age": [ - {"group": "0_to_19", "raw_count": -1 }, - {"group": "20_to_29", "raw_count": -1 }, - {"group": "30_to_39", "raw_count": -1 }, - {"group": "40_to_49", "raw_count": -1 }, - {"group": "50_to_59", "raw_count": -1 }, - {"group": "60_to_69", "raw_count": -1 }, - {"group": "70_to_79", "raw_count": -1 }, - {"group": "80_to_89", "raw_count": -1 }, - {"group": "90_and_older", "raw_count": -1 } - ] - - -## Contra Costa -### Cases - age": [ - {"group": "0_to_20", "raw_count": -1 }, - {"group": "21_to_40", "raw_count": -1 }, - {"group": "41_to_60", "raw_count": -1 }, - {"group": "61_to_80", "raw_count": -1 }, - {"group": "81_to_100", "raw_count": -1 } - ] -### Deaths -Data broken down by gender is not available. - - -## Marin -### Cases and Deaths - age": [ - {"group": "0_to_18", "raw_count": -1 }, - {"group": "19_to_34", "raw_count": -1 }, - {"group": "35_to_49", "raw_count": -1 }, - {"group": "50_to_64", "raw_count": -1 }, - {"group": "65_and_older", "raw_count": -1 } - ] - - - -## Solano -### Cases and Deaths - age": [ - {"group": "0_to_18", "raw_count": -1 }, - {"group": "19_to_64", "raw_count": -1 }, - {"group": "65_and_older", "raw_count": -1 } - ] - - -## Napa -### Cases - age": [ - {"group": "0_to_17", "raw_count": -1 }, - {"group": "18_to_49", "raw_count": -1 }, - {"group": "50_to_64", "raw_count": -1 }, - {"group": "Over_64", "raw_count": -1 } - ] -### Deaths -Data broken down by gender is not available. - - diff --git a/README.md b/README.md index fe3c6de5..2c5cf024 100644 --- a/README.md +++ b/README.md @@ -9,4 +9,158 @@ To install this project, you can simply run `sh install.sh` in your terminal. Th To run the scraper, you can use the run script by typing `sh run_scraper.sh` into your terminal. This will enable the virtual environment and run `scraper.py`. Once again, the virtual environment will not stay active after the script finishes running. If you want to run the scraper without the run script, enable the virtual environment, then run `python3 scraper.py`. ## Running the API -The best way to run the API right now is to run the command `FLASK_APP="app.py" FLASK_ENV=development flask run;`. +The best way to run the API right now is to run the command `FLASK_APP="app.py" FLASK_ENV=development flask run;`. Note that this is not the best way to run the scraper at this time. + +## Data Model +The following sections document the differences between the counties in the common data model (see `data-model.json` and `sf_generic_cdm.js`) which we will see as we begin to get data from them. + +### Ages + +Please make sure to use the following age brackets for the different counties. Note that the brackets may also vary by whether you are scraping cases or deaths data: + + +#### San Francisco +##### Cases + "age": [ + {"group": "18_and_under", "raw_count": -1 }, + {"group": "18_to_30", "raw_count": -1 }, + {"group": "31_to_40", "raw_count": -1 }, + {"group": "41_to_50", "raw_count": -1 }, + {"group": "51_to_60", "raw_count": -1 }, + {"group": "61_to_70", "raw_count": -1 }, + {"group": "71_to_80", "raw_count": -1 }, + {"group": "81_and_older", "raw_count": -1} + ] +##### Deaths +Data broken down by gender is not available on the json files, only on the dashboard. + + +#### Alameda +##### Cases + "age": [ + {"group": "18_and_under", "raw_count": -1 }, + {"group": "18_to_30", "raw_count": -1 }, + {"group": "31_to_40", "raw_count": -1 }, + {"group": "41_to_50", "raw_count": -1 }, + {"group": "51_to_60", "raw_count": -1 }, + {"group": "61_to_70", "raw_count": -1 }, + {"group": "71_to_80", "raw_count": -1 }, + {"group": "81_and_older", "raw_count": -1 }, + {"group": "Unknown", "raw_count": -1 } + ] +##### Deaths +Data broken down by gender is not available. + + +#### Sonoma +##### Cases + "age": [ + {"group": "0_to_17", "raw_count": -1 }, + {"group": "18_to_49", "raw_count": -1 }, + {"group": "50_to_64", "raw_count": -1 }, + {"group": "65_and_older", "raw_count": -1 }, + {"group": "Unknown", "raw_count": -1 } + ] +##### Deaths +Data broken down by gender is not available. + + +#### Santa Clara +##### Cases + "age": [ + {"group": "20_and_under", "raw_count": -1 }, + {"group": "21_to_30", "raw_count": -1 }, + {"group": "31_to_40", "raw_count": -1 }, + {"group": "41_to_50", "raw_count": -1 }, + {"group": "51_to_60", "raw_count": -1 }, + {"group": "61_to_70", "raw_count": -1 }, + {"group": "71_to_80", "raw_count": -1 }, + {"group": "81_to_90", "raw_count": -1 }, + {"group": "90_and_older", "raw_count": -1 }, + {"group": "Unknown", "raw_count": -1 } + ] +##### Deaths + "age": [ + {"group": "20_and_under", "raw_count": -1 }, + {"group": "21_to_30", "raw_count": -1 }, + {"group": "31_to_40", "raw_count": -1 }, + {"group": "41_to_50", "raw_count": -1 }, + {"group": "51_to_60", "raw_count": -1 }, + {"group": "61_to_70", "raw_count": -1 }, + {"group": "71_to_80", "raw_count": -1 }, + {"group": "81_to_90", "raw_count": -1 }, + {"group": "90_and_older", "raw_count": -1 } + ] + + +#### San Mateo +##### Cases + "age": [ + {"group": "0_to_19", "raw_count": -1 }, + {"group": "20_to_29", "raw_count": -1 }, + {"group": "30_to_39", "raw_count": -1 }, + {"group": "40_to_49", "raw_count": -1 }, + {"group": "50_to_59", "raw_count": -1 }, + {"group": "60_to_69", "raw_count": -1 }, + {"group": "70_to_79", "raw_count": -1 }, + {"group": "80_to_89", "raw_count": -1 }, + {"group": "90_and_older", "raw_count": -1 } + ] +##### Deaths + age": [ + {"group": "0_to_19", "raw_count": -1 }, + {"group": "20_to_29", "raw_count": -1 }, + {"group": "30_to_39", "raw_count": -1 }, + {"group": "40_to_49", "raw_count": -1 }, + {"group": "50_to_59", "raw_count": -1 }, + {"group": "60_to_69", "raw_count": -1 }, + {"group": "70_to_79", "raw_count": -1 }, + {"group": "80_to_89", "raw_count": -1 }, + {"group": "90_and_older", "raw_count": -1 } + ] + + +#### Contra Costa +##### Cases + age": [ + {"group": "0_to_20", "raw_count": -1 }, + {"group": "21_to_40", "raw_count": -1 }, + {"group": "41_to_60", "raw_count": -1 }, + {"group": "61_to_80", "raw_count": -1 }, + {"group": "81_to_100", "raw_count": -1 } + ] +##### Deaths +Data broken down by gender is not available. + + +#### Marin +##### Cases and Deaths + age": [ + {"group": "0_to_18", "raw_count": -1 }, + {"group": "19_to_34", "raw_count": -1 }, + {"group": "35_to_49", "raw_count": -1 }, + {"group": "50_to_64", "raw_count": -1 }, + {"group": "65_and_older", "raw_count": -1 } + ] + + + +#### Solano +##### Cases and Deaths + age": [ + {"group": "0_to_18", "raw_count": -1 }, + {"group": "19_to_64", "raw_count": -1 }, + {"group": "65_and_older", "raw_count": -1 } + ] + + +#### Napa +##### Cases + age": [ + {"group": "0_to_17", "raw_count": -1 }, + {"group": "18_to_49", "raw_count": -1 }, + {"group": "50_to_64", "raw_count": -1 }, + {"group": "Over_64", "raw_count": -1 } + ] +##### Deaths +Data broken down by gender is not available. From 7db78f7fc3ef13a17ec4421909d08eef811cc704 Mon Sep 17 00:00:00 2001 From: Logan Cooper Date: Sat, 2 May 2020 13:44:34 -0700 Subject: [PATCH 02/62] organization Move data models to own folder --- README.md | 2 +- .../all_age_brackets_template.js | 0 data-model.json => data_models/data_model.json | 0 sf_generic_cdm.js => data_models/sf_generic_cdm.js | 0 sf_generic_cdm.json => data_models/sf_generic_cdm.json | 0 5 files changed, 1 insertion(+), 1 deletion(-) rename all_age_brackets_template.js => data_models/all_age_brackets_template.js (100%) rename data-model.json => data_models/data_model.json (100%) rename sf_generic_cdm.js => data_models/sf_generic_cdm.js (100%) rename sf_generic_cdm.json => data_models/sf_generic_cdm.json (100%) diff --git a/README.md b/README.md index dbd45c4e..3ae35ebf 100644 --- a/README.md +++ b/README.md @@ -12,7 +12,7 @@ To run the scraper, you can use the run script by typing `sh run_scraper.sh` int The best way to run the API right now is to run the command `FLASK_APP="app.py" FLASK_ENV=development flask run;`. Note that this is not the best way to run the scraper at this time. ## Data Model -The following sections document the differences between the counties in the common data model (see `data-model.json` and `sf_generic_cdm.js`) which we will see as we begin to get data from them. +The following sections document the differences between the counties in the common data model (see `data_models` directory) which we will see as we begin to get data from them. ### Ages diff --git a/all_age_brackets_template.js b/data_models/all_age_brackets_template.js similarity index 100% rename from all_age_brackets_template.js rename to data_models/all_age_brackets_template.js diff --git a/data-model.json b/data_models/data_model.json similarity index 100% rename from data-model.json rename to data_models/data_model.json diff --git a/sf_generic_cdm.js b/data_models/sf_generic_cdm.js similarity index 100% rename from sf_generic_cdm.js rename to data_models/sf_generic_cdm.js diff --git a/sf_generic_cdm.json b/data_models/sf_generic_cdm.json similarity index 100% rename from sf_generic_cdm.json rename to data_models/sf_generic_cdm.json From 05dbee78b33c1de651a1862ceb66a340f63b2532 Mon Sep 17 00:00:00 2001 From: Logan Cooper Date: Wed, 6 May 2020 18:43:27 -0700 Subject: [PATCH 03/62] organization Replace tabs with spaces --- README.md | 86 +++++++++++++++++++++++++++---------------------------- 1 file changed, 43 insertions(+), 43 deletions(-) diff --git a/README.md b/README.md index 3ae35ebf..530f29be 100644 --- a/README.md +++ b/README.md @@ -21,8 +21,8 @@ Please make sure to use the following age brackets for the different counties. N #### San Francisco ##### Cases - "age": [ - {"group": "18_and_under", "raw_count": -1 }, + "age": [ + {"group": "18_and_under", "raw_count": -1 }, {"group": "18_to_30", "raw_count": -1 }, {"group": "31_to_40", "raw_count": -1 }, {"group": "41_to_50", "raw_count": -1 }, @@ -37,9 +37,9 @@ Data broken down by gender is not available on the json files, only on the dashb #### Alameda ##### Cases - "age": [ - {"group": "18_and_under", "raw_count": -1 }, - {"group": "18_to_30", "raw_count": -1 }, + "age": [ + {"group": "18_and_under", "raw_count": -1 }, + {"group": "18_to_30", "raw_count": -1 }, {"group": "31_to_40", "raw_count": -1 }, {"group": "41_to_50", "raw_count": -1 }, {"group": "51_to_60", "raw_count": -1 }, @@ -54,9 +54,9 @@ Data broken down by gender is not available. #### Sonoma ##### Cases - "age": [ - {"group": "0_to_17", "raw_count": -1 }, - {"group": "18_to_49", "raw_count": -1 }, + "age": [ + {"group": "0_to_17", "raw_count": -1 }, + {"group": "18_to_49", "raw_count": -1 }, {"group": "50_to_64", "raw_count": -1 }, {"group": "65_and_older", "raw_count": -1 }, {"group": "Unknown", "raw_count": -1 } @@ -67,10 +67,10 @@ Data broken down by gender is not available. #### Santa Clara ##### Cases - "age": [ - {"group": "20_and_under", "raw_count": -1 }, - {"group": "21_to_30", "raw_count": -1 }, - {"group": "31_to_40", "raw_count": -1 }, + "age": [ + {"group": "20_and_under", "raw_count": -1 }, + {"group": "21_to_30", "raw_count": -1 }, + {"group": "31_to_40", "raw_count": -1 }, {"group": "41_to_50", "raw_count": -1 }, {"group": "51_to_60", "raw_count": -1 }, {"group": "61_to_70", "raw_count": -1 }, @@ -80,10 +80,10 @@ Data broken down by gender is not available. {"group": "Unknown", "raw_count": -1 } ] ##### Deaths - "age": [ - {"group": "20_and_under", "raw_count": -1 }, - {"group": "21_to_30", "raw_count": -1 }, - {"group": "31_to_40", "raw_count": -1 }, + "age": [ + {"group": "20_and_under", "raw_count": -1 }, + {"group": "21_to_30", "raw_count": -1 }, + {"group": "31_to_40", "raw_count": -1 }, {"group": "41_to_50", "raw_count": -1 }, {"group": "51_to_60", "raw_count": -1 }, {"group": "61_to_70", "raw_count": -1 }, @@ -95,10 +95,10 @@ Data broken down by gender is not available. #### San Mateo ##### Cases - "age": [ - {"group": "0_to_19", "raw_count": -1 }, - {"group": "20_to_29", "raw_count": -1 }, - {"group": "30_to_39", "raw_count": -1 }, + "age": [ + {"group": "0_to_19", "raw_count": -1 }, + {"group": "20_to_29", "raw_count": -1 }, + {"group": "30_to_39", "raw_count": -1 }, {"group": "40_to_49", "raw_count": -1 }, {"group": "50_to_59", "raw_count": -1 }, {"group": "60_to_69", "raw_count": -1 }, @@ -107,10 +107,10 @@ Data broken down by gender is not available. {"group": "90_and_older", "raw_count": -1 } ] ##### Deaths - age": [ - {"group": "0_to_19", "raw_count": -1 }, - {"group": "20_to_29", "raw_count": -1 }, - {"group": "30_to_39", "raw_count": -1 }, + age": [ + {"group": "0_to_19", "raw_count": -1 }, + {"group": "20_to_29", "raw_count": -1 }, + {"group": "30_to_39", "raw_count": -1 }, {"group": "40_to_49", "raw_count": -1 }, {"group": "50_to_59", "raw_count": -1 }, {"group": "60_to_69", "raw_count": -1 }, @@ -122,45 +122,45 @@ Data broken down by gender is not available. #### Contra Costa ##### Cases - age": [ - {"group": "0_to_20", "raw_count": -1 }, - {"group": "21_to_40", "raw_count": -1 }, - {"group": "41_to_60", "raw_count": -1 }, + age": [ + {"group": "0_to_20", "raw_count": -1 }, + {"group": "21_to_40", "raw_count": -1 }, + {"group": "41_to_60", "raw_count": -1 }, {"group": "61_to_80", "raw_count": -1 }, {"group": "81_to_100", "raw_count": -1 } - ] + ] ##### Deaths Data broken down by gender is not available. #### Marin ##### Cases and Deaths - age": [ - {"group": "0_to_18", "raw_count": -1 }, - {"group": "19_to_34", "raw_count": -1 }, - {"group": "35_to_49", "raw_count": -1 }, + age": [ + {"group": "0_to_18", "raw_count": -1 }, + {"group": "19_to_34", "raw_count": -1 }, + {"group": "35_to_49", "raw_count": -1 }, {"group": "50_to_64", "raw_count": -1 }, {"group": "65_and_older", "raw_count": -1 } - ] + ] #### Solano ##### Cases and Deaths - age": [ - {"group": "0_to_18", "raw_count": -1 }, - {"group": "19_to_64", "raw_count": -1 }, + age": [ + {"group": "0_to_18", "raw_count": -1 }, + {"group": "19_to_64", "raw_count": -1 }, {"group": "65_and_older", "raw_count": -1 } - ] + ] #### Napa ##### Cases - age": [ - {"group": "0_to_17", "raw_count": -1 }, - {"group": "18_to_49", "raw_count": -1 }, - {"group": "50_to_64", "raw_count": -1 }, + age": [ + {"group": "0_to_17", "raw_count": -1 }, + {"group": "18_to_49", "raw_count": -1 }, + {"group": "50_to_64", "raw_count": -1 }, {"group": "Over_64", "raw_count": -1 } - ] + ] ##### Deaths Data broken down by gender is not available. From 998800bb7e6b5013b9bb159d2abda9a50359d8ff Mon Sep 17 00:00:00 2001 From: Logan Cooper Date: Wed, 6 May 2020 19:50:16 -0700 Subject: [PATCH 04/62] sonoma Get top level metadata --- sonoma_scraper.py | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) create mode 100644 sonoma_scraper.py diff --git a/sonoma_scraper.py b/sonoma_scraper.py new file mode 100644 index 00000000..89630d3e --- /dev/null +++ b/sonoma_scraper.py @@ -0,0 +1,31 @@ +#!/usr/bin/env python3 +import requests +import json +from datetime import datetime +from typing import Dict +from bs4 import BeautifulSoup + +url = 'https://socoemergency.org/emergency/novel-coronavirus/coronavirus-cases/' +page = requests.get(url) +soup = BeautifulSoup(page.content, 'html.parser') +tables = soup.findAll('table') + +def generate_update_time(soup): + update_time_text = soup.find('time').text.strip() + # format is May 6, 2020 10:00 AM + update_datetime = datetime.strptime(update_time_text, '%B %d, %Y %I:%M %p') + return update_datetime.isoformat() + + +model = { + 'name': 'Sonoma County', + 'update_time': generate_update_time(soup), + 'source': url, + +} + +# for i in range(len(tables)): +# if i >= 4: # we don't need the first three tables +# table = tables[i] +# print('\n\n') +# print(table.findAll('tr')) From ed855bd7f8ec77f57b73aa1c6f99d57ef4f654d9 Mon Sep 17 00:00:00 2001 From: Logan Cooper Date: Sun, 10 May 2020 13:58:32 -0700 Subject: [PATCH 05/62] sonoma Move scraper and collect metadata --- data_scrapers/sonoma_county.py | 39 ++++++++++++++++++++++++++++++++++ sonoma_scraper.py | 31 --------------------------- 2 files changed, 39 insertions(+), 31 deletions(-) create mode 100644 data_scrapers/sonoma_county.py delete mode 100644 sonoma_scraper.py diff --git a/data_scrapers/sonoma_county.py b/data_scrapers/sonoma_county.py new file mode 100644 index 00000000..8f0c3344 --- /dev/null +++ b/data_scrapers/sonoma_county.py @@ -0,0 +1,39 @@ +#!/usr/bin/env python3 +import requests +import json +from datetime import datetime +from typing import List, Dict +from bs4 import BeautifulSoup, element + +url = 'https://socoemergency.org/emergency/novel-coronavirus/coronavirus-cases/' +page = requests.get(url) +sonoma_soup = BeautifulSoup(page.content, 'html.parser') +tables = sonoma_soup.findAll('table')[4:] # we don't need the first three tables + +def generate_update_time(soup: BeautifulSoup) -> str: + update_time_text = soup.find('time').text.strip() + # format is May 6, 2020 10:00 AM + update_datetime = datetime.strptime(update_time_text, '%B %d, %Y %I:%M %p') + return update_datetime.isoformat() + +def get_source_meta(soup: BeautifulSoup) -> str: + h3_tags = soup.findAll('h3') + definitions_header = None + for el in h3_tags: + if el.text == 'Definitions': + definitions_header = el + if definitions_header == None: + raise FutureWarning('The webpage has changed and the source metadata has moved -- please look at the Sonoma County webpage and locate it, then update the scraper with this information') + definitions_text = definitions_header.find_parent().text + return definitions_text + +model = { + 'name': 'Sonoma County', + 'update_time': generate_update_time(sonoma_soup), + 'source': url, + 'meta_from_source': get_source_meta(sonoma_soup) +} + +# cases, source, tests, age, sex, region, hospitalized = tables +# transform_cases(cases) +print(get_source_meta(sonoma_soup)) diff --git a/sonoma_scraper.py b/sonoma_scraper.py deleted file mode 100644 index 89630d3e..00000000 --- a/sonoma_scraper.py +++ /dev/null @@ -1,31 +0,0 @@ -#!/usr/bin/env python3 -import requests -import json -from datetime import datetime -from typing import Dict -from bs4 import BeautifulSoup - -url = 'https://socoemergency.org/emergency/novel-coronavirus/coronavirus-cases/' -page = requests.get(url) -soup = BeautifulSoup(page.content, 'html.parser') -tables = soup.findAll('table') - -def generate_update_time(soup): - update_time_text = soup.find('time').text.strip() - # format is May 6, 2020 10:00 AM - update_datetime = datetime.strptime(update_time_text, '%B %d, %Y %I:%M %p') - return update_datetime.isoformat() - - -model = { - 'name': 'Sonoma County', - 'update_time': generate_update_time(soup), - 'source': url, - -} - -# for i in range(len(tables)): -# if i >= 4: # we don't need the first three tables -# table = tables[i] -# print('\n\n') -# print(table.findAll('tr')) From fdab8a43566f0c02d0dc3733f7120902078554d7 Mon Sep 17 00:00:00 2001 From: Logan Cooper Date: Sun, 10 May 2020 15:44:18 -0700 Subject: [PATCH 06/62] sonoma Add transmission types --- data_scrapers/sonoma_county.py | 47 ++++++++++++++++++++++++++++++---- 1 file changed, 42 insertions(+), 5 deletions(-) diff --git a/data_scrapers/sonoma_county.py b/data_scrapers/sonoma_county.py index 8f0c3344..b9944197 100644 --- a/data_scrapers/sonoma_county.py +++ b/data_scrapers/sonoma_county.py @@ -23,17 +23,54 @@ def get_source_meta(soup: BeautifulSoup) -> str: if el.text == 'Definitions': definitions_header = el if definitions_header == None: - raise FutureWarning('The webpage has changed and the source metadata has moved -- please look at the Sonoma County webpage and locate it, then update the scraper with this information') + raise FutureWarning('The source metadata has moved -- please look at the Sonoma County webpage and locate it, then update the scraper with this information') definitions_text = definitions_header.find_parent().text return definitions_text -model = { +# def transform_cases(cases_tag: element.Tag) -> List[Dict]: +# cases = [] +# cumul_cases = 0 +# deaths = [] +# cumul_deaths = 0 +# recovered = [] +# cumul_recovered = 0 +# rows = cases_tag.findAll('tr')[1:] +# for row in rows: +# row_cells = row.findAll(['th', 'td']) +# date = row_cells[0].text.replace('/', '-') +# infected, new_infected, dead, recoveries = [int(el.text) for el in row_cells[1:]] +# print(infected) +# cumul_cases += new_infected +# cases.append({ 'date': date, 'cases': infected, 'cumul_cases': cumul_cases}) + +def transform_transmission(transmission_tag: element.Tag) -> Dict[str, int]: + transmissions = {} + rows = transmission_tag.findAll('tr')[1:] + # turns the transmission categories on the page into the ones we're using + transmission_type_conversion = {'Community': 'community', 'Close Contact': 'from_contact', 'Travel': 'travel', 'Under Investigation': 'unknown'} + for row in rows: + row_cells = row.findAll(['th', 'td']) + type, number, _pct = [el.text for el in row_cells] + if type not in transmission_type_conversion: + raise FutureWarning('The transmission type {0} was not found in transmission_type_conversion'.format(type)) + type = transmission_type_conversion[type] + transmissions[type] = int(number) + return transmissions + + model = { 'name': 'Sonoma County', 'update_time': generate_update_time(sonoma_soup), 'source': url, - 'meta_from_source': get_source_meta(sonoma_soup) + 'meta_from_source': get_source_meta(sonoma_soup), + 'meta_from_baypd': '', + 'series': {}, + 'case_totals': { + 'transmission_cat': transform_transmission(source) + } } -# cases, source, tests, age, sex, region, hospitalized = tables +try: + cases, source, tests, age, sex, region, regions, hospitalized, underlying, symptoms = tables +except ValueError as e: + raise FutureWarning('The number of values on the page has changed -- please ') # transform_cases(cases) -print(get_source_meta(sonoma_soup)) From 8bd208121932684f054cea8a02ceff855305df1c Mon Sep 17 00:00:00 2001 From: Logan Cooper Date: Mon, 11 May 2020 21:29:46 -0700 Subject: [PATCH 07/62] sonoma Get cases, active, recovered, and death series --- data_scrapers/sonoma_county.py | 64 ++++++++++++++++++++++------------ 1 file changed, 42 insertions(+), 22 deletions(-) diff --git a/data_scrapers/sonoma_county.py b/data_scrapers/sonoma_county.py index b9944197..68aa3d81 100644 --- a/data_scrapers/sonoma_county.py +++ b/data_scrapers/sonoma_county.py @@ -27,21 +27,36 @@ def get_source_meta(soup: BeautifulSoup) -> str: definitions_text = definitions_header.find_parent().text return definitions_text -# def transform_cases(cases_tag: element.Tag) -> List[Dict]: -# cases = [] -# cumul_cases = 0 -# deaths = [] -# cumul_deaths = 0 -# recovered = [] -# cumul_recovered = 0 -# rows = cases_tag.findAll('tr')[1:] -# for row in rows: -# row_cells = row.findAll(['th', 'td']) -# date = row_cells[0].text.replace('/', '-') -# infected, new_infected, dead, recoveries = [int(el.text) for el in row_cells[1:]] -# print(infected) -# cumul_cases += new_infected -# cases.append({ 'date': date, 'cases': infected, 'cumul_cases': cumul_cases}) +def transform_cases(cases_tag: element.Tag) -> List[Dict]: + cases = [] + cumul_cases = 0 + deaths = [] + cumul_deaths = 0 + recovered = [] + cumul_recovered = 0 + active = [] + cumul_active = 0 + rows = cases_tag.findAll('tr')[1:] + for row in rows: + row_cells = row.findAll(['th', 'td']) + date = row_cells[0].text.replace('/', '-') + + # instead of 0, this dashboard reports the string '-' + active_cases, new_infected, dead, recoveries = [0 if el.text == '–' else int(el.text) for el in row_cells[1:]] + + cumul_cases += new_infected + cases.append({ 'date': date, 'cases': new_infected, 'cumul_cases': cumul_cases }) + + new_deaths = dead - cumul_deaths + deaths.append({ 'date': date, 'deaths': new_deaths, 'cumul_deaths': dead }) + + new_recovered = recoveries - cumul_recovered + recovered.append({ 'date': date, 'recovered': new_recovered, 'cumul_recovered': recoveries }) + + new_active = active_cases - cumul_active + active.append({ 'date': date, 'active': new_active, 'cumul_active': active_cases }) + + return { 'cases': cases, 'deaths': deaths, 'recovered': recovered, 'active': active } def transform_transmission(transmission_tag: element.Tag) -> Dict[str, int]: transmissions = {} @@ -57,7 +72,16 @@ def transform_transmission(transmission_tag: element.Tag) -> Dict[str, int]: transmissions[type] = int(number) return transmissions - model = { + + +try: + cases, source, tests, age, sex, region, regions, hospitalized, underlying, symptoms = tables +except ValueError as e: + raise FutureWarning('The number of values on the page has changed -- please adjust the page') + +base_series = transform_cases(cases) + +model = { 'name': 'Sonoma County', 'update_time': generate_update_time(sonoma_soup), 'source': url, @@ -65,12 +89,8 @@ def transform_transmission(transmission_tag: element.Tag) -> Dict[str, int]: 'meta_from_baypd': '', 'series': {}, 'case_totals': { - 'transmission_cat': transform_transmission(source) + 'transmission_cat': transform_transmission(source) } } -try: - cases, source, tests, age, sex, region, regions, hospitalized, underlying, symptoms = tables -except ValueError as e: - raise FutureWarning('The number of values on the page has changed -- please ') -# transform_cases(cases) +print(base_series) From bd72db81718d3c77cc5ca80ed59567a970f4279f Mon Sep 17 00:00:00 2001 From: Logan Cooper Date: Mon, 11 May 2020 22:03:41 -0700 Subject: [PATCH 08/62] sonoma Get case data by age --- data_scrapers/sonoma_county.py | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/data_scrapers/sonoma_county.py b/data_scrapers/sonoma_county.py index 68aa3d81..2d6513ff 100644 --- a/data_scrapers/sonoma_county.py +++ b/data_scrapers/sonoma_county.py @@ -72,25 +72,31 @@ def transform_transmission(transmission_tag: element.Tag) -> Dict[str, int]: transmissions[type] = int(number) return transmissions - +def transform_age(age_tag: element.Tag) -> Dict[str, int]: + age_brackets = {} + rows = age_tag.findAll('tr')[1:] + for row in rows: + row_cells = row.findAll(['th', 'td']) + bracket, cases, _pct = [el.text for el in row_cells] + age_brackets[bracket] = int(cases) + return age_brackets try: cases, source, tests, age, sex, region, regions, hospitalized, underlying, symptoms = tables except ValueError as e: raise FutureWarning('The number of values on the page has changed -- please adjust the page') -base_series = transform_cases(cases) - model = { 'name': 'Sonoma County', 'update_time': generate_update_time(sonoma_soup), 'source': url, 'meta_from_source': get_source_meta(sonoma_soup), 'meta_from_baypd': '', - 'series': {}, + 'series': transform_cases(cases), 'case_totals': { - 'transmission_cat': transform_transmission(source) + 'transmission_cat': transform_transmission(source), + 'age_group': transform_age(age) } } -print(base_series) +print(model) From ee5a8b7ce8c53e5dea92d4c6024e0cb47358f5f0 Mon Sep 17 00:00:00 2001 From: Logan Cooper Date: Sat, 16 May 2020 10:26:05 -0700 Subject: [PATCH 09/62] sonoma Fix table numbers --- data_scrapers/sonoma_county.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/data_scrapers/sonoma_county.py b/data_scrapers/sonoma_county.py index 2d6513ff..482256fa 100644 --- a/data_scrapers/sonoma_county.py +++ b/data_scrapers/sonoma_county.py @@ -2,7 +2,7 @@ import requests import json from datetime import datetime -from typing import List, Dict +from typing import List, Dict, Union from bs4 import BeautifulSoup, element url = 'https://socoemergency.org/emergency/novel-coronavirus/coronavirus-cases/' @@ -27,7 +27,8 @@ def get_source_meta(soup: BeautifulSoup) -> str: definitions_text = definitions_header.find_parent().text return definitions_text -def transform_cases(cases_tag: element.Tag) -> List[Dict]: +# apologies for this horror of a output type +def transform_cases(cases_tag: element.Tag) -> Dict[str, List[Dict[str, Union[str, int]]]]: cases = [] cumul_cases = 0 deaths = [] @@ -56,6 +57,7 @@ def transform_cases(cases_tag: element.Tag) -> List[Dict]: new_active = active_cases - cumul_active active.append({ 'date': date, 'active': new_active, 'cumul_active': active_cases }) + # print(deaths) return { 'cases': cases, 'deaths': deaths, 'recovered': recovered, 'active': active } def transform_transmission(transmission_tag: element.Tag) -> Dict[str, int]: @@ -82,7 +84,7 @@ def transform_age(age_tag: element.Tag) -> Dict[str, int]: return age_brackets try: - cases, source, tests, age, sex, region, regions, hospitalized, underlying, symptoms = tables + hist_cases, cases_by_source, cases_by_race, tests, cases_by_region, region_guide, hospitalized, underlying_cond, symptoms, cases_by_gender, underlying_cond_by_gender, hospitalized_by_gender, symptoms_female, symptoms_male, symptoms_desc, cases_by_age, symptoms_by_age, underlying_cond_by_age = tables except ValueError as e: raise FutureWarning('The number of values on the page has changed -- please adjust the page') @@ -92,11 +94,11 @@ def transform_age(age_tag: element.Tag) -> Dict[str, int]: 'source': url, 'meta_from_source': get_source_meta(sonoma_soup), 'meta_from_baypd': '', - 'series': transform_cases(cases), + 'series': transform_cases(hist_cases), 'case_totals': { - 'transmission_cat': transform_transmission(source), - 'age_group': transform_age(age) + 'transmission_cat': transform_transmission(cases_by_source), + 'age_group': transform_age(cases_by_age) } } -print(model) +# print(model) From 7745e5b3a34238bc2c66fa2a837e1a0b61b37c27 Mon Sep 17 00:00:00 2001 From: Logan Cooper Date: Sat, 16 May 2020 10:39:48 -0700 Subject: [PATCH 10/62] sonoma Add test getter --- data_scrapers/sonoma_county.py | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/data_scrapers/sonoma_county.py b/data_scrapers/sonoma_county.py index 482256fa..fba6d5d3 100644 --- a/data_scrapers/sonoma_county.py +++ b/data_scrapers/sonoma_county.py @@ -74,6 +74,17 @@ def transform_transmission(transmission_tag: element.Tag) -> Dict[str, int]: transmissions[type] = int(number) return transmissions +def transform_tests(tests_tag: element.Tag) -> Dict[str, int]: + tests = {} + rows = tests_tag.findAll('tr')[1:] + for row in rows: + row_cells = row.findAll(['th', 'td']) + result, number, _pct = [el.text for el in row_cells] + lower_res = result.lower() + tests[lower_res] = int(number.replace(',', '')) + print(tests) + return tests; + def transform_age(age_tag: element.Tag) -> Dict[str, int]: age_brackets = {} rows = age_tag.findAll('tr')[1:] @@ -84,7 +95,7 @@ def transform_age(age_tag: element.Tag) -> Dict[str, int]: return age_brackets try: - hist_cases, cases_by_source, cases_by_race, tests, cases_by_region, region_guide, hospitalized, underlying_cond, symptoms, cases_by_gender, underlying_cond_by_gender, hospitalized_by_gender, symptoms_female, symptoms_male, symptoms_desc, cases_by_age, symptoms_by_age, underlying_cond_by_age = tables + hist_cases, cases_by_source, cases_by_race, total_tests, cases_by_region, region_guide, hospitalized, underlying_cond, symptoms, cases_by_gender, underlying_cond_by_gender, hospitalized_by_gender, symptoms_female, symptoms_male, symptoms_desc, cases_by_age, symptoms_by_age, underlying_cond_by_age = tables except ValueError as e: raise FutureWarning('The number of values on the page has changed -- please adjust the page') @@ -98,6 +109,9 @@ def transform_age(age_tag: element.Tag) -> Dict[str, int]: 'case_totals': { 'transmission_cat': transform_transmission(cases_by_source), 'age_group': transform_age(cases_by_age) + }, + 'tests_totals': { + 'tests': transform_tests(total_tests), } } From e7ab26f28b56dd6b5359e93a425c0aaa1c8c874f Mon Sep 17 00:00:00 2001 From: Logan Cooper Date: Sat, 16 May 2020 10:52:25 -0700 Subject: [PATCH 11/62] sonoma Factor out some common code --- data_scrapers/sonoma_county.py | 31 ++++++++++++++++++++----------- 1 file changed, 20 insertions(+), 11 deletions(-) diff --git a/data_scrapers/sonoma_county.py b/data_scrapers/sonoma_county.py index fba6d5d3..315f0bd4 100644 --- a/data_scrapers/sonoma_county.py +++ b/data_scrapers/sonoma_county.py @@ -10,6 +10,18 @@ sonoma_soup = BeautifulSoup(page.content, 'html.parser') tables = sonoma_soup.findAll('table')[4:] # we don't need the first three tables +def get_rows(tag: element.Tag) -> List[element.ResultSet]: + ''' + Gets all tr elements in a tag but the first, which is the header + ''' + return tag.findAll('tr')[1:] + +def get_cells(row: List[element.ResultSet]) -> List[str]: + ''' + Gets all th and tr elements within a single tr element + ''' + return [el.text for el in row.findAll(['th', 'td'])] + def generate_update_time(soup: BeautifulSoup) -> str: update_time_text = soup.find('time').text.strip() # format is May 6, 2020 10:00 AM @@ -37,9 +49,10 @@ def transform_cases(cases_tag: element.Tag) -> Dict[str, List[Dict[str, Union[st cumul_recovered = 0 active = [] cumul_active = 0 - rows = cases_tag.findAll('tr')[1:] + rows = get_rows(cases_tag) for row in rows: row_cells = row.findAll(['th', 'td']) + # print(type(row_cells)) date = row_cells[0].text.replace('/', '-') # instead of 0, this dashboard reports the string '-' @@ -62,12 +75,11 @@ def transform_cases(cases_tag: element.Tag) -> Dict[str, List[Dict[str, Union[st def transform_transmission(transmission_tag: element.Tag) -> Dict[str, int]: transmissions = {} - rows = transmission_tag.findAll('tr')[1:] + rows = get_rows(transmission_tag) # turns the transmission categories on the page into the ones we're using transmission_type_conversion = {'Community': 'community', 'Close Contact': 'from_contact', 'Travel': 'travel', 'Under Investigation': 'unknown'} for row in rows: - row_cells = row.findAll(['th', 'td']) - type, number, _pct = [el.text for el in row_cells] + type, number, _pct = get_cells(row) if type not in transmission_type_conversion: raise FutureWarning('The transmission type {0} was not found in transmission_type_conversion'.format(type)) type = transmission_type_conversion[type] @@ -76,21 +88,18 @@ def transform_transmission(transmission_tag: element.Tag) -> Dict[str, int]: def transform_tests(tests_tag: element.Tag) -> Dict[str, int]: tests = {} - rows = tests_tag.findAll('tr')[1:] + rows = get_rows(tests_tag) for row in rows: - row_cells = row.findAll(['th', 'td']) - result, number, _pct = [el.text for el in row_cells] + result, number, _pct = get_cells(row) lower_res = result.lower() tests[lower_res] = int(number.replace(',', '')) - print(tests) return tests; def transform_age(age_tag: element.Tag) -> Dict[str, int]: age_brackets = {} - rows = age_tag.findAll('tr')[1:] + rows = get_rows(age_tag) for row in rows: - row_cells = row.findAll(['th', 'td']) - bracket, cases, _pct = [el.text for el in row_cells] + bracket, cases, _pct = get_cells(row) age_brackets[bracket] = int(cases) return age_brackets From dc9b9fec8f38e833cc22884e4b924670ffc8aa73 Mon Sep 17 00:00:00 2001 From: Logan Cooper Date: Sat, 16 May 2020 11:36:21 -0700 Subject: [PATCH 12/62] sonoma Add cases by race --- data_scrapers/sonoma_county.py | 30 +++++++++++++++++++++++++++--- 1 file changed, 27 insertions(+), 3 deletions(-) diff --git a/data_scrapers/sonoma_county.py b/data_scrapers/sonoma_county.py index 315f0bd4..4f9ed6bb 100644 --- a/data_scrapers/sonoma_county.py +++ b/data_scrapers/sonoma_county.py @@ -1,6 +1,7 @@ #!/usr/bin/env python3 import requests import json +import re from datetime import datetime from typing import List, Dict, Union from bs4 import BeautifulSoup, element @@ -103,21 +104,44 @@ def transform_age(age_tag: element.Tag) -> Dict[str, int]: age_brackets[bracket] = int(cases) return age_brackets +def get_unknown_race(race_eth_tag: element.Tag) -> int: + parent = race_eth_tag.parent + note = parent.find('p').text + matches = re.search('(\d+) \(\d{1,3}%\) missing race/ethnicity', note) + if not matches: + raise FutureWarning('The format of the note with unknown race data has changed') + return(int(matches.groups()[0])) + +def transform_race_eth(race_eth_tag: element.Tag) -> Dict[str, int]: + race_cases = {} + race_transform = {'Asian/Pacific Islander, non-Hispanic': 'Asian', 'Hispanic/Latino': 'Latinx_or_Hispanic', 'Other*, non-Hispanic': 'Other', 'White, non-Hispanic': 'White'} + rows = get_rows(race_eth_tag) + for row in rows: + group_name, cases, _pct = get_cells(row) + if group_name not in race_transform: + raise FutureWarning('The racial group {0} is new in the data -- please adjust the scraper accordingly') + internal_name = race_transform[group_name] + race_cases[internal_name] = int(cases) + race_cases['Unknown'] = get_unknown_race(race_eth_tag) + return race_cases + try: + # we have a lot more data here than we are using hist_cases, cases_by_source, cases_by_race, total_tests, cases_by_region, region_guide, hospitalized, underlying_cond, symptoms, cases_by_gender, underlying_cond_by_gender, hospitalized_by_gender, symptoms_female, symptoms_male, symptoms_desc, cases_by_age, symptoms_by_age, underlying_cond_by_age = tables except ValueError as e: - raise FutureWarning('The number of values on the page has changed -- please adjust the page') + raise FutureWarning('The number of values on the page has changed -- please adjust the scraper') model = { 'name': 'Sonoma County', 'update_time': generate_update_time(sonoma_soup), 'source': url, 'meta_from_source': get_source_meta(sonoma_soup), - 'meta_from_baypd': '', + 'meta_from_baypd': 'Racial "Other" category includes "Black/African American, American Indian/Alaska Native, and Other"', 'series': transform_cases(hist_cases), 'case_totals': { 'transmission_cat': transform_transmission(cases_by_source), - 'age_group': transform_age(cases_by_age) + 'age_group': transform_age(cases_by_age), + 'race_eth': transform_race_eth(cases_by_race) }, 'tests_totals': { 'tests': transform_tests(total_tests), From af8bfe2ac87d436bd4e5cee6151c6a176be1bd28 Mon Sep 17 00:00:00 2001 From: Logan Cooper Date: Sat, 16 May 2020 18:16:10 -0700 Subject: [PATCH 13/62] sonoma Add hospitalizations --- data_scrapers/sonoma_county.py | 22 +++++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) diff --git a/data_scrapers/sonoma_county.py b/data_scrapers/sonoma_county.py index 4f9ed6bb..f685753f 100644 --- a/data_scrapers/sonoma_county.py +++ b/data_scrapers/sonoma_county.py @@ -4,7 +4,7 @@ import re from datetime import datetime from typing import List, Dict, Union -from bs4 import BeautifulSoup, element +from bs4 import BeautifulSoup, element # type: ignore url = 'https://socoemergency.org/emergency/novel-coronavirus/coronavirus-cases/' page = requests.get(url) @@ -17,7 +17,7 @@ def get_rows(tag: element.Tag) -> List[element.ResultSet]: ''' return tag.findAll('tr')[1:] -def get_cells(row: List[element.ResultSet]) -> List[str]: +def get_cells(row: element.ResultSet) -> List[str]: ''' Gets all th and tr elements within a single tr element ''' @@ -32,6 +32,7 @@ def generate_update_time(soup: BeautifulSoup) -> str: def get_source_meta(soup: BeautifulSoup) -> str: h3_tags = soup.findAll('h3') definitions_header = None + # can't use for el in h3_tags: if el.text == 'Definitions': definitions_header = el @@ -125,6 +126,17 @@ def transform_race_eth(race_eth_tag: element.Tag) -> Dict[str, int]: race_cases['Unknown'] = get_unknown_race(race_eth_tag) return race_cases +def transform_total_hospitalizations(hospital_tag: element.Tag) -> Dict[str, int]: + hospitalizations = {} + rows = get_rows(hospital_tag) + for row in rows: + hospitalized, number, _pct = get_cells(row) + if hospitalized == 'Yes': + hospitalizations['hospitalized'] = int(number) + else: + hospitalizations['not_hospitalized'] = int(number) + return hospitalizations + try: # we have a lot more data here than we are using hist_cases, cases_by_source, cases_by_race, total_tests, cases_by_region, region_guide, hospitalized, underlying_cond, symptoms, cases_by_gender, underlying_cond_by_gender, hospitalized_by_gender, symptoms_female, symptoms_male, symptoms_desc, cases_by_age, symptoms_by_age, underlying_cond_by_age = tables @@ -145,7 +157,11 @@ def transform_race_eth(race_eth_tag: element.Tag) -> Dict[str, int]: }, 'tests_totals': { 'tests': transform_tests(total_tests), + }, + 'hospitalizations': { + 'hospitalized_cases': transform_total_hospitalizations(hospitalized) } } -# print(model) +if __name__ == '__main__': + print(json.dumps(model, indent=4)) From adbe41995a9e63bb9b4a54bfff5793e4692a0627 Mon Sep 17 00:00:00 2001 From: Logan Cooper Date: Sat, 16 May 2020 18:38:49 -0700 Subject: [PATCH 14/62] sonoma Add hospitalizations by gender --- data_scrapers/sonoma_county.py | 31 ++++++++++++++++++++++--------- 1 file changed, 22 insertions(+), 9 deletions(-) diff --git a/data_scrapers/sonoma_county.py b/data_scrapers/sonoma_county.py index f685753f..120e2239 100644 --- a/data_scrapers/sonoma_county.py +++ b/data_scrapers/sonoma_county.py @@ -97,13 +97,16 @@ def transform_tests(tests_tag: element.Tag) -> Dict[str, int]: tests[lower_res] = int(number.replace(',', '')) return tests; -def transform_age(age_tag: element.Tag) -> Dict[str, int]: - age_brackets = {} - rows = get_rows(age_tag) +def generic_transform(tag: element.Tag) -> Dict[str, int]: + ''' + Transform function for tables which don't require any special processing + ''' + categories = {} + rows = get_rows(tag) for row in rows: - bracket, cases, _pct = get_cells(row) - age_brackets[bracket] = int(cases) - return age_brackets + cat, cases, _pct = get_cells(row) + categories[cat] = int(cases) + return categories def get_unknown_race(race_eth_tag: element.Tag) -> int: parent = race_eth_tag.parent @@ -137,6 +140,14 @@ def transform_total_hospitalizations(hospital_tag: element.Tag) -> Dict[str, int hospitalizations['not_hospitalized'] = int(number) return hospitalizations +def transform_gender_hospitalizations(hospital_tag: element.Tag) -> Dict[str, int]: + hospitalized = {} + rows = get_rows(hospital_tag) + for row in rows: + gender, yes, no = get_cells(row) + hospitalized[gender] = yes + return hospitalized + try: # we have a lot more data here than we are using hist_cases, cases_by_source, cases_by_race, total_tests, cases_by_region, region_guide, hospitalized, underlying_cond, symptoms, cases_by_gender, underlying_cond_by_gender, hospitalized_by_gender, symptoms_female, symptoms_male, symptoms_desc, cases_by_age, symptoms_by_age, underlying_cond_by_age = tables @@ -152,14 +163,16 @@ def transform_total_hospitalizations(hospital_tag: element.Tag) -> Dict[str, int 'series': transform_cases(hist_cases), 'case_totals': { 'transmission_cat': transform_transmission(cases_by_source), - 'age_group': transform_age(cases_by_age), - 'race_eth': transform_race_eth(cases_by_race) + 'age_group': generic_transform(cases_by_age), + 'race_eth': transform_race_eth(cases_by_race), + 'gender': generic_transform(cases_by_gender) }, 'tests_totals': { 'tests': transform_tests(total_tests), }, 'hospitalizations': { - 'hospitalized_cases': transform_total_hospitalizations(hospitalized) + 'hospitalized_cases': transform_total_hospitalizations(hospitalized), + 'gender': transform_gender_hospitalizations(hospitalized_by_gender) } } From 6b71193620b1155533d5b87fabe7d84300146146 Mon Sep 17 00:00:00 2001 From: Logan Cooper Date: Sat, 16 May 2020 18:44:16 -0700 Subject: [PATCH 15/62] sonoma Fix type error --- data_scrapers/sonoma_county.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/data_scrapers/sonoma_county.py b/data_scrapers/sonoma_county.py index 120e2239..47b6b048 100644 --- a/data_scrapers/sonoma_county.py +++ b/data_scrapers/sonoma_county.py @@ -32,7 +32,6 @@ def generate_update_time(soup: BeautifulSoup) -> str: def get_source_meta(soup: BeautifulSoup) -> str: h3_tags = soup.findAll('h3') definitions_header = None - # can't use for el in h3_tags: if el.text == 'Definitions': definitions_header = el @@ -140,7 +139,7 @@ def transform_total_hospitalizations(hospital_tag: element.Tag) -> Dict[str, int hospitalizations['not_hospitalized'] = int(number) return hospitalizations -def transform_gender_hospitalizations(hospital_tag: element.Tag) -> Dict[str, int]: +def transform_gender_hospitalizations(hospital_tag: element.Tag) -> Dict[str, str]: hospitalized = {} rows = get_rows(hospital_tag) for row in rows: From 627e82a76c3995c7d0d5865b4f2ea8eaf254f09b Mon Sep 17 00:00:00 2001 From: Logan Cooper Date: Sat, 16 May 2020 18:51:55 -0700 Subject: [PATCH 16/62] sonoma Redo definitions getter --- data_scrapers/sonoma_county.py | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/data_scrapers/sonoma_county.py b/data_scrapers/sonoma_county.py index 47b6b048..f1b83883 100644 --- a/data_scrapers/sonoma_county.py +++ b/data_scrapers/sonoma_county.py @@ -30,15 +30,9 @@ def generate_update_time(soup: BeautifulSoup) -> str: return update_datetime.isoformat() def get_source_meta(soup: BeautifulSoup) -> str: - h3_tags = soup.findAll('h3') - definitions_header = None - for el in h3_tags: - if el.text == 'Definitions': - definitions_header = el - if definitions_header == None: - raise FutureWarning('The source metadata has moved -- please look at the Sonoma County webpage and locate it, then update the scraper with this information') + definitions_header = soup.find('h3', string='Definitions') definitions_text = definitions_header.find_parent().text - return definitions_text + return definitions_text.replace('\n', ' ') # apologies for this horror of a output type def transform_cases(cases_tag: element.Tag) -> Dict[str, List[Dict[str, Union[str, int]]]]: From a565a8398f248f64e557b7d1a4c5a8632cb75ee7 Mon Sep 17 00:00:00 2001 From: Logan Cooper Date: Sat, 16 May 2020 18:56:38 -0700 Subject: [PATCH 17/62] sonoma Add get_county function --- data_scrapers/sonoma_county.py | 66 +++++++++++++++++----------------- 1 file changed, 34 insertions(+), 32 deletions(-) diff --git a/data_scrapers/sonoma_county.py b/data_scrapers/sonoma_county.py index f1b83883..8c46d08f 100644 --- a/data_scrapers/sonoma_county.py +++ b/data_scrapers/sonoma_county.py @@ -6,11 +6,6 @@ from typing import List, Dict, Union from bs4 import BeautifulSoup, element # type: ignore -url = 'https://socoemergency.org/emergency/novel-coronavirus/coronavirus-cases/' -page = requests.get(url) -sonoma_soup = BeautifulSoup(page.content, 'html.parser') -tables = sonoma_soup.findAll('table')[4:] # we don't need the first three tables - def get_rows(tag: element.Tag) -> List[element.ResultSet]: ''' Gets all tr elements in a tag but the first, which is the header @@ -141,33 +136,40 @@ def transform_gender_hospitalizations(hospital_tag: element.Tag) -> Dict[str, st hospitalized[gender] = yes return hospitalized -try: - # we have a lot more data here than we are using - hist_cases, cases_by_source, cases_by_race, total_tests, cases_by_region, region_guide, hospitalized, underlying_cond, symptoms, cases_by_gender, underlying_cond_by_gender, hospitalized_by_gender, symptoms_female, symptoms_male, symptoms_desc, cases_by_age, symptoms_by_age, underlying_cond_by_age = tables -except ValueError as e: - raise FutureWarning('The number of values on the page has changed -- please adjust the scraper') - -model = { - 'name': 'Sonoma County', - 'update_time': generate_update_time(sonoma_soup), - 'source': url, - 'meta_from_source': get_source_meta(sonoma_soup), - 'meta_from_baypd': 'Racial "Other" category includes "Black/African American, American Indian/Alaska Native, and Other"', - 'series': transform_cases(hist_cases), - 'case_totals': { - 'transmission_cat': transform_transmission(cases_by_source), - 'age_group': generic_transform(cases_by_age), - 'race_eth': transform_race_eth(cases_by_race), - 'gender': generic_transform(cases_by_gender) - }, - 'tests_totals': { - 'tests': transform_tests(total_tests), - }, - 'hospitalizations': { - 'hospitalized_cases': transform_total_hospitalizations(hospitalized), - 'gender': transform_gender_hospitalizations(hospitalized_by_gender) +def get_county(): + url = 'https://socoemergency.org/emergency/novel-coronavirus/coronavirus-cases/' + page = requests.get(url) + sonoma_soup = BeautifulSoup(page.content, 'html.parser') + tables = sonoma_soup.findAll('table')[4:] # we don't need the first three tables + + try: + # we have a lot more data here than we are using + hist_cases, cases_by_source, cases_by_race, total_tests, cases_by_region, region_guide, hospitalized, underlying_cond, symptoms, cases_by_gender, underlying_cond_by_gender, hospitalized_by_gender, symptoms_female, symptoms_male, symptoms_desc, cases_by_age, symptoms_by_age, underlying_cond_by_age = tables + except ValueError as e: + raise FutureWarning('The number of values on the page has changed -- please adjust the scraper') + + model = { + 'name': 'Sonoma County', + 'update_time': generate_update_time(sonoma_soup), + 'source': url, + 'meta_from_source': get_source_meta(sonoma_soup), + 'meta_from_baypd': 'Racial "Other" category includes "Black/African American, American Indian/Alaska Native, and Other"', + 'series': transform_cases(hist_cases), + 'case_totals': { + 'transmission_cat': transform_transmission(cases_by_source), + 'age_group': generic_transform(cases_by_age), + 'race_eth': transform_race_eth(cases_by_race), + 'gender': generic_transform(cases_by_gender) + }, + 'tests_totals': { + 'tests': transform_tests(total_tests), + }, + 'hospitalizations': { + 'hospitalized_cases': transform_total_hospitalizations(hospitalized), + 'gender': transform_gender_hospitalizations(hospitalized_by_gender) + } } -} + return model if __name__ == '__main__': - print(json.dumps(model, indent=4)) + print(json.dumps(get_county(), indent=4)) From 358a4419cab5b3a708b9294a92e36b3fa4770c82 Mon Sep 17 00:00:00 2001 From: Logan Cooper Date: Sat, 16 May 2020 19:30:11 -0700 Subject: [PATCH 18/62] sonoma Add docstrings --- data_scrapers/sonoma_county.py | 58 ++++++++++++++++++++++++++++------ 1 file changed, 48 insertions(+), 10 deletions(-) diff --git a/data_scrapers/sonoma_county.py b/data_scrapers/sonoma_county.py index 8c46d08f..e11cf854 100644 --- a/data_scrapers/sonoma_county.py +++ b/data_scrapers/sonoma_county.py @@ -7,30 +7,43 @@ from bs4 import BeautifulSoup, element # type: ignore def get_rows(tag: element.Tag) -> List[element.ResultSet]: - ''' + """ Gets all tr elements in a tag but the first, which is the header - ''' + """ return tag.findAll('tr')[1:] def get_cells(row: element.ResultSet) -> List[str]: - ''' + """ Gets all th and tr elements within a single tr element - ''' + """ return [el.text for el in row.findAll(['th', 'td'])] def generate_update_time(soup: BeautifulSoup) -> str: + """ + Generates a timestamp string (e.g. May 6, 2020 10:00 AM) for when the scraper is run + """ update_time_text = soup.find('time').text.strip() - # format is May 6, 2020 10:00 AM update_datetime = datetime.strptime(update_time_text, '%B %d, %Y %I:%M %p') return update_datetime.isoformat() def get_source_meta(soup: BeautifulSoup) -> str: + """ + Finds the 'Definitions' header on the page and gets all of the text in it + """ definitions_header = soup.find('h3', string='Definitions') definitions_text = definitions_header.find_parent().text return definitions_text.replace('\n', ' ') # apologies for this horror of a output type def transform_cases(cases_tag: element.Tag) -> Dict[str, List[Dict[str, Union[str, int]]]]: + """ + Takes in a BeautifulSoup tag for the cases table and returns all cases + (historic and active), deaths, and recoveries in the form: + { 'cases': [], 'deaths': [], 'recovered': [], 'active': [] } + Where each list contains dictionaries (representing each day's data) + of form (example for cases): + { 'date': '', 'cases': -1, 'cumul_cases': -1 } + """ cases = [] cumul_cases = 0 deaths = [] @@ -60,10 +73,14 @@ def transform_cases(cases_tag: element.Tag) -> Dict[str, List[Dict[str, Union[st new_active = active_cases - cumul_active active.append({ 'date': date, 'active': new_active, 'cumul_active': active_cases }) - # print(deaths) return { 'cases': cases, 'deaths': deaths, 'recovered': recovered, 'active': active } def transform_transmission(transmission_tag: element.Tag) -> Dict[str, int]: + """ + Takes in a BeautifulSoup tag for the transmissions table and breaks it into + a dictionary of type: + {'community': -1, 'from_contact': -1, 'travel': -1, 'unknown': -1} + """ transmissions = {} rows = get_rows(transmission_tag) # turns the transmission categories on the page into the ones we're using @@ -86,9 +103,11 @@ def transform_tests(tests_tag: element.Tag) -> Dict[str, int]: return tests; def generic_transform(tag: element.Tag) -> Dict[str, int]: - ''' - Transform function for tables which don't require any special processing - ''' + """ + Transform function for tables which don't require any special processing. + Takes in a BeautifulSoup tag for a table and returns a dictionary + in which the keys are strings and the values integers + """ categories = {} rows = get_rows(tag) for row in rows: @@ -97,6 +116,10 @@ def generic_transform(tag: element.Tag) -> Dict[str, int]: return categories def get_unknown_race(race_eth_tag: element.Tag) -> int: + """ + Gets the notes under the 'Cases by race and ethnicity' table to find the + number of cases where the person's race is unknown + """ parent = race_eth_tag.parent note = parent.find('p').text matches = re.search('(\d+) \(\d{1,3}%\) missing race/ethnicity', note) @@ -105,6 +128,12 @@ def get_unknown_race(race_eth_tag: element.Tag) -> int: return(int(matches.groups()[0])) def transform_race_eth(race_eth_tag: element.Tag) -> Dict[str, int]: + """ + Takes in the BeautifulSoup tag for the cases by race/ethnicity table and + transforms it into an object of form: + 'race_eth': {'Asian': -1, 'Latinx_or_Hispanic': -1, 'Other': -1, 'White':-1, 'Unknown': -1} + NB: These are the only races reported seperatley by Sonoma county at this time + """ race_cases = {} race_transform = {'Asian/Pacific Islander, non-Hispanic': 'Asian', 'Hispanic/Latino': 'Latinx_or_Hispanic', 'Other*, non-Hispanic': 'Other', 'White, non-Hispanic': 'White'} rows = get_rows(race_eth_tag) @@ -118,6 +147,11 @@ def transform_race_eth(race_eth_tag: element.Tag) -> Dict[str, int]: return race_cases def transform_total_hospitalizations(hospital_tag: element.Tag) -> Dict[str, int]: + """ + Takes in a BeautifulSoup tag of the cases by hospitalization table and + returns a dictionary with the numbers of hospitalized and non-hospitalized + cases + """ hospitalizations = {} rows = get_rows(hospital_tag) for row in rows: @@ -129,6 +163,9 @@ def transform_total_hospitalizations(hospital_tag: element.Tag) -> Dict[str, int return hospitalizations def transform_gender_hospitalizations(hospital_tag: element.Tag) -> Dict[str, str]: + """ + + """ hospitalized = {} rows = get_rows(hospital_tag) for row in rows: @@ -136,7 +173,8 @@ def transform_gender_hospitalizations(hospital_tag: element.Tag) -> Dict[str, st hospitalized[gender] = yes return hospitalized -def get_county(): +def get_county() -> Dict: + """Main method for populating county data .json""" url = 'https://socoemergency.org/emergency/novel-coronavirus/coronavirus-cases/' page = requests.get(url) sonoma_soup = BeautifulSoup(page.content, 'html.parser') From 7dc3beb59c0a9345e268d116916ed6f2a8a9785a Mon Sep 17 00:00:00 2001 From: Logan Cooper Date: Sat, 16 May 2020 19:32:57 -0700 Subject: [PATCH 19/62] sonoma Comment out hospitalizations by gender --- data_scrapers/sonoma_county.py | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/data_scrapers/sonoma_county.py b/data_scrapers/sonoma_county.py index e11cf854..0004e5a2 100644 --- a/data_scrapers/sonoma_county.py +++ b/data_scrapers/sonoma_county.py @@ -162,16 +162,17 @@ def transform_total_hospitalizations(hospital_tag: element.Tag) -> Dict[str, int hospitalizations['not_hospitalized'] = int(number) return hospitalizations -def transform_gender_hospitalizations(hospital_tag: element.Tag) -> Dict[str, str]: - """ - - """ - hospitalized = {} - rows = get_rows(hospital_tag) - for row in rows: - gender, yes, no = get_cells(row) - hospitalized[gender] = yes - return hospitalized +# def transform_gender_hospitalizations(hospital_tag: element.Tag) -> Dict[str, float]: +# """ +# +# """ +# hospitalized = {} +# rows = get_rows(hospital_tag) +# for row in rows: +# gender, no, yes = get_cells(row) +# yes_int = int(yes.replace('%', '')) +# hospitalized[gender] = (yes_int / 100) +# return hospitalized def get_county() -> Dict: """Main method for populating county data .json""" @@ -204,7 +205,7 @@ def get_county() -> Dict: }, 'hospitalizations': { 'hospitalized_cases': transform_total_hospitalizations(hospitalized), - 'gender': transform_gender_hospitalizations(hospitalized_by_gender) + # 'gender': transform_gender_hospitalizations(hospitalized_by_gender) } } return model From 6a4ead9c2aeb9a480001633c0ff210e02fc2996d Mon Sep 17 00:00:00 2001 From: Logan Cooper Date: Sat, 16 May 2020 19:37:39 -0700 Subject: [PATCH 20/62] sonoma Add docstring for gender hospitalization --- data_scrapers/sonoma_county.py | 26 ++++++++++++++------------ 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/data_scrapers/sonoma_county.py b/data_scrapers/sonoma_county.py index 0004e5a2..f2d193e9 100644 --- a/data_scrapers/sonoma_county.py +++ b/data_scrapers/sonoma_county.py @@ -162,17 +162,19 @@ def transform_total_hospitalizations(hospital_tag: element.Tag) -> Dict[str, int hospitalizations['not_hospitalized'] = int(number) return hospitalizations -# def transform_gender_hospitalizations(hospital_tag: element.Tag) -> Dict[str, float]: -# """ -# -# """ -# hospitalized = {} -# rows = get_rows(hospital_tag) -# for row in rows: -# gender, no, yes = get_cells(row) -# yes_int = int(yes.replace('%', '')) -# hospitalized[gender] = (yes_int / 100) -# return hospitalized +def transform_gender_hospitalizations(hospital_tag: element.Tag) -> Dict[str, float]: + """ + Takes in a BeautifulSoup tag representing the percent of cases hospitalized + by gender and returns a dictionary of those percentages in float form + e.g. 9% is 0.09 + """ + hospitalized = {} + rows = get_rows(hospital_tag) + for row in rows: + gender, no, yes = get_cells(row) + yes_int = int(yes.replace('%', '')) + hospitalized[gender] = (yes_int / 100) + return hospitalized def get_county() -> Dict: """Main method for populating county data .json""" @@ -205,7 +207,7 @@ def get_county() -> Dict: }, 'hospitalizations': { 'hospitalized_cases': transform_total_hospitalizations(hospitalized), - # 'gender': transform_gender_hospitalizations(hospitalized_by_gender) + 'gender': transform_gender_hospitalizations(hospitalized_by_gender) } } return model From 336e5ac68e492dfe897e73baf13ac7b9c5faef1b Mon Sep 17 00:00:00 2001 From: Logan Cooper Date: Sat, 16 May 2020 19:48:00 -0700 Subject: [PATCH 21/62] sonoma Remove unused variable --- data_scrapers/sonoma_county.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/data_scrapers/sonoma_county.py b/data_scrapers/sonoma_county.py index f2d193e9..71f7fa53 100644 --- a/data_scrapers/sonoma_county.py +++ b/data_scrapers/sonoma_county.py @@ -186,7 +186,7 @@ def get_county() -> Dict: try: # we have a lot more data here than we are using hist_cases, cases_by_source, cases_by_race, total_tests, cases_by_region, region_guide, hospitalized, underlying_cond, symptoms, cases_by_gender, underlying_cond_by_gender, hospitalized_by_gender, symptoms_female, symptoms_male, symptoms_desc, cases_by_age, symptoms_by_age, underlying_cond_by_age = tables - except ValueError as e: + except ValueError: raise FutureWarning('The number of values on the page has changed -- please adjust the scraper') model = { From 5297eeb22f0790d13c33703cb616eab4d556b44e Mon Sep 17 00:00:00 2001 From: Logan Cooper Date: Tue, 19 May 2020 16:23:23 -0700 Subject: [PATCH 22/62] sonoma Replace findAll with find_all --- data_scrapers/sonoma_county.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/data_scrapers/sonoma_county.py b/data_scrapers/sonoma_county.py index 71f7fa53..f39affa2 100644 --- a/data_scrapers/sonoma_county.py +++ b/data_scrapers/sonoma_county.py @@ -10,13 +10,13 @@ def get_rows(tag: element.Tag) -> List[element.ResultSet]: """ Gets all tr elements in a tag but the first, which is the header """ - return tag.findAll('tr')[1:] + return tag.find_all('tr')[1:] def get_cells(row: element.ResultSet) -> List[str]: """ Gets all th and tr elements within a single tr element """ - return [el.text for el in row.findAll(['th', 'td'])] + return [el.text for el in row.find_all(['th', 'td'])] def generate_update_time(soup: BeautifulSoup) -> str: """ @@ -54,7 +54,7 @@ def transform_cases(cases_tag: element.Tag) -> Dict[str, List[Dict[str, Union[st cumul_active = 0 rows = get_rows(cases_tag) for row in rows: - row_cells = row.findAll(['th', 'td']) + row_cells = row.find_all(['th', 'td']) # print(type(row_cells)) date = row_cells[0].text.replace('/', '-') @@ -181,7 +181,7 @@ def get_county() -> Dict: url = 'https://socoemergency.org/emergency/novel-coronavirus/coronavirus-cases/' page = requests.get(url) sonoma_soup = BeautifulSoup(page.content, 'html.parser') - tables = sonoma_soup.findAll('table')[4:] # we don't need the first three tables + tables = sonoma_soup.find_all('table')[4:] # we don't need the first three tables try: # we have a lot more data here than we are using From 5093fe30ddb5b0834f2bff65b0d83491f1c85837 Mon Sep 17 00:00:00 2001 From: Logan Cooper Date: Tue, 19 May 2020 16:24:45 -0700 Subject: [PATCH 23/62] sonoma Make newlines clearer --- data_scrapers/sonoma_county.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/data_scrapers/sonoma_county.py b/data_scrapers/sonoma_county.py index f39affa2..89094f88 100644 --- a/data_scrapers/sonoma_county.py +++ b/data_scrapers/sonoma_county.py @@ -32,7 +32,7 @@ def get_source_meta(soup: BeautifulSoup) -> str: """ definitions_header = soup.find('h3', string='Definitions') definitions_text = definitions_header.find_parent().text - return definitions_text.replace('\n', ' ') + return definitions_text.replace('\n', '/').strip() # apologies for this horror of a output type def transform_cases(cases_tag: element.Tag) -> Dict[str, List[Dict[str, Union[str, int]]]]: From 48dd3c1f2aef5ec4acef107259528d3af8fd5231 Mon Sep 17 00:00:00 2001 From: Logan Cooper Date: Wed, 20 May 2020 17:40:51 -0700 Subject: [PATCH 24/62] sonoma Comment out hospitalizations --- data_scrapers/sonoma_county.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/data_scrapers/sonoma_county.py b/data_scrapers/sonoma_county.py index 89094f88..cf107351 100644 --- a/data_scrapers/sonoma_county.py +++ b/data_scrapers/sonoma_county.py @@ -156,7 +156,7 @@ def transform_total_hospitalizations(hospital_tag: element.Tag) -> Dict[str, int rows = get_rows(hospital_tag) for row in rows: hospitalized, number, _pct = get_cells(row) - if hospitalized == 'Yes': + if hospitalized.lower() == 'yes': hospitalizations['hospitalized'] = int(number) else: hospitalizations['not_hospitalized'] = int(number) @@ -180,7 +180,7 @@ def get_county() -> Dict: """Main method for populating county data .json""" url = 'https://socoemergency.org/emergency/novel-coronavirus/coronavirus-cases/' page = requests.get(url) - sonoma_soup = BeautifulSoup(page.content, 'html.parser') + sonoma_soup = BeautifulSoup(page.content, 'html5lib') tables = sonoma_soup.find_all('table')[4:] # we don't need the first three tables try: @@ -205,10 +205,10 @@ def get_county() -> Dict: 'tests_totals': { 'tests': transform_tests(total_tests), }, - 'hospitalizations': { - 'hospitalized_cases': transform_total_hospitalizations(hospitalized), - 'gender': transform_gender_hospitalizations(hospitalized_by_gender) - } + # 'hospitalizations': { + # 'hospitalized_cases': transform_total_hospitalizations(hospitalized), + # 'gender': transform_gender_hospitalizations(hospitalized_by_gender) + # } } return model From a8ce7423c680049b77dc7199fc936100c649aa5d Mon Sep 17 00:00:00 2001 From: Logan Cooper Date: Wed, 20 May 2020 17:58:12 -0700 Subject: [PATCH 25/62] sonoma Use better date parser --- data_scrapers/sonoma_county.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/data_scrapers/sonoma_county.py b/data_scrapers/sonoma_county.py index cf107351..83a5e4a9 100644 --- a/data_scrapers/sonoma_county.py +++ b/data_scrapers/sonoma_county.py @@ -2,7 +2,7 @@ import requests import json import re -from datetime import datetime +import dateutil.parser from typing import List, Dict, Union from bs4 import BeautifulSoup, element # type: ignore @@ -22,9 +22,14 @@ def generate_update_time(soup: BeautifulSoup) -> str: """ Generates a timestamp string (e.g. May 6, 2020 10:00 AM) for when the scraper is run """ - update_time_text = soup.find('time').text.strip() - update_datetime = datetime.strptime(update_time_text, '%B %d, %Y %I:%M %p') - return update_datetime.isoformat() + update_time_text = soup.find('time', {'class': 'updated'}).text.strip() + try: + date = dateutil.parser.parse(update_time_text) + except ValueError: + raise ValueError(f'Article {index} date is not in ISO 8601' + f'format: "{date_string}"') + print(date) + return date def get_source_meta(soup: BeautifulSoup) -> str: """ From c9f35007245e1ffbd324a5d2bdb64dff8fbc16f8 Mon Sep 17 00:00:00 2001 From: Logan Cooper Date: Wed, 20 May 2020 18:44:12 -0700 Subject: [PATCH 26/62] sonoma Improve transform cases function --- data_scrapers/sonoma_county.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/data_scrapers/sonoma_county.py b/data_scrapers/sonoma_county.py index 83a5e4a9..9f977f68 100644 --- a/data_scrapers/sonoma_county.py +++ b/data_scrapers/sonoma_county.py @@ -60,8 +60,7 @@ def transform_cases(cases_tag: element.Tag) -> Dict[str, List[Dict[str, Union[st rows = get_rows(cases_tag) for row in rows: row_cells = row.find_all(['th', 'td']) - # print(type(row_cells)) - date = row_cells[0].text.replace('/', '-') + date = dateutil.parser.parse(row_cells[0]).date().isoformat() # instead of 0, this dashboard reports the string '-' active_cases, new_infected, dead, recoveries = [0 if el.text == '–' else int(el.text) for el in row_cells[1:]] @@ -72,13 +71,16 @@ def transform_cases(cases_tag: element.Tag) -> Dict[str, List[Dict[str, Union[st new_deaths = dead - cumul_deaths deaths.append({ 'date': date, 'deaths': new_deaths, 'cumul_deaths': dead }) - new_recovered = recoveries - cumul_recovered - recovered.append({ 'date': date, 'recovered': new_recovered, 'cumul_recovered': recoveries }) + # new_recovered = recoveries - cumul_recovered + # recovered.append({ 'date': date, 'recovered': new_recovered, 'cumul_recovered': recoveries }) + # + # new_active = active_cases - cumul_active + # active.append({ 'date': date, 'active': new_active, 'cumul_active': active_cases }) - new_active = active_cases - cumul_active - active.append({ 'date': date, 'active': new_active, 'cumul_active': active_cases }) + cases.reverse() + deaths.reverse() - return { 'cases': cases, 'deaths': deaths, 'recovered': recovered, 'active': active } + return { 'cases': cases, 'deaths': deaths } def transform_transmission(transmission_tag: element.Tag) -> Dict[str, int]: """ From 058a5552a35892b2545bd613ace50f32e22ce455 Mon Sep 17 00:00:00 2001 From: Logan Cooper Date: Wed, 20 May 2020 20:10:50 -0700 Subject: [PATCH 27/62] sonoma Fix date formats, table selection, and number parsing --- data_scrapers/sonoma_county.py | 46 ++++++++++++++++++++++++---------- 1 file changed, 33 insertions(+), 13 deletions(-) diff --git a/data_scrapers/sonoma_county.py b/data_scrapers/sonoma_county.py index 9f977f68..83ef3c53 100644 --- a/data_scrapers/sonoma_county.py +++ b/data_scrapers/sonoma_county.py @@ -6,6 +6,16 @@ from typing import List, Dict, Union from bs4 import BeautifulSoup, element # type: ignore +def get_table(header: str, soup: BeautifulSoup) -> element.Tag: + """ + Takes in a header and a BeautifulSoup object and returns the table under + that header + """ + header = soup.find(lambda tag: tag.name == 'h3' and header in tag.get_text()) + tables = header.find_parent().find_all('table') + # this lets us get the second cases table + return tables[-1] + def get_rows(tag: element.Tag) -> List[element.ResultSet]: """ Gets all tr elements in a tag but the first, which is the header @@ -18,6 +28,13 @@ def get_cells(row: element.ResultSet) -> List[str]: """ return [el.text for el in row.find_all(['th', 'td'])] +def parse_int(text: str) -> int: + text = text.strip() + if text == '-': + return 0 + else: + return int(text.replace(',', '')) + def generate_update_time(soup: BeautifulSoup) -> str: """ Generates a timestamp string (e.g. May 6, 2020 10:00 AM) for when the scraper is run @@ -28,8 +45,7 @@ def generate_update_time(soup: BeautifulSoup) -> str: except ValueError: raise ValueError(f'Article {index} date is not in ISO 8601' f'format: "{date_string}"') - print(date) - return date + return date.isoformat() def get_source_meta(soup: BeautifulSoup) -> str: """ @@ -60,10 +76,10 @@ def transform_cases(cases_tag: element.Tag) -> Dict[str, List[Dict[str, Union[st rows = get_rows(cases_tag) for row in rows: row_cells = row.find_all(['th', 'td']) - date = dateutil.parser.parse(row_cells[0]).date().isoformat() + date = dateutil.parser.parse(row_cells[0].text).date().isoformat() # instead of 0, this dashboard reports the string '-' - active_cases, new_infected, dead, recoveries = [0 if el.text == '–' else int(el.text) for el in row_cells[1:]] + active_cases, new_infected, dead, recoveries = [parse_int(el.text) for el in row_cells[1:]] cumul_cases += new_infected cases.append({ 'date': date, 'cases': new_infected, 'cumul_cases': cumul_cases }) @@ -77,9 +93,8 @@ def transform_cases(cases_tag: element.Tag) -> Dict[str, List[Dict[str, Union[st # new_active = active_cases - cumul_active # active.append({ 'date': date, 'active': new_active, 'cumul_active': active_cases }) - cases.reverse() - deaths.reverse() - + cases.reverse() + deaths.reverse() return { 'cases': cases, 'deaths': deaths } def transform_transmission(transmission_tag: element.Tag) -> Dict[str, int]: @@ -183,18 +198,23 @@ def transform_gender_hospitalizations(hospital_tag: element.Tag) -> Dict[str, fl hospitalized[gender] = (yes_int / 100) return hospitalized +def get_table_tags(soup: BeautifulSoup) -> List[element.Tag]: + """ + Takes in a BeautifulSoup object and returns an array of the tables we need + """ + headers = ['Cases by Date', 'Test Results', 'Cases by Source', 'Cases by Age Group', 'Cases by Gender', 'Cases by Race'] + return [get_table(header, soup) for header in headers] + def get_county() -> Dict: - """Main method for populating county data .json""" + """ + Main method for populating county data .json + """ url = 'https://socoemergency.org/emergency/novel-coronavirus/coronavirus-cases/' page = requests.get(url) sonoma_soup = BeautifulSoup(page.content, 'html5lib') tables = sonoma_soup.find_all('table')[4:] # we don't need the first three tables - try: - # we have a lot more data here than we are using - hist_cases, cases_by_source, cases_by_race, total_tests, cases_by_region, region_guide, hospitalized, underlying_cond, symptoms, cases_by_gender, underlying_cond_by_gender, hospitalized_by_gender, symptoms_female, symptoms_male, symptoms_desc, cases_by_age, symptoms_by_age, underlying_cond_by_age = tables - except ValueError: - raise FutureWarning('The number of values on the page has changed -- please adjust the scraper') + hist_cases, total_tests, cases_by_source, cases_by_age, cases_by_gender, cases_by_race = get_table_tags(sonoma_soup) model = { 'name': 'Sonoma County', From fd4e1359ea0a755d45be65a5db8b0fbfcefdabfa Mon Sep 17 00:00:00 2001 From: Logan Cooper Date: Wed, 20 May 2020 20:17:11 -0700 Subject: [PATCH 28/62] sonoma Use custom int parse function --- data_scrapers/sonoma_county.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/data_scrapers/sonoma_county.py b/data_scrapers/sonoma_county.py index 83ef3c53..c40eb485 100644 --- a/data_scrapers/sonoma_county.py +++ b/data_scrapers/sonoma_county.py @@ -112,7 +112,7 @@ def transform_transmission(transmission_tag: element.Tag) -> Dict[str, int]: if type not in transmission_type_conversion: raise FutureWarning('The transmission type {0} was not found in transmission_type_conversion'.format(type)) type = transmission_type_conversion[type] - transmissions[type] = int(number) + transmissions[type] = parse_int(number) return transmissions def transform_tests(tests_tag: element.Tag) -> Dict[str, int]: @@ -121,7 +121,7 @@ def transform_tests(tests_tag: element.Tag) -> Dict[str, int]: for row in rows: result, number, _pct = get_cells(row) lower_res = result.lower() - tests[lower_res] = int(number.replace(',', '')) + tests[lower_res] = parse_int(number) return tests; def generic_transform(tag: element.Tag) -> Dict[str, int]: @@ -134,7 +134,7 @@ def generic_transform(tag: element.Tag) -> Dict[str, int]: rows = get_rows(tag) for row in rows: cat, cases, _pct = get_cells(row) - categories[cat] = int(cases) + categories[cat] = parse_int(cases) return categories def get_unknown_race(race_eth_tag: element.Tag) -> int: @@ -147,7 +147,7 @@ def get_unknown_race(race_eth_tag: element.Tag) -> int: matches = re.search('(\d+) \(\d{1,3}%\) missing race/ethnicity', note) if not matches: raise FutureWarning('The format of the note with unknown race data has changed') - return(int(matches.groups()[0])) + return(parse_int(matches.groups()[0])) def transform_race_eth(race_eth_tag: element.Tag) -> Dict[str, int]: """ @@ -164,7 +164,7 @@ def transform_race_eth(race_eth_tag: element.Tag) -> Dict[str, int]: if group_name not in race_transform: raise FutureWarning('The racial group {0} is new in the data -- please adjust the scraper accordingly') internal_name = race_transform[group_name] - race_cases[internal_name] = int(cases) + race_cases[internal_name] = parse_int(cases) race_cases['Unknown'] = get_unknown_race(race_eth_tag) return race_cases @@ -179,9 +179,9 @@ def transform_total_hospitalizations(hospital_tag: element.Tag) -> Dict[str, int for row in rows: hospitalized, number, _pct = get_cells(row) if hospitalized.lower() == 'yes': - hospitalizations['hospitalized'] = int(number) + hospitalizations['hospitalized'] = parse_int(number) else: - hospitalizations['not_hospitalized'] = int(number) + hospitalizations['not_hospitalized'] = parse_int(number) return hospitalizations def transform_gender_hospitalizations(hospital_tag: element.Tag) -> Dict[str, float]: @@ -194,7 +194,7 @@ def transform_gender_hospitalizations(hospital_tag: element.Tag) -> Dict[str, fl rows = get_rows(hospital_tag) for row in rows: gender, no, yes = get_cells(row) - yes_int = int(yes.replace('%', '')) + yes_int = parse_int(yes.replace('%', '')) hospitalized[gender] = (yes_int / 100) return hospitalized From 8310ca0f16736a54e88f810eb4d82d068d7e9623 Mon Sep 17 00:00:00 2001 From: Logan Cooper Date: Wed, 20 May 2020 20:25:51 -0700 Subject: [PATCH 29/62] sonoma Create custom FormatError exception --- data_scrapers/format_error.py | 7 +++++++ data_scrapers/sonoma_county.py | 9 +++++---- 2 files changed, 12 insertions(+), 4 deletions(-) create mode 100644 data_scrapers/format_error.py diff --git a/data_scrapers/format_error.py b/data_scrapers/format_error.py new file mode 100644 index 00000000..96ac077f --- /dev/null +++ b/data_scrapers/format_error.py @@ -0,0 +1,7 @@ +class FormatError(Exception): + """ + A custom error to raise whenever a scraper runs into something in an + unexpected format. This usually means that the website the scraper is + accessing has changed + """ + pass diff --git a/data_scrapers/sonoma_county.py b/data_scrapers/sonoma_county.py index c40eb485..2ad13ffa 100644 --- a/data_scrapers/sonoma_county.py +++ b/data_scrapers/sonoma_county.py @@ -5,6 +5,7 @@ import dateutil.parser from typing import List, Dict, Union from bs4 import BeautifulSoup, element # type: ignore +from format_error import FormatError def get_table(header: str, soup: BeautifulSoup) -> element.Tag: """ @@ -110,7 +111,7 @@ def transform_transmission(transmission_tag: element.Tag) -> Dict[str, int]: for row in rows: type, number, _pct = get_cells(row) if type not in transmission_type_conversion: - raise FutureWarning('The transmission type {0} was not found in transmission_type_conversion'.format(type)) + raise FormatError('The transmission type {0} was not found in transmission_type_conversion'.format(type)) type = transmission_type_conversion[type] transmissions[type] = parse_int(number) return transmissions @@ -145,8 +146,8 @@ def get_unknown_race(race_eth_tag: element.Tag) -> int: parent = race_eth_tag.parent note = parent.find('p').text matches = re.search('(\d+) \(\d{1,3}%\) missing race/ethnicity', note) - if not matches: - raise FutureWarning('The format of the note with unknown race data has changed') + if matches: + raise FormatError('The format of the note with unknown race data has changed') return(parse_int(matches.groups()[0])) def transform_race_eth(race_eth_tag: element.Tag) -> Dict[str, int]: @@ -162,7 +163,7 @@ def transform_race_eth(race_eth_tag: element.Tag) -> Dict[str, int]: for row in rows: group_name, cases, _pct = get_cells(row) if group_name not in race_transform: - raise FutureWarning('The racial group {0} is new in the data -- please adjust the scraper accordingly') + raise FormatError('The racial group {0} is new in the data -- please adjust the scraper accordingly') internal_name = race_transform[group_name] race_cases[internal_name] = parse_int(cases) race_cases['Unknown'] = get_unknown_race(race_eth_tag) From ada9b2aafe49af5658b053f1add935886dbaff03 Mon Sep 17 00:00:00 2001 From: Logan Cooper Date: Sat, 23 May 2020 09:46:14 -0700 Subject: [PATCH 30/62] sonoma use template defaults for race --- data_scrapers/sonoma_county.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/data_scrapers/sonoma_county.py b/data_scrapers/sonoma_county.py index 2ad13ffa..bc166a8e 100644 --- a/data_scrapers/sonoma_county.py +++ b/data_scrapers/sonoma_county.py @@ -157,7 +157,17 @@ def transform_race_eth(race_eth_tag: element.Tag) -> Dict[str, int]: 'race_eth': {'Asian': -1, 'Latinx_or_Hispanic': -1, 'Other': -1, 'White':-1, 'Unknown': -1} NB: These are the only races reported seperatley by Sonoma county at this time """ - race_cases = {} + race_cases = { + 'African_Amer': 0, + 'Asian': 0, + 'Latinx_or_Hispanic': 0, + 'Native_Amer':0, + 'Multiple_Race':0, + 'Other': 0, + 'Pacific_Islander': 0, + 'White': 0, + 'Unknown': 0 + } race_transform = {'Asian/Pacific Islander, non-Hispanic': 'Asian', 'Hispanic/Latino': 'Latinx_or_Hispanic', 'Other*, non-Hispanic': 'Other', 'White, non-Hispanic': 'White'} rows = get_rows(race_eth_tag) for row in rows: From 40b84e0811d08e2fa7c187a793c64bb888f8c098 Mon Sep 17 00:00:00 2001 From: Logan Cooper Date: Sat, 23 May 2020 09:49:04 -0700 Subject: [PATCH 31/62] sonoma Fix test breakage --- data_scrapers/sonoma_county.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/data_scrapers/sonoma_county.py b/data_scrapers/sonoma_county.py index bc166a8e..3c090311 100644 --- a/data_scrapers/sonoma_county.py +++ b/data_scrapers/sonoma_county.py @@ -146,7 +146,7 @@ def get_unknown_race(race_eth_tag: element.Tag) -> int: parent = race_eth_tag.parent note = parent.find('p').text matches = re.search('(\d+) \(\d{1,3}%\) missing race/ethnicity', note) - if matches: + if not matches: raise FormatError('The format of the note with unknown race data has changed') return(parse_int(matches.groups()[0])) From eb1a4893341fbe908b9a58cb6317495ae7147ab0 Mon Sep 17 00:00:00 2001 From: Logan Cooper Date: Sat, 23 May 2020 10:46:55 -0700 Subject: [PATCH 32/62] sonoma Use unique functions for age and gender --- data_scrapers/sonoma_county.py | 39 +++++++++++++++++++++++++++++----- 1 file changed, 34 insertions(+), 5 deletions(-) diff --git a/data_scrapers/sonoma_county.py b/data_scrapers/sonoma_county.py index 3c090311..a3074936 100644 --- a/data_scrapers/sonoma_county.py +++ b/data_scrapers/sonoma_county.py @@ -125,17 +125,46 @@ def transform_tests(tests_tag: element.Tag) -> Dict[str, int]: tests[lower_res] = parse_int(number) return tests; -def generic_transform(tag: element.Tag) -> Dict[str, int]: +# def generic_transform(tag: element.Tag) -> Dict[str, int]: +# """ +# Transform function for tables which don't require any special processing. +# Takes in a BeautifulSoup tag for a table and returns a dictionary +# in which the keys are strings and the values integers +# """ +# categories = {} +# rows = get_rows(tag) +# for row in rows: +# cat, cases, _pct = get_cells(row) +# categories[cat] = parse_int(cases) +# return categories + +def gender_transform(tag: element.Tag) -> Dict[str, int]: """ - Transform function for tables which don't require any special processing. + Transform function for the cases by gender table. Takes in a BeautifulSoup tag for a table and returns a dictionary in which the keys are strings and the values integers """ categories = {} rows = get_rows(tag) + string_conversions = {'Males': 'male', 'Females': 'female'} for row in rows: cat, cases, _pct = get_cells(row) - categories[cat] = parse_int(cases) + categories[string_conversions[cat]] = parse_int(cases) + return categories + +def age_transform(tag: element.Tag) -> List[Dict[str, int]]: + """ + Transform function for the cases by age group table. + Takes in a BeautifulSoup tag for a table and returns a list of + dictionaries in which the keys are strings and the values integers + """ + categories = [] + rows = get_rows(tag) + for row in rows: + group, cases, _pct = get_cells(row) + raw_cases = parse_int(cases) + element = {'group': group, 'raw_cases': raw_cases} + categories.append(element) return categories def get_unknown_race(race_eth_tag: element.Tag) -> int: @@ -236,9 +265,9 @@ def get_county() -> Dict: 'series': transform_cases(hist_cases), 'case_totals': { 'transmission_cat': transform_transmission(cases_by_source), - 'age_group': generic_transform(cases_by_age), + 'age_group': age_transform(cases_by_age), 'race_eth': transform_race_eth(cases_by_race), - 'gender': generic_transform(cases_by_gender) + 'gender': gender_transform(cases_by_gender) }, 'tests_totals': { 'tests': transform_tests(total_tests), From fdb20454f645893006a7216d2077b5a9e9a41780 Mon Sep 17 00:00:00 2001 From: Logan Cooper Date: Sat, 23 May 2020 10:52:03 -0700 Subject: [PATCH 33/62] sonoma Transform age group names --- data_scrapers/sonoma_county.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/data_scrapers/sonoma_county.py b/data_scrapers/sonoma_county.py index a3074936..f648fac4 100644 --- a/data_scrapers/sonoma_county.py +++ b/data_scrapers/sonoma_county.py @@ -163,7 +163,15 @@ def age_transform(tag: element.Tag) -> List[Dict[str, int]]: for row in rows: group, cases, _pct = get_cells(row) raw_cases = parse_int(cases) - element = {'group': group, 'raw_cases': raw_cases} + age_string_transform = { + '0-17': '0_to_17', + '18-49': '18_to_49', + '50-64': '50_to_64', + '65 and Above': '65_and_older', + 'Under Investigation': 'Unknown' + } + + element = {'group': age_string_transform[group], 'raw_cases': raw_cases} categories.append(element) return categories From 1e1b0a82847906a94183b3b09361410e904d4aa1 Mon Sep 17 00:00:00 2001 From: Logan Cooper Date: Sat, 23 May 2020 11:02:26 -0700 Subject: [PATCH 34/62] sonoma Add error handling for gender and age transformations --- data_scrapers/sonoma_county.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/data_scrapers/sonoma_county.py b/data_scrapers/sonoma_county.py index f648fac4..afdc74b5 100644 --- a/data_scrapers/sonoma_county.py +++ b/data_scrapers/sonoma_county.py @@ -146,10 +146,12 @@ def gender_transform(tag: element.Tag) -> Dict[str, int]: """ categories = {} rows = get_rows(tag) - string_conversions = {'Males': 'male', 'Females': 'female'} + gender_string_conversions = {'Males': 'male', 'Females': 'female'} for row in rows: - cat, cases, _pct = get_cells(row) - categories[string_conversions[cat]] = parse_int(cases) + gender, cases, _pct = get_cells(row) + if gender not in gender_string_conversions: + raise FormatError('An unrecognized gender has been added to the gender table') + categories[gender_string_conversions[gender]] = parse_int(cases) return categories def age_transform(tag: element.Tag) -> List[Dict[str, int]]: @@ -171,6 +173,9 @@ def age_transform(tag: element.Tag) -> List[Dict[str, int]]: 'Under Investigation': 'Unknown' } + if cases not in age_string_transform: + raise FormatError('A new race group has been added to the cases by race table') + element = {'group': age_string_transform[group], 'raw_cases': raw_cases} categories.append(element) return categories From 1f3755abf7f323bfb11c0570894409f5d21af806 Mon Sep 17 00:00:00 2001 From: Logan Cooper Date: Sat, 23 May 2020 11:03:07 -0700 Subject: [PATCH 35/62] sonoma Rename scraper file --- data_scrapers/{sonoma_county.py => sonoma.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename data_scrapers/{sonoma_county.py => sonoma.py} (100%) diff --git a/data_scrapers/sonoma_county.py b/data_scrapers/sonoma.py similarity index 100% rename from data_scrapers/sonoma_county.py rename to data_scrapers/sonoma.py From 96b81b5d9bf82337f6c04c5f124f03db19b06698 Mon Sep 17 00:00:00 2001 From: Logan Cooper Date: Sat, 23 May 2020 11:06:56 -0700 Subject: [PATCH 36/62] sonoma Fix error handling for age --- data_scrapers/sonoma.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/data_scrapers/sonoma.py b/data_scrapers/sonoma.py index afdc74b5..841bbaab 100644 --- a/data_scrapers/sonoma.py +++ b/data_scrapers/sonoma.py @@ -173,8 +173,8 @@ def age_transform(tag: element.Tag) -> List[Dict[str, int]]: 'Under Investigation': 'Unknown' } - if cases not in age_string_transform: - raise FormatError('A new race group has been added to the cases by race table') + if group not in age_string_transform: + raise FormatError('A new age group has been added to the cases by race table') element = {'group': age_string_transform[group], 'raw_cases': raw_cases} categories.append(element) @@ -264,6 +264,7 @@ def get_county() -> Dict: """ url = 'https://socoemergency.org/emergency/novel-coronavirus/coronavirus-cases/' page = requests.get(url) + page.raise_for_status() sonoma_soup = BeautifulSoup(page.content, 'html5lib') tables = sonoma_soup.find_all('table')[4:] # we don't need the first three tables From fb339b4bc770f6984b5f645269af8b6676d8a1c1 Mon Sep 17 00:00:00 2001 From: Logan Cooper Date: Sat, 23 May 2020 11:20:37 -0700 Subject: [PATCH 37/62] sonoma Fix typing errors --- data_scrapers/sonoma.py | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/data_scrapers/sonoma.py b/data_scrapers/sonoma.py index 841bbaab..dcc7c21b 100644 --- a/data_scrapers/sonoma.py +++ b/data_scrapers/sonoma.py @@ -5,15 +5,15 @@ import dateutil.parser from typing import List, Dict, Union from bs4 import BeautifulSoup, element # type: ignore -from format_error import FormatError +from format_error import FormatError # type: ignore def get_table(header: str, soup: BeautifulSoup) -> element.Tag: """ Takes in a header and a BeautifulSoup object and returns the table under that header """ - header = soup.find(lambda tag: tag.name == 'h3' and header in tag.get_text()) - tables = header.find_parent().find_all('table') + header_tag = soup.find(lambda tag: tag.name == 'h3' and header in tag.get_text()) + tables = header_tag.find_parent().find_all('table') # this lets us get the second cases table return tables[-1] @@ -44,8 +44,8 @@ def generate_update_time(soup: BeautifulSoup) -> str: try: date = dateutil.parser.parse(update_time_text) except ValueError: - raise ValueError(f'Article {index} date is not in ISO 8601' - f'format: "{date_string}"') + raise ValueError(f'Date is not in ISO 8601' + f'format: "{update_time_text}"') return date.isoformat() def get_source_meta(soup: BeautifulSoup) -> str: @@ -57,7 +57,7 @@ def get_source_meta(soup: BeautifulSoup) -> str: return definitions_text.replace('\n', '/').strip() # apologies for this horror of a output type -def transform_cases(cases_tag: element.Tag) -> Dict[str, List[Dict[str, Union[str, int]]]]: +def transform_cases(cases_tag: element.Tag) -> Dict[str, List[Dict[str, Union[str, int, object]]]]: """ Takes in a BeautifulSoup tag for the cases table and returns all cases (historic and active), deaths, and recoveries in the form: @@ -70,10 +70,10 @@ def transform_cases(cases_tag: element.Tag) -> Dict[str, List[Dict[str, Union[st cumul_cases = 0 deaths = [] cumul_deaths = 0 - recovered = [] - cumul_recovered = 0 - active = [] - cumul_active = 0 + # recovered = [] + # cumul_recovered = 0 + # active = [] + # cumul_active = 0 rows = get_rows(cases_tag) for row in rows: row_cells = row.find_all(['th', 'td']) @@ -154,7 +154,9 @@ def gender_transform(tag: element.Tag) -> Dict[str, int]: categories[gender_string_conversions[gender]] = parse_int(cases) return categories -def age_transform(tag: element.Tag) -> List[Dict[str, int]]: +# not sure why I need object in here: I've checked the output value types and +# they are strings and ints, but mypy is convinced there is an object in there +def age_transform(tag: element.Tag) -> List[Dict[str, Union[str, int, object]]]: """ Transform function for the cases by age group table. Takes in a BeautifulSoup tag for a table and returns a list of From 5d96031c875f780025ac2edda82e1610266219b4 Mon Sep 17 00:00:00 2001 From: Logan Cooper Date: Wed, 27 May 2020 19:36:16 -0700 Subject: [PATCH 38/62] sonoma Factor out getting section by title --- data_scrapers/sonoma.py | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/data_scrapers/sonoma.py b/data_scrapers/sonoma.py index dcc7c21b..e1c72b74 100644 --- a/data_scrapers/sonoma.py +++ b/data_scrapers/sonoma.py @@ -7,13 +7,19 @@ from bs4 import BeautifulSoup, element # type: ignore from format_error import FormatError # type: ignore +def get_section_by_title(header: str, soup: BeautifulSoup) -> element.Tag: + """ + Takes in a header string and returns the parent element of that header + """ + header_tag = soup.find(lambda tag: tag.name == 'h3' and header in tag.get_text()) + return header_tag.find_parent() + def get_table(header: str, soup: BeautifulSoup) -> element.Tag: """ Takes in a header and a BeautifulSoup object and returns the table under that header """ - header_tag = soup.find(lambda tag: tag.name == 'h3' and header in tag.get_text()) - tables = header_tag.find_parent().find_all('table') + tables = get_section_by_title(header, soup).find_all('table') # this lets us get the second cases table return tables[-1] @@ -50,10 +56,10 @@ def generate_update_time(soup: BeautifulSoup) -> str: def get_source_meta(soup: BeautifulSoup) -> str: """ - Finds the 'Definitions' header on the page and gets all of the text in it + Finds the 'Definitions' header on the page and gets all of the text' in it """ - definitions_header = soup.find('h3', string='Definitions') - definitions_text = definitions_header.find_parent().text + definitions_section = get_section_by_title('Definitions', soup) + definitions_text = definitions_section.text return definitions_text.replace('\n', '/').strip() # apologies for this horror of a output type From fd09e5e0e3c3cc93a8ce3925a671f9ceaae38d52 Mon Sep 17 00:00:00 2001 From: Logan Cooper Date: Wed, 27 May 2020 19:48:33 -0700 Subject: [PATCH 39/62] sonoma Correct deaths and cases aggregation --- data_scrapers/sonoma.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/data_scrapers/sonoma.py b/data_scrapers/sonoma.py index e1c72b74..028d355f 100644 --- a/data_scrapers/sonoma.py +++ b/data_scrapers/sonoma.py @@ -80,7 +80,7 @@ def transform_cases(cases_tag: element.Tag) -> Dict[str, List[Dict[str, Union[st # cumul_recovered = 0 # active = [] # cumul_active = 0 - rows = get_rows(cases_tag) + rows = reversed(get_rows(cases_tag)) for row in rows: row_cells = row.find_all(['th', 'td']) date = dateutil.parser.parse(row_cells[0].text).date().isoformat() @@ -92,6 +92,7 @@ def transform_cases(cases_tag: element.Tag) -> Dict[str, List[Dict[str, Union[st cases.append({ 'date': date, 'cases': new_infected, 'cumul_cases': cumul_cases }) new_deaths = dead - cumul_deaths + cumul_deaths = dead deaths.append({ 'date': date, 'deaths': new_deaths, 'cumul_deaths': dead }) # new_recovered = recoveries - cumul_recovered @@ -100,8 +101,6 @@ def transform_cases(cases_tag: element.Tag) -> Dict[str, List[Dict[str, Union[st # new_active = active_cases - cumul_active # active.append({ 'date': date, 'active': new_active, 'cumul_active': active_cases }) - cases.reverse() - deaths.reverse() return { 'cases': cases, 'deaths': deaths } def transform_transmission(transmission_tag: element.Tag) -> Dict[str, int]: @@ -144,7 +143,7 @@ def transform_tests(tests_tag: element.Tag) -> Dict[str, int]: # categories[cat] = parse_int(cases) # return categories -def gender_transform(tag: element.Tag) -> Dict[str, int]: +def transform_gender(tag: element.Tag) -> Dict[str, int]: """ Transform function for the cases by gender table. Takes in a BeautifulSoup tag for a table and returns a dictionary @@ -289,7 +288,7 @@ def get_county() -> Dict: 'transmission_cat': transform_transmission(cases_by_source), 'age_group': age_transform(cases_by_age), 'race_eth': transform_race_eth(cases_by_race), - 'gender': gender_transform(cases_by_gender) + 'gender': transform_gender(cases_by_gender) }, 'tests_totals': { 'tests': transform_tests(total_tests), From 7770c89e5a25c0b16a0b9fd0862b7833bd016b51 Mon Sep 17 00:00:00 2001 From: Logan Cooper Date: Wed, 27 May 2020 19:55:44 -0700 Subject: [PATCH 40/62] sonoma Raise error for hospitalization change --- data_scrapers/sonoma.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/data_scrapers/sonoma.py b/data_scrapers/sonoma.py index 028d355f..76e9de41 100644 --- a/data_scrapers/sonoma.py +++ b/data_scrapers/sonoma.py @@ -238,6 +238,9 @@ def transform_total_hospitalizations(hospital_tag: element.Tag) -> Dict[str, int rows = get_rows(hospital_tag) for row in rows: hospitalized, number, _pct = get_cells(row) + lowercase_hospitalized = hospitalized.lower() + if lowercase_hospitalized != 'yes' and lowercase_hospitalized != 'no': + raise FormatError('The format of the hospitalization table has changed') if hospitalized.lower() == 'yes': hospitalizations['hospitalized'] = parse_int(number) else: From 6b4b69ba2222df068167dcf11524ba97559dcd1b Mon Sep 17 00:00:00 2001 From: Logan Cooper Date: Wed, 27 May 2020 19:57:41 -0700 Subject: [PATCH 41/62] sonoma Add error for getting section by title --- data_scrapers/sonoma.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/data_scrapers/sonoma.py b/data_scrapers/sonoma.py index 76e9de41..58eadefd 100644 --- a/data_scrapers/sonoma.py +++ b/data_scrapers/sonoma.py @@ -12,6 +12,9 @@ def get_section_by_title(header: str, soup: BeautifulSoup) -> element.Tag: Takes in a header string and returns the parent element of that header """ header_tag = soup.find(lambda tag: tag.name == 'h3' and header in tag.get_text()) + if not header_tag: + raise FormatError('The header "{0}" no longer corresponds to a section'.format(header)) + return header_tag.find_parent() def get_table(header: str, soup: BeautifulSoup) -> element.Tag: From f1c7f0568f3379145977d1662cd9e7ac7760024f Mon Sep 17 00:00:00 2001 From: Logan Cooper Date: Wed, 27 May 2020 20:07:18 -0700 Subject: [PATCH 42/62] sonoma Fix typing issue for age --- data_scrapers/sonoma.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/data_scrapers/sonoma.py b/data_scrapers/sonoma.py index 58eadefd..5c5aa894 100644 --- a/data_scrapers/sonoma.py +++ b/data_scrapers/sonoma.py @@ -162,15 +162,13 @@ def transform_gender(tag: element.Tag) -> Dict[str, int]: categories[gender_string_conversions[gender]] = parse_int(cases) return categories -# not sure why I need object in here: I've checked the output value types and -# they are strings and ints, but mypy is convinced there is an object in there -def age_transform(tag: element.Tag) -> List[Dict[str, Union[str, int, object]]]: +def transform_age(tag: element.Tag) -> List[Dict[str, Union[str, int]]]: """ Transform function for the cases by age group table. Takes in a BeautifulSoup tag for a table and returns a list of dictionaries in which the keys are strings and the values integers """ - categories = [] + categories: List[Dict[str, Union[str, int]]] = [] rows = get_rows(tag) for row in rows: group, cases, _pct = get_cells(row) @@ -186,7 +184,7 @@ def age_transform(tag: element.Tag) -> List[Dict[str, Union[str, int, object]]]: if group not in age_string_transform: raise FormatError('A new age group has been added to the cases by race table') - element = {'group': age_string_transform[group], 'raw_cases': raw_cases} + element: Dict[str, Union[str, int]] = {'group': age_string_transform[group], 'raw_cases': raw_cases} categories.append(element) return categories @@ -292,7 +290,7 @@ def get_county() -> Dict: 'series': transform_cases(hist_cases), 'case_totals': { 'transmission_cat': transform_transmission(cases_by_source), - 'age_group': age_transform(cases_by_age), + 'age_group': transform_age(cases_by_age), 'race_eth': transform_race_eth(cases_by_race), 'gender': transform_gender(cases_by_gender) }, From 5c9a9ed6ea744f4aa5f6dce133cfd489f4d213ae Mon Sep 17 00:00:00 2001 From: Logan Cooper Date: Sun, 31 May 2020 13:45:13 -0700 Subject: [PATCH 43/62] sonoma Write parse table function --- data_scrapers/sonoma.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/data_scrapers/sonoma.py b/data_scrapers/sonoma.py index 5c5aa894..bef05e01 100644 --- a/data_scrapers/sonoma.py +++ b/data_scrapers/sonoma.py @@ -38,6 +38,22 @@ def get_cells(row: element.ResultSet) -> List[str]: """ return [el.text for el in row.find_all(['th', 'td'])] +def row_list_to_dict(row_list: List[str], headers: List[str]) -> Dict[str, str]: + output = {} + for i in range(len(row_list)): + val = row_list[i] + header = headers[i] + output[header] = val + return output + +def parse_table(tag: element.Tag) -> List[Dict[str, str]]: + rows = tag.find_all('tr') + header = rows[0] + body = rows[1:] + header_cells = get_cells(header) + body_cells = [get_cells(row) for row in body] + return [row_list_to_dict(row, header_cells) for row in body_cells] + def parse_int(text: str) -> int: text = text.strip() if text == '-': From 41d61c46c29ba171003e76e41d0e441f2ebd6d7b Mon Sep 17 00:00:00 2001 From: Logan Cooper Date: Sun, 7 Jun 2020 12:23:50 -0700 Subject: [PATCH 44/62] Fix typo Co-authored-by: Rob Brackett --- data_scrapers/sonoma.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/data_scrapers/sonoma.py b/data_scrapers/sonoma.py index bef05e01..e2cdcaa1 100644 --- a/data_scrapers/sonoma.py +++ b/data_scrapers/sonoma.py @@ -75,7 +75,7 @@ def generate_update_time(soup: BeautifulSoup) -> str: def get_source_meta(soup: BeautifulSoup) -> str: """ - Finds the 'Definitions' header on the page and gets all of the text' in it + Finds the 'Definitions' header on the page and gets all of the text in it. """ definitions_section = get_section_by_title('Definitions', soup) definitions_text = definitions_section.text From 06163e213e0c60f214d588b7ba8c1be0005e981b Mon Sep 17 00:00:00 2001 From: Logan Cooper Date: Sun, 7 Jun 2020 13:27:53 -0700 Subject: [PATCH 45/62] sonoma Comment and typing fixes --- data_scrapers/sonoma.py | 76 ++++++++++++++++++++--------------------- 1 file changed, 38 insertions(+), 38 deletions(-) diff --git a/data_scrapers/sonoma.py b/data_scrapers/sonoma.py index bef05e01..48cb6757 100644 --- a/data_scrapers/sonoma.py +++ b/data_scrapers/sonoma.py @@ -82,11 +82,11 @@ def get_source_meta(soup: BeautifulSoup) -> str: return definitions_text.replace('\n', '/').strip() # apologies for this horror of a output type -def transform_cases(cases_tag: element.Tag) -> Dict[str, List[Dict[str, Union[str, int, object]]]]: +def transform_cases(cases_tag: element.Tag) -> Dict[str, List[Dict[str, Union[str, int]]]]: """ Takes in a BeautifulSoup tag for the cases table and returns all cases (historic and active), deaths, and recoveries in the form: - { 'cases': [], 'deaths': [], 'recovered': [], 'active': [] } + { 'cases': [], 'deaths': [] } Where each list contains dictionaries (representing each day's data) of form (example for cases): { 'date': '', 'cases': -1, 'cumul_cases': -1 } @@ -103,16 +103,16 @@ def transform_cases(cases_tag: element.Tag) -> Dict[str, List[Dict[str, Union[st for row in rows: row_cells = row.find_all(['th', 'td']) date = dateutil.parser.parse(row_cells[0].text).date().isoformat() - - # instead of 0, this dashboard reports the string '-' active_cases, new_infected, dead, recoveries = [parse_int(el.text) for el in row_cells[1:]] cumul_cases += new_infected - cases.append({ 'date': date, 'cases': new_infected, 'cumul_cases': cumul_cases }) + case_dict: Dict[str, Union[str, int]] = { 'date': date, 'cases': new_infected, 'cumul_cases': cumul_cases } + cases.append(case_dict) new_deaths = dead - cumul_deaths cumul_deaths = dead - deaths.append({ 'date': date, 'deaths': new_deaths, 'cumul_deaths': dead }) + death_dict: Dict[str, Union[str, int]] = { 'date': date, 'deaths': new_deaths, 'cumul_deaths': dead } + deaths.append(death_dict) # new_recovered = recoveries - cumul_recovered # recovered.append({ 'date': date, 'recovered': new_recovered, 'cumul_recovered': recoveries }) @@ -245,38 +245,38 @@ def transform_race_eth(race_eth_tag: element.Tag) -> Dict[str, int]: race_cases['Unknown'] = get_unknown_race(race_eth_tag) return race_cases -def transform_total_hospitalizations(hospital_tag: element.Tag) -> Dict[str, int]: - """ - Takes in a BeautifulSoup tag of the cases by hospitalization table and - returns a dictionary with the numbers of hospitalized and non-hospitalized - cases - """ - hospitalizations = {} - rows = get_rows(hospital_tag) - for row in rows: - hospitalized, number, _pct = get_cells(row) - lowercase_hospitalized = hospitalized.lower() - if lowercase_hospitalized != 'yes' and lowercase_hospitalized != 'no': - raise FormatError('The format of the hospitalization table has changed') - if hospitalized.lower() == 'yes': - hospitalizations['hospitalized'] = parse_int(number) - else: - hospitalizations['not_hospitalized'] = parse_int(number) - return hospitalizations - -def transform_gender_hospitalizations(hospital_tag: element.Tag) -> Dict[str, float]: - """ - Takes in a BeautifulSoup tag representing the percent of cases hospitalized - by gender and returns a dictionary of those percentages in float form - e.g. 9% is 0.09 - """ - hospitalized = {} - rows = get_rows(hospital_tag) - for row in rows: - gender, no, yes = get_cells(row) - yes_int = parse_int(yes.replace('%', '')) - hospitalized[gender] = (yes_int / 100) - return hospitalized +# def transform_total_hospitalizations(hospital_tag: element.Tag) -> Dict[str, int]: +# """ +# Takes in a BeautifulSoup tag of the cases by hospitalization table and +# returns a dictionary with the numbers of hospitalized and non-hospitalized +# cases +# """ +# hospitalizations = {} +# rows = get_rows(hospital_tag) +# for row in rows: +# hospitalized, number, _pct = get_cells(row) +# lowercase_hospitalized = hospitalized.lower() +# if lowercase_hospitalized != 'yes' and lowercase_hospitalized != 'no': +# raise FormatError('The format of the hospitalization table has changed') +# if hospitalized.lower() == 'yes': +# hospitalizations['hospitalized'] = parse_int(number) +# else: +# hospitalizations['not_hospitalized'] = parse_int(number) +# return hospitalizations +# +# def transform_gender_hospitalizations(hospital_tag: element.Tag) -> Dict[str, float]: +# """ +# Takes in a BeautifulSoup tag representing the percent of cases hospitalized +# by gender and returns a dictionary of those percentages in float form +# e.g. 9% is 0.09 +# """ +# hospitalized = {} +# rows = get_rows(hospital_tag) +# for row in rows: +# gender, no, yes = get_cells(row) +# yes_int = parse_int(yes.replace('%', '')) +# hospitalized[gender] = (yes_int / 100) +# return hospitalized def get_table_tags(soup: BeautifulSoup) -> List[element.Tag]: """ From ba6df28c2808ac535198ade47b4d9357791eabba Mon Sep 17 00:00:00 2001 From: Logan Cooper Date: Sun, 7 Jun 2020 13:44:05 -0700 Subject: [PATCH 46/62] Use raw string for regex Co-authored-by: Rob Brackett --- data_scrapers/sonoma.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/data_scrapers/sonoma.py b/data_scrapers/sonoma.py index e2cdcaa1..37966df2 100644 --- a/data_scrapers/sonoma.py +++ b/data_scrapers/sonoma.py @@ -211,7 +211,7 @@ def get_unknown_race(race_eth_tag: element.Tag) -> int: """ parent = race_eth_tag.parent note = parent.find('p').text - matches = re.search('(\d+) \(\d{1,3}%\) missing race/ethnicity', note) + matches = re.search(r'(\d+) \(\d{1,3}%\) missing race/ethnicity', note) if not matches: raise FormatError('The format of the note with unknown race data has changed') return(parse_int(matches.groups()[0])) From 2bf3faf108143a6c7ffea62fa829cb3396adf1fc Mon Sep 17 00:00:00 2001 From: Logan Cooper Date: Sun, 7 Jun 2020 13:45:31 -0700 Subject: [PATCH 47/62] sonoma Remove commented out code --- data_scrapers/sonoma.py | 50 ----------------------------------------- 1 file changed, 50 deletions(-) diff --git a/data_scrapers/sonoma.py b/data_scrapers/sonoma.py index d10a3a4d..281bec66 100644 --- a/data_scrapers/sonoma.py +++ b/data_scrapers/sonoma.py @@ -149,19 +149,6 @@ def transform_tests(tests_tag: element.Tag) -> Dict[str, int]: tests[lower_res] = parse_int(number) return tests; -# def generic_transform(tag: element.Tag) -> Dict[str, int]: -# """ -# Transform function for tables which don't require any special processing. -# Takes in a BeautifulSoup tag for a table and returns a dictionary -# in which the keys are strings and the values integers -# """ -# categories = {} -# rows = get_rows(tag) -# for row in rows: -# cat, cases, _pct = get_cells(row) -# categories[cat] = parse_int(cases) -# return categories - def transform_gender(tag: element.Tag) -> Dict[str, int]: """ Transform function for the cases by gender table. @@ -245,39 +232,6 @@ def transform_race_eth(race_eth_tag: element.Tag) -> Dict[str, int]: race_cases['Unknown'] = get_unknown_race(race_eth_tag) return race_cases -# def transform_total_hospitalizations(hospital_tag: element.Tag) -> Dict[str, int]: -# """ -# Takes in a BeautifulSoup tag of the cases by hospitalization table and -# returns a dictionary with the numbers of hospitalized and non-hospitalized -# cases -# """ -# hospitalizations = {} -# rows = get_rows(hospital_tag) -# for row in rows: -# hospitalized, number, _pct = get_cells(row) -# lowercase_hospitalized = hospitalized.lower() -# if lowercase_hospitalized != 'yes' and lowercase_hospitalized != 'no': -# raise FormatError('The format of the hospitalization table has changed') -# if hospitalized.lower() == 'yes': -# hospitalizations['hospitalized'] = parse_int(number) -# else: -# hospitalizations['not_hospitalized'] = parse_int(number) -# return hospitalizations -# -# def transform_gender_hospitalizations(hospital_tag: element.Tag) -> Dict[str, float]: -# """ -# Takes in a BeautifulSoup tag representing the percent of cases hospitalized -# by gender and returns a dictionary of those percentages in float form -# e.g. 9% is 0.09 -# """ -# hospitalized = {} -# rows = get_rows(hospital_tag) -# for row in rows: -# gender, no, yes = get_cells(row) -# yes_int = parse_int(yes.replace('%', '')) -# hospitalized[gender] = (yes_int / 100) -# return hospitalized - def get_table_tags(soup: BeautifulSoup) -> List[element.Tag]: """ Takes in a BeautifulSoup object and returns an array of the tables we need @@ -313,10 +267,6 @@ def get_county() -> Dict: 'tests_totals': { 'tests': transform_tests(total_tests), }, - # 'hospitalizations': { - # 'hospitalized_cases': transform_total_hospitalizations(hospitalized), - # 'gender': transform_gender_hospitalizations(hospitalized_by_gender) - # } } return model From bac1b5b1a3ee6d524655c1c64d212b0ead998ea9 Mon Sep 17 00:00:00 2001 From: Logan Cooper Date: Sun, 7 Jun 2020 13:50:23 -0700 Subject: [PATCH 48/62] sonoma Remove unused variable --- data_scrapers/sonoma.py | 1 - 1 file changed, 1 deletion(-) diff --git a/data_scrapers/sonoma.py b/data_scrapers/sonoma.py index 281bec66..e64ae6e9 100644 --- a/data_scrapers/sonoma.py +++ b/data_scrapers/sonoma.py @@ -247,7 +247,6 @@ def get_county() -> Dict: page = requests.get(url) page.raise_for_status() sonoma_soup = BeautifulSoup(page.content, 'html5lib') - tables = sonoma_soup.find_all('table')[4:] # we don't need the first three tables hist_cases, total_tests, cases_by_source, cases_by_age, cases_by_gender, cases_by_race = get_table_tags(sonoma_soup) From 6a0ef8c141934c88126ea590b8c187273d180961 Mon Sep 17 00:00:00 2001 From: Logan Cooper Date: Sun, 7 Jun 2020 19:56:02 -0700 Subject: [PATCH 49/62] sonoma Add sonoma to init.py --- data_scrapers/__init__.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/data_scrapers/__init__.py b/data_scrapers/__init__.py index cd096613..ad646ed8 100644 --- a/data_scrapers/__init__.py +++ b/data_scrapers/__init__.py @@ -1,8 +1,9 @@ from typing import Dict, Any import data_scrapers.alameda_county as alameda_county +import data_scrapers.sonoma_county as sonoma_county scrapers: Dict[str, Any] = { - 'alameda': alameda_county + 'alameda': alameda_county, # 'contra_costa': None, # 'marin': None, # 'napa': None, @@ -10,5 +11,5 @@ # 'san_mateo': None, # 'santa_clara': None, # 'solano': None, - # 'sonoma': None, + 'sonoma': sonoma_county, } From 329f92d203afb4ed1cfda7240afb37466c9330f4 Mon Sep 17 00:00:00 2001 From: Logan Cooper Date: Tue, 16 Jun 2020 17:59:06 -0700 Subject: [PATCH 50/62] sonoma Correct conventions for sonoma --- covid19_sfbayarea/data/__init__.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/covid19_sfbayarea/data/__init__.py b/covid19_sfbayarea/data/__init__.py index 00d99984..ee2a2412 100644 --- a/covid19_sfbayarea/data/__init__.py +++ b/covid19_sfbayarea/data/__init__.py @@ -1,16 +1,16 @@ from typing import Dict, Any from . import alameda from . import san_francisco -import data_scrapers.sonoma_county as sonoma_county +from . import sonoma scrapers: Dict[str, Any] = { 'alameda': alameda, # 'contra_costa': None, # 'marin': None, # 'napa': None, - 'san_francisco': san_francisco + 'san_francisco': san_francisco, # 'san_mateo': None, # 'santa_clara': None, # 'solano': None, - 'sonoma': sonoma_county, + 'sonoma': sonoma, } From f0701252337e338f5565ea5a05b169a4ab797ff3 Mon Sep 17 00:00:00 2001 From: root Date: Wed, 29 Jul 2020 19:24:15 -0700 Subject: [PATCH 51/62] Fix conflicts --- covid19_sfbayarea/data/__init__.py | 6 ------ covid19_sfbayarea/data/sonoma.py | 27 ++++++++++----------------- 2 files changed, 10 insertions(+), 23 deletions(-) diff --git a/covid19_sfbayarea/data/__init__.py b/covid19_sfbayarea/data/__init__.py index 207561d7..1349a469 100644 --- a/covid19_sfbayarea/data/__init__.py +++ b/covid19_sfbayarea/data/__init__.py @@ -1,11 +1,8 @@ from typing import Dict, Any from . import alameda from . import san_francisco -<<<<<<< HEAD from . import sonoma -======= from . import solano ->>>>>>> 5ba6dbb55c75455b154e42832651e6a3837fc805 scrapers: Dict[str, Any] = { 'alameda': alameda, @@ -15,11 +12,8 @@ 'san_francisco': san_francisco, # 'san_mateo': None, # 'santa_clara': None, -<<<<<<< HEAD # 'solano': None, 'sonoma': sonoma, -======= 'solano': solano, # 'sonoma': None, ->>>>>>> 5ba6dbb55c75455b154e42832651e6a3837fc805 } diff --git a/covid19_sfbayarea/data/sonoma.py b/covid19_sfbayarea/data/sonoma.py index e64ae6e9..15685dde 100644 --- a/covid19_sfbayarea/data/sonoma.py +++ b/covid19_sfbayarea/data/sonoma.py @@ -133,7 +133,7 @@ def transform_transmission(transmission_tag: element.Tag) -> Dict[str, int]: # turns the transmission categories on the page into the ones we're using transmission_type_conversion = {'Community': 'community', 'Close Contact': 'from_contact', 'Travel': 'travel', 'Under Investigation': 'unknown'} for row in rows: - type, number, _pct = get_cells(row) + type, number, *rest = get_cells(row) if type not in transmission_type_conversion: raise FormatError('The transmission type {0} was not found in transmission_type_conversion'.format(type)) type = transmission_type_conversion[type] @@ -144,7 +144,7 @@ def transform_tests(tests_tag: element.Tag) -> Dict[str, int]: tests = {} rows = get_rows(tests_tag) for row in rows: - result, number, _pct = get_cells(row) + result, number, *rest = get_cells(row) lower_res = result.lower() tests[lower_res] = parse_int(number) return tests; @@ -159,7 +159,7 @@ def transform_gender(tag: element.Tag) -> Dict[str, int]: rows = get_rows(tag) gender_string_conversions = {'Males': 'male', 'Females': 'female'} for row in rows: - gender, cases, _pct = get_cells(row) + gender, cases, *rest = get_cells(row) if gender not in gender_string_conversions: raise FormatError('An unrecognized gender has been added to the gender table') categories[gender_string_conversions[gender]] = parse_int(cases) @@ -174,20 +174,10 @@ def transform_age(tag: element.Tag) -> List[Dict[str, Union[str, int]]]: categories: List[Dict[str, Union[str, int]]] = [] rows = get_rows(tag) for row in rows: - group, cases, _pct = get_cells(row) + group, cases, *rest = get_cells(row) raw_cases = parse_int(cases) - age_string_transform = { - '0-17': '0_to_17', - '18-49': '18_to_49', - '50-64': '50_to_64', - '65 and Above': '65_and_older', - 'Under Investigation': 'Unknown' - } - if group not in age_string_transform: - raise FormatError('A new age group has been added to the cases by race table') - - element: Dict[str, Union[str, int]] = {'group': age_string_transform[group], 'raw_cases': raw_cases} + element: Dict[str, Union[str, int]] = {'group': group, 'raw_cases': raw_cases} categories.append(element) return categories @@ -224,7 +214,8 @@ def transform_race_eth(race_eth_tag: element.Tag) -> Dict[str, int]: race_transform = {'Asian/Pacific Islander, non-Hispanic': 'Asian', 'Hispanic/Latino': 'Latinx_or_Hispanic', 'Other*, non-Hispanic': 'Other', 'White, non-Hispanic': 'White'} rows = get_rows(race_eth_tag) for row in rows: - group_name, cases, _pct = get_cells(row) + print(get_cells(row)) + group_name, cases, *rest = get_cells(row) if group_name not in race_transform: raise FormatError('The racial group {0} is new in the data -- please adjust the scraper accordingly') internal_name = race_transform[group_name] @@ -244,7 +235,9 @@ def get_county() -> Dict: Main method for populating county data .json """ url = 'https://socoemergency.org/emergency/novel-coronavirus/coronavirus-cases/' - page = requests.get(url) + # need this to avoid 403 error ¯\_(ツ)_/¯ + headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'} + page = requests.get(url, headers=headers) page.raise_for_status() sonoma_soup = BeautifulSoup(page.content, 'html5lib') From d1aec84a85ddd4b396c70d3858fd36fc4035c2a1 Mon Sep 17 00:00:00 2001 From: root Date: Wed, 29 Jul 2020 19:50:24 -0700 Subject: [PATCH 52/62] Fix error import --- covid19_sfbayarea/data/sonoma.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/covid19_sfbayarea/data/sonoma.py b/covid19_sfbayarea/data/sonoma.py index 15685dde..97e0dc97 100644 --- a/covid19_sfbayarea/data/sonoma.py +++ b/covid19_sfbayarea/data/sonoma.py @@ -5,7 +5,7 @@ import dateutil.parser from typing import List, Dict, Union from bs4 import BeautifulSoup, element # type: ignore -from format_error import FormatError # type: ignore +from ..errors import FormatError # type: ignore def get_section_by_title(header: str, soup: BeautifulSoup) -> element.Tag: """ From 869418a050c9ba735e784b26f67f5aa023e4a7f4 Mon Sep 17 00:00:00 2001 From: root Date: Wed, 5 Aug 2020 19:56:03 -0700 Subject: [PATCH 53/62] Fix linter errors and import --- covid19_sfbayarea/data/sonoma.py | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/covid19_sfbayarea/data/sonoma.py b/covid19_sfbayarea/data/sonoma.py index 97e0dc97..d096e516 100644 --- a/covid19_sfbayarea/data/sonoma.py +++ b/covid19_sfbayarea/data/sonoma.py @@ -5,7 +5,7 @@ import dateutil.parser from typing import List, Dict, Union from bs4 import BeautifulSoup, element # type: ignore -from ..errors import FormatError # type: ignore +from ..errors import FormatError def get_section_by_title(header: str, soup: BeautifulSoup) -> element.Tag: """ @@ -38,20 +38,15 @@ def get_cells(row: element.ResultSet) -> List[str]: """ return [el.text for el in row.find_all(['th', 'td'])] -def row_list_to_dict(row_list: List[str], headers: List[str]) -> Dict[str, str]: - output = {} - for i in range(len(row_list)): - val = row_list[i] - header = headers[i] - output[header] = val - return output +def row_list_to_dict(row: List[str], headers: List[str]) -> Dict[str, str]: + return dict(zip(headers, row)) def parse_table(tag: element.Tag) -> List[Dict[str, str]]: rows = tag.find_all('tr') header = rows[0] body = rows[1:] header_cells = get_cells(header) - body_cells = [get_cells(row) for row in body] + body_cells = (get_cells(row) for row in body) return [row_list_to_dict(row, header_cells) for row in body_cells] def parse_int(text: str) -> int: From 6ef13b459b662c6f32f147ea357734cb2f64083e Mon Sep 17 00:00:00 2001 From: root Date: Sat, 8 Aug 2020 11:49:22 -0700 Subject: [PATCH 54/62] Add type aliases --- README.md | 3 --- covid19_sfbayarea/data/__init__.py | 2 -- covid19_sfbayarea/data/format_error.py | 7 ------- covid19_sfbayarea/data/sonoma.py | 18 +++++++++--------- 4 files changed, 9 insertions(+), 21 deletions(-) delete mode 100644 covid19_sfbayarea/data/format_error.py diff --git a/README.md b/README.md index a4c12550..c0988bff 100644 --- a/README.md +++ b/README.md @@ -9,9 +9,6 @@ To install this project, you can simply run `./install.sh` in your terminal. Thi ## Running the scraper To run the scraper, you can use the run script by typing `sh run_scraper.sh` into your terminal. This will enable the virtual environment and run `scraper.py`. Once again, the virtual environment will not stay active after the script finishes running. If you want to run the scraper without the run script, enable the virtual environment, then run `python3 scraper.py`. -## Running the API -The best way to run the API right now is to run the command `FLASK_APP="app.py" FLASK_ENV=development flask run;`. Note that this is not the best way to run the scraper at this time. - ## Data Model The following sections document the differences between the counties in the common data model (see `data_models` directory) which we will see as we begin to get data from them. diff --git a/covid19_sfbayarea/data/__init__.py b/covid19_sfbayarea/data/__init__.py index 1349a469..498242db 100644 --- a/covid19_sfbayarea/data/__init__.py +++ b/covid19_sfbayarea/data/__init__.py @@ -12,8 +12,6 @@ 'san_francisco': san_francisco, # 'san_mateo': None, # 'santa_clara': None, - # 'solano': None, 'sonoma': sonoma, 'solano': solano, - # 'sonoma': None, } diff --git a/covid19_sfbayarea/data/format_error.py b/covid19_sfbayarea/data/format_error.py deleted file mode 100644 index 96ac077f..00000000 --- a/covid19_sfbayarea/data/format_error.py +++ /dev/null @@ -1,7 +0,0 @@ -class FormatError(Exception): - """ - A custom error to raise whenever a scraper runs into something in an - unexpected format. This usually means that the website the scraper is - accessing has changed - """ - pass diff --git a/covid19_sfbayarea/data/sonoma.py b/covid19_sfbayarea/data/sonoma.py index d096e516..286f4417 100644 --- a/covid19_sfbayarea/data/sonoma.py +++ b/covid19_sfbayarea/data/sonoma.py @@ -1,4 +1,3 @@ -#!/usr/bin/env python3 import requests import json import re @@ -7,6 +6,9 @@ from bs4 import BeautifulSoup, element # type: ignore from ..errors import FormatError +TimeSeriesItem = Dict[str, Union[str, int]] +TimeSeries = List[TimeSeriesItem] + def get_section_by_title(header: str, soup: BeautifulSoup) -> element.Tag: """ Takes in a header string and returns the parent element of that header @@ -76,8 +78,7 @@ def get_source_meta(soup: BeautifulSoup) -> str: definitions_text = definitions_section.text return definitions_text.replace('\n', '/').strip() -# apologies for this horror of a output type -def transform_cases(cases_tag: element.Tag) -> Dict[str, List[Dict[str, Union[str, int]]]]: +def transform_cases(cases_tag: element.Tag) -> Dict[str, TimeSeries]: """ Takes in a BeautifulSoup tag for the cases table and returns all cases (historic and active), deaths, and recoveries in the form: @@ -101,12 +102,12 @@ def transform_cases(cases_tag: element.Tag) -> Dict[str, List[Dict[str, Union[st active_cases, new_infected, dead, recoveries = [parse_int(el.text) for el in row_cells[1:]] cumul_cases += new_infected - case_dict: Dict[str, Union[str, int]] = { 'date': date, 'cases': new_infected, 'cumul_cases': cumul_cases } + case_dict: TimeSeriesItem = { 'date': date, 'cases': new_infected, 'cumul_cases': cumul_cases } cases.append(case_dict) new_deaths = dead - cumul_deaths cumul_deaths = dead - death_dict: Dict[str, Union[str, int]] = { 'date': date, 'deaths': new_deaths, 'cumul_deaths': dead } + death_dict: TimeSeriesItem = { 'date': date, 'deaths': new_deaths, 'cumul_deaths': dead } deaths.append(death_dict) # new_recovered = recoveries - cumul_recovered @@ -160,19 +161,19 @@ def transform_gender(tag: element.Tag) -> Dict[str, int]: categories[gender_string_conversions[gender]] = parse_int(cases) return categories -def transform_age(tag: element.Tag) -> List[Dict[str, Union[str, int]]]: +def transform_age(tag: element.Tag) -> TimeSeries: """ Transform function for the cases by age group table. Takes in a BeautifulSoup tag for a table and returns a list of dictionaries in which the keys are strings and the values integers """ - categories: List[Dict[str, Union[str, int]]] = [] + categories: TimeSeries = [] rows = get_rows(tag) for row in rows: group, cases, *rest = get_cells(row) raw_cases = parse_int(cases) - element: Dict[str, Union[str, int]] = {'group': group, 'raw_cases': raw_cases} + element: TimeSeriesItem = {'group': group, 'raw_cases': raw_cases} categories.append(element) return categories @@ -209,7 +210,6 @@ def transform_race_eth(race_eth_tag: element.Tag) -> Dict[str, int]: race_transform = {'Asian/Pacific Islander, non-Hispanic': 'Asian', 'Hispanic/Latino': 'Latinx_or_Hispanic', 'Other*, non-Hispanic': 'Other', 'White, non-Hispanic': 'White'} rows = get_rows(race_eth_tag) for row in rows: - print(get_cells(row)) group_name, cases, *rest = get_cells(row) if group_name not in race_transform: raise FormatError('The racial group {0} is new in the data -- please adjust the scraper accordingly') From 5fdc2aa2fb95cddc25030b10c1d0ce4466abb0f8 Mon Sep 17 00:00:00 2001 From: ldtcoop Date: Sat, 8 Aug 2020 14:03:01 -0700 Subject: [PATCH 55/62] Use get cell function for cases --- covid19_sfbayarea/data/sonoma.py | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/covid19_sfbayarea/data/sonoma.py b/covid19_sfbayarea/data/sonoma.py index 286f4417..7f3fa711 100644 --- a/covid19_sfbayarea/data/sonoma.py +++ b/covid19_sfbayarea/data/sonoma.py @@ -97,9 +97,9 @@ def transform_cases(cases_tag: element.Tag) -> Dict[str, TimeSeries]: # cumul_active = 0 rows = reversed(get_rows(cases_tag)) for row in rows: - row_cells = row.find_all(['th', 'td']) - date = dateutil.parser.parse(row_cells[0].text).date().isoformat() - active_cases, new_infected, dead, recoveries = [parse_int(el.text) for el in row_cells[1:]] + row_cells = get_cells(row) + date = dateutil.parser.parse(row_cells[0]).date().isoformat() + active_cases, new_infected, dead, recoveries = [parse_int(el) for el in row_cells[1:]] cumul_cases += new_infected case_dict: TimeSeriesItem = { 'date': date, 'cases': new_infected, 'cumul_cases': cumul_cases } @@ -131,7 +131,7 @@ def transform_transmission(transmission_tag: element.Tag) -> Dict[str, int]: for row in rows: type, number, *rest = get_cells(row) if type not in transmission_type_conversion: - raise FormatError('The transmission type {0} was not found in transmission_type_conversion'.format(type)) + raise FormatError(f'The transmission type {type} was not found in transmission_type_conversion') type = transmission_type_conversion[type] transmissions[type] = parse_int(number) return transmissions @@ -171,9 +171,9 @@ def transform_age(tag: element.Tag) -> TimeSeries: rows = get_rows(tag) for row in rows: group, cases, *rest = get_cells(row) - raw_cases = parse_int(cases) + raw_count = parse_int(cases) - element: TimeSeriesItem = {'group': group, 'raw_cases': raw_cases} + element: TimeSeriesItem = {'group': group, 'raw_count': raw_count} categories.append(element) return categories @@ -197,13 +197,9 @@ def transform_race_eth(race_eth_tag: element.Tag) -> Dict[str, int]: NB: These are the only races reported seperatley by Sonoma county at this time """ race_cases = { - 'African_Amer': 0, 'Asian': 0, 'Latinx_or_Hispanic': 0, - 'Native_Amer':0, - 'Multiple_Race':0, 'Other': 0, - 'Pacific_Islander': 0, 'White': 0, 'Unknown': 0 } From aed862f7719566729fd6c0b0cac78196b5aa79f0 Mon Sep 17 00:00:00 2001 From: ldtcoop Date: Mon, 10 Aug 2020 17:30:14 -0700 Subject: [PATCH 56/62] Remove data model readme from main readme --- README.md | 153 ------------------------------------------------------ 1 file changed, 153 deletions(-) diff --git a/README.md b/README.md index c0988bff..d347a00d 100644 --- a/README.md +++ b/README.md @@ -9,159 +9,6 @@ To install this project, you can simply run `./install.sh` in your terminal. Thi ## Running the scraper To run the scraper, you can use the run script by typing `sh run_scraper.sh` into your terminal. This will enable the virtual environment and run `scraper.py`. Once again, the virtual environment will not stay active after the script finishes running. If you want to run the scraper without the run script, enable the virtual environment, then run `python3 scraper.py`. -## Data Model -The following sections document the differences between the counties in the common data model (see `data_models` directory) which we will see as we begin to get data from them. - -### Ages - -Please make sure to use the following age brackets for the different counties. Note that the brackets may also vary by whether you are scraping cases or deaths data: - - -#### San Francisco -##### Cases - "age": [ - {"group": "18_and_under", "raw_count": -1 }, - {"group": "18_to_30", "raw_count": -1 }, - {"group": "31_to_40", "raw_count": -1 }, - {"group": "41_to_50", "raw_count": -1 }, - {"group": "51_to_60", "raw_count": -1 }, - {"group": "61_to_70", "raw_count": -1 }, - {"group": "71_to_80", "raw_count": -1 }, - {"group": "81_and_older", "raw_count": -1} - ] -##### Deaths -Data broken down by gender is not available on the json files, only on the dashboard. - - -#### Alameda -##### Cases - "age": [ - {"group": "18_and_under", "raw_count": -1 }, - {"group": "18_to_30", "raw_count": -1 }, - {"group": "31_to_40", "raw_count": -1 }, - {"group": "41_to_50", "raw_count": -1 }, - {"group": "51_to_60", "raw_count": -1 }, - {"group": "61_to_70", "raw_count": -1 }, - {"group": "71_to_80", "raw_count": -1 }, - {"group": "81_and_older", "raw_count": -1 }, - {"group": "Unknown", "raw_count": -1 } - ] -##### Deaths -Data broken down by gender is not available. - - -#### Sonoma -##### Cases - "age": [ - {"group": "0_to_17", "raw_count": -1 }, - {"group": "18_to_49", "raw_count": -1 }, - {"group": "50_to_64", "raw_count": -1 }, - {"group": "65_and_older", "raw_count": -1 }, - {"group": "Unknown", "raw_count": -1 } - ] -##### Deaths -Data broken down by gender is not available. - - -#### Santa Clara -##### Cases - "age": [ - {"group": "20_and_under", "raw_count": -1 }, - {"group": "21_to_30", "raw_count": -1 }, - {"group": "31_to_40", "raw_count": -1 }, - {"group": "41_to_50", "raw_count": -1 }, - {"group": "51_to_60", "raw_count": -1 }, - {"group": "61_to_70", "raw_count": -1 }, - {"group": "71_to_80", "raw_count": -1 }, - {"group": "81_to_90", "raw_count": -1 }, - {"group": "90_and_older", "raw_count": -1 }, - {"group": "Unknown", "raw_count": -1 } - ] -##### Deaths - "age": [ - {"group": "20_and_under", "raw_count": -1 }, - {"group": "21_to_30", "raw_count": -1 }, - {"group": "31_to_40", "raw_count": -1 }, - {"group": "41_to_50", "raw_count": -1 }, - {"group": "51_to_60", "raw_count": -1 }, - {"group": "61_to_70", "raw_count": -1 }, - {"group": "71_to_80", "raw_count": -1 }, - {"group": "81_to_90", "raw_count": -1 }, - {"group": "90_and_older", "raw_count": -1 } - ] - - -#### San Mateo -##### Cases - "age": [ - {"group": "0_to_19", "raw_count": -1 }, - {"group": "20_to_29", "raw_count": -1 }, - {"group": "30_to_39", "raw_count": -1 }, - {"group": "40_to_49", "raw_count": -1 }, - {"group": "50_to_59", "raw_count": -1 }, - {"group": "60_to_69", "raw_count": -1 }, - {"group": "70_to_79", "raw_count": -1 }, - {"group": "80_to_89", "raw_count": -1 }, - {"group": "90_and_older", "raw_count": -1 } - ] -##### Deaths - age": [ - {"group": "0_to_19", "raw_count": -1 }, - {"group": "20_to_29", "raw_count": -1 }, - {"group": "30_to_39", "raw_count": -1 }, - {"group": "40_to_49", "raw_count": -1 }, - {"group": "50_to_59", "raw_count": -1 }, - {"group": "60_to_69", "raw_count": -1 }, - {"group": "70_to_79", "raw_count": -1 }, - {"group": "80_to_89", "raw_count": -1 }, - {"group": "90_and_older", "raw_count": -1 } - ] - - -#### Contra Costa -##### Cases - age": [ - {"group": "0_to_20", "raw_count": -1 }, - {"group": "21_to_40", "raw_count": -1 }, - {"group": "41_to_60", "raw_count": -1 }, - {"group": "61_to_80", "raw_count": -1 }, - {"group": "81_to_100", "raw_count": -1 } - ] -##### Deaths -Data broken down by gender is not available. - - -#### Marin -##### Cases and Deaths - age": [ - {"group": "0_to_18", "raw_count": -1 }, - {"group": "19_to_34", "raw_count": -1 }, - {"group": "35_to_49", "raw_count": -1 }, - {"group": "50_to_64", "raw_count": -1 }, - {"group": "65_and_older", "raw_count": -1 } - ] - - - -#### Solano -##### Cases and Deaths - age": [ - {"group": "0_to_18", "raw_count": -1 }, - {"group": "19_to_64", "raw_count": -1 }, - {"group": "65_and_older", "raw_count": -1 } - ] - - -#### Napa -##### Cases - age": [ - {"group": "0_to_17", "raw_count": -1 }, - {"group": "18_to_49", "raw_count": -1 }, - {"group": "50_to_64", "raw_count": -1 }, - {"group": "Over_64", "raw_count": -1 } - ] -##### Deaths -Data broken down by gender is not available. ## Development We use CircleCI to lint the code and run tests in this repository, but you can (and should!) also run tests locally. From 898672d7a5e56db2898096d4dffb3b1067b3f9e2 Mon Sep 17 00:00:00 2001 From: ldtcoop Date: Mon, 10 Aug 2020 17:35:49 -0700 Subject: [PATCH 57/62] Add readme link --- README.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/README.md b/README.md index d347a00d..8a85271b 100644 --- a/README.md +++ b/README.md @@ -9,6 +9,9 @@ To install this project, you can simply run `./install.sh` in your terminal. Thi ## Running the scraper To run the scraper, you can use the run script by typing `sh run_scraper.sh` into your terminal. This will enable the virtual environment and run `scraper.py`. Once again, the virtual environment will not stay active after the script finishes running. If you want to run the scraper without the run script, enable the virtual environment, then run `python3 scraper.py`. +## Data Models +The data models are in JSON format and are located in the `data_models` directory. For more information, see the [data model readme](./data_models/README.md). + ## Development We use CircleCI to lint the code and run tests in this repository, but you can (and should!) also run tests locally. From a549ea475fc95c91f49bf3414b256231f59022f6 Mon Sep 17 00:00:00 2001 From: ldtcoop Date: Wed, 12 Aug 2020 19:15:18 -0700 Subject: [PATCH 58/62] Refactor test and gender functions --- covid19_sfbayarea/data/sonoma.py | 32 ++++++++++++++++++++------------ 1 file changed, 20 insertions(+), 12 deletions(-) diff --git a/covid19_sfbayarea/data/sonoma.py b/covid19_sfbayarea/data/sonoma.py index 7f3fa711..681f5bcf 100644 --- a/covid19_sfbayarea/data/sonoma.py +++ b/covid19_sfbayarea/data/sonoma.py @@ -40,10 +40,10 @@ def get_cells(row: element.ResultSet) -> List[str]: """ return [el.text for el in row.find_all(['th', 'td'])] -def row_list_to_dict(row: List[str], headers: List[str]) -> Dict[str, str]: +def row_list_to_dict(row: List[str], headers: List[str]) -> TimeSeriesItem: return dict(zip(headers, row)) -def parse_table(tag: element.Tag) -> List[Dict[str, str]]: +def parse_table(tag: element.Tag) -> TimeSeries: rows = tag.find_all('tr') header = rows[0] body = rows[1:] @@ -52,6 +52,10 @@ def parse_table(tag: element.Tag) -> List[Dict[str, str]]: return [row_list_to_dict(row, header_cells) for row in body_cells] def parse_int(text: str) -> int: + """ + Takes in a number in string form and returns that string in integer form + and handles zeroes represented as dashes + """ text = text.strip() if text == '-': return 0 @@ -137,12 +141,15 @@ def transform_transmission(transmission_tag: element.Tag) -> Dict[str, int]: return transmissions def transform_tests(tests_tag: element.Tag) -> Dict[str, int]: + """ + Transform function for the tests table. + Takes in a BeautifulSoup tag for a table and returns a dictionary + """ tests = {} - rows = get_rows(tests_tag) + rows = parse_table(tests_tag) for row in rows: - result, number, *rest = get_cells(row) - lower_res = result.lower() - tests[lower_res] = parse_int(number) + lower_res = row['Results'].lower() + tests[lower_res] = parse_int(row['Number']) return tests; def transform_gender(tag: element.Tag) -> Dict[str, int]: @@ -151,15 +158,16 @@ def transform_gender(tag: element.Tag) -> Dict[str, int]: Takes in a BeautifulSoup tag for a table and returns a dictionary in which the keys are strings and the values integers """ - categories = {} - rows = get_rows(tag) + genders = {} + rows = parse_table(tag) gender_string_conversions = {'Males': 'male', 'Females': 'female'} for row in rows: - gender, cases, *rest = get_cells(row) + gender = row['Gender'] + cases = parse_int(row['Cases']) if gender not in gender_string_conversions: raise FormatError('An unrecognized gender has been added to the gender table') - categories[gender_string_conversions[gender]] = parse_int(cases) - return categories + genders[gender_string_conversions[gender]] = cases + return genders def transform_age(tag: element.Tag) -> TimeSeries: """ @@ -251,7 +259,7 @@ def get_county() -> Dict: 'tests': transform_tests(total_tests), }, } - return model + # return model if __name__ == '__main__': print(json.dumps(get_county(), indent=4)) From 97b72c1f1e0c17ed94faa2f6ad40841e29012cf1 Mon Sep 17 00:00:00 2001 From: ldtcoop Date: Wed, 12 Aug 2020 19:30:20 -0700 Subject: [PATCH 59/62] Refactor all transforn functions but cases --- covid19_sfbayarea/data/sonoma.py | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/covid19_sfbayarea/data/sonoma.py b/covid19_sfbayarea/data/sonoma.py index 681f5bcf..3a3695b6 100644 --- a/covid19_sfbayarea/data/sonoma.py +++ b/covid19_sfbayarea/data/sonoma.py @@ -129,15 +129,16 @@ def transform_transmission(transmission_tag: element.Tag) -> Dict[str, int]: {'community': -1, 'from_contact': -1, 'travel': -1, 'unknown': -1} """ transmissions = {} - rows = get_rows(transmission_tag) + rows = parse_table(transmission_tag) # turns the transmission categories on the page into the ones we're using transmission_type_conversion = {'Community': 'community', 'Close Contact': 'from_contact', 'Travel': 'travel', 'Under Investigation': 'unknown'} for row in rows: - type, number, *rest = get_cells(row) + type = row['Source'] + number = parse_int(row['Cases']) if type not in transmission_type_conversion: raise FormatError(f'The transmission type {type} was not found in transmission_type_conversion') type = transmission_type_conversion[type] - transmissions[type] = parse_int(number) + transmissions[type] = number return transmissions def transform_tests(tests_tag: element.Tag) -> Dict[str, int]: @@ -176,11 +177,10 @@ def transform_age(tag: element.Tag) -> TimeSeries: dictionaries in which the keys are strings and the values integers """ categories: TimeSeries = [] - rows = get_rows(tag) + rows = parse_table(tag) for row in rows: - group, cases, *rest = get_cells(row) - raw_count = parse_int(cases) - + raw_count = parse_int(row['Cases']) + group = row['Age Group'] element: TimeSeriesItem = {'group': group, 'raw_count': raw_count} categories.append(element) return categories @@ -212,14 +212,16 @@ def transform_race_eth(race_eth_tag: element.Tag) -> Dict[str, int]: 'Unknown': 0 } race_transform = {'Asian/Pacific Islander, non-Hispanic': 'Asian', 'Hispanic/Latino': 'Latinx_or_Hispanic', 'Other*, non-Hispanic': 'Other', 'White, non-Hispanic': 'White'} - rows = get_rows(race_eth_tag) + rows = parse_table(race_eth_tag) for row in rows: - group_name, cases, *rest = get_cells(row) + group_name = row['Race/Ethnicity'] + cases = parse_int(row['Cases']) if group_name not in race_transform: raise FormatError('The racial group {0} is new in the data -- please adjust the scraper accordingly') internal_name = race_transform[group_name] - race_cases[internal_name] = parse_int(cases) + race_cases[internal_name] = cases race_cases['Unknown'] = get_unknown_race(race_eth_tag) + print(race_cases) return race_cases def get_table_tags(soup: BeautifulSoup) -> List[element.Tag]: From 28df7be772107b330f39b00d10542a5d30cb721d Mon Sep 17 00:00:00 2001 From: ldtcoop Date: Wed, 12 Aug 2020 19:59:35 -0700 Subject: [PATCH 60/62] Fix types --- covid19_sfbayarea/data/sonoma.py | 34 ++++++++++---------------------- 1 file changed, 10 insertions(+), 24 deletions(-) diff --git a/covid19_sfbayarea/data/sonoma.py b/covid19_sfbayarea/data/sonoma.py index 3a3695b6..cd07e088 100644 --- a/covid19_sfbayarea/data/sonoma.py +++ b/covid19_sfbayarea/data/sonoma.py @@ -8,6 +8,8 @@ TimeSeriesItem = Dict[str, Union[str, int]] TimeSeries = List[TimeSeriesItem] +UnformattedSeriesItem = Dict[str, str] +UnformattedSeries = List[UnformattedSeriesItem] def get_section_by_title(header: str, soup: BeautifulSoup) -> element.Tag: """ @@ -28,22 +30,16 @@ def get_table(header: str, soup: BeautifulSoup) -> element.Tag: # this lets us get the second cases table return tables[-1] -def get_rows(tag: element.Tag) -> List[element.ResultSet]: - """ - Gets all tr elements in a tag but the first, which is the header - """ - return tag.find_all('tr')[1:] - def get_cells(row: element.ResultSet) -> List[str]: """ Gets all th and tr elements within a single tr element """ return [el.text for el in row.find_all(['th', 'td'])] -def row_list_to_dict(row: List[str], headers: List[str]) -> TimeSeriesItem: +def row_list_to_dict(row: List[str], headers: List[str]) -> UnformattedSeriesItem: return dict(zip(headers, row)) -def parse_table(tag: element.Tag) -> TimeSeries: +def parse_table(tag: element.Tag) -> UnformattedSeries: rows = tag.find_all('tr') header = rows[0] body = rows[1:] @@ -95,15 +91,12 @@ def transform_cases(cases_tag: element.Tag) -> Dict[str, TimeSeries]: cumul_cases = 0 deaths = [] cumul_deaths = 0 - # recovered = [] - # cumul_recovered = 0 - # active = [] - # cumul_active = 0 - rows = reversed(get_rows(cases_tag)) + + rows = list(reversed(parse_table(cases_tag))) for row in rows: - row_cells = get_cells(row) - date = dateutil.parser.parse(row_cells[0]).date().isoformat() - active_cases, new_infected, dead, recoveries = [parse_int(el) for el in row_cells[1:]] + date = dateutil.parser.parse(row['Date']).date().isoformat() + new_infected = parse_int(row['New']) + dead = parse_int(row['Deaths']) cumul_cases += new_infected case_dict: TimeSeriesItem = { 'date': date, 'cases': new_infected, 'cumul_cases': cumul_cases } @@ -114,12 +107,6 @@ def transform_cases(cases_tag: element.Tag) -> Dict[str, TimeSeries]: death_dict: TimeSeriesItem = { 'date': date, 'deaths': new_deaths, 'cumul_deaths': dead } deaths.append(death_dict) - # new_recovered = recoveries - cumul_recovered - # recovered.append({ 'date': date, 'recovered': new_recovered, 'cumul_recovered': recoveries }) - # - # new_active = active_cases - cumul_active - # active.append({ 'date': date, 'active': new_active, 'cumul_active': active_cases }) - return { 'cases': cases, 'deaths': deaths } def transform_transmission(transmission_tag: element.Tag) -> Dict[str, int]: @@ -221,7 +208,6 @@ def transform_race_eth(race_eth_tag: element.Tag) -> Dict[str, int]: internal_name = race_transform[group_name] race_cases[internal_name] = cases race_cases['Unknown'] = get_unknown_race(race_eth_tag) - print(race_cases) return race_cases def get_table_tags(soup: BeautifulSoup) -> List[element.Tag]: @@ -261,7 +247,7 @@ def get_county() -> Dict: 'tests': transform_tests(total_tests), }, } - # return model + return model if __name__ == '__main__': print(json.dumps(get_county(), indent=4)) From 6ddf682204a5b58ad338e5498a80ed63664997cf Mon Sep 17 00:00:00 2001 From: ldtcoop Date: Wed, 12 Aug 2020 20:02:36 -0700 Subject: [PATCH 61/62] Add docstrings --- covid19_sfbayarea/data/sonoma.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/covid19_sfbayarea/data/sonoma.py b/covid19_sfbayarea/data/sonoma.py index cd07e088..9d5fd8fc 100644 --- a/covid19_sfbayarea/data/sonoma.py +++ b/covid19_sfbayarea/data/sonoma.py @@ -37,9 +37,17 @@ def get_cells(row: element.ResultSet) -> List[str]: return [el.text for el in row.find_all(['th', 'td'])] def row_list_to_dict(row: List[str], headers: List[str]) -> UnformattedSeriesItem: + """ + Takes in a list of headers and a corresponding list of cells + and returns a dictionary associating the headers with the cells + """ return dict(zip(headers, row)) def parse_table(tag: element.Tag) -> UnformattedSeries: + """ + Takes in a BeautifulSoup table tag and returns a list of dictionaries + where the keys correspond to header names and the values to corresponding cell values + """ rows = tag.find_all('tr') header = rows[0] body = rows[1:] From 4a92856ac6aa3c7d56a58d48bd2b9a2902800a16 Mon Sep 17 00:00:00 2001 From: ldtcoop Date: Wed, 12 Aug 2020 20:09:07 -0700 Subject: [PATCH 62/62] Use datetime attribute --- covid19_sfbayarea/data/sonoma.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/covid19_sfbayarea/data/sonoma.py b/covid19_sfbayarea/data/sonoma.py index 9d5fd8fc..37ab0759 100644 --- a/covid19_sfbayarea/data/sonoma.py +++ b/covid19_sfbayarea/data/sonoma.py @@ -70,7 +70,7 @@ def generate_update_time(soup: BeautifulSoup) -> str: """ Generates a timestamp string (e.g. May 6, 2020 10:00 AM) for when the scraper is run """ - update_time_text = soup.find('time', {'class': 'updated'}).text.strip() + update_time_text = soup.find('time', {'class': 'updated'})['datetime'] try: date = dateutil.parser.parse(update_time_text) except ValueError: