Skip to content

Commit

Permalink
Merge pull request #51 from elaguerta/data-cli
Browse files Browse the repository at this point in the history
#46 Add CLI to run data scrapers
  • Loading branch information
elaguerta authored May 17, 2020
2 parents 40f7ac4 + 9ca5cd5 commit 575f138
Show file tree
Hide file tree
Showing 4 changed files with 76 additions and 23 deletions.
14 changes: 14 additions & 0 deletions data_scrapers/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
from typing import Dict, Any
import data_scrapers.alameda_county as alameda_county

scrapers: Dict[str, Any] = {
'alameda': alameda_county
# 'contra_costa': None,
# 'marin': None,
# 'napa': None,
# 'san_francisco': san_francisco_county,
# 'san_mateo': None,
# 'santa_clara': None,
# 'solano': None,
# 'sonoma': None,
}
46 changes: 23 additions & 23 deletions data_scrapers/alameda_county.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
#!/usr/bin/env python3
import requests
from bs4 import BeautifulSoup
from bs4 import BeautifulSoup # type: ignore
import json
from typing import List, Dict
from typing import List, Dict, Tuple
from datetime import datetime, timezone
from selenium import webdriver
from selenium import webdriver # type: ignore

# Note that we are using numbers for all of Alameda County, including Berkeley
# Running this scraper requires a Firefox webdriver. The macos Firefox driver, geckodriver, is stored in ./env/bin
Expand All @@ -30,7 +30,7 @@ def get_county() -> Dict:
# Load data model template into a local dictionary called 'out'.
with open('./data_models/data_model.json') as template:
out = json.load(template)

# populate dataset headers
out["name"] = "Alameda County"
out["source_url"] = landing_page
Expand All @@ -41,7 +41,7 @@ def get_county() -> Dict:
response.raise_for_status()
cases_header = response.json()
timestamp = cases_header["editingInfo"]["lastEditDate"]
# Raise an exception if a timezone is specified. If "dateFieldsTimeReference" is present, we need to edit this scrapr to handle it.
# Raise an exception if a timezone is specified. If "dateFieldsTimeReference" is present, we need to edit this scrapr to handle it.
# See: https://developers.arcgis.com/rest/services-reference/layer-feature-service-.htm#GUID-20D36DF4-F13A-4B01-AA05-D642FA455EB6
if "dateFieldsTimeReference" in cases_header["editingInfo"] or "editFieldsInfo" in cases_header:
raise FutureWarning("A timezone may now be specified in the metadata.")
Expand All @@ -61,14 +61,14 @@ def get_county() -> Dict:


# Confirmed Cases and Deaths
def get_timeseries() -> Dict:
def get_timeseries() -> Dict:
"""Fetch daily and cumulative cases and deaths by day
Returns the dictionary value for "series": {"cases":[], "deaths":[]}.
To create a DataFrame from this dictionary, run
'pd.DataFrame(get_timeseries())'
"""

series = {"cases":[], "deaths":[]} # dictionary holding the timeseries for cases and deaths
series: Dict[str, List] = {"cases":[], "deaths":[]} # dictionary holding the timeseries for cases and deaths
# Dictionary of 'source_label': 'target_label' for re-keying
TIMESERIES_KEYS = {
'Date': 'date',
Expand All @@ -79,12 +79,12 @@ def get_timeseries() -> Dict:
}

# query API
param_list = {'where':'0=0', 'resultType': 'none', 'outFields': 'Date,AC_Cases,AC_CumulCases,AC_Deaths,AC_CumulDeaths', 'outSR': 4326,'orderByField': 'Date', 'f': 'json'}
param_list = {'where':'0=0', 'resultType': 'none', 'outFields': 'Date,AC_Cases,AC_CumulCases,AC_Deaths,AC_CumulDeaths', 'outSR': '4326','orderByField': 'Date', 'f': 'json'}
response = requests.get(cases_deaths, params=param_list)
response.raise_for_status()
parsed = response.json()
features = [obj["attributes"] for obj in parsed['features']]

# convert dates
for obj in features:
month, day, year = obj['Date'].split('/')
Expand All @@ -94,7 +94,7 @@ def get_timeseries() -> Dict:
day = '0' + day
obj['Date'] = "{}-{}-{}".format(year, month, day)



re_keyed = [{TIMESERIES_KEYS[key]: value for key, value in entry.items()}
for entry in features]
Expand Down Expand Up @@ -125,24 +125,24 @@ def get_notes() -> str:
driver.quit()
return '\n\n'.join(notes)

def get_demographics(out:Dict) -> (Dict, List):
def get_demographics(out: Dict) -> Tuple[Dict, List]:
"""Fetch cases and deaths by age, gender, race, ethnicity
Returns the dictionary value for {"cases_totals": {}, "death_totals":{}}, as well as a list of
strings describing datapoints that have a value of "<10".
To create a DataFrame from the dictionary, run 'pd.DataFrame(get_demographics()[0])'
Returns the dictionary value for {"cases_totals": {}, "death_totals":{}}, as well as a list of
strings describing datapoints that have a value of "<10".
To create a DataFrame from the dictionary, run 'pd.DataFrame(get_demographics()[0])'
Note that the DataFrame will convert the "<10" strings to NaN.
"""
# Dicts of target_label : source_label for re-keying.
# Note that the cases table includes MTF and FTM, but the deaths table does not.
# Dicts of target_label : source_label for re-keying.
# Note that the cases table includes MTF and FTM, but the deaths table does not.
GENDER_KEYS = {"female": "Female", "male": "Male",
"unknown": "Unknown_Sex", "mtf": "MTF", "ftm": "FTM"}
"unknown": "Unknown_Sex", "mtf": "MTF", "ftm": "FTM"}
RACE_KEYS = {"Latinx_or_Hispanic": "Hispanic_Latino", "Asian": "Asian", "African_Amer": "African_American_Black",
"White": "White", "Pacific_Islander": "Pacific_Islander", "Native_Amer": "Native_American", "Multiple_Race": "Multirace",
"Other": "Other_Race", "Unknown": "Unknown_Race"}


# format query to get entry for Alameda County
param_list = {'where': "Geography='Alameda County'", 'outFields': '*', 'outSR':4326, 'f':'json'}
param_list = {'where': "Geography='Alameda County'", 'outFields': '*', 'outSR':'4326', 'f':'json'}
# get cases data
response = requests.get(demographics_cases, params=param_list)
response.raise_for_status()
Expand All @@ -153,9 +153,9 @@ def get_demographics(out:Dict) -> (Dict, List):
response.raise_for_status()
parsed = response.json()
deaths_data = parsed['features'][0]['attributes']

# copy dictionary structure of 'out' dictionary to local variable
demo_totals = { "case_totals": out["case_totals"], "death_totals": out["death_totals"]}
demo_totals = { "case_totals": out["case_totals"], "death_totals": out["death_totals"]}

# Parse and re-key
# gender cases and deaths
Expand All @@ -171,7 +171,7 @@ def get_demographics(out:Dict) -> (Dict, List):
demo_totals["case_totals"]["age_group"] = { k: v for k, v in cases_data.items() if 'Age' in k }
demo_totals["death_totals"]["age_group"] = {k: v for k, v in deaths_data.items() if 'Age' in k}

# Handle values equal to '<10', if any. Note that some data points are entered as `null`, which
# Handle values equal to '<10', if any. Note that some data points are entered as `null`, which
# will be decoded as Python's `None`
counts_lt_10 = []
for cat, cat_dict in demo_totals.items(): # cases, deaths
Expand All @@ -181,7 +181,7 @@ def get_demographics(out:Dict) -> (Dict, List):
counts_lt_10.append(f"{cat}.{group}.{key}")
elif val is None: # proactively set None values to our default value of -1
group_dict[key] = - 1
else: # if else, this value should be a number. check that val can be cast to an int.
else: # if else, this value should be a number. check that val can be cast to an int.
try:
int(val)
except ValueError:
Expand All @@ -190,4 +190,4 @@ def get_demographics(out:Dict) -> (Dict, List):

if __name__ == '__main__':
""" When run as a script, prints the data to stdout"""
print(json.dumps(get_county(), indent=4))
print(json.dumps(get_county(), indent=4))
2 changes: 2 additions & 0 deletions run_scraper_data.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
source env/bin/activate;
python3 scraper_data.py $@;
37 changes: 37 additions & 0 deletions scraper_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
#!/usr/bin/env python3
import click
import json
import data_scrapers
from typing import Tuple
from pathlib import Path


COUNTY_NAMES : Tuple[str,...]= tuple(data_scrapers.scrapers.keys())


@click.command(help='Create a .json with data for one or more counties. Supported '
f'counties: {", ".join(COUNTY_NAMES)}.')
@click.argument('counties', metavar='[COUNTY]...', nargs=-1,
type=click.Choice(COUNTY_NAMES, case_sensitive=False))
@click.option('--output', help='write output file to this directory')

def main(counties: Tuple[str,...], output:str) -> None:
out = dict()
if len(counties) == 0:
counties = COUNTY_NAMES

# Run each scraper's get_county() method. Assign the output to out[county]
for county in counties:
out[county] = data_scrapers.scrapers[county].get_county()

if output:
parent = Path(output)
parent.mkdir(exist_ok = True) # if output directory does not exist, create it
with parent.joinpath('data.json').open('w', encoding='utf-8') as f:
json.dump(out, f, ensure_ascii=False, indent=2)

else:
print(json.dumps(out,indent=2))

if __name__ == '__main__':
main()

0 comments on commit 575f138

Please sign in to comment.