Merge pull request #51 from elaguerta/data-cli

#46 Add CLI to run data scrapers
sfbrigade · May 17, 2020 · 575f138 · 575f138
2 parents 40f7ac4 + 9ca5cd5
commit 575f138
Show file tree

Hide file tree

Showing 4 changed files with 76 additions and 23 deletions.
diff --git a/data_scrapers/__init__.py b/data_scrapers/__init__.py
@@ -0,0 +1,14 @@
+from typing import Dict, Any
+import data_scrapers.alameda_county as alameda_county
+
+scrapers: Dict[str, Any] = {
+    'alameda': alameda_county
+    # 'contra_costa': None,
+    # 'marin': None,
+    # 'napa': None,
+    # 'san_francisco': san_francisco_county,
+    # 'san_mateo': None,
+    # 'santa_clara': None,
+    # 'solano': None,
+    # 'sonoma': None,
+}
diff --git a/data_scrapers/alameda_county.py b/data_scrapers/alameda_county.py
@@ -1,10 +1,10 @@
 #!/usr/bin/env python3
 import requests
-from bs4 import BeautifulSoup
+from bs4 import BeautifulSoup # type: ignore
 import json
-from typing import List, Dict
+from typing import List, Dict, Tuple
 from datetime import datetime, timezone
-from selenium import webdriver
+from selenium import webdriver  # type: ignore
 
 # Note that we are using numbers for all of Alameda County, including Berkeley
 # Running this scraper requires a Firefox webdriver. The macos Firefox driver, geckodriver, is stored in ./env/bin
@@ -30,7 +30,7 @@ def get_county() -> Dict:
     # Load data model template into a local dictionary called 'out'.
     with open('./data_models/data_model.json') as template:
         out = json.load(template)
-    
+
     # populate dataset headers
     out["name"] = "Alameda County"
     out["source_url"] = landing_page
@@ -41,7 +41,7 @@ def get_county() -> Dict:
     response.raise_for_status()
     cases_header = response.json()
     timestamp = cases_header["editingInfo"]["lastEditDate"]
-    # Raise an exception if a timezone is specified. If "dateFieldsTimeReference" is present, we need to edit this scrapr to handle it. 
+    # Raise an exception if a timezone is specified. If "dateFieldsTimeReference" is present, we need to edit this scrapr to handle it.
     # See: https://developers.arcgis.com/rest/services-reference/layer-feature-service-.htm#GUID-20D36DF4-F13A-4B01-AA05-D642FA455EB6
     if "dateFieldsTimeReference" in cases_header["editingInfo"] or "editFieldsInfo" in cases_header:
         raise FutureWarning("A timezone may now be specified in the metadata.")
@@ -61,14 +61,14 @@ def get_county() -> Dict:
 
 
 # Confirmed Cases and Deaths
-def get_timeseries() -> Dict: 
+def get_timeseries() -> Dict:
     """Fetch daily and cumulative cases and deaths by day
     Returns the dictionary value for "series": {"cases":[], "deaths":[]}.
     To create a DataFrame from this dictionary, run
     'pd.DataFrame(get_timeseries())'
     """
 
-    series = {"cases":[], "deaths":[]} # dictionary holding the timeseries for cases and deaths
+    series: Dict[str, List] = {"cases":[], "deaths":[]} # dictionary holding the timeseries for cases and deaths
     # Dictionary of 'source_label': 'target_label' for re-keying
     TIMESERIES_KEYS = {
         'Date': 'date',
@@ -79,12 +79,12 @@ def get_timeseries() -> Dict:
     }
 
     # query API
-    param_list = {'where':'0=0', 'resultType': 'none', 'outFields': 'Date,AC_Cases,AC_CumulCases,AC_Deaths,AC_CumulDeaths', 'outSR': 4326,'orderByField': 'Date', 'f': 'json'}
+    param_list = {'where':'0=0', 'resultType': 'none', 'outFields': 'Date,AC_Cases,AC_CumulCases,AC_Deaths,AC_CumulDeaths', 'outSR': '4326','orderByField': 'Date', 'f': 'json'}
     response = requests.get(cases_deaths, params=param_list)
     response.raise_for_status()
     parsed = response.json()
     features = [obj["attributes"] for obj in parsed['features']]
-    
+
     # convert dates
     for obj in features:
         month, day, year = obj['Date'].split('/')
@@ -94,7 +94,7 @@ def get_timeseries() -> Dict:
             day = '0' + day
         obj['Date'] = "{}-{}-{}".format(year, month, day)
 
-  
+
 
     re_keyed = [{TIMESERIES_KEYS[key]: value for key, value in entry.items()}
                 for entry in features]
@@ -125,24 +125,24 @@ def get_notes() -> str:
     driver.quit()
     return '\n\n'.join(notes)
 
-def get_demographics(out:Dict) -> (Dict, List):
+def get_demographics(out: Dict) -> Tuple[Dict, List]:
     """Fetch cases and deaths by age, gender, race, ethnicity
-    Returns the dictionary value for {"cases_totals": {}, "death_totals":{}}, as well as a list of 
-    strings describing datapoints that have a value of "<10". 
-    To create a DataFrame from the dictionary, run 'pd.DataFrame(get_demographics()[0])' 
+    Returns the dictionary value for {"cases_totals": {}, "death_totals":{}}, as well as a list of
+    strings describing datapoints that have a value of "<10".
+    To create a DataFrame from the dictionary, run 'pd.DataFrame(get_demographics()[0])'
     Note that the DataFrame will convert the "<10" strings to NaN.
     """
-    # Dicts of target_label : source_label for re-keying. 
-    # Note that the cases table includes MTF and FTM, but the deaths table does not. 
+    # Dicts of target_label : source_label for re-keying.
+    # Note that the cases table includes MTF and FTM, but the deaths table does not.
     GENDER_KEYS = {"female": "Female", "male": "Male",
-                   "unknown": "Unknown_Sex", "mtf": "MTF", "ftm": "FTM"} 
+                   "unknown": "Unknown_Sex", "mtf": "MTF", "ftm": "FTM"}
     RACE_KEYS = {"Latinx_or_Hispanic": "Hispanic_Latino", "Asian": "Asian", "African_Amer": "African_American_Black",
                  "White": "White", "Pacific_Islander": "Pacific_Islander", "Native_Amer": "Native_American", "Multiple_Race": "Multirace",
                  "Other": "Other_Race", "Unknown": "Unknown_Race"}
 
 
     # format query to get entry for Alameda County
-    param_list = {'where': "Geography='Alameda County'", 'outFields': '*', 'outSR':4326, 'f':'json'}
+    param_list = {'where': "Geography='Alameda County'", 'outFields': '*', 'outSR':'4326', 'f':'json'}
     # get cases data
     response = requests.get(demographics_cases, params=param_list)
     response.raise_for_status()
@@ -153,9 +153,9 @@ def get_demographics(out:Dict) -> (Dict, List):
     response.raise_for_status()
     parsed = response.json()
     deaths_data = parsed['features'][0]['attributes']
-   
+
     # copy dictionary structure of 'out' dictionary to local variable
-    demo_totals = { "case_totals": out["case_totals"], "death_totals": out["death_totals"]} 
+    demo_totals = { "case_totals": out["case_totals"], "death_totals": out["death_totals"]}
 
     # Parse and re-key
     # gender cases and deaths
@@ -171,7 +171,7 @@ def get_demographics(out:Dict) -> (Dict, List):
     demo_totals["case_totals"]["age_group"] = { k: v for k, v in cases_data.items() if 'Age' in k }
     demo_totals["death_totals"]["age_group"] = {k: v for k, v in deaths_data.items() if 'Age' in k}
 
-    # Handle values equal to '<10', if any. Note that some data points are entered as `null`, which 
+    # Handle values equal to '<10', if any. Note that some data points are entered as `null`, which
     # will be decoded as Python's `None`
     counts_lt_10 = []
     for cat, cat_dict in demo_totals.items():  # cases, deaths
@@ -181,7 +181,7 @@ def get_demographics(out:Dict) -> (Dict, List):
                     counts_lt_10.append(f"{cat}.{group}.{key}")
                 elif val is None: # proactively set None values to our default value of -1
                     group_dict[key] = - 1
-                else: # if else, this value should be a number. check that val can be cast to an int. 
+                else: # if else, this value should be a number. check that val can be cast to an int.
                     try:
                         int(val)
                     except ValueError:
@@ -190,4 +190,4 @@ def get_demographics(out:Dict) -> (Dict, List):
 
 if __name__ == '__main__':
     """ When run as a script, prints the data to stdout"""
-    print(json.dumps(get_county(), indent=4))
+    print(json.dumps(get_county(), indent=4))
diff --git a/run_scraper_data.sh b/run_scraper_data.sh
@@ -0,0 +1,2 @@
+source env/bin/activate;
+python3 scraper_data.py $@;
diff --git a/scraper_data.py b/scraper_data.py
@@ -0,0 +1,37 @@
+#!/usr/bin/env python3
+import click
+import json
+import data_scrapers
+from typing import Tuple
+from pathlib import Path
+
+
+COUNTY_NAMES : Tuple[str,...]= tuple(data_scrapers.scrapers.keys())
+
+
+@click.command(help='Create a .json with data for one or more counties. Supported '
+                    f'counties: {", ".join(COUNTY_NAMES)}.')
+@click.argument('counties', metavar='[COUNTY]...', nargs=-1,
+                type=click.Choice(COUNTY_NAMES, case_sensitive=False))
+@click.option('--output', help='write output file to this directory')
+
+def main(counties: Tuple[str,...], output:str) -> None:
+    out = dict()
+    if len(counties) == 0:
+        counties = COUNTY_NAMES
+
+    # Run each scraper's get_county() method. Assign the output to out[county]
+    for county in counties:
+        out[county] = data_scrapers.scrapers[county].get_county()
+
+    if output:
+        parent = Path(output)
+        parent.mkdir(exist_ok = True) # if output directory does not exist, create it
+        with parent.joinpath('data.json').open('w', encoding='utf-8') as f:
+            json.dump(out, f, ensure_ascii=False, indent=2)
+
+    else:
+        print(json.dumps(out,indent=2))
+
+if __name__ == '__main__':
+    main()
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		source env/bin/activate;
		python3 scraper_data.py $@;