sfbrigade · elaguerta · Sep 3, 2020 · May 19, 2020 · May 26, 2020 · May 26, 2020
diff --git a/covid19_sfbayarea/data/__init__.py b/covid19_sfbayarea/data/__init__.py
@@ -1,13 +1,14 @@
 from typing import Dict, Any
 from . import alameda
 from . import san_francisco
+from . import marin
 from . import sonoma
 from . import solano
 
 scrapers: Dict[str, Any] = {
     'alameda': alameda,
     # 'contra_costa': None,
-    # 'marin': None,
+    'marin': marin,
     # 'napa': None,
     'san_francisco': san_francisco,
     # 'san_mateo': None,

diff --git a/covid19_sfbayarea/data/marin.py b/covid19_sfbayarea/data/marin.py
@@ -0,0 +1,277 @@
+#!/usr/bin/env python3
+import csv
+from typing import List, Dict, Tuple
+from bs4 import BeautifulSoup # type: ignore
+from urllib.parse import unquote_plus
+from datetime import datetime
+from contextlib import contextmanager
+import time
+
+
+from ..webdriver import get_firefox
+from .utils import get_data_model
+
+def get_county() -> Dict:
+    """Main method for populating county data"""
+
+    url = 'https://coronavirus.marinhhs.org/surveillance'
+    model = get_data_model()
+
+    chart_ids = {"cases": "Eq6Es", "deaths": "Eq6Es", "inmates": "KCNZn", "age": "zSHDs", "gender": "FEciW", "race_eth": "aBeEd"}
+    # I removed "tests": '2Hgir' from chart_ids b/c it seems to have disappeared from the website?
+
+    model['name'] = "Marin County"
+    model['update_time'] = datetime.today().isoformat()
+    model["meta_from_baypd"] = ["There's no actual update time on their website. Not all charts are updated daily.", "The cases and deaths total include inmate numbers, but the cases and deaths series, the testing data and data broken down by race/ethnicity, gender and age do not."]
+    model['source_url'] = url
+    model['meta_from_source'] = get_chart_meta(url, chart_ids)
+
+    model["series"]["cases"] = get_series_data(chart_ids["cases"], url, ['Date', 'Total Cases', 'Total Recovered*', 'Total Hospitalized', 'Total Deaths'], "cumul_cases", 'Total Cases', 'cases') 
+    model["series"]["deaths"] =  get_series_data(chart_ids["deaths"], url, ['Date', 'Total Cases', 'Total Recovered*', 'Total Hospitalized', 'Total Deaths'], "cumul_deaths", 'Total Deaths', 'deaths') 
+    model["inmates"]["cases"] = get_inmate_totals(chart_ids["inmates"], url)[0]
+    model["inmates"]["deaths"] = get_inmate_totals(chart_ids["inmates"], url)[1]
+
+    #model["series"]["tests"] = get_test_series(chart_ids["tests"], url)
+    model["case_totals"]["age_group"], model["death_totals"]["age_group"] = get_breakdown_age(chart_ids["age"], url)
+    model["case_totals"]["gender"], model["death_totals"]["gender"] = get_breakdown_gender(chart_ids["gender"], url)
+    model["case_totals"]["race_eth"], model["death_totals"]["race_eth"] = get_breakdown_race_eth(chart_ids["race_eth"], url)
+    return model
+
+@contextmanager
+def chart_frame(driver, chart_id: str): # type: ignore 
+    # is this bad practice? I didn't know what type to specify here for the frame.
+    frame = driver.find_element_by_css_selector(f'iframe[src*="//datawrapper.dwcdn.net/{chart_id}/"]')
+    driver.switch_to.frame(frame)
+    try:
+        yield frame
+    finally:
+        driver.switch_to.default_content()
+        driver.quit()
+
+def get_chart_data(url: str, chart_id: str) -> List[str]: 
+    """This method extracts parsed csv data from the csv linked in the data wrapper charts."""
+    with get_firefox() as driver:
+        driver.implicitly_wait(30)
+        driver.get(url)
+
+        with chart_frame(driver, chart_id):
+            csv_data = driver.find_element_by_class_name('dw-data-link').get_attribute('href')
+            # Deal with the data
+            if csv_data.startswith('data:'):
+                media, data = csv_data[5:].split(',', 1)
+                # Will likely always have this kind of data type
+                if media != 'application/octet-stream;charset=utf-8':
+                    raise ValueError(f'Cannot handle media type "{media}"')
+                csv_string = unquote_plus(data)
+                csv_data = csv_string.splitlines()
+            else:
+                raise ValueError('Cannot handle this csv_data href')
+
+    return csv_data
+
+def get_chart_meta(url: str, chart_ids: Dict[str, str]) -> Tuple[List, List]:
+    """This method gets all the metadata underneath the data wrapper charts and the metadata at the top of the county dashboard."""
+    metadata: set = set()
+    chart_metadata: set = set()
+
+    with get_firefox() as driver: 
+        driver.implicitly_wait(30)
+        driver.get(url)
+        soup = BeautifulSoup(driver.page_source, 'html5lib')
+
+        for soup_obj in soup.findAll('div', attrs={"class":"surveillance-data-text"}): 
+            if soup_obj.findAll('p'):
+                metadata = set({paragraph.text.replace("\u2014","").replace("\u00a0", "").replace("\u2019","") for paragraph in soup_obj.findAll('p')})
+            else:
+                raise ValueError('Metadata location has changed.')
+
+    with get_firefox() as driver: # I keep getting a connection error so maybe I need to do this again? seems weird.
+        driver.implicitly_wait(30)
+        driver.get(url)
+        # Metadata for each chart visualizing the data of the csv file I'll pull.
+        # I had to change my metadata function b/c for whatever reason, my usual code didn't pick up on the class notes block. 
+        # There's something weird with the website that Ricardo and I couldn't quite pinpoint. 
+        source_list: set = set()
+        for chart_id in chart_ids.values(): 
+            driver.implicitly_wait(30)
+            source = driver.find_element_by_css_selector(f'iframe[src*="//datawrapper.dwcdn.net/{chart_id}/"]').get_attribute('src')
+            source_list.add(source)
+
+    with get_firefox() as driver:
+        for source in source_list:
+            driver.get(source)
+            #breakpoint() 
+            time.sleep(5) # this ensures there's enough time for the soup to find the elements and for the chart_metadata to populate. 
+            # From the source code it seems that .get() should be synchronous but it's not working like that :( 
+            soup = BeautifulSoup(driver.page_source, 'html5lib') 
+            for data in soup.findAll('div', attrs = {'class': 'notes-block'}):
+                #breakpoint() 
+                chart_metadata.add(data.text.strip())
+
+    # Return the metadata. I take the set of the chart_metadata since there are repeating metadata strings.
+    return list(metadata), list(chart_metadata)
+
+def get_inmate_totals(chart_id: str, url: str) -> Tuple:
+    """This method extracts the number of cases and deaths for San Quentin inmates."""
+    csv_data = get_chart_data(url, chart_id)
+    csv_reader = csv.DictReader(csv_data)
+
+    keys = csv_reader.fieldnames
+
+    if keys != ['Updated', 'Total Confirmed Cases', 'Total Resolved Cases', 'COVID-19 Deaths']:
+        raise ValueError('The headers have changed')
+
+    for row in csv_reader:
+        cases = row['Total Confirmed Cases']
+        deaths = row['COVID-19 Deaths']
+
+    return (cases, deaths) 
+
+def get_series_data(chart_id: str, url: str, headers: list, model_typ: str, typ: str, new_count: str) -> List:
+    """This method extracts the date, number of cases/deaths, and new cases/deaths."""
+
+    csv_data = get_chart_data(url, chart_id)
+    csv_reader = csv.DictReader(csv_data)
+
+    keys = csv_reader.fieldnames
+
+    series: list = list()
+
+    if keys != headers:
+        raise ValueError('The headers have changed')
+
+    history: list = list()
+
+    for row in csv_reader:
+        daily: dict = dict()
+        date_time_obj = datetime.strptime(row['Date'], '%m/%d/%Y')
+        daily["date"] = date_time_obj.strftime('%Y-%m-%d')
+        # Collect the case totals in order to compute the change in cases per day 
+        history.append(int(row[typ]))
+        daily[model_typ] = int(row[typ]) 
+        series.append(daily)
+
+    history_diff: list = list()
+    # Since i'm substracting pairwise elements, I need to adjust the range so I don't get an off by one error.
+    for i in range(0, len(history)-1):
+        history_diff.append((int(history[i+1]) - int(history[i])) + int(series[0][model_typ]))
+        # from what I've seen, series[0]["cumul_cases"] will be 0, but I shouldn't assume that.
+    history_diff.insert(0, int(series[0][model_typ]))
+
+    for val, num in enumerate(history_diff):
+        series[val][new_count] = num 
+    return series
+
+def get_breakdown_age(chart_id: str, url: str) -> Tuple[List, List]:
+    """This method gets the breakdown of cases and deaths by age."""
+    csv_data = get_chart_data(url, chart_id)
+    csv_reader = csv.DictReader(csv_data)
+
+    keys = csv_reader.fieldnames
+
+    c_brkdown: list = list()
+    d_brkdown: list = list()
+
+    if keys != ['Age Category', 'POPULATION', 'Cases', 'Hospitalizations', 'Deaths']:
+        raise ValueError('The headers have changed')
+
+    key_mapping = {"0-9": "0_to_9", "10-18": "10_to_18", "19-34": "19_to_34", "35-49": "35_to_49", "50-64": "50_to_64", "65-79": "65_to_79", "80-94": "80_to_94", "95+": "95_and_older"} 
+
+    for row in csv_reader:
+        c_age: dict = dict()
+        d_age: dict = dict()
+         # Extracting the age group and the raw count for both cases and deaths.
+        c_age["group"], d_age["group"] = row['Age Category'], row['Age Category']
+        if c_age["group"] not in key_mapping:
+            raise ValueError(str(c_age["group"]) + ' is not in the list of age groups. The age groups have changed.')
+        else:
+            c_age["group"] = key_mapping[c_age["group"]]
+            c_age["raw_count"] = int(row["Cases"])
+            d_age["group"] = key_mapping[d_age["group"]]
+            d_age["raw_count"] = int(row["Deaths"])
+            c_brkdown.append(c_age)
+            d_brkdown.append(d_age)
+
+    return c_brkdown, d_brkdown
+
+def get_breakdown_gender(chart_id: str, url: str) -> Tuple[Dict, Dict]:
+    """This method gets the breakdown of cases and deaths by gender."""
+    csv_data = get_chart_data(url, chart_id)
+    csv_reader = csv.DictReader(csv_data)
+
+    keys = csv_reader.fieldnames
+
+    if keys != ['Gender', 'POPULATION', 'Cases', 'Hospitalizations', 'Deaths']:
+        raise ValueError('The headers have changed.')
+
+    genders = ['male', 'female']
+    c_gender: dict = dict()
+    d_gender: dict = dict()
+
+    for row in csv_reader:
+        # Extracting the gender and the raw count (the 3rd and 5th columns, respectively) for both cases and deaths.
+        # Each new row has data for a different gender.
+        gender = row["Gender"].lower()
+        if gender not in genders:
+            return ValueError("The genders have changed.") # type: ignore 
+            # is doing this bad practice? mypy doesn't have an issue with the error on line 244 so not sure why this one causes an error
+        c_gender[gender] = int(row["Cases"])
+        d_gender[gender] = int(row["Deaths"])            
+
+    return c_gender, d_gender
+
+def get_breakdown_race_eth(chart_id: str, url: str) -> Tuple[Dict, Dict]:
+    """This method gets the breakdown of cases and deaths by race/ethnicity."""
+
+    csv_data = get_chart_data(url, chart_id)
+    csv_reader = csv.DictReader(csv_data)
+
+    keys = csv_reader.fieldnames
+
+    if keys != ['Race/Ethnicity', 'COUNTY POPULATION', 'Cases', 'Case Percent', 'Hospitalizations', 'Hospitalizations Percent', 'Deaths', 'Deaths Percent']:
+        raise ValueError("The headers have changed.")
+
+    key_mapping = {"Black/African American":"African_Amer", "Hispanic/Latino": "Latinx_or_Hispanic", "White": "White", "Asian": "Asian", "Native Hawaiian/Pacific Islander": "Pacific_Islander", "American Indian/Alaska Native": "Native_Amer", "Multi or Other Race": "Multi_or_Other"}
+
+    c_race_eth: dict = dict()
+    d_race_eth: dict = dict()
+
+    for row in csv_reader:
+        race_eth = row["Race/Ethnicity"]
+        if race_eth not in key_mapping:
+            raise ValueError("The race_eth groups have changed.")
+        else:
+            c_race_eth[key_mapping[race_eth]] = int(row["Cases"])
+            d_race_eth[key_mapping[race_eth]] = int(row["Deaths"])
+
+    return c_race_eth, d_race_eth
+
+def get_test_series(chart_id: str, url: str) -> List:
+    """This method gets the date, the number of positive and negative tests on that date, and the number of cumulative positive and negative tests."""
+    csv_data = get_chart_data(url, chart_id)
+
+    dates, positives, negatives = [row.split(',')[1:] for row in csv_data] 
+    series = zip(dates, positives, negatives)
+
+    test_series: list = list()
+
+    cumul_pos = 0
+    cumul_neg = 0
+    for entry in series:
+        daily: dict = dict()
+        # I'm not sure why, but I just found out that some of the test series have a 'null' value (in the spot where the number of positive tests is), so I needed to account for that here.
+        # At least for now, it's only present at the end, so I just break out of the loop and return the test series. 
+        if entry[1] != 'null':
+            date_time_obj = datetime.strptime(entry[0], '%m/%d/%Y')
+            daily["date"] = date_time_obj.strftime('%Y-%m-%d')
+            daily["positive"] = int(entry[1])
+            cumul_pos += daily["positive"]
+            daily["negative"] = int(entry[2])
+            cumul_neg += daily["negative"]
+            daily["cumul_pos"] = cumul_pos
+            daily["cumul_neg"] = cumul_neg
+            test_series.append(daily)
+        else:
+            break
+
+    return test_series
diff --git a/data_models/README.md b/data_models/README.md
@@ -86,6 +86,7 @@ Below are the tabulations we are making by gender, age group, race/ethnicity, an
             "Pacific_Islander":-1,
             "White":-1,
             "Unknown":-1
+            "Multi_or_Other": -1 
         },
         "underlying_cond": {
             "none":-1,
@@ -137,7 +138,18 @@ The fields will be used for normalizing the county case and death tabulations, a
     }
 ```
 
-5. __Hospitalization Data__
+6. __Inmate Data__
+
+This part of the data model currently only applies to Marin County, which reports the case and death count separately from the case and death count in the Marin County community. Note that the case and death data available for inmates is not in series form; there are only aggregated totals.
+
+```
+    "inmates": {
+        "cases": -1,
+        "deaths": -1
+    }
+```
+
+6. __Hospitalization Data__
 
 California COVID-19 hospitalization data is retrieved separately from the the
 [California Health and Human Services Open Data Portal
@@ -205,6 +217,8 @@ Scraper authors, please keep an eye out for amendments to the data model.
 # Race and Ethnicity
 We need to collapse counties that report race and ethnicity into one race/ethnicity dimension. This section will be updated pending information about San Francisco County's methods for reporting race and ethnicity.
 
+The category "Multi_or_Other" was included because Marin rolls up the numbers from "Multi" and "Other" into one.
+
 # Gender
 One future potential issue is that some counties still lump non-binary and cis-gender people under "Other", and other counties have started to differentiate. Our data model would ideally match the most detailed county's gender categories. A county with only the "Other" county would have the value of -1 for the non male/female categories, indicating that they are not collecting that information. However, this means that our `"Other"` category would not be internally comparable or consistent. The `"Other"` category for a county that has "Male, Female, Other, MTF, FTM" as separate datapoints should really be called `"Other - not MTF, not FTM"` and is not comparable to the `"Other"` category for a county that only has "Male, Female, Other".
 

diff --git a/data_models/data_model.json b/data_models/data_model.json
@@ -6,12 +6,12 @@
     "meta_from_baypd": "STORE IMPORTANT NOTES ABOUT OUR METHODS HERE",
     "series": {
         "cases": [
-            { "date": "yyyy-mm-dd", "cases": -1, "cumul_cases": -1},
+            { "date": "yyyy-mm-dd", "cases": -1, "cumul_cases": -1 },
             { "date": "yyyy-mm-dd", "cases": -1, "cumul_cases": -1 }
        ],
         "deaths": [
             { "date": "yyyy-mm-dd", "deaths": -1, "cumul_deaths": -1 },
-            { "date": "yyyy-mm-dd", "deaths": -1, "cumul_deaths": -1}
+            { "date": "yyyy-mm-dd", "deaths": -1, "cumul_deaths": -1 }
         ],
         "tests": [
             {
@@ -57,7 +57,8 @@
             "Other": -1,
             "Pacific_Islander":-1,
             "White":-1,
-            "Unknown":-1
+            "Unknown":-1,
+            "Multi_or_Other": -1
         },
         "transmission_cat": {
             "community": -1,
@@ -84,7 +85,8 @@
             "Other": -1,
             "Pacific_Islander":-1,
             "White":-1,
-            "Unknown":-1
+            "Unknown":-1,
+            "Multi_or_Other": -1
         },
         "underlying_cond": {
             "none":-1,
@@ -128,5 +130,9 @@
             "White":-1,
             "Unknown":-1
         }
+    },
+    "inmates": {
+        "cases": -1,
+        "deaths": -1
     }
 }