caciviclab · ckingbailey · Oct 30, 2023 · Oct 30, 2023 · Nov 1, 2023 · Nov 13, 2023
diff --git a/.gitignore b/.gitignore
@@ -4,3 +4,4 @@ downloads/cached-db
 inputs
 .local
 **/__pycache__
+.vscode/
diff --git a/download/.github/workflows/main.yml b/download/.github/workflows/main.yml
@@ -0,0 +1,21 @@
+name: "Check Google Drive Access"
+on:
+ workflow_dispatch:
+jobs:
+ check:
+ runs-on: ubuntu-latest
+ env:
+ REPO_OWNER: ${{ github.repository_owner}}
+ REPO_BRANCH: ${{ github.ref_name }}
+ SERVICE_ACCOUNT_KEY_JSON: ${{ secrets.SERVICE_ACCOUNT_KEY_JSON }}
+ GDRIVE_FOLDER: ${{ vars.GDRIVE_FOLDER }}
+ steps:
+ - uses: actions/checkout@v3
+ - run: "pip install -r gdrive_requirements.txt"
+ - run: "python test_pull_from_gdrive.py"
+ - name: Archive pulled files
+ uses: actions/upload-artifact@v2
+ with:
+ name: redacted-netfile-files
+ path: .local/downloads
+
diff --git a/download/.github/workflows/pr_check.yaml b/download/.github/workflows/pr_check.yaml
@@ -0,0 +1,32 @@
+name: Python tests
+
+on:
+ pull_request:
+ branches:
+ - feat/pull-v2-api
+ push:
+ branches:
+ - ci/test-downloader
+ workflow_dispatch:
+
+env:
+ working_dir: download
+
+jobs:
+ run_tests:
+ name: Run tests on Python download code
+ runs-on: ubuntu-22.04
+ defaults:
+ run:
+ working-directory: ${{ env.working_dir }}
+ steps:
+ - uses: actions/checkout@v4
+ - uses: actions/setup-python@v4
+ with:
+ python-version-file: ${{ env.working_dir }}/.python-version
+ cache: pip
+ cache-dependency-path: ${{ env.working_dir }}/requirements.txt
+ - run: pip install -r requirements.txt
+ - name: Run tests
+ run: pytest tests/test_*.py
+
diff --git a/download/.gitignore b/download/.gitignore
@@ -0,0 +1,7 @@
+.venv/
+__pycache__
+.env
+.idea
+.vscode
+SERVICE_ACCOUNT_KEY_JSON.json
+.local
diff --git a/download/.python-version b/download/.python-version
@@ -0,0 +1 @@
+3.11
diff --git a/download/README.md b/download/README.md
@@ -0,0 +1,3 @@
+# Query NetFile V2 API to Load Disclosure-Backend DB
+
+Run main.py to download raw JSON files and create csv files. To download, the key for the service account used to access Google Drive has to be placed in the file .local/SERVICE_ACCOUNT_KEY_JSON.json.
diff --git a/download/conftest.py b/download/conftest.py
@@ -0,0 +1,5 @@
+import pytest
+
+pytest_plugins = [
+ "tests.fixtures.data_fixtures"
+]
diff --git a/download/main.py b/download/main.py
@@ -0,0 +1,79 @@
+""" main, to run everything """
+import json
+from model.a_contributions import A_Contributions
+from model.committee import Committees
+from model.election import Elections
+from model.filing import Filings
+from model.transaction import Transactions
+
+from gdrive_datastore.gdrive import pull_data
+
+DATA_DIR_PATH = '.local/downloads'
+OUTPUT_DIR = '.local'
+
+def unique_statuses(filers):
+ """ What are the unique values for status? """
+ return set(
+ s['status'] for f in filers
+ for s in f['statusList']
+ )
+
+def main():
+ """ Do everyting """
+ # pull data from gdrive and put it in .local/downloads
+ pull_data(subfolder='main', default_folder='OpenDisclosure')
+
+ with open(f'{DATA_DIR_PATH}/elections.json', encoding='utf8') as f:
+ elections_json = json.loads(f.read())
+
+ elections = Elections(elections_json)
+
+ with open(f'{DATA_DIR_PATH}/filers.json', encoding='utf8') as f:
+ filers = json.loads(f.read())
+
+ committees = Committees(filers, elections.pl)
+
+ # A-Contribs:
+ # join filers + filings + elections + transactions
+ # transactions.filing_nid -> filings.filing_nid
+ # filings.filer_nid -> committees.filer_nid
+ # committees.Ballot_Measure_Election -> elections.Ballot_Measure_Election
+ # where trans['transaction']['calTransactionType'] == 'F460A'
+ with open(f'{DATA_DIR_PATH}/filings.json', encoding='utf8') as f:
+ filings = Filings(json.loads(f.read())).pl
+
+ with open(f'{DATA_DIR_PATH}/transactions.json', encoding='utf8') as f:
+ records = json.loads(f.read())
+ transactions = Transactions(records).pl
+
+ a_contributions = A_Contributions(transactions, filings, committees.pl)
+ a_contribs_df = a_contributions.df
+ if not a_contribs_df.is_empty:
+ print(a_contribs_df.drop(columns=[
+ 'BakRef_TID',
+ 'Bal_Name',
+ 'Bal_Juris',
+ 'Bal_Num',
+ 'Dist_No',
+ 'Form_Type',
+ 'Int_CmteId',
+ 'Juris_Cd',
+ 'Juris_Dscr',
+ 'Loan_Rate',
+ 'Memo_Code',
+ 'Memo_RefNo',
+ 'Off_S_H_Cd',
+ 'tblCover_Offic_Dscr',
+ 'tblCover_Office_Cd',
+ 'tblDetlTran_Office_Cd',
+ 'tblDetlTran_Offic_Dscr',
+ 'XRef_SchNm',
+ 'XRef_Match',
+ ]).sample(n=20))
+
+ elections.pl.write_csv(f'{OUTPUT_DIR}/elections.csv')
+ committees.pl.write_csv(f'{OUTPUT_DIR}/committees.csv')
+ a_contributions.df.write_csv(f'{OUTPUT_DIR}/a_contributions.csv')
+
+if __name__ == '__main__':
+ main()
diff --git a/download/model/__init__.py b/download/model/__init__.py
diff --git a/download/model/a_contributions.py b/download/model/a_contributions.py
@@ -0,0 +1,24 @@
+"""
+Schedule A, Contributions
+Hopefully this can be joined with other Schedule classes into a single Transaction class
+"""
+import polars as pl
+from .schedule import ScheduleBase
+
+class A_Contributions(ScheduleBase):
+ """
+ Each record represents Schedule A - Contributions from form 460
+ """
+ def __init__(
+ self,
+ transactions:pl.DataFrame,
+ filings:pl.DataFrame,
+ committees:pl.DataFrame
+ ):
+ self._form_id = 'F460A'
+ super().__init__(
+ self._form_id,
+ transactions,
+ filings,
+ committees
+ )
diff --git a/download/model/base.py b/download/model/base.py
@@ -0,0 +1,51 @@
+""" This is the base model, upon all others shall be based """
+import pandas as pd
+import polars as pl
+
+class BaseModel:
+ """ Base model other models inherit from """
+ def __init__(self, data):
+ self._data = data
+ self._df = None
+ self._pl = None
+ self._dtypes = []
+ self._pl_dtypes = []
+ self._sql_dtypes = []
+ self._sql_cols = []
+ self._sql_table_name = ''
+
+ @property
+ def data(self):
+ """ Just return the data """
+ return self._data
+
+ @property
+ def pl(self):
+ ''' Return a Polars dataframe '''
+ if self._pl is None or self._pl.is_empty():
+ self._pl = pl.DataFrame(self._data, schema=self._pl_dtypes)
+
+ return self._pl
+
+ @property
+ def df(self):
+ """ Get a dataframe of the data """
+ if self._df is None or self._df.empty:
+ self._df = pd.DataFrame(self._data).astype(self._dtypes)
+
+ return self._df
+
+ def to_sql(self, connection, **kwargs):
+ """ Write to a postgresql table """
+ options = {
+ 'index_label': 'id',
+ 'if_exists': 'replace'
+ }
+ options.update(kwargs)
+
+ self.df[self._sql_cols].to_sql(
+ self._sql_table_name,
+ connection,
+ dtype=self._sql_dtypes,
+ **options
+ )
diff --git a/download/model/committee.py b/download/model/committee.py
@@ -0,0 +1,123 @@
+""" This is the Committee model """
+from typing import List
+import polars as pl
+from sqlalchemy.types import String
+from . import base
+
+class Committees(base.BaseModel):
+ """ A collection of committees """
+ def __init__(self, filers:List[dict], elections:pl.DataFrame):
+ empty_election_influence = {
+ 'electionDate': None,
+ 'measure': None,
+ 'candidate': None,
+ 'doesSupport': None,
+ 'startDate': None,
+ 'endDate': None
+ }
+
+ super().__init__([
+ {
+ 'filer_nid': int(f['filerNid']),
+ # 'Ballot_Measure_Election': [ *elections[elections['date'] == infl['electionDate']]['name'].array, None ][0],
+ 'Ballot_Measure_Election': self._get_possibly_empty_ballot_measure_election(
+ elections,
+ infl
+ ),
+ 'Filer_ID': f['registrations'].get('CA SOS'),
+ 'Filer_NamL': infl.get('committeeName', f['filerName']),
+ '_Status': 'INACTIVE' if f['isTerminated'] else 'ACTIVE',
+ '_Committee_Type': (f['committeeTypes'][0]
+ if len(f['committeeTypes']) == 1
+ else 'Multiple Types'),
+ 'Ballot_Measure': infl['measure'].get('measureNumber') if infl['measure'] else None,
+ 'Support_Or_Oppose': self.support_or_oppose(infl),
+ 'candidate_controlled_id': None, # TODO: link to candidates if candidate committee
+ 'Start_Date': infl['startDate'],
+ 'End_Date': infl['endDate'],
+ 'data_warning': None,
+ 'Make_Active': None
+ } for f in filers
+ for infl in (
+ # TODO: This is slightly effed because some filers have duplicate electionInfluences
+ # See: filer with filerName "Families in Action For Justice Fund"
+ # I guess we have to dedupe electionInfluences blurg
+ f['electionInfluences']
+ if f['electionInfluences']
+ else [ empty_election_influence ]
+ )
+ if f['registrations'].get('CA SOS')
+ ])
+ self._dtypes = {
+ 'filer_nid': int,
+ 'Ballot_Measure_Election': 'string',
+ 'Filer_ID': 'string',
+ 'Filer_NamL': 'string',
+ '_Status': 'string',
+ '_Committee_Type': 'string',
+ 'Ballot_Measure': 'string',
+ 'Support_Or_Oppose': 'string',
+ 'candidate_controlled_id': 'string',
+ 'Start_Date': 'string',
+ 'End_Date': 'string',
+ 'data_warning': 'string',
+ 'Make_Active': 'string'
+ }
+ self._pl_dtypes = {
+ 'filer_nid': pl.UInt64,
+ 'Ballot_Measure_Election': pl.Utf8,
+ 'Filer_ID': pl.Utf8,
+ 'Filer_NamL': pl.Utf8,
+ '_Status': pl.Utf8,
+ '_Committee_Type': pl.Utf8,
+ 'Ballot_Measure': pl.Utf8,
+ 'Support_Or_Oppose': pl.Utf8,
+ 'candidate_controlled_id': pl.Utf8,
+ 'Start_Date': pl.Utf8,
+ 'End_Date': pl.Utf8,
+ 'data_warning': pl.Utf8,
+ 'Make_Active': pl.Utf8
+ }
+ self._sql_dtypes = {
+ 'Ballot_Measure_Election': String,
+ 'Filer_ID': String,
+ 'Filer_NamL': String,
+ '_Status': String,
+ '_Committee_Type': String,
+ 'Ballot_Measure': String,
+ 'Support_Or_Oppose': String,
+ 'candidate_controlled_id': String,
+ 'Start_Date': String,
+ 'End_Date': String,
+ 'data_warning': String,
+ 'Make_Active': String
+ }
+ self._sql_cols = self._sql_dtypes.keys()
+ self._sql_table_name = 'committees'
+
+ @staticmethod
+ def support_or_oppose(influence):
+ """
+ Return 'S' or 'O' code only for committees that support or oppose measures,
+ or committees that oppose candidates
+ """
+ sup_opp_cd = 'S' if influence['doesSupport'] else 'O'
+
+ if (influence['measure'] is not None or influence['candidate'] and sup_opp_cd == 'O'):
+ return sup_opp_cd
+
+ @staticmethod
+ def _get_possibly_empty_ballot_measure_election(elections: pl.DataFrame, influence: dict):
+ '''
+ The Ballot Measure Election is the election's slugified `name` like "oakland-march-2020".
+ To get the BME for a committee, we match the `electionDate` of an `influence` object
+ of the committee against election `date`. Then we unpack the results into a list,
+ appending None in case no matches were found. Finally we return the first index of the
+ list, which will contain either the matched election slug or None.
+ '''
+ return [
+ *elections.lazy().filter(
+ pl.col('date') == influence['electionDate']
+ ).first().collect().get_column('name'),
+ None
+ ][0]
diff --git a/download/model/d_expenditures.py b/download/model/d_expenditures.py
@@ -0,0 +1,23 @@
+'''
+FPPC Form 460, Schedule D, Expenditures
+'''
+import polars as pl
+from .schedule import ScheduleBase
+
+class DExpenditures(ScheduleBase):
+ '''
+ Schedule D - Expenditures from FPPC Form 460
+ '''
+ def __init__(
+ self,
+ transactions: pl.DataFrame,
+ filings: pl.DataFrame,
+ committees: pl.DataFrame
+ ):
+ self._form_id = 'F460D'
+ super().__init__(
+ self._form_id,
+ transactions,
+ filings,
+ committees
+ )