diff --git a/.devcontainer/Dockerfile b/.devcontainer/Dockerfile index 7ef8f731f..1401b3194 100644 --- a/.devcontainer/Dockerfile +++ b/.devcontainer/Dockerfile @@ -8,3 +8,9 @@ ENV PYTHONUNBUFFERED 1 RUN apt-get update && export DEBIAN_FRONTEND=noninteractive \ && apt-get -y install --no-install-recommends postgresql-client ruby-full +COPY ./.devcontainer/post-create-command.sh /scripts/ +COPY ./requirements.txt requirements.txt +COPY ./Gemfile Gemfile +COPY ./Gemfile.lock Gemfile.lock +RUN bash /scripts/post-create-command.sh + diff --git a/.devcontainer/README b/.devcontainer/README index 3a1c348ce..5e9685b2c 100644 --- a/.devcontainer/README +++ b/.devcontainer/README @@ -2,4 +2,3 @@ The dev container can be used to work on the project in a consistent environment independent of what machine you are working on. When working in the dev container, you will have a postgres instance running. You can access the postgres instance just by running psql. - diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json index c880fcea8..87eb3c386 100644 --- a/.devcontainer/devcontainer.json +++ b/.devcontainer/devcontainer.json @@ -4,7 +4,6 @@ "service": "app", "workspaceFolder": "/workspace", "remoteUser": "vscode", - "postCreateCommand": "bash ./.devcontainer/post-create-command.sh", "postStartCommand": "git config --global --add safe.directory ${containerWorkspaceFolder}", "forwardPorts": [4567, 5432], "extensions": [ diff --git a/.devcontainer/docker-compose.yml b/.devcontainer/docker-compose.yml index a90bde562..d84f95db6 100644 --- a/.devcontainer/docker-compose.yml +++ b/.devcontainer/docker-compose.yml @@ -26,12 +26,10 @@ services: network_mode: service:db db: - #image: postgres:latest - image: postgres:15.4 + image: postgres:15.6-bullseye restart: unless-stopped volumes: - postgres-data:/var/lib/postgresql/data - #- ../pgdata:/var/lib/postgresql/data environment: POSTGRES_USER: app_user POSTGRES_DB: "disclosure-backend" diff --git a/.devcontainer/enable-ssh-agent.ps1 b/.devcontainer/enable-ssh-agent.ps1 deleted file mode 100644 index 310d46384..000000000 --- a/.devcontainer/enable-ssh-agent.ps1 +++ /dev/null @@ -1,5 +0,0 @@ -# To use git in the dev container, you have to enable ssh agent on Windows. Try running the following in a PowerShell as Administrator. -# Make sure you're running as an Administrator -Set-Service ssh-agent -StartupType Automatic -Start-Service ssh-agent -Get-Service ssh-agent diff --git a/.devcontainer/post-create-command.sh b/.devcontainer/post-create-command.sh index a6636fdb4..50e3d85ac 100644 --- a/.devcontainer/post-create-command.sh +++ b/.devcontainer/post-create-command.sh @@ -1,8 +1,10 @@ #!/bin/bash +set -e + pip install --upgrade pip #pip install 'urllib3[secure]' pip install -r requirements.txt -pip install -r gdrive_requirements.txt +#pip install -r download/requirements.txt sudo gem install pg bundler sudo bundle install diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 091c40daf..60c734a6b 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -1,25 +1,112 @@ -# This workflow will later be replaced with logic to "Generate Website Data" -# The verify-gdrive.yml workflow file will be renamed to this one -# We have to introduce this change in steps because GitHub gets confused until -# we add the new workflow file to the master branch name: "Generate Website Data" on: workflow_dispatch: + push: +env: + POSTGRES_USER: app_user + POSTGRES_DB: disclosure-backend + POSTGRES_PASSWORD: app_password jobs: - generate: + build: runs-on: ubuntu-latest - env: - REPO_OWNER: ${{ github.repository_owner}} - REPO_BRANCH: ${{ github.ref_name }} - SERVICE_ACCOUNT_KEY_JSON: ${{ secrets.SERVICE_ACCOUNT_KEY_JSON }} - GDRIVE_FOLDER: ${{ vars.GDRIVE_FOLDER }} + outputs: + devcontainer: ${{ steps.filter.outputs.devcontainer }} + noncontainer: ${{ steps.filter.outputs.noncontainer }} steps: - - uses: actions/checkout@v3 - - run: pip install -r gdrive_requirements.txt - - run: python test_pull_from_gdrive.py - - name: Archive pulled files - uses: actions/upload-artifact@v3 + - name: Login to GitHub Container Registry + uses: docker/login-action@v3 with: - name: redacted-netfile-files - path: .local/downloads + registry: ghcr.io + username: ${{github.actor}} + password: ${{secrets.GITHUB_TOKEN}} + - uses: actions/checkout@v3 + - name: Get changed files + id: changed-files + uses: tj-actions/changed-files@v40 + - name: List all changed files + id: filter + run: | + echo ${{github.event_name}} + noncontainer=true + if docker pull ghcr.io/caciviclab/disclosure-backend-static/${{github.ref_name}}:latest; then + devcontainer=false + else + devcontainer=true + fi + for file in ${{ steps.changed-files.outputs.all_changed_files }}; do + echo "$file was changed" + if [[ ${{github.event_name}} = push ]]; then + if [[ $file = .devcontainer* ]]; then + devcontainer=true + elif [[ $file = *requirements.txt* ]]; then + devcontainer=true + elif [[ $file = Gemfile* ]]; then + devcontainer=true + fi + fi + done + + echo "devcontainer=$devcontainer" >> $GITHUB_OUTPUT + echo "noncontainer=$noncontainer" >> $GITHUB_OUTPUT + - name: Build dev container + if: steps.filter.outputs.devcontainer == 'true' + run: | + docker build --no-cache --tag ghcr.io/caciviclab/disclosure-backend-static/${{github.ref_name}}:latest -f ./.devcontainer/Dockerfile . + docker push ghcr.io/caciviclab/disclosure-backend-static/${{github.ref_name}}:latest + - name: Check code changes + if: steps.filter.outputs.noncontainer == 'true' + run: | + echo "TODO: run test to verify that code changes are good" + generate: + needs: build + if: needs.build.outputs.noncontainer == 'true' + runs-on: ubuntu-latest + container: + image: ghcr.io/caciviclab/disclosure-backend-static/${{github.ref_name}}:latest + credentials: + username: ${{ github.actor }} + password: ${{ secrets.github_token }} + env: + REPO_OWNER: ${{ github.repository_owner}} + REPO_BRANCH: ${{ github.ref_name }} + SERVICE_ACCOUNT_KEY_JSON: ${{ secrets.SERVICE_ACCOUNT_KEY_JSON }} + GDRIVE_FOLDER: ${{ vars.GDRIVE_FOLDER }} + PGHOST: postgres + PGDATABASE: ${{ env.POSTGRES_DB }} + PGUSER: ${{ env.POSTGRES_USER }} + PGPASSWORD: ${{ env.POSTGRES_PASSWORD }} + services: + postgres: + #image: postgres:9.6-bullseye + image: postgres:15.6-bullseye + env: + POSTGRES_USER: ${{ env.POSTGRES_USER }} + POSTGRES_DB: ${{ env.POSTGRES_DB }} + POSTGRES_PASSWORD: ${{ env.POSTGRES_PASSWORD }} + steps: + - uses: actions/checkout@v4 + - name: Check setup + run: | + git -v + # This keeps git from thinking that the current dir is not a repo even though a .git dir exists + git config --global --add safe.directory "$GITHUB_WORKSPACE" + psql -l + echo "c1,c2" > test.csv + echo "a,b" >> test.csv + cat test.csv + csvsql -v --db postgresql:///disclosure-backend --insert test.csv + echo "List tables" + psql -c "SELECT * FROM pg_catalog.pg_tables WHERE schemaname != 'pg_catalog' AND schemaname != 'information_schema';" + pip show sqlalchemy + - name: Create csv files + run: | + make clean + make download + make import + make process + - name: Summarize results + run: | + echo "List tables" + psql -c "SELECT * FROM pg_catalog.pg_tables WHERE schemaname != 'pg_catalog' AND schemaname != 'information_schema';" + diff --git a/.github/workflows/verify-gdrive.yml b/.github/workflows/verify-gdrive.yml index 8cb029157..fcb748dac 100644 --- a/.github/workflows/verify-gdrive.yml +++ b/.github/workflows/verify-gdrive.yml @@ -4,6 +4,12 @@ on: jobs: check: runs-on: ubuntu-latest + container: + image: ghcr.io/caciviclab/disclosure-backend-static/${{github.ref_name}}:latest + credentials: + username: ${{ github.actor }} + password: ${{ secrets.github_token }} + env: REPO_OWNER: ${{ github.repository_owner}} REPO_BRANCH: ${{ github.ref_name }} @@ -11,8 +17,8 @@ jobs: GDRIVE_FOLDER: ${{ vars.GDRIVE_FOLDER }} steps: - uses: actions/checkout@v3 - - run: pip install -r gdrive_requirements.txt - - run: python test_pull_from_gdrive.py + - name: Test pull from gdrive + run: python test_pull_from_gdrive.py - name: Archive pulled files uses: actions/upload-artifact@v3 with: diff --git a/Makefile b/Makefile index bf7a9a39b..d3c749c08 100644 --- a/Makefile +++ b/Makefile @@ -10,7 +10,11 @@ clean-spreadsheets: rm -rf downloads/csv/*.csv downloads/csv/office_elections.csv downloads/csv/measure_committees.csv downloads/csv/elections.csv clean: - rm -rf downloads/raw downloads/csv + rm -rf downloads/raw downloads/csv .local/downloads .local/csv + git --version + python --version + ruby --version + psql --version process: process.rb # todo: remove RUBYOPT variable when activerecord fixes deprecation warnings @@ -21,6 +25,9 @@ process: process.rb bin/report-candidates git --no-pager diff build/digests.json +download-netfile-v2: + python download/main.py + download-spreadsheets: downloads/csv/candidates.csv downloads/csv/committees.csv \ downloads/csv/referendums.csv downloads/csv/name_to_number.csv \ downloads/csv/office_elections.csv downloads/csv/elections.csv @@ -36,7 +43,8 @@ upload-cache: tar czf - downloads/csv downloads/static downloads/cached-db \ | aws s3 cp - s3://odca-data-cache/$(shell date +%Y-%m-%d).tar.gz --acl public-read -download: download-spreadsheets \ +download: download-netfile-v2 \ + download-spreadsheets \ download-COAK-2014 download-COAK-2015 download-COAK-2016 \ download-COAK-2017 download-COAK-2018 \ download-COAK-2019 download-COAK-2020 \ @@ -110,9 +118,7 @@ do-import-spreadsheets: csvsql --db postgresql:///$(DATABASE_NAME) --insert --no-create --no-inference downloads/csv/elections.csv echo 'ALTER TABLE "elections" ADD COLUMN id SERIAL PRIMARY KEY;' | psql $(DATABASE_NAME) -import-data: 496 497 A-Contributions B1-Loans B2-Loans C-Contributions \ - D-Expenditure E-Expenditure F-Expenses F461P5-Expenditure F465P3-Expenditure \ - F496P3-Contributions G-Expenditure H-Loans I-Contributions Summary +import-data: import-old-data import-new-data echo 'CREATE TABLE IF NOT EXISTS "calculations" (id SERIAL PRIMARY KEY, subject_id integer, subject_type varchar(30), name varchar(40), value jsonb);' | psql $(DATABASE_NAME) ./bin/remove_duplicate_transactions ./bin/make_view @@ -124,9 +130,19 @@ recreatedb: reindex: ruby search_index.rb +import-new-data: elections_v2 committees_v2 a_contributions_v2 + echo 'TODO: add new data to import' + +import-old-data: 496 497 A-Contributions B1-Loans B2-Loans C-Contributions \ + D-Expenditure E-Expenditure F-Expenses F461P5-Expenditure F465P3-Expenditure \ + F496P3-Contributions G-Expenditure H-Loans I-Contributions Summary + 496 497 A-Contributions B1-Loans B2-Loans C-Contributions D-Expenditure E-Expenditure F-Expenses F461P5-Expenditure F465P3-Expenditure F496P3-Contributions G-Expenditure H-Loans I-Contributions Summary: DATABASE_NAME=$(DATABASE_NAME) ./bin/import-file $(CSV_PATH) $@ +elections_v2 committees_v2 a_contributions_v2: + DATABASE_NAME=$(DATABASE_NAME) ./bin/import-file $(CSV_PATH) $@ 0 + downloads/csv/candidates.csv: mkdir -p downloads/csv downloads/raw $(WGET) -O- \ diff --git a/bin/clean b/bin/clean index e9ac2487d..dafae6847 100755 --- a/bin/clean +++ b/bin/clean @@ -18,4 +18,6 @@ cat <<-QUERY | psql ${database_name} DELETE FROM "$table_name" WHERE "Tran_Date" is NULL; QUERY +else + echo fi diff --git a/bin/import-file b/bin/import-file index 635bfe786..843afe532 100755 --- a/bin/import-file +++ b/bin/import-file @@ -2,10 +2,12 @@ # Contains logic to import files regardless of how many there are. # If there's no file, don't do anything with the database. # If the table already exists in the database, don't try to re-create it. +# The fix_pending parameter defaults to 1 if not set. Set it to 0 +# to skip the section that fixes the pending Filer_ID # # Usage: -# bin/import-file [csv_path] [table] -# bin/import-file downloads/csv A-Contributions +# bin/import-file [csv_path] [table] [fix_pending] +# bin/import-file downloads/csv A-Contributions 1 set -euo pipefail if [ -z "${DATABASE_NAME:-""}" ]; then @@ -14,12 +16,13 @@ if [ -z "${DATABASE_NAME:-""}" ]; then fi if [ $# -eq 0 ]; then - echo "Usage: bin/import-file [csv_path] [table]" + echo "Usage: bin/import-file [csv_path] [table] [fix_pending]" exit 1 fi csv_path=$1 table_name=$2 +fix_pending=${3:-1} filename_glob=$csv_path'/*'${table_name}'.csv' table_exists= if psql disclosure-backend -c '\d "'${table_name}'"' >/dev/null 2>&1; then @@ -34,9 +37,10 @@ if ls $filename_glob 2>/dev/null >/dev/null; then csvsql --db postgresql:///$DATABASE_NAME --tables $table_name --insert --no-inference --no-create echo -n ' Removing empty Tran_Date... ' ./bin/clean "$DATABASE_NAME" "$table_name" - echo - echo -n ' Fixing pending Filer_IDs... ' - ./bin/fix-pending "$DATABASE_NAME" "$table_name" + if [ "$fix_pending" = "1" ]; then + echo -n ' Fixing pending Filer_IDs... ' + ./bin/fix-pending "$DATABASE_NAME" "$table_name" + fi else echo 'Found no files to import' fi diff --git a/build/_data/stats.json b/build/_data/stats.json index 4a04467b4..702469e97 100644 --- a/build/_data/stats.json +++ b/build/_data/stats.json @@ -1,3 +1,3 @@ { - "date_processed": "2024-06-28 00:08:43 -0700" + "date_processed": "2024-07-02 00:08:35 -0700" } diff --git a/build/schema.sql b/build/schema.sql index 0e830b976..3345c740b 100644 --- a/build/schema.sql +++ b/build/schema.sql @@ -966,7 +966,7 @@ ALTER TABLE public."G-Expenditure" OWNER TO travis; CREATE TABLE public."H-Loans" ( "Filer_ID" integer NOT NULL, - "Filer_NamL" character varying(47) NOT NULL, + "Filer_NamL" character varying(60) NOT NULL, "Report_Num" character varying(3) NOT NULL, "Committee_Type" character varying(3) NOT NULL, "Rpt_Date" date NOT NULL, @@ -998,7 +998,7 @@ CREATE TABLE public."H-Loans" ( "Loan_EMP" character varying(32), "Loan_OCC" character varying(32), "Loan_Self" boolean NOT NULL, - "Cmte_ID" integer NOT NULL, + "Cmte_ID" integer, "Tres_NamL" character varying(32), "Tres_NamF" character varying(32), "Tres_NamT" character varying(32), diff --git a/dbschema/H-Loans.sql b/dbschema/H-Loans.sql index 9627025cc..265b66d1a 100644 --- a/dbschema/H-Loans.sql +++ b/dbschema/H-Loans.sql @@ -1,6 +1,6 @@ CREATE TABLE "H-Loans" ( "Filer_ID" INTEGER NOT NULL, - "Filer_NamL" VARCHAR(47) NOT NULL, + "Filer_NamL" VARCHAR(60) NOT NULL, "Report_Num" VARCHAR(3) NOT NULL, "Committee_Type" VARCHAR(3) NOT NULL, "Rpt_Date" DATE NOT NULL, @@ -32,7 +32,8 @@ CREATE TABLE "H-Loans" ( "Loan_EMP" VARCHAR(32), "Loan_OCC" VARCHAR(32), "Loan_Self" BOOLEAN NOT NULL, - "Cmte_ID" INTEGER NOT NULL, + --"Cmte_ID" INTEGER NOT NULL, + "Cmte_ID" INTEGER, "Tres_NamL" VARCHAR(32), "Tres_NamF" VARCHAR(32), "Tres_NamT" VARCHAR(32), diff --git a/dbschema/a_contributions_v2.sql b/dbschema/a_contributions_v2.sql new file mode 100644 index 000000000..e69de29bb diff --git a/dbschema/committees_v2.sql b/dbschema/committees_v2.sql new file mode 100644 index 000000000..273d9e569 --- /dev/null +++ b/dbschema/committees_v2.sql @@ -0,0 +1,15 @@ +CREATE TABLE committees_v2 ( + filer_nid DECIMAL NOT NULL, + "Ballot_Measure_Election" VARCHAR, + "Filer_ID" VARCHAR, + "Filer_NamL" VARCHAR NOT NULL, + "_Status" VARCHAR NOT NULL, + "_Committee_Type" VARCHAR NOT NULL, + "Ballot_Measure" VARCHAR, + "Support_Or_Oppose" VARCHAR, + candidate_controlled_id BOOLEAN, + "Start_Date" DATE, + "End_Date" BOOLEAN, + data_warning BOOLEAN, + "Make_Active" BOOLEAN +); diff --git a/dbschema/elections_v2.sql b/dbschema/elections_v2.sql new file mode 100644 index 000000000..1f4ab2fd6 --- /dev/null +++ b/dbschema/elections_v2.sql @@ -0,0 +1,6 @@ +CREATE TABLE elections_v2 ( + location VARCHAR NOT NULL, + date DATE NOT NULL, + name VARCHAR NOT NULL, + title VARCHAR NOT NULL +); diff --git a/download/main.py b/download/main.py index 3658b2476..95730b21e 100644 --- a/download/main.py +++ b/download/main.py @@ -1,4 +1,7 @@ """ main, to run everything """ +from collections import Counter +from datetime import datetime +import os import json from model.a_contributions import A_Contributions from model.committee import Committees @@ -21,7 +24,12 @@ def unique_statuses(filers): def main(): """ Do everyting """ - # pull data from gdrive and put it in .local/downloads + data_dir_path = '.local/downloads' + csv_data_dir_path = '.local/csv' + os.makedirs(data_dir_path, exist_ok=True) + os.makedirs(csv_data_dir_path, exist_ok=True) + + # pull data from gdrive and put it in .local/downloads/raw pull_data(subfolder='main', default_folder='OpenDisclosure') with open(f'{DATA_DIR_PATH}/elections.json', encoding='utf8') as f: @@ -72,9 +80,9 @@ def main(): 'XRef_Match', ]).sample(n=20)) - elections.df.write_csv(f'{OUTPUT_DIR}/elections.csv') - committees.df.write_csv(f'{OUTPUT_DIR}/committees.csv') - a_contributions.df.write_csv(f'{OUTPUT_DIR}/a_contributions.csv') + elections.df.write_csv(f'{csv_data_dir_path}/elections.csv') + committees.df.write_csv(f'{csv_data_dir_path}/committees.csv') + a_contributions.df.write_csv(f'{csv_data_dir_path}/a_contributions.csv') if __name__ == '__main__': main() diff --git a/download/requirements.txt b/download/requirements.txt deleted file mode 100644 index 453185900..000000000 --- a/download/requirements.txt +++ /dev/null @@ -1,6 +0,0 @@ -pandas~=2.0.3 -SQLAlchemy~=2.0.20 -psycopg2~=2.9.7 -gdrive-datastore==0.0.1.5 -pyarrow==14.0.1 -polars==0.19.12 diff --git a/gdrive_requirements.txt b/gdrive_requirements.txt deleted file mode 100644 index fad47bef5..000000000 --- a/gdrive_requirements.txt +++ /dev/null @@ -1 +0,0 @@ -gdrive-datastore==0.0.1.5 diff --git a/requirements.txt b/requirements.txt index 74c4a8ac2..e24fe46ff 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,18 +4,21 @@ Babel==2.9.1 csvkit==1.4.0 dbfread==2.0.7 et-xmlfile==1.0.1 +gdrive-datastore==0.0.1.5 isodate==0.5.4; python_version < '3.0' isodate==0.6.1; python_version >= '3.0' jdcal==1.3 leather==0.3.3 -openpyxl==2.4.2 +openpyxl==3.1.5 +pandas~=2.0.3 parsedatetime==2.1 polars==0.20.1 psycopg2-binary==2.7.4; python_version < '3.0' -psycopg2-binary==2.8.6; python_version >= '3.0' -python-dateutil==2.2 +psycopg2-binary==2.9.7; python_version >= '3.0' +pyarrow==14.0.1 +python-dateutil==2.8.2 pytimeparse==1.1.5 -pytz==2016.10 +pytz==2020.1 regex==2016.12.27 six==1.10.0 SQLAlchemy~=2.0.23