-
Notifications
You must be signed in to change notification settings - Fork 13
122 lines (101 loc) · 4.3 KB
/
data_update_v2.yml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
name: data.v2.json update
on:
schedule:
# Run at 3 AM PDT every day
- cron: 0 10 * * *
workflow_dispatch: {}
jobs:
# This workflow contains a single job called "build"
build:
runs-on: ubuntu-latest
steps:
# Checkout the main repo
- name: Checkout Main repo
uses: actions/checkout@v2
with:
ssh-key: ${{ secrets.data_daily_update }}
path: site
# Checkout data scraper repo
- name: Checkout Data Scraper
uses: actions/checkout@v2
with:
ssh-key: ${{ secrets.data_daily_update }}
repository: sfbrigade/data-covid19-sfbayarea
path: scraper
# The scraper uses Python 3.8+, so make sure we've got the latest 3.x
- name: Set up Python 3.x
uses: actions/setup-python@v1
with:
python-version: '3.x'
- name: Cache Python Dependencies
uses: actions/cache@v2
with:
path: ~/.cache/pip
# NOTE: we can hash these requirements files together, but keeping them
# separate allows us to load a partial cache if only the dev
# requirements have changed.
key: ${{ runner.os }}-pip-${{ hashFiles('scraper/requirements.txt') }}-${{ hashFiles('scraper/requirements-dev.txt') }}
restore-keys: |
${{ runner.os }}-pip-${{ hashFiles('scraper/requirements.txt') }}-
${{ runner.os }}-pip-
# Install dependencies
- name: Install libxml2-dev libxslt-dev
run: sudo apt-get install libxml2-dev libxslt-dev
# - The commit that was checked out will be available as $SCRAPER_COMMIT.
- name: Install Data Scraper & Dependencies
run: |
cd ${GITHUB_WORKSPACE}/scraper
python -m pip install --upgrade pip
pip install -r requirements.txt;
# Keep track of the version used so we can use it in commit messages
echo "SCRAPER_COMMIT='$(git rev-parse HEAD)'" >> $GITHUB_ENV
- name: Scrape Data
id: scrape_data
# The scraper may error on some counties but succeed on others. We want to
# continue so that we still generate PRs for partial successes. We'll handle
# failures in a later step.
continue-on-error: true
env:
SLACK_WEBHOOK_URL: ${{ secrets.SLACK_WEBHOOK_URL }}
run: |
echo "SCRAPER_TIME='$(date)'" >> $GITHUB_ENV
cd ${GITHUB_WORKSPACE}/scraper
OUT_PATH="${GITHUB_WORKSPACE}/site/data/data.v2.json"
python scraper_data.py > today.json 2> errors.txt || true
# Merge new data into previous data. (Note this has to be two steps;
# if we read $OUT_PATH as input for jq and write stdout to it at the
# same time, we get conflicts and bad output.)
jq -s '.[0] + .[1]' $OUT_PATH today.json > merged.json
mv merged.json $OUT_PATH
ERRORS=`cat errors.txt`
if [ -n "${ERRORS}" ]; then
echo "Encountered the following errors while scraping:"
echo "------------------------------------------------"
echo "${ERRORS}"
# The error text can contain unescaped quotes, newlines, etc.
# Use jq to make sure we are composing correctly formatted JSON.
# `--raw-input` treats the input as strings instead of JSON.
# `--slurp` causes all lines to be combined into one string.
ERRORS_JSON=`cat errors.txt | jq --slurp --raw-input '{"text": .}'`
curl -X POST -H 'Content-type: application/json' --data "${ERRORS_JSON}" $SLACK_WEBHOOK_URL
# Raise an error so this step fails.
false
fi
- name: Commit Changes
run: |
cd ${GITHUB_WORKSPACE}/site
git config user.name ${{ secrets.githubaction_config_user_name }}
git config user.email ${{ secrets.githubaction_config_user_email }}
git add data/data.v2.json
git commit -F - << EOF
GitHubAction: data v2 update
Created with commit ${{env.SCRAPER_COMMIT}} from sfbrigade/data-covid19-sfbayarea
https://github.com/sfbrigade/data-covid19-sfbayarea/commit/${{env.SCRAPER_COMMIT}}
EOF
git push
echo 'Git commit and push completed for the daily data update.'
- name: Fail Workflow if the Scraper Had Errors
if: ${{ steps.scrape_data.outcome == 'failure' }}
run: |
echo "Marking workflow as failed."
false