-
Notifications
You must be signed in to change notification settings - Fork 1.6k
/
Copy pathconvert_bigquery_to_postgres.sh
executable file
·182 lines (151 loc) · 9.36 KB
/
convert_bigquery_to_postgres.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
#!/bin/bash
# This shell script converts BigQuery .sql files into PostgreSQL .sql files.
# path in which we create the postgres concepts
TARGET_PATH='../concepts_postgres'
mkdir -p $TARGET_PATH
# String replacements are necessary for some queries.
# Note that these queries are very senstive to changes, e.g. adding whitespaces after comma can already change the behavior.
# Schema replacement: change `physionet-data.<dataset>.<table>` to just <table> (with no backticks)
export REGEX_SCHEMA='s/`physionet-data.(mimiciv_hosp|mimiciv_icu|mimiciv_derived).([A-Za-z0-9_-]+)`/\1.\2/g'
# Add single quotes around the date part
export REGEX_DATETIME_DIFF="s/DATETIME_DIFF\(([^,]+), ?(.*), ?(DAY|MINUTE|SECOND|HOUR|YEAR)\)/DATETIME_DIFF(\1, \2, '\3')/g"
export REGEX_DATETIME_TRUNC="s/DATETIME_TRUNC\(([^,]+), ?(DAY|MINUTE|SECOND|HOUR|YEAR)\)/DATE_TRUNC('\2', \1)/g"
# Add necessary quotes to INTERVAL, e.g. "INTERVAL 5 hour" to "INTERVAL '5' hour"
export REGEX_INTERVAL="s/interval ([[:digit:]]+) (hour|day|month|year)/INTERVAL '\1' \2/gI"
# Specific queries for some problems that arose with some files.
export REGEX_INT="s/CAST\(hr AS INT64\)/CAST\(hr AS bigint\)/g"
export REGEX_ARRAY="s/GENERATE_ARRAY\(-24, CEIL\(DATETIME\_DIFF\(it\.outtime_hr, it\.intime_hr, HOUR\)\)\)/ARRAY\(SELECT \* FROM generate\_series\(-24, CEIL\(DATETIME\_DIFF\(it\.outtime_hr, it\.intime_hr, HOUR\)\)\)\)/g"
export REGEX_HOUR_INTERVAL="s/INTERVAL CAST\(hr AS INT64\) HOUR/interval \'1\' hour * CAST\(hr AS bigint\)/g"
export REGEX_SECONDS="s/SECOND\)/\'SECOND\'\)/g"
#=== Decide on the order queries are executed
# Normally, queries will be executed in alphabetical order.
# However, some tables depend on each other. We specify
# which queries to run first, and in what order.
# tables we want to run before all other concepts
# usually because they are used as dependencies
DIR_AND_TABLES_TO_PREBUILD='demographics.icustay_times demographics.icustay_hourly demographics.weight_durations measurement.urine_output organfailure.kdigo_uo'
# tables which are written directly in postgresql and source code controlled
# this is usually because there is no trivial conversion between bq/psql syntax
DIR_AND_TABLES_ALREADY_IN_PSQL=''
# tables which we want to run after all other concepts
# usually because they depend on one or more other queries
# these will be generated in the order specified
DIR_AND_TABLES_TO_SKIP='organfailure.kdigo_stages firstday.first_day_sofa sepsis.sepsis3 medication.vasoactive_agent medication.norepinephrine_equivalent_dose'
# First, we re-create the postgres-make-concepts.sql file.
echo "\echo ''" > $TARGET_PATH/postgres-make-concepts.sql
# Now we add some preamble for the user running the script.
echo "\echo '==='" >> $TARGET_PATH/postgres-make-concepts.sql
echo "\echo 'Beginning to create materialized views for MIMIC database.'" >> $TARGET_PATH/postgres-make-concepts.sql
echo "\echo '"'Any notices of the form "NOTICE: materialized view "XXXXXX" does not exist" can be ignored.'"'" >> $TARGET_PATH/postgres-make-concepts.sql
echo "\echo 'The scripts drop views before creating them, and these notices indicate nothing existed prior to creating the view.'" >> $TARGET_PATH/postgres-make-concepts.sql
echo "\echo '==='" >> $TARGET_PATH/postgres-make-concepts.sql
echo "\echo ''" >> $TARGET_PATH/postgres-make-concepts.sql
echo >> $TARGET_PATH/postgres-make-concepts.sql
echo "-- Set the search_path, i.e. the location at which we generate tables." >> $TARGET_PATH/postgres-make-concepts.sql
echo "-- postgres looks at schemas sequentially, so this will generate tables on the mimiciv_derived schema" >> $TARGET_PATH/postgres-make-concepts.sql
echo >> $TARGET_PATH/postgres-make-concepts.sql
echo "-- NOTE: many scripts *require* you to use mimiciv_derived as the schema for outputting concepts" >> $TARGET_PATH/postgres-make-concepts.sql
echo "-- change the search path at your peril!" >> $TARGET_PATH/postgres-make-concepts.sql
echo "SET search_path TO mimiciv_derived, mimiciv_hosp, mimiciv_icu, mimiciv_ed;" >> $TARGET_PATH/postgres-make-concepts.sql
# ======================================== #
# === CONCEPTS WHICH WE MUST RUN FIRST === #
# ======================================== #
echo -n "Dependencies:"
# output table creation calls to the make-concepts script
echo "" >> $TARGET_PATH/postgres-make-concepts.sql
echo "-- dependencies" >> $TARGET_PATH/postgres-make-concepts.sql
for dir_and_table in $DIR_AND_TABLES_TO_PREBUILD;
do
d=`echo ${dir_and_table} | cut -d. -f1`
tbl=`echo ${dir_and_table} | cut -d. -f2`
# catch special case when file is in current directory
if [[ $d == '' ]]; then
d='.'
fi
# make the sub-folder for postgres if it does not exist
mkdir -p "$TARGET_PATH/${d}"
# convert the bigquery script to psql and output it to the appropriate subfolder
echo -n " ${d}.${tbl} .."
# re-write the script into psql using regex
# the if statement ensures we do not overwrite tables
# for which we already have psql code
if ! [[ "$DIR_AND_TABLES_ALREADY_IN_PSQL" =~ "$d.$tbl" ]]; then
echo "-- THIS SCRIPT IS AUTOMATICALLY GENERATED. DO NOT EDIT IT DIRECTLY." > "$TARGET_PATH/${d}/${tbl}.sql"
echo "DROP TABLE IF EXISTS ${tbl}; CREATE TABLE ${tbl} AS" >> "${TARGET_PATH}/${d}/${tbl}.sql"
# apply regex to map bigquery syntax to postgres syntax
cat "${d}/${tbl}.sql" | sed -r -e "${REGEX_ARRAY}" | sed -r -e "${REGEX_HOUR_INTERVAL}" | sed -r -e "${REGEX_INT}" | sed -r -e "${REGEX_DATETIME_DIFF}" | sed -r -e "${REGEX_DATETIME_TRUNC}" | sed -r -e "${REGEX_SCHEMA}" | sed -r -e "${REGEX_INTERVAL}" | sed -r -e "${REGEX_SECONDS}" >> "$TARGET_PATH/${d}/${tbl}.sql"
else
echo -n "(psql!) .."
fi
# write out a call to this script in the make concepts file
echo "\i ${d}/${tbl}.sql" >> $TARGET_PATH/postgres-make-concepts.sql
done
echo " done!"
# ================================== #
# === MAIN LOOP FOR ALL CONCEPTS === #
# ================================== #
# Iterate through each concept subfolder, and:
# (1) apply the above regular expressions to update the script
# (2) output to the postgres subfolder
# (3) add a line to the postgres-make-concepts.sql script to generate this table
# the order *only* matters during the conversion step because our loop is
# inserting table build commands into the postgres-make-concepts.sql file
for d in demographics measurement comorbidity medication treatment firstday organfailure score sepsis;
do
mkdir -p "$TARGET_PATH/${d}"
echo -n "${d}:"
echo "" >> $TARGET_PATH/postgres-make-concepts.sql
echo "-- ${d}" >> $TARGET_PATH/postgres-make-concepts.sql
for fn in `ls $d`;
do
# only run SQL queries
if [[ "${fn: -4}" == ".sql" ]]; then
# table name is file name minus extension
tbl="${fn%????}"
echo -n " ${tbl} "
# check if var2 is in var1
if echo "$DIR_AND_TABLES_TO_PREBUILD" | grep -wq "$d.$tbl"; then
echo -n "(prebuilt!) .."
continue
elif echo "$DIR_AND_TABLES_TO_SKIP" | grep -wq "$d.$tbl"; then
echo -n "(skipping!) .."
continue
fi
# re-write the script into psql using regex
# the if statement ensures we do not overwrite tables which are already written in psql
if ! echo "$DIR_AND_TABLES_ALREADY_IN_PSQL" | grep -wq "$d.$tbl"; then
echo "-- THIS SCRIPT IS AUTOMATICALLY GENERATED. DO NOT EDIT IT DIRECTLY." > "${TARGET_PATH}/${d}/${tbl}.sql"
echo "DROP TABLE IF EXISTS ${tbl}; CREATE TABLE ${tbl} AS" >> "${TARGET_PATH}/${d}/${tbl}.sql"
cat "${d}/${tbl}.sql" | sed -r -e "${REGEX_ARRAY}" | sed -r -e "${REGEX_HOUR_INTERVAL}" | sed -r -e "${REGEX_INT}" | sed -r -e "${REGEX_DATETIME_DIFF}" | sed -r -e "${REGEX_DATETIME_TRUNC}" | sed -r -e "${REGEX_SCHEMA}" | sed -r -e "${REGEX_INTERVAL}" >> "${TARGET_PATH}/${d}/${fn}"
echo -n ".."
else
echo -n "(psql!) .."
fi
# add statement to generate this table
# in the make concepts script
echo "\i ${d}/${fn}" >> ${TARGET_PATH}/postgres-make-concepts.sql
fi
done
echo " done!"
done
# generate remaining concepts
echo "" >> ${TARGET_PATH}/postgres-make-concepts.sql
echo "-- final tables which were dependent on one or more prior tables" >> ${TARGET_PATH}/postgres-make-concepts.sql
echo -n "final:"
for dir_and_table in $DIR_AND_TABLES_TO_SKIP
do
d=`echo ${dir_and_table} | cut -d. -f1`
tbl=`echo ${dir_and_table} | cut -d. -f2`
# make the sub-folder for postgres if it does not exist
mkdir -p "$TARGET_PATH/${d}"
# convert the bigquery script to psql and output it to the appropriate subfolder
echo -n " ${d}.${tbl} .."
if ! echo "$DIR_AND_TABLES_ALREADY_IN_PSQL" | grep -wq "$d.$tbl"; then
echo "-- THIS SCRIPT IS AUTOMATICALLY GENERATED. DO NOT EDIT IT DIRECTLY." > "$TARGET_PATH/${d}/${tbl}.sql"
echo "DROP TABLE IF EXISTS ${tbl}; CREATE TABLE ${tbl} AS" >> "$TARGET_PATH/${d}/${tbl}.sql"
cat "${d}/${tbl}.sql" | sed -r -e "${REGEX_ARRAY}" | sed -r -e "${REGEX_HOUR_INTERVAL}" | sed -r -e "${REGEX_INT}" | sed -r -e "${REGEX_DATETIME_DIFF}" | sed -r -e "${REGEX_DATETIME_TRUNC}" | sed -r -e "${REGEX_SCHEMA}" | sed -r -e "${REGEX_INTERVAL}" | sed -r -e "${REGEX_SECONDS}" >> "$TARGET_PATH/${d}/${tbl}.sql"
fi
# write out a call to this script in the make concepts file
echo "\i ${d}/${tbl}.sql" >> $TARGET_PATH/postgres-make-concepts.sql
done
echo " done!"