-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathetl-run-all.sh
executable file
·154 lines (141 loc) · 5.2 KB
/
etl-run-all.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
#!/bin/bash
# Run the NMA API ETL end-to-end (SPARQL store load, extraction, Solr load)
# Orchestrates the entire ETL process, archiving loaded files and log files
#
# Intended to be run periodically via cron as root user
# But can be run manually: sudo ./etl-run-all.sh [full|incr]
#
# EMu exports a new set of files into the full or incr directory daily
# Emu files should be moved after ingest (as etl loads all *.xml files found)
# Piction exports a single file that is constantly over-written daily
# Piction file is not to be moved after ingest (so may be stale if export fails)
# we mainly work/archive in the EMu dir (but pull in piction files too)
DATA_DIR=/mnt/emu_data
PICTION_DATA_DIR=/mnt/dams_data
# where the main steps operations are
SCRIPT_DIR=/usr/local/NMA-API-ETL
# where to find ETL logs (to copy/archive after each step)
LOGS_DIR=/var/log/NMA-API-ETL
# default to full load
MODE=full
# Optional command-line arg to override mode
if (( $# > 0 ))
then
MODE="$1"
fi
# Create output directory, eg: /mnt/emu_data/etl/yyyy-mm/job_yyyymmdd_hhmm_full
JOB_ID=job_$(date +"%Y%m%d_%H%M")_$MODE
JOB_DIR=$(date +"%Y-%m")
OUT_DIR="$DATA_DIR/etl/$JOB_DIR/$JOB_ID"
mkdir -p "$OUT_DIR"
# Init logfile
LOGFILE="$OUT_DIR/etl-run-all.log"
to_log() {
echo $(date +"%Y-%m-%d %H:%M:%S") $1 >> $LOGFILE
}
to_log "BEGIN ETL - mode=$MODE, job=$JOB_ID"
# ETL step 1 - load to sparql store
cd $SCRIPT_DIR
# Fuseki build is now always "full"
#IN_DIR="$DATA_DIR/$MODE"
IN_DIR="$DATA_DIR/full"
PICTION_IN_DIR="$PICTION_DATA_DIR"
# check for existence of data files; if any are missing, abort the ETL
echo Checking for existence of source data files ... >> $LOGFILE
# Fuseki build is now always "full"
case "$MODE" in
# full)
full|incremental)
if compgen -G $IN_DIR/*object*.xml > /dev/null ; then
echo Objects file exists >> $LOGFILE
else
echo Objects file missing! ETL aborting. >> $LOGFILE
exit 1
fi
if compgen -G $IN_DIR/*narratives*.xml > /dev/null ; then
echo Narratives file exists >> $LOGFILE
else
echo Narratives file missing! ETL aborting. >> $LOGFILE
exit 1
fi
if compgen -G $IN_DIR/*accessionlots*.xml > /dev/null ; then
echo Accession lots file exists >> $LOGFILE
else
echo Accession lots file missing! ETL aborting. >> $LOGFILE
exit 1
fi
if compgen -G $IN_DIR/*sites*.xml > /dev/null ; then
echo Sites file exists >> $LOGFILE
else
echo Sites file missing! ETL aborting. >> $LOGFILE
exit 1
fi
if compgen -G $IN_DIR/*parties*.xml > /dev/null ; then
echo Parties file exists >> $LOGFILE
else
echo Parties file missing! ETL aborting. >> $LOGFILE
exit 1
fi
# if compgen -G $PICTION_IN_DIR/solr_prod1.xml > /dev/null ; then
# echo Piction file exists >> $LOGFILE
# else
# echo Piction file missing! ETL aborting. >> $LOGFILE
# exit 1
# fi
# ;;
# incremental)
# we only need at least one EMu XML file for incremental
# if compgen -G $IN_DIR/*.xml > /dev/null ; then
# echo EMu update files exist >> $LOGFILE
# else
# echo EMu update files missing! ETL aborting. >> $LOGFILE
# exit 1
# fi
;;
*)
to_log "Unknown mode: $MODE"
echo Usage: $0 [full\|incremental]
exit 1
esac
to_log "START ETL STEP 1 - full load to Fuseki SPARQL store"
to_log "Source files: $(ls $IN_DIR/*.xml 2>/dev/null) $(ls $PICTION_IN_DIR/*.xml 2>/dev/null)"
to_log "Loading files to Fuseki public dataset"
$SCRIPT_DIR/etl-to-fuseki-full.sh public
to_log "Loading files to Fuseki internal dataset"
$SCRIPT_DIR/etl-to-fuseki-full.sh internal
to_log "FINISH ETL STEP 1 - load to Fuseki SPARQL store"
# move/copy loaded files and log
# (we're not allowed to move piction files)
cp $LOGS_DIR/etl-to-fuseki*.log $OUT_DIR/
mv $IN_DIR/*.xml $OUT_DIR 2>/dev/null
cp $PICTION_IN_DIR/*.xml $OUT_DIR 2>/dev/null
to_log "Moved/copied ingested files to archive: $OUT_DIR"
# ETL step 2 - extract from sparql store and load to solr
to_log "START ETL STEP 2 - load to Solr"
to_log "Loading files to Solr public core"
cd $SCRIPT_DIR
$SCRIPT_DIR/etl-to-solr.sh public $MODE
cp $LOGS_DIR/etl-to-solr-public.log $OUT_DIR/
to_log "Copied Solr public load log files to archive: $OUT_DIR"
to_log "Loading files to Solr internal core"
$SCRIPT_DIR/etl-to-solr.sh internal $MODE
cp $LOGS_DIR/etl-to-solr-internal.log $OUT_DIR/
to_log "Copied Solr internal load log files to archive: $OUT_DIR"
to_log "FINISH ETL STEP 2 - load to Solr"
# delete stale archives
to_log "Removing old data files (14 days):"
find $DATA_DIR/etl -name *.xml -mtime +14 -print >> $LOGFILE
find $DATA_DIR/etl -name *.xml -mtime +14 -exec rm '{}' \;
to_log "Removing old etl job logs (6 months):"
# NB: using 7 months as may delete the whole 6 month directory
find $DATA_DIR/etl -mindepth 1 -type d -ctime +214 -print >> $LOGFILE
find $DATA_DIR/etl -mindepth 1 -type d -ctime +214 -exec rm -rf '{}' \;
# end of run
cp $LOGS_DIR/etl-run-cron.log $OUT_DIR/
mkdir $OUT_DIR/failed-solr-deposits
find "/data/failed-solr-deposits/" -exec mv {} $OUT_DIR/failed-solr-deposits/ \;
# NB: don't need to copy etl-run-all.log as is already in the output dir
to_log "END ETL - mode=$MODE, job=$JOB_ID"
HOSTNAME=`hostname`
sendemail -f "$HOSTNAME ETL <no_reply@$HOSTNAME>" -t [email protected] [email protected] -u 'NMA ETL results' -m "See attached log file." -a $OUT_DIR/etl-run-all.log
exit 0