-
Notifications
You must be signed in to change notification settings - Fork 0
/
.gitlab-ci.yml
243 lines (223 loc) · 14.2 KB
/
.gitlab-ci.yml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
default:
image: python:3.9-slim
variables:
OS_AUTH_TYPE: v3applicationcredential
OS_AUTH_URL: https://hdf-cloud.fz-juelich.de:5000
OS_IDENTITY_API_VERSION: 3
OS_REGION_NAME: "HDFCloud"
OS_INTERFACE: public
PRODUCTION_IP: 134.94.199.220
OLD_PROD_NAME: old-airflow-production
PRODUCTION_NAME: airflow-production
PRODUCTION_URL: https://datalogistics.eflows4hpc.eu
PRODUCTION_DOMAIN: datalogistics.eflows4hpc.eu
AIRFLOW_TESTUSER: "airflow"
AIRFLOW__SECRETS__BACKEND: datacat_integration.secrets.DatacatSecretsBackend
DAG_GIT_URL: https://github.com/eflows4hpc/dls-dags
VOLUME_ID: 6b58c3a6-691b-496a-8afd-153637c2de48
DOCKER_TLS_CERTDIR: ""
TESTING_IP: 134.94.199.115
OLD_TEST_NAME: old-airflow-testing
TESTING_NAME: airflow-testing
TESTING_URL: https://zam10115.zam.kfa-juelich.de
TESTING_DOMAIN: zam10115.zam.kfa-juelich.de
METADATA_URL: https://zam10045.zam.kfa-juelich.de:7000/oauth2/.well-known/openid-configuration
TESTING_OAUTH_ID: ff3f5a29-d210-4be2-a6d4-93c4fc755bfe
PRODUCTION_OAUTH_ID: affcd446-cab1-4751-8d76-e792f830d415
# before script copied from gitlab docs
.before_script_template: &ssh_setup
before_script:
- 'command -v ssh-agent >/dev/null || ( apt-get update -y && apt-get install openssh-client gcc libxslt-dev libffi-dev libssl-dev build-essential python3-dev -y )'
- eval $(ssh-agent -s)
- echo "$SSH_PRIVATE_KEY" | tr -d '\r' | ssh-add -
- mkdir -p ~/.ssh
- chmod 700 ~/.ssh
stages:
- build
- publish
- deploy
- test-deployment
- cleanup
build-testing-image:
stage: build
image: docker:latest
services:
- docker:dind
when: manual
tags:
- laptop
variables:
IMAGE_COMMIT_TAG: $CI_REGISTRY_IMAGE/eflows-airflow:testing
script:
- docker login -u $CI_REGISTRY_USER -p $CI_REGISTRY_PASSWORD $CI_REGISTRY
- docker build --no-cache=true --pull -t $IMAGE_COMMIT_TAG -f dockers/eflows-airflow.docker .
- docker push $IMAGE_COMMIT_TAG
build-production-image:
stage: build
image: docker:latest
services:
- docker:dind
when: manual
tags:
- laptop
variables:
IMAGE_COMMIT_TAG: $CI_REGISTRY_IMAGE/eflows-airflow:$CI_COMMIT_SHORT_SHA
IMAGE_LATEST_TAG: $CI_REGISTRY_IMAGE/eflows-airflow:latest
script:
- docker login -u $CI_REGISTRY_USER -p $CI_REGISTRY_PASSWORD $CI_REGISTRY
- docker build --no-cache=true --pull -t $IMAGE_COMMIT_TAG -f dockers/eflows-airflow.docker .
- docker push $IMAGE_COMMIT_TAG
- docker tag $IMAGE_COMMIT_TAG $IMAGE_LATEST_TAG
- docker push $IMAGE_LATEST_TAG
full-deploy-testing:
stage: deploy
environment: Testing
rules:
- if: ($CI_COMMIT_BRANCH == "main" && $MANUAL_FULL_DEPLOY == "true")
<<: *ssh_setup
script:
- echo "Starting the full testing deployment of airflows."
- pip install python-openstackclient
- OLD_ID=`openstack server show $TESTING_NAME -f value -c id` && server_exists=true || echo "No testing server found. It might be a first time deployment"
- openstack server set --name $OLD_TEST_NAME $OLD_ID;
# - if [ "$server_exists" = true ] ; then
# ssh -oStrictHostKeyChecking=accept-new airflow@$TESTING_IP "sudo docker-compose -f /home/airflow/data-logistics-service/dockers/docker-compose.yaml --project-directory /home/airflow/eflows-airflow down"
# openstack server set --name $OLD_TEST_NAME $OLD_ID;
# fi
- INSTANCE_ID=`openstack server create -f value -c id --prefix IMAGE_ --flavor l2 --image 149a65b5-aeb8-499f-aaa6-ec966bd28dd6 --user-data scripts/cloudinit.yml --security-group ssh --security-group www --security-group https $TESTING_NAME`
- while [ "`openstack server show $INSTANCE_ID -c addresses -f value`" = "{}" ]; do sleep 5; done # wait until an address is available to attach the floating ip
- openstack server add floating ip $INSTANCE_ID $TESTING_IP
- sleep 10 # ensure that next command reaches the new server, prevents host key problems
- ssh -oStrictHostKeyChecking=accept-new airflow@$TESTING_IP "sudo mkdir -p /persistent_data/docker_volumes"
- until ssh -oStrictHostKeyChecking=accept-new airflow@$TESTING_IP ls /finished_cloudinit >/dev/null 2>&1; do sleep 30; done # wait until cloudinit script is complete
- ssh -oStrictHostKeyChecking=accept-new airflow@$TESTING_IP "sudo service docker restart" # to use the configured docker data path
- ssh -oStrictHostKeyChecking=accept-new airflow@$TESTING_IP "docker volume create --name=persistent_postgres-db-volume"
- ssh -oStrictHostKeyChecking=accept-new airflow@$TESTING_IP "docker volume create --name=persistent_certs"
- ssh -oStrictHostKeyChecking=accept-new airflow@$TESTING_IP 'sed -i "s_eflows-airflow:latest_eflows-airflow:testing_g" /home/airflow/data-logistics-service/dockers/docker-compose.yaml'
- ssh -oStrictHostKeyChecking=accept-new airflow@$PTESTING_IP "sudo /home/airflow/data-logistics-service/scripts/deployment.sh /home/airflow /home/airflow/data-logistics-service $TESTING_DOMAIN $AIRFLOW__SECRETS__BACKEND $AIRFLOW__SECRETS__BACKEND_KWARGS $AIRFLOW_FERNET_KEY $DAG_GIT_URL $TESTING_OAUTH_ID $SSO_CLIENT_SECRET $METADATA_URL"
- echo "Done"
light-deploy-testing:
stage: deploy
rules:
- if: ($CI_COMMIT_BRANCH == "main" && $MANUAL_FULL_DEPLOY !~ /true/ )
<<: *ssh_setup
environment: Testing
script:
- ssh -oStrictHostKeyChecking=accept-new airflow@$TESTING_IP "cd /home/airflow/data-logistics-service && git stash && git stash clear && git checkout main && git checkout -f $CI_COMMIT_TAG && git pull --all"
- ssh -oStrictHostKeyChecking=accept-new airflow@$TESTING_IP 'sed -i "s_eflows-airflow:latest_eflows-airflow:testing_g" /home/airflow/data-logistics-service/dockers/docker-compose.yaml'
- ssh -oStrictHostKeyChecking=accept-new airflow@$TESTING_IP "sudo /home/airflow/data-logistics-service/scripts/deployment.sh /home/airflow /home/airflow/data-logistics-service $TESTING_DOMAIN $AIRFLOW__SECRETS__BACKEND $AIRFLOW__SECRETS__BACKEND_KWARGS $AIRFLOW_FERNET_KEY $DAG_GIT_URL $TESTING_OAUTH_ID $SSO_CLIENT_SECRET $METADATA_URL"
force-light-deploy-testing: # for deploying images generated on other branches to testing - can only be done by logged in and authorized users
stage: deploy
when: manual
<<: *ssh_setup
environment: Testing
script:
- ssh -oStrictHostKeyChecking=accept-new airflow@$TESTING_IP "cd /home/airflow/data-logistics-service && git stash && git stash clear && git checkout main && git checkout -f $CI_COMMIT_TAG && git pull --all"
- ssh -oStrictHostKeyChecking=accept-new airflow@$TESTING_IP 'sed -i "s_eflows-airflow:latest_eflows-airflow:testing_g" /home/airflow/data-logistics-service/dockers/docker-compose.yaml'
- ssh -oStrictHostKeyChecking=accept-new airflow@$TESTING_IP "sudo /home/airflow/data-logistics-service/scripts/deployment.sh /home/airflow /home/airflow/data-logistics-service $TESTING_DOMAIN $AIRFLOW__SECRETS__BACKEND $AIRFLOW__SECRETS__BACKEND_KWARGS $AIRFLOW_FERNET_KEY $DAG_GIT_URL $TESTING_OAUTH_ID $SSO_CLIENT_SECRET $METADATA_URL"
full-deploy-production:
stage: deploy
environment: Production
rules:
- if: ($CI_COMMIT_TAG =~ /stable/ && $MANUAL_FULL_DEPLOY == "true")
<<: *ssh_setup
script:
- echo "Starting the full production deployment of airflows."
- pip install python-openstackclient
- OLD_ID=`openstack server show $PRODUCTION_NAME -f value -c id` && server_exists=true || echo "No production server found. It might be a first time deployment"
- openstack server set --name $OLD_PROD_NAME $OLD_ID
# - if [ "$server_exists" = true ] ; then
# ssh -oStrictHostKeyChecking=accept-new airflow@$PRODUCTION_IP "sudo docker-compose -f /home/airflow/data-logistics-service/dockers/docker-compose.yaml --project-directory /home/airflow/eflows-airflow down"
# openstack server set --name $OLD_PROD_NAME $OLD_ID;
# fi
- openstack server remove volume $OLD_ID $VOLUME_ID
- INSTANCE_ID=`openstack server create -f value -c id --prefix IMAGE_ --flavor m4 --image 149a65b5-aeb8-499f-aaa6-ec966bd28dd6 --user-data scripts/cloudinit.yml --security-group ssh --security-group www --security-group https $PRODUCTION_NAME`
- while [ "`openstack server show $INSTANCE_ID -c addresses -f value`" = "{}" ]; do sleep 5; done # wait until an address is available to attach the floating ip
- openstack server add floating ip $INSTANCE_ID $PRODUCTION_IP
- sleep 10 # ensure that next command reaches the new server, prevents host key problems
# do the mount /dev/vdb1 stuff
- openstack server add volume $INSTANCE_ID $VOLUME_ID
- sleep 20 # apparently it may take some time until the volume is available to the OS
- while [ "`ssh -oStrictHostKeyChecking=accept-new airflow@$PRODUCTION_IP ls `" ]; do sleep 5; done
- until ssh -oStrictHostKeyChecking=accept-new airflow@$PRODUCTION_IP ls /finished_cloudinit >/dev/null 2>&1; do sleep 30; done # wait until cloudinit script is complete
- ssh -oStrictHostKeyChecking=accept-new airflow@$PRODUCTION_IP "sudo mkdir -p /persistent_data && sudo mount /dev/vdb1 /persistent_data"
- ssh -oStrictHostKeyChecking=accept-new airflow@$PRODUCTION_IP "sudo service docker restart" # to use the configured docker data path
- ssh -oStrictHostKeyChecking=accept-new airflow@$PRODUCTION_IP "sudo /home/airflow/data-logistics-service/scripts/deployment.sh /home/airflow /home/airflow/data-logistics-service $PRODUCTION_DOMAIN $AIRFLOW__SECRETS__BACKEND $AIRFLOW__SECRETS__BACKEND_KWARGS $AIRFLOW_FERNET_KEY $DAG_GIT_URL $PRODUCTION_OAUTH_ID $SSO_CLIENT_SECRET $METADATA_URL"
- echo "Done"
# NOTE Light deployment did not perform well when the template/main.html file was changed (in case of the official airflow image being updated)
# TODO Add proper tests
light-deploy-production:
stage: deploy
rules:
- if: ($CI_COMMIT_TAG =~ /stable/ && $MANUAL_FULL_DEPLOY !~ /true/)
<<: *ssh_setup
environment: Production
script:
- ssh -oStrictHostKeyChecking=accept-new airflow@$PRODUCTION_IP "cd /home/airflow/data-logistics-service && git stash && git stash clear && git checkout main && git fetch --all && git checkout -f $CI_COMMIT_TAG && rm -rf dags && git clone https://github.com/eflows4hpc/dls-dags.git dags"
- ssh -oStrictHostKeyChecking=accept-new airflow@$PRODUCTION_IP "sudo /home/airflow/data-logistics-service/scripts/deployment.sh /home/airflow /home/airflow/data-logistics-service $PRODUCTION_DOMAIN $AIRFLOW__SECRETS__BACKEND $AIRFLOW__SECRETS__BACKEND_KWARGS $AIRFLOW_FERNET_KEY $DAG_GIT_URL $PRODUCTION_OAUTH_ID $SSO_CLIENT_SECRET $METADATA_URL"
test-production-webserver:
cache: {}
stage: test-deployment
rules:
- if: $CI_COMMIT_TAG =~ /stable/
script:
- apt update && apt -y install curl
- echo "This is a simple check if the deployment was successful and dags get executed"
# ensure that the docker containers are up and running before testing the airflow deployment; timeout in 16 to 17 minutes
- SECONDS=0
- 'while [ $SECONDS -le 1000 ] ; do if output=$(curl --insecure --max-time 10 -I -H "Accept: application/json" $PRODUCTION_URL/home) ; then break; else sleep 30; fi ; done'
- 'curl --insecure -I -H "Accept: application/json" $PRODUCTION_URL/home'
- 'curl -X GET -u $AIRFLOW_TESTUSER:$AIRFLOW_TESTUSER_PASS -H "Content-Type: application/json" $PRODUCTION_URL/api/v1/dags'
- 'curl -X GET -u $AIRFLOW_TESTUSER:$AIRFLOW_TESTUSER_PASS -H "Content-Type: application/json" $PRODUCTION_URL/api/v1/connections'
- 'curl -X POST -u $AIRFLOW_TESTUSER:$AIRFLOW_TESTUSER_PASS -H "Content-Type: application/json" --data {} $PRODUCTION_URL/api/v1/dags/testdag/dagRuns'
cleanup-successful-full-deployment:
# check if there is an old prod or test instance, and delete it if present
stage: cleanup
when: on_success
rules:
- if: $MANUAL_FULL_DEPLOY == "true"
script:
- echo "This is the cleanup for the full-redeployment of the testing or production servers"
- echo "if this job is reached, all earlier jobs were successful, and any lingering old instances need to be removed"
- pip install python-openstackclient
- openstack server delete $OLD_TEST_NAME && echo "Deleted old testing server." || echo "No old testing server found."
- openstack server delete $OLD_PROD_NAME && echo "Deleted old production server." || echo "No old production server found."
cleanup-failed-full-deployment:
# check if there is an old prod or test instance, assign respective ip to it, re-attach volume, delete new instance, rename old instance
# if there is none, this is a failed light-deployment, which is handled by another job
# this does not guarantee a successful rollback, but unless the old instance was faulty, this should work
stage: cleanup
when: on_failure
rules:
- if: ($CI_COMMIT_TAG =~ /stable/ && $MANUAL_FULL_DEPLOY == "true")
<<: *ssh_setup
script:
- echo "This is the cleanup for the full-redeployment of the testing or production servers"
- echo "if this job is reached, some earlier job had to have failed, this will return to the previous instance (if available)"
- echo "A successfull cleanup can not be guaranteed, depending on the failure reason"
- pip install python-openstackclient
# check which old instance is present. (either old test or old production); store instance id in a var
- OLD_TEST_ID=`openstack server show $OLD_TEST_NAME -f value -c id` && rollback_test=true || echo "No old testing server found."
# if applicable: rollback test server
- if [ "$rollback_test" = true ] ; then
REMOVE_ID=`openstack server show $TESTING_NAME -f value -c id` && new_deployment_exists=true|| echo "No new testing server has been created.";
openstack server set --name $TESTING_NAME $OLD_TEST_ID;
openstack server add floating ip $OLD_TEST_ID $TESTING_IP;
if [ "$new_deployment_exists" = true ] ; then
openstack server delete $REMOVE_ID && echo "Deleted faulty testing server.";
fi
fi
# gitlab should automatically alert the devs about this failure
publishgit-do:
stage: publish
only:
- tags
tags: [stable]
script:
- apt-get update
- apt-get install -y git
- (git remote rm gith) || echo "Not found"
- (git remote -v | grep gith) || git remote add gith "https://${GITHUB_USER}:${GITHUB_TOKEN}@github.com/eflows4hpc/data-logistics-service.git"
- git remote -v
- git fetch --unshallow origin
- git push gith +HEAD:refs/heads/main