-
Notifications
You must be signed in to change notification settings - Fork 1
/
embedding_creation_script.wdl
224 lines (191 loc) · 8.2 KB
/
embedding_creation_script.wdl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
# Copyright 2022 Verily Life Sciences LLC
# Use of this source code is governed by a BSD-style license that can be found in the LICENSE file or at
# https://developers.google.com/open-source/licenses/bsd
#
# Optimized version of the Embedding Creation workflow.
#
# Coding standard https://biowdl.github.io/styleGuidelines.html is used with newer command body
# style https://github.com/openwdl/wdl/blob/main/versions/1.0/SPEC.md#command-section.
version 1.0
import 'https://raw.githubusercontent.com/broadinstitute/cellprofiler-on-Terra/v0.2.0/utils/cellprofiler_distributed_utils.wdl' as cell_profiler_workflows
workflow EmbeddingCreation {
input {
#--- [ sharding specific parameters ] ----
# Specify path to input load_data_with_illum.csv, which contains paths to *.tiff, and illum/*.npy.
String loadDataWithIllum
# To determine the shard for a well, perform the modulus operation over the last two digits of the well name
# or use the numeric value verbatim if the modulus value is zero.
Int modulus = 24
#--- [ embedding creation specific parameters ] ----
# GCS or S3 path underneath which computed cell centers are stored.
String cellCentersPathPrefix
# Desired location of the computed embeddings.
String embeddingOutputPath
Int cellPatchDim
Int modelBatchDim
String tfHubModelPath
Int tfHubModelInputImageHeight
Int tfHubModelInputImageWidth
Int tfHubModelOutputEmbSize
String embeddingCreationDockerImage = 'ghcr.io/jump-cellpainting/cell-painting-embedder:20240510'
Int embeddingCreationCPU = 8
Int embeddingCreationMemoryGB = 30
Int embeddingCreationDiskGB = 10
Int embeddingCreationBootDiskGB = 15
Int embeddingCreationMaxRetries = 1
Int embeddingCreationPreemptibleAttempts = 2
String embeddingCreationGPUType = "nvidia-tesla-t4"
Int embeddingCreationGPUCount = 1
String embeddingCreationNvidiaDriverVersion = "470.82.01"
String embeddingCreationZone = 'us-central1-c'
}
String embeddingOutputPathTrimmed = sub(embeddingOutputPath, '/+$', '')
# Determine which wells should be processed within which shards.
call determineShards {
input:
loadDataWithIllum = loadDataWithIllum,
modulus = modulus,
dockerImage = embeddingCreationDockerImage,
bootDiskGB = embeddingCreationBootDiskGB
}
# Run embedding creation scattered by shards of multiple wells.
scatter(shard in determineShards.value) {
call runEmbeddingCreationScript {
input:
shardMetadata = shard,
loadDataWithIllum = loadDataWithIllum,
cellCentersPathPrefix = cellCentersPathPrefix,
cellPatchDim = cellPatchDim,
modelBatchDim = modelBatchDim,
tfHubModelPath = tfHubModelPath,
tfHubModelInputImageHeight = tfHubModelInputImageHeight,
tfHubModelInputImageWidth = tfHubModelInputImageWidth,
tfHubModelOutputEmbSize = tfHubModelOutputEmbSize,
dockerImage = embeddingCreationDockerImage,
cpu = embeddingCreationCPU,
memoryGB = embeddingCreationMemoryGB,
diskGB = embeddingCreationDiskGB,
bootDiskGB = embeddingCreationBootDiskGB,
maxRetries = embeddingCreationMaxRetries,
preemptibleAttempts = embeddingCreationPreemptibleAttempts,
gpuType = embeddingCreationGPUType,
gpuCount = embeddingCreationGPUCount,
nvidiaDriverVersion = embeddingCreationNvidiaDriverVersion,
zone = embeddingCreationZone
}
call cell_profiler_workflows.extract_and_gsutil_rsync as delocalizeEmbeddingOutputs {
input:
tarball=runEmbeddingCreationScript.tarOutputs,
destination_gsurl=embeddingOutputPathTrimmed
}
}
output {
String outputDirectory = delocalizeEmbeddingOutputs.output_directory[0]
Array[File] dataWarningsLog = runEmbeddingCreationScript.dataWarningsLog
}
}
task determineShards {
input {
# Specify path to input load_data_with_illum.csv, which contains GCS paths to *.tiff, and illum/*.npy.
String loadDataWithIllum
# To determine the shard for a well, perform the modulus operation over the last two digits of the well name
# or use the numeric value verbatim if the modulus value is zero.
Int modulus = 24
# Docker image
String dockerImage = 'ghcr.io/jump-cellpainting/cell-painting-embedder:20240510'
Int bootDiskGB = 15
}
String outputFilename = 'shards_metadata.txt'
command <<<
# Errors should cause the task to fail, not produce an empty output.
set -o errexit
set -o pipefail
set -o nounset
# Send a trace of all fully resolved executed commands to stderr.
set -o xtrace
python3 /opt/scatter_wells_s3.py \
--load_data_with_illum_csv_file=~{loadDataWithIllum} \
--modulus=~{modulus} \
--output_filename=~{outputFilename}
>>>
output {
Array[String] value = read_lines(outputFilename)
File outputText = outputFilename
}
runtime {
docker: dockerImage
bootDiskSizeGb: bootDiskGB
maxRetries: 1
preemptible: 2
}
}
task runEmbeddingCreationScript {
input {
# GCS or S3 path underneath which computed cell centers are stored.
String shardMetadata
# Specify path to input load_data_with_illum.csv, which contains GCS paths to *.tiff, and illum/*.npy.
String loadDataWithIllum
String cellCentersPathPrefix
Int cellPatchDim
Int modelBatchDim
String tfHubModelPath
Int tfHubModelInputImageHeight
Int tfHubModelInputImageWidth
Int tfHubModelOutputEmbSize
String dockerImage = 'ghcr.io/jump-cellpainting/cell-painting-embedder:20240510'
Int cpu = 8
Int memoryGB = 30
Int diskGB = 10
Int bootDiskGB = 15
Int maxRetries = 1
Int preemptibleAttempts = 2
String gpuType = 'nvidia-tesla-t4'
Int gpuCount = 1
String nvidiaDriverVersion = '470.82.01'
String zone = 'us-central1-c'
}
String workDir = 'workdir'
String tarOutputsFile = 'outputs.tar.gz'
command <<<
# Errors should cause the task to fail, not produce an empty output.
set -o errexit
set -o pipefail
set -o nounset
# Send a trace of all fully resolved executed commands to stderr.
set -o xtrace
mkdir -p ~{workDir}
cd ~{workDir}
python3 /opt/embedding_creation.py \
--shard_metadata='~{shardMetadata}' \
--cell_center_path_prefix=~{cellCentersPathPrefix} \
--load_data=~{loadDataWithIllum} \
--cell_patch_dim=~{cellPatchDim} \
--model_batch_dim=~{modelBatchDim} \
--tf_hub_model_path=~{tfHubModelPath} \
--tf_hub_model_output_emb_height=~{tfHubModelInputImageHeight} \
--tf_hub_model_output_emb_width=~{tfHubModelInputImageWidth} \
--tf_hub_model_output_emb_size=~{tfHubModelOutputEmbSize}
# Create a tar to also capture any outputs written to subdirectories, in addition to the current working directory.
cd ..
tar -zcvf ~{tarOutputsFile} --directory ~{workDir} .
>>>
output {
File tarOutputs = tarOutputsFile
File dataWarningsLog = glob('*data_warnings.log')[0]
}
# See also https://cromwell.readthedocs.io/en/stable/RuntimeAttributes/#recognized-runtime-attributes-and-backends
# How to configure GPUs https://support.terra.bio/hc/en-us/articles/360055066731
runtime {
docker: dockerImage
memory: memoryGB + ' GB'
disks: 'local-disk ' + diskGB + ' SSD'
bootDiskSizeGb: bootDiskGB
maxRetries: maxRetries
preemptible: preemptibleAttempts
cpu: cpu
gpuType: gpuType
gpuCount: gpuCount
nvidiaDriverVersion: nvidiaDriverVersion
zones: [zone]
}
}