-
Notifications
You must be signed in to change notification settings - Fork 18
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* Update update_etags.sh * Update upload_index.sh * Update upload_index.sh * Update upload_index.sh * Update upload_index.sh * use standard form, check token sooner * quote variables, drop csvkit dep * quote variable * drop csvkit dep * cleanup * shebang, formatting * bug fix * simplify regex * deps: add shekllcheck and shfmt to flake.nix * chore: quote to avoid potential globbing * format: apply shfmt to scripts * fix(upload_index.sh): use /usr/bin/env bash shebang * fix: quote 'etag' header; refactor get_column * refactor: replace column in place; support etag in any col --------- Co-authored-by: Alán F. Muñoz <[email protected]>
- Loading branch information
Showing
3 changed files
with
120 additions
and
73 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -27,6 +27,8 @@ | |
curl | ||
gawk | ||
moreutils | ||
shellcheck | ||
shfmt | ||
]; | ||
}; | ||
}; | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,12 +1,38 @@ | ||
#!/usr/bin/env bash | ||
# Returns the updated ETag for elements in the second column of $1 alongside the first two columns. | ||
cat $1 | | ||
tail -n +2 | # Remove headers | ||
cut -f2 -d',' | # Select url column | ||
xargs -I {} -- curl -I --silent "{}" | # Fetch remote metadata | ||
grep "ETag" | # Select etag field from resulting html | ||
awk '{print $2}' | # Remove prefix | ||
sed 's/\r$//' | # Remove carriage | ||
sed 1i'"etag"' | # add header | ||
paste - $1 -d',' | # Merge with original file | ||
awk -F ',' '{print $2","$3","$1}' # Print in the right order | ||
# Fetch updated ETag values for URLs in a CSV file. | ||
|
||
# Note that quotes are expected in the csv but ommited when | ||
input_file="$1" | ||
url_header="url" | ||
etag_header="etag" | ||
|
||
get_column() { | ||
# gets id of column $1 in ${input_file}. | ||
awk -F',' -v col="\"$1\"" 'NR==1 { for (i=1; i<=NF; ++i) { if ($i==col) print i } }' "${input_file}" | ||
} | ||
|
||
# Check if input file is provided | ||
if [ -z "${input_file}" ]; then | ||
echo "Usage: $0 <input_file>" | ||
exit 1 | ||
fi | ||
|
||
url_column=$(get_column "${url_header}") | ||
urls=$(awk -F',' -v col="${url_column}" 'NR>1 {gsub(/^"|"$/, "", $col); print $col}' "${input_file}") | ||
|
||
# Fetch ETags for each URL in a loop | ||
etag_values='"etag"' | ||
while IFS= read -r url; do | ||
etag=$(curl -I --silent "${url}" | awk '/ETag:/ {print $2}') | ||
etag_values+="\n${etag}" | ||
done <<<"$urls" | ||
|
||
# Remove existing ETag column if present | ||
etag_column=$(get_column "${etag_header}") | ||
|
||
# Combine original data (without ETag) with new ETag values | ||
if [[ -n "${etag_column}" ]]; then # Replace $etag_column in $input_file with $etag_values | ||
awk -F',' -v OFS=',' -v col="${etag_column}" 'NR==FNR{a[NR]=$1;next}{$col=a[FNR]}1' <(echo -e "${etag_values}") "${input_file}" | ||
else # Append $etag_values as a new column on the right | ||
paste -d',' "${input_file}" <(echo -e "${etag_values}") | ||
fi |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,101 +1,120 @@ | ||
#!/usr/bin/env bash | ||
# Find the latest version of the dataset | ||
ZENODO_ENDPOINT="https://zenodo.org" | ||
DEPOSITION_PREFIX="${ZENODO_ENDPOINT}/api/deposit/depositions" | ||
ORIGINAL_ID="13892061" | ||
FILE_TO_VERSION="manifests/profile_index.csv" | ||
FILENAME=$(echo ${FILE_TO_VERSION} | sed 's+.*/++g') | ||
METADATA_JSON='{ | ||
"metadata": { | ||
"title": "The Joint Undertaking for Morphological Profiling (JUMP) Consortium Datasets Index", | ||
"creators": [ | ||
{ | ||
"name": "The JUMP Cell Painting Consortium" | ||
} | ||
], | ||
"upload_type": "dataset", | ||
"access_right": "open" | ||
} | ||
}' | ||
|
||
ZENODO_ENDPOINT="https://zenodo.org" | ||
DEPOSITION_PREFIX="${ZENODO_ENDPOINT}/api/deposit/depositions" | ||
|
||
FILENAME=${FILE_TO_VERSION##*/} | ||
|
||
echo "Checking that S3 ETags match their local counterpart" | ||
S3_ETAGS=$(cat ${FILE_TO_VERSION} | tail -n +2 | cut -f2 -d',' | xargs -I {} -- curl -I --silent "{}" | grep ETag | awk '{print $2}' | sed 's/\r$//' | md5sum | cut -f1 -d" ") | ||
LOCAL_ETAGS=$(cat ${FILE_TO_VERSION} | tail -n +2 | cut -f3 -d',' | md5sum | cut -f1 -d" ") | ||
|
||
echo "Remote ${S3_ETAGS} vs Local ${LOCAL_ETAGS} values" | ||
if [ "${S3_ETAGS}" != "${LOCAL_ETAGS}" ]; then | ||
echo "At least one ETag does not match their url." | ||
exit 1 | ||
fi | ||
# Extract URLs and ETags | ||
url_column=$(head -n1 "${FILE_TO_VERSION}" | tr ',' '\n' | grep -n "url" | cut -d':' -f1) | ||
urls=$(awk -F',' -v col="${url_column}" 'NR>1 {gsub(/^"|"$/, "", $col); print $col}' "${FILE_TO_VERSION}") | ||
|
||
if [ -z "${ORIGINAL_ID}" ]; then # Only get latest id when provided an original one | ||
echo "Creating new deposition" | ||
DEPOSITION_ENDPOINT="${DEPOSITION_PREFIX}" | ||
else # Update existing dataset | ||
echo "Previous ID Exists" | ||
LATEST_ID=$(curl "${ZENODO_ENDPOINT}/records/${ORIGINAL_ID}/latest" | | ||
grep records | sed 's/.*href=".*\.org\/records\/\(.*\)".*/\1/') | ||
REMOTE_HASH=$(curl -H "Content-Type: application/json" -X GET --data "{}" \ | ||
"${DEPOSITION_PREFIX}/${LATEST_ID}/files?access_token=${ZENODO_TOKEN}" | | ||
jq ".[] .links .download" | xargs curl | md5sum | cut -f1 -d" ") | ||
LOCAL_HASH=$(md5sum ${FILE_TO_VERSION} | cut -f1 -d" ") | ||
|
||
echo "Checking for changes in file contents: Remote ${REMOTE_HASH} vs Local ${LOCAL_HASH}" | ||
if [ "${REMOTE_HASH}" == "${LOCAL_HASH}" ]; then | ||
echo "The urls and md5sums have not changed" | ||
exit 0 | ||
fi | ||
|
||
echo "Creating new version" | ||
DEPOSITION_ENDPOINT="${DEPOSITION_PREFIX}/${LATEST_ID}/actions/newversion" | ||
fi | ||
etag_column=$(head -n1 "${FILE_TO_VERSION}" | tr ',' '\n' | grep -n "etag" | cut -d':' -f1) | ||
local_etags=$(awk -F',' -v col="${etag_column}" 'NR>1 {gsub(/^"|"$/, "", $col); print $col}' "${FILE_TO_VERSION}") | ||
|
||
s3_etags="" | ||
while IFS= read -r url; do | ||
etag=$(curl -I --silent "$url" | awk '/[eE][tT]ag:/ {print $2}' | tr -d '\r"') | ||
s3_etags+="${etag}\n" | ||
done <<<"${urls}" | ||
|
||
# Remove the trailing newline from s3_etags | ||
s3_etags=$(echo -e "${s3_etags}" | sed '/^$/d') | ||
|
||
# Calculate checksums for comparison | ||
s3_etags_hash=$(echo -e "${s3_etags}" | md5sum | cut -f1 -d" ") | ||
local_etags_hash=$(echo "${local_etags}" | md5sum | cut -f1 -d" ") | ||
|
||
echo "Remote ${s3_etags_hash} vs Local ${local_etags_hash} values" | ||
if [ "${s3_etags_hash}" != "${local_etags_hash}" ]; then | ||
echo "At least one ETag does not match their url." | ||
exit 1 | ||
fi | ||
|
||
if [ -z "${ZENODO_TOKEN}" ]; then # Check Zenodo Token | ||
echo "Access token not available" | ||
exit 1 | ||
echo "Access token not available" | ||
exit 1 | ||
else | ||
echo "Access token found." | ||
echo "Access token found." | ||
fi | ||
|
||
if [ -z "${ORIGINAL_ID}" ]; then # Only get latest id when provided an original one | ||
echo "Creating new deposition" | ||
DEPOSITION_ENDPOINT="${DEPOSITION_PREFIX}" | ||
else # Update existing dataset | ||
echo "Previous ID Exists" | ||
LATEST_ID=$(curl "${ZENODO_ENDPOINT}/records/${ORIGINAL_ID}/latest" | | ||
grep records | sed 's/.*href=".*\.org\/records\/\(.*\)".*/\1/') | ||
REMOTE_HASH=$(curl -H "Content-Type: application/json" -X GET --data "{}" \ | ||
"${DEPOSITION_PREFIX}/${LATEST_ID}/files?access_token=${ZENODO_TOKEN}" | | ||
jq ".[] .links .download" | xargs curl | md5sum | cut -f1 -d" ") | ||
LOCAL_HASH=$(md5sum ${FILE_TO_VERSION} | cut -f1 -d" ") | ||
|
||
echo "Checking for changes in file contents: Remote ${REMOTE_HASH} vs Local ${LOCAL_HASH}" | ||
if [ "${REMOTE_HASH}" == "${LOCAL_HASH}" ]; then | ||
echo "The urls and md5sums have not changed" | ||
exit 0 | ||
fi | ||
|
||
echo "Creating new version" | ||
DEPOSITION_ENDPOINT="${DEPOSITION_PREFIX}/${LATEST_ID}/actions/newversion" | ||
fi | ||
|
||
# Create new deposition | ||
DEPOSITION=$(curl -H "Content-Type: application/json" \ | ||
-X POST\ | ||
--data "{}" \ | ||
"${DEPOSITION_ENDPOINT}?access_token=${ZENODO_TOKEN}"\ | ||
| jq .id) | ||
-X POST --data "{}" \ | ||
"${DEPOSITION_ENDPOINT}?access_token=${ZENODO_TOKEN}" | | ||
jq .id) | ||
echo "New deposition ID is ${DEPOSITION}" | ||
|
||
# Variables | ||
BUCKET_DATA=$(curl "${DEPOSITION_PREFIX}/${DEPOSITION}?access_token=${ZENODO_TOKEN}") | ||
BUCKET=$(echo "${BUCKET_DATA}" | jq --raw-output .links.bucket) | ||
|
||
if [ "${BUCKET}" = "null" ]; then | ||
echo "Could not find URL for upload. Response from server:" | ||
echo "${BUCKET_DATA}" | ||
exit 1 | ||
echo "Could not find URL for upload. Response from server:" | ||
echo "${BUCKET_DATA}" | ||
exit 1 | ||
fi | ||
|
||
# Upload file | ||
echo "Uploading file ${FILE_TO_VERSION} to bucket ${BUCKET}" | ||
cat ${FILE_TO_VERSION} | ||
curl -o /dev/null \ | ||
--upload-file ${FILE_TO_VERSION} \ | ||
${BUCKET}/${FILENAME}?access_token="${ZENODO_TOKEN}" | ||
|
||
--upload-file ${FILE_TO_VERSION} \ | ||
"${BUCKET}"/"${FILENAME}"?access_token="${ZENODO_TOKEN}" | ||
|
||
# Upload Metadata | ||
echo -e '{"metadata": { | ||
"title": "The Joint Undertaking for Morphological Profiling (JUMP) Consortium Datasets Index", | ||
"creators": [ | ||
{ | ||
"name": "The JUMP Cell Painting Consortium" | ||
} | ||
], | ||
"upload_type": "dataset", | ||
"access_right": "open" | ||
}}' > metadata.json | ||
echo -e "${METADATA_JSON}" >metadata.json | ||
|
||
NEW_DEPOSITION_ENDPOINT="${DEPOSITION_PREFIX}/${DEPOSITION}" | ||
echo "Uploading file to ${NEW_DEPOSITION_ENDPOINT}" | ||
curl -H "Content-Type: application/json" \ | ||
-X PUT\ | ||
--data @metadata.json \ | ||
"${NEW_DEPOSITION_ENDPOINT}?access_token=${ZENODO_TOKEN}" | ||
-X PUT \ | ||
--data @metadata.json \ | ||
"${NEW_DEPOSITION_ENDPOINT}?access_token=${ZENODO_TOKEN}" | ||
|
||
# Publish | ||
echo "Publishing to ${NEW_DEPOSITION_ENDPOINT}" | ||
curl -H "Content-Type: application/json" \ | ||
-X POST\ | ||
--data "{}"\ | ||
"${NEW_DEPOSITION_ENDPOINT}/actions/publish?access_token=${ZENODO_TOKEN}"\ | ||
| jq .id | ||
|
||
-X POST \ | ||
--data "{}" \ | ||
"${NEW_DEPOSITION_ENDPOINT}/actions/publish?access_token=${ZENODO_TOKEN}" | | ||
jq .id |