Skip to content

Commit

Permalink
Make zenodo scripts reusable (#129)
Browse files Browse the repository at this point in the history
* Update update_etags.sh

* Update upload_index.sh

* Update upload_index.sh

* Update upload_index.sh

* Update upload_index.sh

* use standard form, check token sooner

* quote variables, drop csvkit dep

* quote variable

* drop csvkit dep

* cleanup

* shebang, formatting

* bug fix

* simplify regex

* deps: add shekllcheck and shfmt to flake.nix

* chore: quote to avoid potential globbing

* format: apply shfmt to scripts

* fix(upload_index.sh): use /usr/bin/env bash shebang

* fix: quote 'etag' header; refactor get_column

* refactor: replace column in place; support etag in any col

---------

Co-authored-by: Alán F. Muñoz <[email protected]>
  • Loading branch information
shntnu and afermg authored Oct 16, 2024
1 parent 074a8a8 commit 18e287c
Show file tree
Hide file tree
Showing 3 changed files with 120 additions and 73 deletions.
2 changes: 2 additions & 0 deletions flake.nix
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,8 @@
curl
gawk
moreutils
shellcheck
shfmt
];
};
};
Expand Down
48 changes: 37 additions & 11 deletions manifests/src/update_etags.sh
Original file line number Diff line number Diff line change
@@ -1,12 +1,38 @@
#!/usr/bin/env bash
# Returns the updated ETag for elements in the second column of $1 alongside the first two columns.
cat $1 |
tail -n +2 | # Remove headers
cut -f2 -d',' | # Select url column
xargs -I {} -- curl -I --silent "{}" | # Fetch remote metadata
grep "ETag" | # Select etag field from resulting html
awk '{print $2}' | # Remove prefix
sed 's/\r$//' | # Remove carriage
sed 1i'"etag"' | # add header
paste - $1 -d',' | # Merge with original file
awk -F ',' '{print $2","$3","$1}' # Print in the right order
# Fetch updated ETag values for URLs in a CSV file.

# Note that quotes are expected in the csv but ommited when
input_file="$1"
url_header="url"
etag_header="etag"

get_column() {
# gets id of column $1 in ${input_file}.
awk -F',' -v col="\"$1\"" 'NR==1 { for (i=1; i<=NF; ++i) { if ($i==col) print i } }' "${input_file}"
}

# Check if input file is provided
if [ -z "${input_file}" ]; then
echo "Usage: $0 <input_file>"
exit 1
fi

url_column=$(get_column "${url_header}")
urls=$(awk -F',' -v col="${url_column}" 'NR>1 {gsub(/^"|"$/, "", $col); print $col}' "${input_file}")

# Fetch ETags for each URL in a loop
etag_values='"etag"'
while IFS= read -r url; do
etag=$(curl -I --silent "${url}" | awk '/ETag:/ {print $2}')
etag_values+="\n${etag}"
done <<<"$urls"

# Remove existing ETag column if present
etag_column=$(get_column "${etag_header}")

# Combine original data (without ETag) with new ETag values
if [[ -n "${etag_column}" ]]; then # Replace $etag_column in $input_file with $etag_values
awk -F',' -v OFS=',' -v col="${etag_column}" 'NR==FNR{a[NR]=$1;next}{$col=a[FNR]}1' <(echo -e "${etag_values}") "${input_file}"
else # Append $etag_values as a new column on the right
paste -d',' "${input_file}" <(echo -e "${etag_values}")
fi
143 changes: 81 additions & 62 deletions manifests/src/upload_index.sh
Original file line number Diff line number Diff line change
@@ -1,101 +1,120 @@
#!/usr/bin/env bash
# Find the latest version of the dataset
ZENODO_ENDPOINT="https://zenodo.org"
DEPOSITION_PREFIX="${ZENODO_ENDPOINT}/api/deposit/depositions"
ORIGINAL_ID="13892061"
FILE_TO_VERSION="manifests/profile_index.csv"
FILENAME=$(echo ${FILE_TO_VERSION} | sed 's+.*/++g')
METADATA_JSON='{
"metadata": {
"title": "The Joint Undertaking for Morphological Profiling (JUMP) Consortium Datasets Index",
"creators": [
{
"name": "The JUMP Cell Painting Consortium"
}
],
"upload_type": "dataset",
"access_right": "open"
}
}'

ZENODO_ENDPOINT="https://zenodo.org"
DEPOSITION_PREFIX="${ZENODO_ENDPOINT}/api/deposit/depositions"

FILENAME=${FILE_TO_VERSION##*/}

echo "Checking that S3 ETags match their local counterpart"
S3_ETAGS=$(cat ${FILE_TO_VERSION} | tail -n +2 | cut -f2 -d',' | xargs -I {} -- curl -I --silent "{}" | grep ETag | awk '{print $2}' | sed 's/\r$//' | md5sum | cut -f1 -d" ")
LOCAL_ETAGS=$(cat ${FILE_TO_VERSION} | tail -n +2 | cut -f3 -d',' | md5sum | cut -f1 -d" ")

echo "Remote ${S3_ETAGS} vs Local ${LOCAL_ETAGS} values"
if [ "${S3_ETAGS}" != "${LOCAL_ETAGS}" ]; then
echo "At least one ETag does not match their url."
exit 1
fi
# Extract URLs and ETags
url_column=$(head -n1 "${FILE_TO_VERSION}" | tr ',' '\n' | grep -n "url" | cut -d':' -f1)
urls=$(awk -F',' -v col="${url_column}" 'NR>1 {gsub(/^"|"$/, "", $col); print $col}' "${FILE_TO_VERSION}")

if [ -z "${ORIGINAL_ID}" ]; then # Only get latest id when provided an original one
echo "Creating new deposition"
DEPOSITION_ENDPOINT="${DEPOSITION_PREFIX}"
else # Update existing dataset
echo "Previous ID Exists"
LATEST_ID=$(curl "${ZENODO_ENDPOINT}/records/${ORIGINAL_ID}/latest" |
grep records | sed 's/.*href=".*\.org\/records\/\(.*\)".*/\1/')
REMOTE_HASH=$(curl -H "Content-Type: application/json" -X GET --data "{}" \
"${DEPOSITION_PREFIX}/${LATEST_ID}/files?access_token=${ZENODO_TOKEN}" |
jq ".[] .links .download" | xargs curl | md5sum | cut -f1 -d" ")
LOCAL_HASH=$(md5sum ${FILE_TO_VERSION} | cut -f1 -d" ")

echo "Checking for changes in file contents: Remote ${REMOTE_HASH} vs Local ${LOCAL_HASH}"
if [ "${REMOTE_HASH}" == "${LOCAL_HASH}" ]; then
echo "The urls and md5sums have not changed"
exit 0
fi

echo "Creating new version"
DEPOSITION_ENDPOINT="${DEPOSITION_PREFIX}/${LATEST_ID}/actions/newversion"
fi
etag_column=$(head -n1 "${FILE_TO_VERSION}" | tr ',' '\n' | grep -n "etag" | cut -d':' -f1)
local_etags=$(awk -F',' -v col="${etag_column}" 'NR>1 {gsub(/^"|"$/, "", $col); print $col}' "${FILE_TO_VERSION}")

s3_etags=""
while IFS= read -r url; do
etag=$(curl -I --silent "$url" | awk '/[eE][tT]ag:/ {print $2}' | tr -d '\r"')
s3_etags+="${etag}\n"
done <<<"${urls}"

# Remove the trailing newline from s3_etags
s3_etags=$(echo -e "${s3_etags}" | sed '/^$/d')

# Calculate checksums for comparison
s3_etags_hash=$(echo -e "${s3_etags}" | md5sum | cut -f1 -d" ")
local_etags_hash=$(echo "${local_etags}" | md5sum | cut -f1 -d" ")

echo "Remote ${s3_etags_hash} vs Local ${local_etags_hash} values"
if [ "${s3_etags_hash}" != "${local_etags_hash}" ]; then
echo "At least one ETag does not match their url."
exit 1
fi

if [ -z "${ZENODO_TOKEN}" ]; then # Check Zenodo Token
echo "Access token not available"
exit 1
echo "Access token not available"
exit 1
else
echo "Access token found."
echo "Access token found."
fi

if [ -z "${ORIGINAL_ID}" ]; then # Only get latest id when provided an original one
echo "Creating new deposition"
DEPOSITION_ENDPOINT="${DEPOSITION_PREFIX}"
else # Update existing dataset
echo "Previous ID Exists"
LATEST_ID=$(curl "${ZENODO_ENDPOINT}/records/${ORIGINAL_ID}/latest" |
grep records | sed 's/.*href=".*\.org\/records\/\(.*\)".*/\1/')
REMOTE_HASH=$(curl -H "Content-Type: application/json" -X GET --data "{}" \
"${DEPOSITION_PREFIX}/${LATEST_ID}/files?access_token=${ZENODO_TOKEN}" |
jq ".[] .links .download" | xargs curl | md5sum | cut -f1 -d" ")
LOCAL_HASH=$(md5sum ${FILE_TO_VERSION} | cut -f1 -d" ")

echo "Checking for changes in file contents: Remote ${REMOTE_HASH} vs Local ${LOCAL_HASH}"
if [ "${REMOTE_HASH}" == "${LOCAL_HASH}" ]; then
echo "The urls and md5sums have not changed"
exit 0
fi

echo "Creating new version"
DEPOSITION_ENDPOINT="${DEPOSITION_PREFIX}/${LATEST_ID}/actions/newversion"
fi

# Create new deposition
DEPOSITION=$(curl -H "Content-Type: application/json" \
-X POST\
--data "{}" \
"${DEPOSITION_ENDPOINT}?access_token=${ZENODO_TOKEN}"\
| jq .id)
-X POST --data "{}" \
"${DEPOSITION_ENDPOINT}?access_token=${ZENODO_TOKEN}" |
jq .id)
echo "New deposition ID is ${DEPOSITION}"

# Variables
BUCKET_DATA=$(curl "${DEPOSITION_PREFIX}/${DEPOSITION}?access_token=${ZENODO_TOKEN}")
BUCKET=$(echo "${BUCKET_DATA}" | jq --raw-output .links.bucket)

if [ "${BUCKET}" = "null" ]; then
echo "Could not find URL for upload. Response from server:"
echo "${BUCKET_DATA}"
exit 1
echo "Could not find URL for upload. Response from server:"
echo "${BUCKET_DATA}"
exit 1
fi

# Upload file
echo "Uploading file ${FILE_TO_VERSION} to bucket ${BUCKET}"
cat ${FILE_TO_VERSION}
curl -o /dev/null \
--upload-file ${FILE_TO_VERSION} \
${BUCKET}/${FILENAME}?access_token="${ZENODO_TOKEN}"

--upload-file ${FILE_TO_VERSION} \
"${BUCKET}"/"${FILENAME}"?access_token="${ZENODO_TOKEN}"

# Upload Metadata
echo -e '{"metadata": {
"title": "The Joint Undertaking for Morphological Profiling (JUMP) Consortium Datasets Index",
"creators": [
{
"name": "The JUMP Cell Painting Consortium"
}
],
"upload_type": "dataset",
"access_right": "open"
}}' > metadata.json
echo -e "${METADATA_JSON}" >metadata.json

NEW_DEPOSITION_ENDPOINT="${DEPOSITION_PREFIX}/${DEPOSITION}"
echo "Uploading file to ${NEW_DEPOSITION_ENDPOINT}"
curl -H "Content-Type: application/json" \
-X PUT\
--data @metadata.json \
"${NEW_DEPOSITION_ENDPOINT}?access_token=${ZENODO_TOKEN}"
-X PUT \
--data @metadata.json \
"${NEW_DEPOSITION_ENDPOINT}?access_token=${ZENODO_TOKEN}"

# Publish
echo "Publishing to ${NEW_DEPOSITION_ENDPOINT}"
curl -H "Content-Type: application/json" \
-X POST\
--data "{}"\
"${NEW_DEPOSITION_ENDPOINT}/actions/publish?access_token=${ZENODO_TOKEN}"\
| jq .id

-X POST \
--data "{}" \
"${NEW_DEPOSITION_ENDPOINT}/actions/publish?access_token=${ZENODO_TOKEN}" |
jq .id

0 comments on commit 18e287c

Please sign in to comment.