From 7a68c7a4c37a96d43e65b6f65bc2ec4f71a536b6 Mon Sep 17 00:00:00 2001 From: Alexandre Gattiker Date: Wed, 21 Oct 2020 09:10:35 +0200 Subject: [PATCH] Use Spark 3 in Data Explorer Sample (#98) Also: - automatically generate PAT token for Databricks - added README about continuous export for Data Explorer --- .../azure-databricks/create-databricks.sh | 71 +++++++------------ eventhubs-dataexplorer/README.md | 33 +++++++++ eventhubs-dataexplorer/create-solution.sh | 9 ++- .../databricks/job/run-databricks-job.sh | 1 + .../notebooks/verify-dataexplorer.scala | 4 +- streaming/databricks/runners/verify-common.sh | 1 + .../databricks/runners/verify-dataexplorer.sh | 2 +- 7 files changed, 69 insertions(+), 52 deletions(-) diff --git a/components/azure-databricks/create-databricks.sh b/components/azure-databricks/create-databricks.sh index 2df65199..db75edb4 100755 --- a/components/azure-databricks/create-databricks.sh +++ b/components/azure-databricks/create-databricks.sh @@ -28,6 +28,9 @@ fi databricks_metainfo=$(az resource show -g $RESOURCE_GROUP --resource-type Microsoft.Databricks/workspaces -n $ADB_WORKSPACE -o json) +# Databricks CLI automatically picks up configuration from $DATABRICKS_HOST and $DATABRICKS_TOKEN. +export DATABRICKS_HOST=$(jq -r '"https://" + .location + ".azuredatabricks.net"' <<<"$databricks_metainfo") + echo 'creating Key Vault to store Databricks PAT token' az keyvault create -g $RESOURCE_GROUP -n $ADB_TOKEN_KEYVAULT -o tsv >>log.txt @@ -35,54 +38,30 @@ echo 'checking PAT token secret presence in Key Vault' databricks_token_secret_name="DATABRICKS-TOKEN" pat_token_secret=$(az keyvault secret list --vault-name $ADB_TOKEN_KEYVAULT --query "[?ends_with(id, '/$databricks_token_secret_name')].id" -o tsv) if [[ -z "$pat_token_secret" ]]; then - echo 'PAT token secret not present. Creating dummy entry for user to fill in manually' - az keyvault secret set --vault-name $ADB_TOKEN_KEYVAULT -n "$databricks_token_secret_name" --file /dev/null -o tsv >>log.txt + echo 'generating PAT token' + wsId=$(jq -r .id <<<"$databricks_metainfo") + + # Get a token for the global Databricks application. + # The resource name is fixed and never changes. + token_response=$(az account get-access-token --resource 2ff814a6-3304-4ab8-85cb-cd0e6f879c1d) + token=$(jq .accessToken -r <<< "$token_response") + + # Get a token for the Azure management API + token_response=$(az account get-access-token --resource https://management.core.windows.net/) + azToken=$(jq .accessToken -r <<< "$token_response") + + api_response=$(curl -sf "$DATABRICKS_HOST/api/2.0/token/create" \ + -H "Authorization: Bearer $token" \ + -H "X-Databricks-Azure-SP-Management-Token:$azToken" \ + -H "X-Databricks-Azure-Workspace-Resource-Id:$wsId" \ + -d '{ "lifetime_seconds": 864000, "comment": "streaming-at-scale generated token" }') + pat_token=$(jq .token_value -r <<< "$api_response") + + az keyvault secret set --vault-name "$ADB_TOKEN_KEYVAULT" --name "$databricks_token_secret_name" --value "$pat_token" fi -echo 'checking PAT token presence in Key Vault' -pat_token=$(az keyvault secret show --vault-name $ADB_TOKEN_KEYVAULT -n "$databricks_token_secret_name" --query value -o tsv) - -if [[ -z "$pat_token" ]]; then - echo 'PAT token not present. Requesting user to fill in manually' - databricks_login_url=$(jq -r '"https://" + .location + ".azuredatabricks.net/aad/auth?has=&Workspace=" + .id + "&WorkspaceResourceGroupUri="+ .properties.managedResourceGroupId' <<<"$databricks_metainfo") - - kv_info=$(az resource show -g $RESOURCE_GROUP --resource-type Microsoft.KeyVault/vaults -n $ADB_TOKEN_KEYVAULT -o json) - kv_secrets_url=$(jq -r '"https://portal.azure.com/#@" + .properties.tenantId + "/resource" + .id + "/secrets"' <<<$kv_info) - - cat < ``` + +## Next steps + +Retaining long-term data in Azure Data Explorer can drive up costs. You can set up [continuous data export](https://docs.microsoft.com/en-us/azure/data-explorer/kusto/management/data-export/continuous-data-export) to save derivations from ingested data into storage. In conjunction with a [retention policy](https://docs.microsoft.com/en-us/azure/data-explorer/kusto/management/retentionpolicy), this allows data tiering, serving hot data from Data Explorer's own storage, and colder data through the external table. + +The sample statements below use CSV files in storage blob for simplicity. Use Parquet instead to improve file size and access performance, especially if planning to query data from the external table. Use Azure Data Lake Storage Gen2 instead of blob for improved performance and to avoid the need for hard-coded credentials. + + +```kql +.create external table SummarizedEvents (deviceId: string, type: string, count:long, from:datetime, to:datetime) +kind=blob +dataformat=csv +( +h@'https://storage.blob.core.windows.net/export;' +) + +.create function +EventSummary() +{ + EventTable + | summarize count=count(), from=min(createdAt), to=max(createdAt) by deviceId, type +} + +// Create the target table (if it doesn't already exist) +.set-or-append SummarizedEvents <| EventSummary() | limit 0 + +.create-or-alter continuous-export SummarizedEventsExport +to table SummarizedEvents +with +(intervalBetweenRuns=5m) +<| EventSummary() + +``` diff --git a/eventhubs-dataexplorer/create-solution.sh b/eventhubs-dataexplorer/create-solution.sh index 16854a4d..711738ad 100755 --- a/eventhubs-dataexplorer/create-solution.sh +++ b/eventhubs-dataexplorer/create-solution.sh @@ -47,11 +47,13 @@ if [[ -z "$PREFIX" ]]; then usage fi +export DATABRICKS_SPARKVERSION=7.3.x-scala2.12 + # 10000 messages/sec if [ "$TESTTYPE" == "10" ]; then export EVENTHUB_PARTITIONS=12 export EVENTHUB_CAPACITY=12 - export DATAEXPLORER_SKU=D13_v2 + export DATAEXPLORER_SKU=Standard_D13_v2 export DATAEXPLORER_CAPACITY=3 export SIMULATOR_INSTANCES=5 fi @@ -60,7 +62,7 @@ fi if [ "$TESTTYPE" == "5" ]; then export EVENTHUB_PARTITIONS=8 export EVENTHUB_CAPACITY=6 - export DATAEXPLORER_SKU=D12_v2 + export DATAEXPLORER_SKU=Standard_D12_v2 export DATAEXPLORER_CAPACITY=2 export SIMULATOR_INSTANCES=3 fi @@ -69,7 +71,7 @@ fi if [ "$TESTTYPE" == "1" ]; then export EVENTHUB_PARTITIONS=2 export EVENTHUB_CAPACITY=2 - export DATAEXPLORER_SKU=D11_v2 + export DATAEXPLORER_SKU=Standard_D11_v2 export DATAEXPLORER_CAPACITY=2 export SIMULATOR_INSTANCES=1 fi @@ -169,6 +171,7 @@ echo "***** [V] Starting deployment VERIFICATION" RUN=`echo $STEPS | grep V -o || true` if [ ! -z "$RUN" ]; then + source ../assert/has-local-databrickscli.sh source ../components/azure-databricks/create-databricks.sh source ../streaming/databricks/runners/verify-dataexplorer.sh fi diff --git a/streaming/databricks/job/run-databricks-job.sh b/streaming/databricks/job/run-databricks-job.sh index cffc77e6..f5646436 100755 --- a/streaming/databricks/job/run-databricks-job.sh +++ b/streaming/databricks/job/run-databricks-job.sh @@ -39,6 +39,7 @@ wait_for_run () { cluster_jq_command="$(cat < dbutils.widgets.get("dataexplorer-client-id"), - KustoSourceOptions.KUSTO_AAD_CLIENT_PASSWORD -> dbutils.secrets.get(scope = "MAIN", key = "dataexplorer-client-password"), + KustoSourceOptions.KUSTO_AAD_APP_ID -> dbutils.widgets.get("dataexplorer-client-id"), + KustoSourceOptions.KUSTO_AAD_APP_SECRET -> dbutils.secrets.get(scope = "MAIN", key = "dataexplorer-client-password"), KustoSourceOptions.KUSTO_BLOB_STORAGE_ACCOUNT_NAME -> dbutils.widgets.get("dataexplorer-storage-account"), KustoSourceOptions.KUSTO_BLOB_STORAGE_ACCOUNT_KEY -> dbutils.secrets.get(scope = "MAIN", key = "dataexplorer-storage-key"), KustoSourceOptions.KUSTO_BLOB_CONTAINER -> dbutils.widgets.get("dataexplorer-storage-container") diff --git a/streaming/databricks/runners/verify-common.sh b/streaming/databricks/runners/verify-common.sh index 74047205..7fbff004 100644 --- a/streaming/databricks/runners/verify-common.sh +++ b/streaming/databricks/runners/verify-common.sh @@ -1,5 +1,6 @@ export DATABRICKS_NODETYPE=Standard_F4s export DATABRICKS_WORKERS=2 +export DATABRICKS_SPARKVERSION=${DATABRICKS_SPARKVERSION:-5.5.x-scala2.11} export DATABRICKS_MAXEVENTSPERTRIGGER=10000 export DATABRICKS_TESTOUTPUTPATH=dbfs:/test-output/$(uuidgen) diff --git a/streaming/databricks/runners/verify-dataexplorer.sh b/streaming/databricks/runners/verify-dataexplorer.sh index 1c0bf66b..4eb377a5 100755 --- a/streaming/databricks/runners/verify-dataexplorer.sh +++ b/streaming/databricks/runners/verify-dataexplorer.sh @@ -20,7 +20,7 @@ databricks secrets put --scope "MAIN" --key "dataexplorer-client-password" --str databricks secrets put --scope "MAIN" --key "dataexplorer-storage-key" --string-value "$AZURE_STORAGE_KEY" source ../streaming/databricks/job/run-databricks-job.sh verify-dataexplorer true "$(cat <