From 7a68c7a4c37a96d43e65b6f65bc2ec4f71a536b6 Mon Sep 17 00:00:00 2001
From: Alexandre Gattiker <algattik@users.noreply.github.com>
Date: Wed, 21 Oct 2020 09:10:35 +0200
Subject: [PATCH] Use Spark 3 in Data Explorer Sample (#98)

Also:
- automatically generate PAT token for Databricks
- added README about continuous export for Data Explorer
---
 .../azure-databricks/create-databricks.sh     | 71 +++++++------------
 eventhubs-dataexplorer/README.md              | 33 +++++++++
 eventhubs-dataexplorer/create-solution.sh     |  9 ++-
 .../databricks/job/run-databricks-job.sh      |  1 +
 .../notebooks/verify-dataexplorer.scala       |  4 +-
 streaming/databricks/runners/verify-common.sh |  1 +
 .../databricks/runners/verify-dataexplorer.sh |  2 +-
 7 files changed, 69 insertions(+), 52 deletions(-)

diff --git a/components/azure-databricks/create-databricks.sh b/components/azure-databricks/create-databricks.sh
index 2df65199..db75edb4 100755
--- a/components/azure-databricks/create-databricks.sh
+++ b/components/azure-databricks/create-databricks.sh
@@ -28,6 +28,9 @@ fi
 
 databricks_metainfo=$(az resource show -g $RESOURCE_GROUP --resource-type Microsoft.Databricks/workspaces -n $ADB_WORKSPACE -o json)
 
+# Databricks CLI automatically picks up configuration from $DATABRICKS_HOST and $DATABRICKS_TOKEN.
+export DATABRICKS_HOST=$(jq -r '"https://" + .location + ".azuredatabricks.net"' <<<"$databricks_metainfo")
+
 echo 'creating Key Vault to store Databricks PAT token'
 az keyvault create -g $RESOURCE_GROUP -n $ADB_TOKEN_KEYVAULT -o tsv >>log.txt
 
@@ -35,54 +38,30 @@ echo 'checking PAT token secret presence in Key Vault'
 databricks_token_secret_name="DATABRICKS-TOKEN"
 pat_token_secret=$(az keyvault secret list --vault-name $ADB_TOKEN_KEYVAULT --query "[?ends_with(id, '/$databricks_token_secret_name')].id" -o tsv)
 if [[ -z "$pat_token_secret" ]]; then
-  echo 'PAT token secret not present. Creating dummy entry for user to fill in manually'
-  az keyvault secret set --vault-name $ADB_TOKEN_KEYVAULT -n "$databricks_token_secret_name" --file /dev/null -o tsv >>log.txt
+  echo 'generating PAT token'
+  wsId=$(jq -r .id <<<"$databricks_metainfo")
+
+  # Get a token for the global Databricks application.
+  # The resource name is fixed and never changes.
+  token_response=$(az account get-access-token --resource 2ff814a6-3304-4ab8-85cb-cd0e6f879c1d)
+  token=$(jq .accessToken -r <<< "$token_response")
+
+  # Get a token for the Azure management API
+  token_response=$(az account get-access-token --resource https://management.core.windows.net/)
+  azToken=$(jq .accessToken -r <<< "$token_response")
+
+  api_response=$(curl -sf "$DATABRICKS_HOST/api/2.0/token/create" \
+    -H "Authorization: Bearer $token" \
+    -H "X-Databricks-Azure-SP-Management-Token:$azToken" \
+    -H "X-Databricks-Azure-Workspace-Resource-Id:$wsId" \
+    -d '{ "lifetime_seconds": 864000, "comment": "streaming-at-scale generated token" }')
+  pat_token=$(jq .token_value -r <<< "$api_response")
+
+  az keyvault secret set --vault-name "$ADB_TOKEN_KEYVAULT" --name "$databricks_token_secret_name" --value "$pat_token"
 fi
 
-echo 'checking PAT token presence in Key Vault'
-pat_token=$(az keyvault secret show --vault-name $ADB_TOKEN_KEYVAULT -n "$databricks_token_secret_name" --query value -o tsv)
-
-if [[ -z "$pat_token" ]]; then
-  echo 'PAT token not present. Requesting user to fill in manually'
-  databricks_login_url=$(jq -r '"https://" + .location + ".azuredatabricks.net/aad/auth?has=&Workspace=" + .id + "&WorkspaceResourceGroupUri="+ .properties.managedResourceGroupId' <<<"$databricks_metainfo")
-
-  kv_info=$(az resource show -g $RESOURCE_GROUP --resource-type Microsoft.KeyVault/vaults -n $ADB_TOKEN_KEYVAULT -o json)
-  kv_secrets_url=$(jq -r '"https://portal.azure.com/#@" + .properties.tenantId + "/resource" + .id + "/secrets"' <<<$kv_info)
-
-  cat <<EOM
-  ERROR: Missing PAT token in Key Vault (this is normal the first time you run this script).
-
-  You need to manually create a Databricks PAT token and register it into the Key Vault as follows,
-  then rerun this script or pipeline.
-
-  - Navigate to:
-      $databricks_login_url
-    Create a PAT token and copy it to the clipboard:
-      https://docs.azuredatabricks.net/api/latest/authentication.html#generate-a-token
-  - Navigate to:
-      $kv_secrets_url
-    Click $databricks_token_secret_name
-    Click "+ New Version"
-    As value, enter the PAT token you copied
-    Click Create
-  - The script will wait for the PAT to be copied into the Key Vault
-    If you stop the script, you can resume it running the following command:
-      ./create-solution.sh -d "$PREFIX" -t $TESTTYPE -s PT
-
-EOM
-  
-  echo 'waiting for PAT (polling every 5 secs)...'
-  while : ; do
-    pat_token=$(az keyvault secret show --vault-name "$ADB_TOKEN_KEYVAULT" --name "$databricks_token_secret_name" --query value -o tsv | grep dapi || true)	
-    if [ ! -z "$pat_token" ]; then break; fi
-	  sleep 5
-  done
-  echo 'PAT detected'
-fi
-
-# Databricks CLI automatically picks up configuration from these two environment variables.
-export DATABRICKS_HOST=$(jq -r '"https://" + .location + ".azuredatabricks.net"' <<<"$databricks_metainfo")
-export DATABRICKS_TOKEN="$pat_token"
+echo 'getting PAT token from Key Vault'
+export DATABRICKS_TOKEN=$(az keyvault secret show --vault-name $ADB_TOKEN_KEYVAULT -n "$databricks_token_secret_name" --query value -o tsv)
 
 fi
 echo 'checking Databricks secrets scope exists'
diff --git a/eventhubs-dataexplorer/README.md b/eventhubs-dataexplorer/README.md
index b495463c..00782a3c 100644
--- a/eventhubs-dataexplorer/README.md
+++ b/eventhubs-dataexplorer/README.md
@@ -145,3 +145,36 @@ To remove all the created resource, you can just delete the related resource gro
 ```bash
 az group delete -n <resource-group-name>
 ```
+
+## Next steps
+
+Retaining long-term data in Azure Data Explorer can drive up costs. You can set up [continuous data export](https://docs.microsoft.com/en-us/azure/data-explorer/kusto/management/data-export/continuous-data-export) to save derivations from ingested data into storage. In conjunction with a [retention policy](https://docs.microsoft.com/en-us/azure/data-explorer/kusto/management/retentionpolicy), this allows data tiering, serving hot data from Data Explorer's own storage, and colder data through the external table. 
+
+The sample statements below use CSV files in storage blob for simplicity. Use Parquet instead to improve file size and access performance, especially if planning to query data from the external table. Use Azure Data Lake Storage Gen2 instead of blob for improved performance and to avoid the need for hard-coded credentials.
+
+
+```kql
+.create external table SummarizedEvents (deviceId: string, type: string, count:long, from:datetime, to:datetime)  
+kind=blob 
+dataformat=csv
+(
+h@'https://<SOLUTION_NAME>storage.blob.core.windows.net/export;<STORAGE_KEY>'
+)
+
+.create function 
+EventSummary()
+{
+    EventTable
+    | summarize count=count(), from=min(createdAt), to=max(createdAt) by deviceId, type
+}
+
+// Create the target table (if it doesn't already exist)
+.set-or-append SummarizedEvents <| EventSummary() | limit 0
+
+.create-or-alter continuous-export SummarizedEventsExport
+to table SummarizedEvents
+with
+(intervalBetweenRuns=5m)
+<|  EventSummary()
+
+```
diff --git a/eventhubs-dataexplorer/create-solution.sh b/eventhubs-dataexplorer/create-solution.sh
index 16854a4d..711738ad 100755
--- a/eventhubs-dataexplorer/create-solution.sh
+++ b/eventhubs-dataexplorer/create-solution.sh
@@ -47,11 +47,13 @@ if [[ -z "$PREFIX" ]]; then
 	usage
 fi
 
+export DATABRICKS_SPARKVERSION=7.3.x-scala2.12
+
 # 10000 messages/sec
 if [ "$TESTTYPE" == "10" ]; then
     export EVENTHUB_PARTITIONS=12
     export EVENTHUB_CAPACITY=12
-    export DATAEXPLORER_SKU=D13_v2
+    export DATAEXPLORER_SKU=Standard_D13_v2
     export DATAEXPLORER_CAPACITY=3
     export SIMULATOR_INSTANCES=5
 fi
@@ -60,7 +62,7 @@ fi
 if [ "$TESTTYPE" == "5" ]; then
     export EVENTHUB_PARTITIONS=8
     export EVENTHUB_CAPACITY=6
-    export DATAEXPLORER_SKU=D12_v2
+    export DATAEXPLORER_SKU=Standard_D12_v2
     export DATAEXPLORER_CAPACITY=2
     export SIMULATOR_INSTANCES=3
 fi
@@ -69,7 +71,7 @@ fi
 if [ "$TESTTYPE" == "1" ]; then
     export EVENTHUB_PARTITIONS=2
     export EVENTHUB_CAPACITY=2
-    export DATAEXPLORER_SKU=D11_v2
+    export DATAEXPLORER_SKU=Standard_D11_v2
     export DATAEXPLORER_CAPACITY=2
     export SIMULATOR_INSTANCES=1 
 fi
@@ -169,6 +171,7 @@ echo "***** [V] Starting deployment VERIFICATION"
 
     RUN=`echo $STEPS | grep V -o || true`
     if [ ! -z "$RUN" ]; then
+        source ../assert/has-local-databrickscli.sh
         source ../components/azure-databricks/create-databricks.sh
         source ../streaming/databricks/runners/verify-dataexplorer.sh
     fi
diff --git a/streaming/databricks/job/run-databricks-job.sh b/streaming/databricks/job/run-databricks-job.sh
index cffc77e6..f5646436 100755
--- a/streaming/databricks/job/run-databricks-job.sh
+++ b/streaming/databricks/job/run-databricks-job.sh
@@ -39,6 +39,7 @@ wait_for_run () {
 cluster_jq_command="$(cat <<JQ
   .name = "Streaming at scale job $notebook_name"
   | .notebook_task.notebook_path = "/Shared/streaming-at-scale/$notebook_name"
+  | .new_cluster.spark_version = "$DATABRICKS_SPARKVERSION"
   | .new_cluster.node_type_id = "$DATABRICKS_NODETYPE"
   | .new_cluster.num_workers = $DATABRICKS_WORKERS
   | .timeout_seconds = $((${REPORT_THROUGHPUT_MINUTES:-30} * 60))
diff --git a/streaming/databricks/notebooks/verify-dataexplorer.scala b/streaming/databricks/notebooks/verify-dataexplorer.scala
index 96b328e9..5ad49ce7 100644
--- a/streaming/databricks/notebooks/verify-dataexplorer.scala
+++ b/streaming/databricks/notebooks/verify-dataexplorer.scala
@@ -22,8 +22,8 @@ val cluster = dbutils.widgets.get("dataexplorer-cluster")
 val database = dbutils.widgets.get("dataexplorer-database")
 val query = dbutils.widgets.get("dataexplorer-query")
 val conf: Map[String, String] = Map(
-  KustoSourceOptions.KUSTO_AAD_CLIENT_ID -> dbutils.widgets.get("dataexplorer-client-id"),
-  KustoSourceOptions.KUSTO_AAD_CLIENT_PASSWORD -> dbutils.secrets.get(scope = "MAIN", key = "dataexplorer-client-password"),
+  KustoSourceOptions.KUSTO_AAD_APP_ID -> dbutils.widgets.get("dataexplorer-client-id"),
+  KustoSourceOptions.KUSTO_AAD_APP_SECRET -> dbutils.secrets.get(scope = "MAIN", key = "dataexplorer-client-password"),
   KustoSourceOptions.KUSTO_BLOB_STORAGE_ACCOUNT_NAME -> dbutils.widgets.get("dataexplorer-storage-account"),
   KustoSourceOptions.KUSTO_BLOB_STORAGE_ACCOUNT_KEY -> dbutils.secrets.get(scope = "MAIN", key = "dataexplorer-storage-key"),
   KustoSourceOptions.KUSTO_BLOB_CONTAINER -> dbutils.widgets.get("dataexplorer-storage-container")
diff --git a/streaming/databricks/runners/verify-common.sh b/streaming/databricks/runners/verify-common.sh
index 74047205..7fbff004 100644
--- a/streaming/databricks/runners/verify-common.sh
+++ b/streaming/databricks/runners/verify-common.sh
@@ -1,5 +1,6 @@
 export DATABRICKS_NODETYPE=Standard_F4s
 export DATABRICKS_WORKERS=2
+export DATABRICKS_SPARKVERSION=${DATABRICKS_SPARKVERSION:-5.5.x-scala2.11}
 export DATABRICKS_MAXEVENTSPERTRIGGER=10000
 
 export DATABRICKS_TESTOUTPUTPATH=dbfs:/test-output/$(uuidgen)
diff --git a/streaming/databricks/runners/verify-dataexplorer.sh b/streaming/databricks/runners/verify-dataexplorer.sh
index 1c0bf66b..4eb377a5 100755
--- a/streaming/databricks/runners/verify-dataexplorer.sh
+++ b/streaming/databricks/runners/verify-dataexplorer.sh
@@ -20,7 +20,7 @@ databricks secrets put --scope "MAIN" --key "dataexplorer-client-password" --str
 databricks secrets put --scope "MAIN" --key "dataexplorer-storage-key" --string-value "$AZURE_STORAGE_KEY"
 
 source ../streaming/databricks/job/run-databricks-job.sh verify-dataexplorer true "$(cat <<JQ
-  .libraries += [ { "maven": { "coordinates": "com.microsoft.azure.kusto:spark-kusto-connector:1.0.0-BETA-04", "exclusions": ["javax.mail:mail"] } } ]
+  .libraries += [ { "maven": { "coordinates": "com.microsoft.azure.kusto:kusto-spark_3.0_2.12:2.3.0" } } ]
   | .notebook_task.base_parameters."test-output-path" = "$DATABRICKS_TESTOUTPUTPATH"
   | .notebook_task.base_parameters."dataexplorer-cluster" = "$kustoURL"
   | .notebook_task.base_parameters."dataexplorer-database" = "$DATAEXPLORER_DATABASE"