@@ -217,4 +217,56 @@ object QbeastUtils extends Logging {
217
217
218
218
}
219
219
220
+ /**
221
+ * Compute the histogram for a given column
222
+ *
223
+ * Since computing the histogram can be expensive, this method is used outside the indexing
224
+ * process.
225
+ *
226
+ * It outputs the histogram of the column as format [bin1, bin2, bin3, ...] Number of bins by
227
+ * default is 50
228
+ *
229
+ * For example:
230
+ *
231
+ * val qbeastTable = QbeastTable.forPath(spark, "path")
232
+ *
233
+ * val histogram =qbeastTable.computeHistogramForColumn(df, "column")
234
+ *
235
+ * df.write.format("qbeast").option("columnsToIndex",
236
+ * "column:histogram").option("columnStats",histogram).save()
237
+ *
238
+ * @param df
239
+ * @param columnName
240
+ */
241
+ @ deprecated(" Use computeQuantilesForColumn method instead" , " 0.8.0" )
242
+ def computeHistogramForColumn (df : DataFrame , columnName : String , numBins : Int = 50 ): String = {
243
+
244
+ import df .sparkSession .implicits ._
245
+ if (! df.columns.contains(columnName)) {
246
+ throw AnalysisExceptionFactory .create(s " Column $columnName does not exist in the dataframe " )
247
+ }
248
+
249
+ log.info(s " Computing histogram for column $columnName with number of bins $numBins" )
250
+ val binStarts = " __bin_starts"
251
+ val stringPartitionColumn =
252
+ MultiDimClusteringFunctions .range_partition_id(col(columnName), numBins)
253
+
254
+ val histogram = df
255
+ .select(columnName)
256
+ .distinct()
257
+ .na
258
+ .drop()
259
+ .groupBy(stringPartitionColumn)
260
+ .agg(min(columnName).alias(binStarts))
261
+ .select(binStarts)
262
+ .orderBy(binStarts)
263
+ .as[String ]
264
+ .collect()
265
+
266
+ log.info(s " Histogram for column $columnName: $histogram" )
267
+ histogram
268
+ .map(string => s " ' $string' " )
269
+ .mkString(" [" , " ," , " ]" )
270
+ }
271
+
220
272
}
0 commit comments