[SW-2682] do constant check & row count in one iteration (#2730)

krasinski · web-flow · commit bbb13d5965b9 · 2022-04-14T16:07:28.000+02:00
diff --git a/benchmarks/src/main/scala/ai/h2o/sparkling/benchmarks/DataFrameToH2OFrameConversionViaCsvFilesBenchmark.scala b/benchmarks/src/main/scala/ai/h2o/sparkling/benchmarks/DataFrameToH2OFrameConversionViaCsvFilesBenchmark.scala
@@ -17,7 +17,6 @@
 
 package ai.h2o.sparkling.benchmarks
 
-import java.net.URI
 import ai.h2o.sparkling.H2OFrame
 import org.apache.spark.sql.{DataFrame, SaveMode}
 
diff --git a/benchmarks/src/main/scala/ai/h2o/sparkling/benchmarks/DataFrameToH2OFrameConversionViaCsvFilesIncludingS3LoadBenchmark.scala b/benchmarks/src/main/scala/ai/h2o/sparkling/benchmarks/DataFrameToH2OFrameConversionViaCsvFilesIncludingS3LoadBenchmark.scala
@@ -17,7 +17,7 @@
 
 package ai.h2o.sparkling.benchmarks
 
-import org.apache.spark.sql.{DataFrame, SaveMode}
+import org.apache.spark.sql.DataFrame
 
 class DataFrameToH2OFrameConversionViaCsvFilesIncludingS3LoadBenchmark(context: BenchmarkContext)
   extends DataFrameToH2OFrameConversionViaCsvFilesBenchmark(context) {
diff --git a/core/src/main/scala/ai/h2o/sparkling/H2OContext.scala b/core/src/main/scala/ai/h2o/sparkling/H2OContext.scala
@@ -159,12 +159,21 @@ class H2OContext private[sparkling] (private val conf: H2OConf) extends H2OConte
   }
 
   /** Transform DataFrame to H2OFrame */
-  def asH2OFrame(df: DataFrame): H2OFrame = asH2OFrame(df, None)
+  def asH2OFrame(df: DataFrame): H2OFrame = asH2OFrame(df, frameName = None)
+
+  def asH2OFrame(df: DataFrame, featureColumns: Seq[String]): H2OFrame =
+    asH2OFrame(df, frameName = None, Some(featureColumns))
 
   def asH2OFrame(df: DataFrame, frameName: String): H2OFrame = asH2OFrame(df, Option(frameName))
 
-  def asH2OFrame(df: DataFrame, frameName: Option[String]): H2OFrame = {
-    withConversionDebugPrints(sparkContext, "Dataframe", SparkDataFrameConverter.toH2OFrame(this, df, frameName))
+  def asH2OFrame(
+      df: DataFrame,
+      frameName: Option[String] = None,
+      featureColumns: Option[Seq[String]] = None): H2OFrame = {
+    withConversionDebugPrints(
+      sparkContext,
+      "Dataframe",
+      SparkDataFrameConverter.toH2OFrame(this, df, frameName, featureColumns))
   }
 
   /** Transforms Dataset[Supported type] to H2OFrame */
diff --git a/core/src/main/scala/ai/h2o/sparkling/backend/PartitionStats.scala b/core/src/main/scala/ai/h2o/sparkling/backend/PartitionStats.scala
@@ -0,0 +1,19 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package ai.h2o.sparkling.backend
+
+case class PartitionStats(partitionSizes: Map[Int, Int], areFeatureColumnsConstant: Option[Boolean])
diff --git a/core/src/main/scala/ai/h2o/sparkling/backend/PartitionStatsGenerator.scala b/core/src/main/scala/ai/h2o/sparkling/backend/PartitionStatsGenerator.scala
@@ -0,0 +1,77 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package ai.h2o.sparkling.backend
+
+import org.apache.spark.rdd.RDD
+import org.apache.spark.sql.Row
+
+/**
+  * Goes over RDD partitions counting records and checking if given set of columns has constant values
+  */
+private[backend] object PartitionStatsGenerator {
+
+  def getPartitionStats(rdd: RDD[Row], maybeColumnsForConstantCheck: Option[Seq[String]] = None): PartitionStats = {
+    val partitionStats = rdd
+      .mapPartitionsWithIndex {
+        case (partitionIdx, iterator) =>
+          maybeColumnsForConstantCheck
+            .map(rowCountWithColumnsConstantCheck(partitionIdx, iterator, _))
+            .getOrElse(rowCountWithoutColumnsConstantCheck(partitionIdx, iterator))
+      }
+      .fold((Map.empty, Set.empty))((a, b) => (a._1 ++ b._1, a._2 ++ b._2))
+
+    val areProvidedColumnsConstant = if (partitionStats._2.isEmpty || maybeColumnsForConstantCheck.isEmpty) {
+      None
+    } else {
+      Some(partitionStats._2.size < 2)
+    }
+    PartitionStats(partitionStats._1, areProvidedColumnsConstant)
+  }
+
+  private def rowCountWithoutColumnsConstantCheck(partitionIdx: Int, iterator: Iterator[Row]) =
+    Iterator.single(Map(partitionIdx -> iterator.size), Set.empty)
+
+  private def rowCountWithColumnsConstantCheck(
+      partitionIdx: Int,
+      iterator: Iterator[Row],
+      columnsForConstantCheck: Seq[String]) = {
+    var atMostTwoDistinctColumnSetValues = Set[Map[String, Any]]()
+    var recordCount = 0
+    var constantCheckColumnsFlattened: Option[Seq[String]] = None
+    while (iterator.hasNext) {
+      val row = iterator.next()
+      if (constantCheckColumnsFlattened.isEmpty) {
+        constantCheckColumnsFlattened = Some(
+          findFlattenedColumnNamesByPrefix(columnsForConstantCheck, row.schema.fieldNames))
+      }
+      if (atMostTwoDistinctColumnSetValues.size < 2) {
+        atMostTwoDistinctColumnSetValues += row.getValuesMap(constantCheckColumnsFlattened.get)
+      }
+      recordCount += 1
+    }
+    Iterator.single(Map(partitionIdx -> recordCount), atMostTwoDistinctColumnSetValues)
+  }
+
+  private def findFlattenedColumnNamesByPrefix(
+      columnPrefixes: Seq[String],
+      flattenedFields: Array[String]): Seq[String] =
+    columnPrefixes.flatMap(
+      colNameBeforeFlatten =>
+        flattenedFields
+          .filter(col => col == colNameBeforeFlatten || col.startsWith(colNameBeforeFlatten + ".")))
+
+}
diff --git a/core/src/main/scala/ai/h2o/sparkling/backend/Writer.scala b/core/src/main/scala/ai/h2o/sparkling/backend/Writer.scala
@@ -17,22 +17,21 @@
 
 package ai.h2o.sparkling.backend
 
-import java.io.Closeable
-
-import ai.h2o.sparkling.{H2OConf, H2OFrame}
 import ai.h2o.sparkling.H2OFrame.query
+import ai.h2o.sparkling.backend.converters.{CategoricalDomainBuilder, TimeZoneConverter}
 import ai.h2o.sparkling.backend.utils.RestApiUtils.getClusterEndpoint
 import ai.h2o.sparkling.extensions.rest.api.Paths
 import ai.h2o.sparkling.extensions.rest.api.schema.UploadPlanV3
-import ai.h2o.sparkling.backend.converters.{CategoricalDomainBuilder, TimeZoneConverter}
-import ai.h2o.sparkling.extensions.serde.{ChunkAutoBufferWriter, ExpectedTypes, SerdeUtils}
+import ai.h2o.sparkling.extensions.serde.{ChunkAutoBufferWriter, SerdeUtils}
 import ai.h2o.sparkling.utils.ScalaUtils.withResource
 import ai.h2o.sparkling.utils.SparkSessionUtils
-import org.apache.spark.rdd.RDD
+import ai.h2o.sparkling.{H2OConf, H2OFrame}
 import org.apache.spark.sql.Row
 import org.apache.spark.sql.types._
 import org.apache.spark.{ExposeUtils, TaskContext, ml, mllib}
 
+import java.io.Closeable
+
 private[backend] class Writer(nodeDesc: NodeDesc, metadata: WriterMetadata, numRows: Int, chunkId: Int)
   extends Closeable {
 
@@ -91,16 +90,22 @@ private[backend] object Writer {
 
   def convert(rdd: H2OAwareRDD[Row], colNames: Array[String], metadata: WriterMetadata): H2OFrame = {
     H2OFrame.initializeFrame(metadata.conf, metadata.frameId, colNames)
-    val partitionSizes = getNonEmptyPartitionSizes(rdd)
-    val nonEmptyPartitions = getNonEmptyPartitions(partitionSizes)
+    val partitionStats = PartitionStatsGenerator.getPartitionStats(rdd, metadata.featureColsForConstCheck)
+    if (partitionStats.areFeatureColumnsConstant.getOrElse(false)) {
+      throw new IllegalArgumentException(s"H2O could not use any of the specified input" +
+        s" columns: '${metadata.featureColsForConstCheck.get.mkString(", ")}' because they are all constants. H2O requires at least one non-constant column.")
+    }
+
+    val partitionSizes = partitionStats.partitionSizes
+    val nonEmptyPartitions = partitionSizes.filter(_._2 > 0).keys.toSeq.sorted
 
     val uploadPlan = getUploadPlan(metadata.conf, nonEmptyPartitions.length)
     val operation: SparkJob = perDataFramePartition(metadata, uploadPlan, nonEmptyPartitions, partitionSizes)
     val rows = SparkSessionUtils.active.sparkContext.runJob(rdd, operation, nonEmptyPartitions)
-    val res = new Array[Long](nonEmptyPartitions.size)
-    rows.foreach { case (chunkIdx, numRows) => res(chunkIdx) = numRows }
+    val rowsPerChunk = new Array[Long](nonEmptyPartitions.size)
+    rows.foreach { case (chunkIdx, numRows) => rowsPerChunk(chunkIdx) = numRows }
     val types = SerdeUtils.expectedTypesToVecTypes(metadata.expectedTypes, metadata.maxVectorSizes)
-    H2OFrame.finalizeFrame(metadata.conf, metadata.frameId, res, types)
+    H2OFrame.finalizeFrame(metadata.conf, metadata.frameId, rowsPerChunk, types)
     H2OFrame(metadata.frameId)
   }
 
@@ -164,24 +169,6 @@ private[backend] object Writer {
     }
   }
 
-  private def getNonEmptyPartitionSizes[T](rdd: RDD[T]): Map[Int, Int] = {
-    rdd
-      .mapPartitionsWithIndex {
-        case (idx, it) =>
-          if (it.nonEmpty) {
-            Iterator.single((idx, it.size))
-          } else {
-            Iterator.empty
-          }
-      }
-      .collect()
-      .toMap
-  }
-
-  private def getNonEmptyPartitions(partitionSizes: Map[Int, Int]): Seq[Int] = {
-    partitionSizes.keys.toSeq.sorted
-  }
-
   private def getUploadPlan(conf: H2OConf, numberOfPartitions: Int): UploadPlan = {
     val endpoint = getClusterEndpoint(conf)
     val parameters = Map("number_of_chunks" -> numberOfPartitions)
diff --git a/core/src/main/scala/ai/h2o/sparkling/backend/WriterMetadata.scala b/core/src/main/scala/ai/h2o/sparkling/backend/WriterMetadata.scala
@@ -26,4 +26,5 @@ case class WriterMetadata(
     frameId: String,
     expectedTypes: Array[ExpectedType],
     maxVectorSizes: Array[Int],
-    timezone: TimeZone)
+    timezone: TimeZone,
+    featureColsForConstCheck: Option[Seq[String]])
diff --git a/core/src/main/scala/ai/h2o/sparkling/backend/converters/SparkDataFrameConverter.scala b/core/src/main/scala/ai/h2o/sparkling/backend/converters/SparkDataFrameConverter.scala
@@ -23,6 +23,7 @@ import ai.h2o.sparkling.utils.SparkSessionUtils
 import ai.h2o.sparkling.{H2OContext, H2OFrame, SparkTimeZone}
 import org.apache.spark.expose.Logging
 import org.apache.spark.sql.DataFrame
+import org.apache.spark.storage.StorageLevel
 
 object SparkDataFrameConverter extends Logging {
 
@@ -40,23 +41,42 @@ object SparkDataFrameConverter extends Logging {
     spark.baseRelationToDataFrame(relation)
   }
 
-  def toH2OFrame(hc: H2OContext, dataFrame: DataFrame, frameKeyName: Option[String]): H2OFrame = {
+  def toH2OFrame(
+      hc: H2OContext,
+      dataFrame: DataFrame,
+      frameKeyName: Option[String] = None,
+      featureColsForConstCheck: Option[Seq[String]] = None): H2OFrame = {
     val df = dataFrame.toDF() // Because of PySparkling, we can receive Dataset[Primitive] in this method, ensure that
     // we are dealing with Dataset[Row]
     val flatDataFrame = flattenDataFrame(df)
     val schema = flatDataFrame.schema
-    val rdd = flatDataFrame.rdd // materialized the data frame
+    val rdd = flatDataFrame.rdd
+    if (hc.getConf.runsInInternalClusterMode) {
+      rdd.persist(StorageLevel.DISK_ONLY)
+    } else {
+      rdd.persist()
+    }
 
     val elemMaxSizes = collectMaxElementSizes(rdd, schema)
     val vecIndices = collectVectorLikeTypes(schema).toArray
-    val flattenSchema = expandedSchema(schema, elemMaxSizes)
-    val colNames = flattenSchema.map(field => "\"" + field.name + "\"").toArray
+    val flattenedSchema = expandedSchema(schema, elemMaxSizes)
+    val h2oColNames = flattenedSchema.map(field => "\"" + field.name + "\"").toArray
     val maxVecSizes = vecIndices.map(elemMaxSizes(_))
 
     val expectedTypes = DataTypeConverter.determineExpectedTypes(schema)
 
     val uniqueFrameId = frameKeyName.getOrElse("frame_rdd_" + rdd.id + scala.util.Random.nextInt())
-    val metadata = WriterMetadata(hc.getConf, uniqueFrameId, expectedTypes, maxVecSizes, SparkTimeZone.current())
-    Writer.convert(new H2OAwareRDD(hc.getH2ONodes(), rdd), colNames, metadata)
+    val metadata =
+      WriterMetadata(
+        hc.getConf,
+        uniqueFrameId,
+        expectedTypes,
+        maxVecSizes,
+        SparkTimeZone.current(),
+        featureColsForConstCheck)
+    val result = Writer.convert(new H2OAwareRDD(hc.getH2ONodes(), rdd), h2oColNames, metadata)
+    rdd.unpersist(blocking = false)
+    result
   }
+
 }
diff --git a/core/src/test/scala/ai/h2o/sparkling/backend/PartitionStatsGeneratorTestSuite.scala b/core/src/test/scala/ai/h2o/sparkling/backend/PartitionStatsGeneratorTestSuite.scala
@@ -0,0 +1,82 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package ai.h2o.sparkling.backend
+
+import ai.h2o.sparkling.SparkTestContext
+import org.apache.spark.sql.SparkSession
+import org.junit.runner.RunWith
+import org.scalatest.junit.JUnitRunner
+import org.scalatest.{FunSuite, Matchers, OptionValues}
+
+@RunWith(classOf[JUnitRunner])
+class PartitionStatsGeneratorTestSuite extends FunSuite with SparkTestContext with Matchers with OptionValues {
+
+  override def createSparkSession(): SparkSession = sparkSession("local[*]")
+
+  import spark.implicits._
+
+  private final val dataset =
+    Seq((1, "John", "Doe", 1999), (2, "John", "Doe", 1999), (3, "Jane", "Doe", 1999), (4, "Jane", "Doe", 1999))
+
+  private val datasetCols = Seq("id", "name", "surname", "birthYear")
+
+  test("should correctly detect constant columns") {
+    val input = dataset.toDF(datasetCols: _*).rdd
+
+    val resultOnConstantColumn = PartitionStatsGenerator.getPartitionStats(input, Some(Seq("surname")))
+    val resultOnConstantColumns = PartitionStatsGenerator.getPartitionStats(input, Some(Seq("surname", "birthYear")))
+    val resultOnNotConstantColumn = PartitionStatsGenerator.getPartitionStats(input, Some(Seq("name")))
+    val resultOnNotConstantColumns = PartitionStatsGenerator.getPartitionStats(input, Some(Seq("name", "id")))
+    val resultWhereOnlyOneColumnIsConstant =
+      PartitionStatsGenerator.getPartitionStats(input, Some(Seq("surname", "id")))
+
+    resultOnConstantColumn.areFeatureColumnsConstant.value shouldBe true
+    resultOnConstantColumns.areFeatureColumnsConstant.value shouldBe true
+    resultOnNotConstantColumn.areFeatureColumnsConstant.value shouldBe false
+    resultOnNotConstantColumns.areFeatureColumnsConstant.value shouldBe false
+    resultWhereOnlyOneColumnIsConstant.areFeatureColumnsConstant.value shouldBe false
+  }
+
+  test("should correctly count values") {
+    val inputWithTwoPartitions = dataset.toDF(datasetCols: _*).rdd.coalesce(numPartitions = 2)
+
+    val result = PartitionStatsGenerator.getPartitionStats(inputWithTwoPartitions, Some(Seq("id")))
+
+    result.areFeatureColumnsConstant.value shouldBe false
+    result.partitionSizes should have size 2
+    result.partitionSizes should contain theSameElementsAs Map(0 -> 2, 1 -> 2)
+  }
+
+  test("should not fail given an empty dataset") {
+    val emptyInput = Seq.empty[String].toDF.rdd
+
+    val result = PartitionStatsGenerator.getPartitionStats(emptyInput, Some(Seq("id")))
+
+    result.areFeatureColumnsConstant shouldBe None
+  }
+
+  test("should not fail given one element dataset") {
+    val oneElementInput = Seq(dataset.head).toDF(datasetCols: _*).rdd
+
+    val result = PartitionStatsGenerator.getPartitionStats(oneElementInput, Some(Seq("id")))
+
+    result.areFeatureColumnsConstant.value shouldBe true
+    result.partitionSizes should have size 1
+    result.partitionSizes shouldBe Map(0 -> 1)
+  }
+
+}
diff --git a/ml/src/main/scala/ai/h2o/sparkling/ml/algos/H2OAlgoCommonUtils.scala b/ml/src/main/scala/ai/h2o/sparkling/ml/algos/H2OAlgoCommonUtils.scala
@@ -20,7 +20,7 @@ import ai.h2o.sparkling.backend.utils.H2OFrameLifecycle
 import ai.h2o.sparkling.ml.models.H2OBinaryModel
 import ai.h2o.sparkling.ml.utils.EstimatorCommonUtils
 import ai.h2o.sparkling.{H2OContext, H2OFrame}
-import org.apache.spark.sql.{DataFrame, Dataset}
+import org.apache.spark.sql.{Column, DataFrame, Dataset}
 import org.apache.spark.sql.functions.col
 
 trait H2OAlgoCommonUtils extends EstimatorCommonUtils with H2OFrameLifecycle {
@@ -76,16 +76,12 @@ trait H2OAlgoCommonUtils extends EstimatorCommonUtils with H2OFrameLifecycle {
 
     val featureColumns = getInputCols().map(sanitize).map(col)
 
-    if (dataset.select(featureColumns: _*).distinct().count() == 1) {
-      throw new IllegalArgumentException(s"H2O could not use any of the specified input" +
-        s" columns: '${getInputCols().mkString(", ")}' because they are all constants. H2O requires at least one non-constant column.")
-    }
     val excludedColumns = excludedCols.map(sanitize).map(col)
     val additionalColumns = getAdditionalCols().map(sanitize).map(col)
     val columns = (featureColumns ++ excludedColumns ++ additionalColumns).distinct
     val h2oContext = H2OContext.ensure(
       "H2OContext needs to be created in order to train the model. Please create one as H2OContext.getOrCreate().")
-    val trainFrame = h2oContext.asH2OFrame(dataset.select(columns: _*).toDF())
+    val trainFrame = h2oContext.asH2OFrame(dataset.select(columns: _*).toDF(), getInputCols())
 
     trainFrame.convertColumnsToStrings(getColumnsToString())