revise tests to include num_workers param and add logic around determining predict func

danielenricocahall · danielenricocahall · commit 73faba19a45d · 2022-08-30T12:24:49.000-04:00
diff --git a/elephas/spark_model.py b/elephas/spark_model.py
@@ -198,24 +198,39 @@ def _fit(self, rdd: RDD, **kwargs):
             self.stop_server()
 
     def _predict(self, rdd: RDD):
-        rdd = rdd.zipWithIndex()
-        if self.num_workers:
-            rdd = rdd.repartition(self.num_workers)
         json_model = self.master_network.to_json()
         weights = self.master_network.get_weights()
         weights = rdd.context.broadcast(weights)
-        custom_objects = self.custom_objects
+        custom_objs = self.custom_objects
 
-        def _predict(model, custom_objects, data):
-            model = model_from_json(model, custom_objects)
+        def _predict(model_as_json, custom_objects, data):
+            model = model_from_json(model_as_json, custom_objects)
+            model.set_weights(weights.value)
+            data = np.array([x for x in data])
+            return model.predict(data)
+
+        def _predict_with_indices(model_as_json, custom_objects, data):
+            model = model_from_json(model_as_json, custom_objects)
             model.set_weights(weights.value)
             data, indices = zip(*data)
             data = np.array(data)
             return zip(model.predict(data), indices)
 
-        predictions_and_indices = rdd.mapPartitions(partial(_predict, json_model, custom_objects))
-        predictions_sorted_by_index = predictions_and_indices.sortBy(lambda x: x[1])
-        predictions = predictions_sorted_by_index.map(lambda x: x[0]).collect()
+        if self.num_workers and self.num_workers > 1:
+            # if there are multiple workers, we need to retrieve element indices and preserve them throughout
+            # the inference process, since we'll need to sort by index before returning the result, as repartitioning
+            # does not preserve ordering, but the users will expect prediction results which correspond to the ordering
+            # of samples they supplied.
+            rdd = rdd.zipWithIndex()
+            rdd = rdd.repartition(self.num_workers)
+            predictions_and_indices = rdd.mapPartitions(partial(_predict_with_indices, json_model, custom_objs))
+            predictions_sorted_by_index = predictions_and_indices.sortBy(lambda x: x[1])
+            predictions = predictions_sorted_by_index.map(lambda x: x[0]).collect()
+        else:
+            # if there are no workers specified or only a single worker, we don't need to worry about handling index
+            # values, since there will be no shuffling
+            predictions = rdd.mapPartitions(partial(_predict, json_model, custom_objs)).collect()
+
         return predictions
 
     def _evaluate(self, rdd: RDD, **kwargs):
diff --git a/examples/ml_mlp_classification.py b/examples/ml_mlp_classification.py
@@ -15,7 +15,7 @@
 # Define basic parameters
 batch_size = 64
 nb_classes = 10
-epochs = 1
+epochs = 20
 
 # Load data
 (x_train, y_train), (x_test, y_test) = mnist.load_data()
@@ -26,6 +26,11 @@
 x_test = x_test.astype("float32")
 x_train /= 255
 x_test /= 255
+
+x_train = x_train[:5000]
+x_test = x_test[:1000]
+y_train = y_train[:5000]
+y_test = y_test[:1000]
 print(x_train.shape[0], 'train samples')
 print(x_test.shape[0], 'test samples')
 
@@ -74,7 +79,7 @@
 # Evaluate Spark model by evaluating the underlying model
 prediction = fitted_pipeline.transform(test_df)
 pnl = prediction.select("label", "prediction")
-pnl.show(100)
+pnl.show(100, truncate=False)
 
 prediction_and_label = pnl.rdd.map(lambda row: (row.label, row.prediction))
 metrics = MulticlassMetrics(prediction_and_label)
diff --git a/tests/integration/test_end_to_end.py b/tests/integration/test_end_to_end.py
@@ -9,14 +9,20 @@
 import pytest
 import numpy as np
 
-
-# enumerate possible combinations for training mode and parameter server for a classification model
-@pytest.mark.parametrize('mode,parameter_server_mode', [('synchronous', None),
-                                                        ('asynchronous', 'http'),
-                                                        ('asynchronous', 'socket'),
-                                                        ('hogwild', 'http'),
-                                                        ('hogwild', 'socket')])
-def test_training_classification(spark_context, mode, parameter_server_mode, mnist_data, classification_model):
+# enumerate possible combinations for training mode and parameter server for a classification model while also validatiing
+# multiple workers for repartitioning
+@pytest.mark.parametrize('mode,parameter_server_mode,num_workers',
+                         [('synchronous', None, None),
+                          ('synchronous', None, 2),
+                          ('asynchronous', 'http', None),
+                          ('asynchronous', 'http', 2),
+                          ('asynchronous', 'socket', None),
+                          ('asynchronous', 'socket', 2),
+                          ('hogwild', 'http', None),
+                          ('hogwild', 'http', 2),
+                          ('hogwild', 'socket', None),
+                          ('hogwild', 'socket', 2)])
+def test_training_classification(spark_context, mode, parameter_server_mode, num_workers, mnist_data, classification_model):
     # Define basic parameters
     batch_size = 64
     epochs = 10
@@ -33,7 +39,7 @@ def test_training_classification(spark_context, mode, parameter_server_mode, mni
     rdd = to_simple_rdd(spark_context, x_train, y_train)
 
     # Initialize SparkModel from keras model and Spark context
-    spark_model = SparkModel(classification_model, frequency='epoch',
+    spark_model = SparkModel(classification_model, frequency='epoch', num_workers=num_workers,
                              mode=mode, parameter_server_mode=parameter_server_mode, port=4000 + random.randint(0, 800))
 
     # Train Spark model
@@ -57,13 +63,21 @@ def test_training_classification(spark_context, mode, parameter_server_mode, mni
     assert isclose(evals[1], spark_model.master_network.evaluate(x_test, y_test)[1], abs_tol=0.01)
 
 
-# enumerate possible combinations for training mode and parameter server for a regression model
-@pytest.mark.parametrize('mode,parameter_server_mode', [('synchronous', None),
-                                                        ('asynchronous', 'http'),
-                                                        ('asynchronous', 'socket'),
-                                                        ('hogwild', 'http'),
-                                                        ('hogwild', 'socket')])
-def test_training_regression(spark_context, mode, parameter_server_mode, boston_housing_dataset, regression_model):
+# enumerate possible combinations for training mode and parameter server for a regression model while also validating
+# multiple workers for repartitioning
+@pytest.mark.parametrize('mode,parameter_server_mode,num_workers',
+                         [('synchronous', None, None),
+                          ('synchronous', None, 2),
+                          ('asynchronous', 'http', None),
+                          ('asynchronous', 'http', 2),
+                          ('asynchronous', 'socket', None),
+                          ('asynchronous', 'socket', 2),
+                          ('hogwild', 'http', None),
+                          ('hogwild', 'http', 2),
+                          ('hogwild', 'socket', None),
+                          ('hogwild', 'socket', 2)])
+def test_training_regression(spark_context, mode, parameter_server_mode, num_workers, boston_housing_dataset,
+                             regression_model):
     x_train, y_train, x_test, y_test = boston_housing_dataset
     rdd = to_simple_rdd(spark_context, x_train, y_train)
 
@@ -72,7 +86,7 @@ def test_training_regression(spark_context, mode, parameter_server_mode, boston_
     epochs = 10
     sgd = SGD(lr=0.0000001)
     regression_model.compile(sgd, 'mse', ['mae'])
-    spark_model = SparkModel(regression_model, frequency='epoch', mode=mode,
+    spark_model = SparkModel(regression_model, frequency='epoch', mode=mode, num_workers=num_workers,
                              parameter_server_mode=parameter_server_mode, port=4000 + random.randint(0, 800))
 
     # Train Spark model
@@ -92,44 +106,5 @@ def test_training_regression(spark_context, mode, parameter_server_mode, boston_
     assert all(np.isclose(x, y, 0.01) for x, y in zip(predictions, spark_model.master_network.predict(x_test)))
 
     # assert we get the same evaluation results when calling evaluate on keras model directly
-    assert isclose(evals[0], spark_model.master_network.evaluate(x_test, y_test)[0], abs_tol=0.01)
-    assert isclose(evals[1], spark_model.master_network.evaluate(x_test, y_test)[1], abs_tol=0.01)
-
-
-def test_bug203_using_multiple_workers(spark_context, boston_housing_dataset, regression_model):
-    x_train, y_train, x_test, y_test = boston_housing_dataset
-    rdd = to_simple_rdd(spark_context, x_train, y_train)
-
-    # Define basic parameters
-    batch_size = 32
-    epochs = 10
-    sgd = SGD(lr=0.0000001)
-    import tensorflow as tf
-    regression_model.compile(sgd, 'mse', ['mae'])
-
-    spark_model_multiple_workers = SparkModel(regression_model,
-                                              frequency="epoch",
-                                              port=4000 + random.randint(0, 800),
-                                              mode="synchronous",
-                                              num_workers=2)
-
-    # Train Spark model
-    spark_model_multiple_workers.fit(rdd, epochs=epochs, batch_size=batch_size, verbose=0, validation_split=0.1)
-
-    # run inference on trained spark model
-    predictions = spark_model_multiple_workers.predict(x_test)
-    # run evaluation on trained spark model
-    evals = spark_model_multiple_workers.evaluate(x_test, y_test)
-
-    # assert we can supply rdd and get same prediction results when supplying numpy array
-    test_rdd = spark_context.parallelize(x_test)
-    assert all(np.isclose(x, y, 0.01) for x, y in zip(predictions, spark_model_multiple_workers.predict(test_rdd)))
-
-    # assert we get the same prediction result with calling predict on keras model directly
-    assert all(np.isclose(x, y, 0.01) for x, y in zip(predictions, spark_model_multiple_workers.master_network.predict(x_test))), (predictions, spark_model_multiple_workers.master_network.predict(x_test))
-
-    # assert we get the same evaluation results when calling evaluate on keras model directly
-    assert isclose(evals[0], spark_model_multiple_workers.master_network.evaluate(x_test, y_test)[0], abs_tol=1.0)
-    assert isclose(evals[1], spark_model_multiple_workers.master_network.evaluate(x_test, y_test)[1], abs_tol=1.0)
-
-
+    assert isclose(evals[0], spark_model.master_network.evaluate(x_test, y_test)[0], abs_tol=1.0)
+    assert isclose(evals[1], spark_model.master_network.evaluate(x_test, y_test)[1], abs_tol=1.0)