Merge pull request tensorflow#2042 from 18jeffreyma:master

tfx-copybara · tfx-copybara · commit b7122158ae81 · 2020-06-30T15:03:16.000-07:00
PiperOrigin-RevId: 319106358
diff --git a/RELEASE.md b/RELEASE.md
@@ -6,6 +6,8 @@
 *   Added the ConcatPlaceholder to tfx.dsl.component.experimental.placeholders.
 *   Changed Span information as a property of ExampleGen's output artifact.
     Deprecated ExampleGen input (external) artifact.
+*   Added ModelRun artifact for Trainer for storing training related files,
+    e.g., Tensorboard logs.
 
 ## Bug fixes and other changes
 *   Added Tuner component, which is still work in progress.
diff --git a/docs/tutorials/tfx/components_keras.ipynb b/docs/tutorials/tfx/components_keras.ipynb
@@ -1284,7 +1284,7 @@
         "          for i in range(num_dnn_layers)\n",
         "      ])\n",
         "\n",
-        "  # This log path might change in the future.\n",
+        "  # TODO(b/158106209): use ModelRun instead of Model artifact for logging.\n",
         "  log_dir = os.path.join(os.path.dirname(fn_args.serving_model_dir), 'logs')\n",
         "  tensorboard_callback = tf.keras.callbacks.TensorBoard(\n",
         "      log_dir=log_dir, update_freq='batch')\n",
diff --git a/tfx/components/testdata/module_file/trainer_module.py b/tfx/components/testdata/module_file/trainer_module.py
@@ -25,6 +25,7 @@
 from __future__ import division
 from __future__ import print_function
 
+import os
 import absl
 import tensorflow as tf
 import tensorflow_model_analysis as tfma
@@ -340,4 +341,8 @@ def run_fn(fn_args: executor.TrainerFnArgs):
       export_dir_base=fn_args.eval_model_dir,
       eval_input_receiver_fn=training_spec['eval_input_receiver_fn'])
 
+  # Simulate writing a log to the path given by fn_args
+  io_utils.write_string_file(
+      os.path.join(fn_args.model_run_dir, 'fake_log.txt'), '')
+
   absl.logging.info('Exported eval_savedmodel to %s.', fn_args.eval_model_dir)
diff --git a/tfx/components/trainer/component.py b/tfx/components/trainer/component.py
@@ -119,6 +119,7 @@ def __init__(
       custom_config: Optional[Dict[Text, Any]] = None,
       custom_executor_spec: Optional[executor_spec.ExecutorSpec] = None,
       output: Optional[types.Channel] = None,
+      model_run: Optional[types.Channel] = None,
       transform_output: Optional[types.Channel] = None,
       instance_name: Optional[Text] = None):
     """Construct a Trainer component.
@@ -179,6 +180,8 @@ def trainer_fn(trainer.executor.TrainerFnArgs,
         that will be passed into user module.
       custom_executor_spec: Optional custom executor spec.
       output: Optional `Model` channel for result of exported models.
+      model_run: Optional `ModelRun` channel, as the working dir of models,
+        can be used to output non-model related output (e.g., TensorBoard logs).
       transform_output: Backwards compatibility alias for the 'transform_graph'
         argument.
       instance_name: Optional unique instance name. Necessary iff multiple
@@ -214,6 +217,9 @@ def trainer_fn(trainer.executor.TrainerFnArgs,
     examples = examples or transformed_examples
     output = output or types.Channel(
         type=standard_artifacts.Model, artifacts=[standard_artifacts.Model()])
+    model_run = model_run or types.Channel(
+        type=standard_artifacts.ModelRun,
+        artifacts=[standard_artifacts.ModelRun()])
     spec = TrainerSpec(
         examples=examples,
         transform_graph=transform_graph,
@@ -226,7 +232,9 @@ def trainer_fn(trainer.executor.TrainerFnArgs,
         run_fn=run_fn,
         trainer_fn=trainer_fn,
         custom_config=json_utils.dumps(custom_config),
-        model=output)
+        model=output,
+        # TODO(b/158106209): change the model_run as optional output artifact
+        model_run=model_run)
     super(Trainer, self).__init__(
         spec=spec,
         custom_executor_spec=custom_executor_spec,
diff --git a/tfx/components/trainer/component_test.py b/tfx/components/trainer/component_test.py
@@ -46,6 +46,8 @@ def setUp(self):
   def _verify_outputs(self, trainer):
     self.assertEqual(standard_artifacts.Model.TYPE_NAME,
                      trainer.outputs['model'].type_name)
+    self.assertEqual(standard_artifacts.ModelRun.TYPE_NAME,
+                     trainer.outputs['model_run'].type_name)
 
   def testConstructFromModuleFile(self):
     module_file = '/path/to/module/file'
diff --git a/tfx/components/trainer/constants.py b/tfx/components/trainer/constants.py
@@ -37,7 +37,9 @@
 CUSTOM_CONFIG_KEY = 'custom_config'
 
 # Key for output model in executor output_dict.
-OUTPUT_MODEL_KEY = 'model'
+MODEL_KEY = 'model'
+# Key for log output in executor output_dict
+MODEL_RUN_KEY = 'model_run'
 
 # The name of environment variable to indicate distributed training cluster.
 TF_CONFIG_ENV = 'TF_CONFIG'
diff --git a/tfx/components/trainer/executor.py b/tfx/components/trainer/executor.py
@@ -130,10 +130,13 @@ def _GetFnArgs(self, input_dict: Dict[Text, List[types.Artifact]],
       hyperparameters_config = None
 
     output_path = artifact_utils.get_single_uri(
-        output_dict[constants.OUTPUT_MODEL_KEY])
+        output_dict[constants.MODEL_KEY])
     serving_model_dir = path_utils.serving_model_dir(output_path)
     eval_model_dir = path_utils.eval_model_dir(output_path)
 
+    model_run_dir = artifact_utils.get_single_uri(
+        output_dict[constants.MODEL_RUN_KEY])
+
     # TODO(b/126242806) Use PipelineInputs when it is available in third_party.
     return TrainerFnArgs(
         # A list of uris for train files.
@@ -148,6 +151,8 @@ def _GetFnArgs(self, input_dict: Dict[Text, List[types.Artifact]],
         eval_model_dir=eval_model_dir,
         # A list of uris for eval files.
         eval_files=fn_args.eval_files,
+        # A single uri for the output directory of model training related files.
+        model_run_dir=model_run_dir,
         # A single uri for schema file.
         schema_file=fn_args.schema_path,
         # Number of train steps.
@@ -168,7 +173,8 @@ def Do(self, input_dict: Dict[Text, List[types.Artifact]],
 
     The Trainer Executor invokes a run_fn callback function provided by
     the user via the module_file parameter. In this function, user defines the
-    model and train it, then save the model to the provided location.
+    model and trains it, then saves the model and training related files
+    (e.g, Tensorboard logs) to the provided locations.
 
     Args:
       input_dict: Input dict from input key to a list of ML-Metadata Artifacts.
@@ -177,7 +183,8 @@ def Do(self, input_dict: Dict[Text, List[types.Artifact]],
         - transform_output: Optional input transform graph.
         - schema: Schema of the data.
       output_dict: Output dict from output key to a list of Artifacts.
-        - output: Exported model.
+        - model: Exported model.
+        - model_run: Model training related outputs (e.g., Tensorboard logs)
       exec_properties: A dict of execution properties.
         - train_args: JSON string of trainer_pb2.TrainArgs instance, providing
           args for training.
@@ -211,8 +218,10 @@ def Do(self, input_dict: Dict[Text, List[types.Artifact]],
     # module's responsibility to export the model only once.
     if not tf.io.gfile.exists(fn_args.serving_model_dir):
       raise RuntimeError('run_fn failed to generate model.')
-    absl.logging.info('Training complete. Model written to %s',
-                      fn_args.serving_model_dir)
+
+    absl.logging.info(
+        'Training complete. Model written to %s. ModelRun written to %s',
+        fn_args.serving_model_dir, fn_args.model_run_dir)
 
 
 class Executor(GenericExecutor):
@@ -244,7 +253,8 @@ def Do(self, input_dict: Dict[Text, List[types.Artifact]],
         - transform_output: Optional input transform graph.
         - schema: Schema of the data.
       output_dict: Output dict from output key to a list of Artifacts.
-        - output: Exported model.
+        - model: Exported model.
+        - model_run: Model training related outputs (e.g., Tensorboard logs)
       exec_properties: A dict of execution properties.
         - train_args: JSON string of trainer_pb2.TrainArgs instance, providing
           args for training.
@@ -278,8 +288,10 @@ def Do(self, input_dict: Dict[Text, List[types.Artifact]],
     tf.estimator.train_and_evaluate(training_spec['estimator'],
                                     training_spec['train_spec'],
                                     training_spec['eval_spec'])
-    absl.logging.info('Training complete.  Model written to %s',
-                      fn_args.serving_model_dir)
+
+    absl.logging.info(
+        'Training complete. Model written to %s. ModelRun written to %s',
+        fn_args.serving_model_dir, fn_args.model_run_dir)
 
     # Export an eval savedmodel for TFMA. If distributed training, it must only
     # be written by the chief worker, as would be done for serving savedmodel.
@@ -290,6 +302,10 @@ def Do(self, input_dict: Dict[Text, List[types.Artifact]],
           export_dir_base=fn_args.eval_model_dir,
           eval_input_receiver_fn=training_spec['eval_input_receiver_fn'])
 
+      # TODO(b/158106209): refactor serving_model_dir to only contain model.
+      # Copy model run information to ModelRun artifact
+      io_utils.copy_dir(fn_args.serving_model_dir, fn_args.model_run_dir)
+
       absl.logging.info('Exported eval_savedmodel to %s.',
                         fn_args.eval_model_dir)
     else:
diff --git a/tfx/components/trainer/executor_test.py b/tfx/components/trainer/executor_test.py
@@ -70,7 +70,13 @@ def setUp(self):
     self._model_exports = standard_artifacts.Model()
     self._model_exports.uri = os.path.join(self._output_data_dir,
                                            'model_export_path')
-    self._output_dict = {constants.OUTPUT_MODEL_KEY: [self._model_exports]}
+    self._model_run_exports = standard_artifacts.ModelRun()
+    self._model_run_exports.uri = os.path.join(self._output_data_dir,
+                                               'model_run_path')
+    self._output_dict = {
+        constants.MODEL_KEY: [self._model_exports],
+        constants.MODEL_RUN_KEY: [self._model_run_exports]
+    }
 
     # Create exec properties skeleton.
     self._exec_properties = {
@@ -106,6 +112,10 @@ def _verify_no_eval_model_exports(self):
     self.assertFalse(
         tf.io.gfile.exists(path_utils.eval_model_dir(self._model_exports.uri)))
 
+  def _verify_model_run_exports(self):
+    self.assertTrue(
+        tf.io.gfile.exists(os.path.dirname(self._model_run_exports.uri)))
+
   def _do(self, test_executor):
     test_executor.Do(
         input_dict=self._input_dict,
@@ -116,30 +126,35 @@ def testGenericExecutor(self):
     self._exec_properties['module_file'] = self._module_file
     self._do(self._generic_trainer_executor)
     self._verify_model_exports()
+    self._verify_model_run_exports()
 
   @mock.patch('tfx.components.trainer.executor._is_chief')
   def testDoChief(self, mock_is_chief):
     mock_is_chief.return_value = True
     self._exec_properties['module_file'] = self._module_file
     self._do(self._trainer_executor)
     self._verify_model_exports()
+    self._verify_model_run_exports()
 
   @mock.patch('tfx.components.trainer.executor._is_chief')
   def testDoNonChief(self, mock_is_chief):
     mock_is_chief.return_value = False
     self._exec_properties['module_file'] = self._module_file
     self._do(self._trainer_executor)
     self._verify_no_eval_model_exports()
+    self._verify_model_run_exports()
 
   def testDoWithModuleFile(self):
     self._exec_properties['module_file'] = self._module_file
     self._do(self._trainer_executor)
     self._verify_model_exports()
+    self._verify_model_run_exports()
 
   def testDoWithTrainerFn(self):
     self._exec_properties['trainer_fn'] = self._trainer_fn
     self._do(self._trainer_executor)
     self._verify_model_exports()
+    self._verify_model_run_exports()
 
   def testDoWithNoTrainerFn(self):
     with self.assertRaises(ValueError):
@@ -169,6 +184,7 @@ def testDoWithHyperParameters(self):
     self._exec_properties['module_file'] = self._module_file
     self._do(self._trainer_executor)
     self._verify_model_exports()
+    self._verify_model_run_exports()
 
 
 if __name__ == '__main__':
diff --git a/tfx/dsl/compiler/testdata/iris_pipeline_ir.pbtxt b/tfx/dsl/compiler/testdata/iris_pipeline_ir.pbtxt
@@ -617,6 +617,16 @@ nodes {
           }
         }
       }
+      outputs {
+        key: "model_run"
+        value {
+          artifact_spec {
+            type {
+              name: "ModelRun"
+            }
+          }
+        }
+      }
     }
     parameters {
       parameters {
diff --git a/tfx/examples/airflow_workshop/setup/dags/taxi_utils.py b/tfx/examples/airflow_workshop/setup/dags/taxi_utils.py
@@ -322,8 +322,13 @@
 #             for i in range(num_dnn_layers)
 #         ])
 #
-#   # TODO(b/158106209): This log path might change in the future.
-#   log_dir = os.path.join(os.path.dirname(fn_args.serving_model_dir), 'logs')
+#   try:
+#     log_dir = fn_args.model_run_dir
+#   except KeyError:
+#     # TODO(b/158106209): use ModelRun instead of Model artifact for logging.
+#     log_dir = os.path.join(os.path.dirname(fn_args.serving_model_dir), 'logs')
+#
+#   # Write logs to path
 #   tensorboard_callback = tf.keras.callbacks.TensorBoard(
 #       log_dir=log_dir, update_freq='batch')
 #
diff --git a/tfx/examples/airflow_workshop/setup/dags/taxi_utils_solution.py b/tfx/examples/airflow_workshop/setup/dags/taxi_utils_solution.py
@@ -319,8 +319,13 @@ def run_fn(fn_args: TrainerFnArgs):
             for i in range(num_dnn_layers)
         ])
 
-  # TODO(b/158106209): This log path might change in the future.
-  log_dir = os.path.join(os.path.dirname(fn_args.serving_model_dir), 'logs')
+  try:
+    log_dir = fn_args.model_run_dir
+  except KeyError:
+    # TODO(b/158106209): use ModelRun instead of Model artifact for logging.
+    log_dir = os.path.join(os.path.dirname(fn_args.serving_model_dir), 'logs')
+
+  # Write logs to path
   tensorboard_callback = tf.keras.callbacks.TensorBoard(
       log_dir=log_dir, update_freq='batch')
 
diff --git a/tfx/examples/chicago_taxi_pipeline/taxi_utils_native_keras.py b/tfx/examples/chicago_taxi_pipeline/taxi_utils_native_keras.py
@@ -22,6 +22,7 @@
 from __future__ import division
 from __future__ import print_function
 
+import os
 from typing import List, Text
 
 import absl
@@ -311,11 +312,22 @@ def run_fn(fn_args: TrainerFnArgs):
             for i in range(num_dnn_layers)
         ])
 
+  try:
+    log_dir = fn_args.model_run_dir
+  except KeyError:
+    # TODO(b/158106209): use ModelRun instead of Model artifact for logging.
+    log_dir = os.path.join(os.path.dirname(fn_args.serving_model_dir), 'logs')
+
+  # Write logs to path
+  tensorboard_callback = tf.keras.callbacks.TensorBoard(
+      log_dir=log_dir, update_freq='batch')
+
   model.fit(
       train_dataset,
       steps_per_epoch=fn_args.train_steps,
       validation_data=eval_dataset,
-      validation_steps=fn_args.eval_steps)
+      validation_steps=fn_args.eval_steps,
+      callbacks=[tensorboard_callback])
 
   signatures = {
       'serving_default':
diff --git a/tfx/examples/iris/iris_utils_native_keras.py b/tfx/examples/iris/iris_utils_native_keras.py
@@ -22,8 +22,8 @@
 from __future__ import division
 from __future__ import print_function
 
+import os
 from typing import List, Text
-
 import absl
 import kerastuner
 import tensorflow as tf
@@ -236,12 +236,23 @@ def run_fn(fn_args: TrainerFnArgs):
 
   steps_per_epoch = _TRAIN_DATA_SIZE / _TRAIN_BATCH_SIZE
 
+  try:
+    log_dir = fn_args.model_run_dir
+  except KeyError:
+    # TODO(b/158106209): use ModelRun instead of Model artifact for logging.
+    log_dir = os.path.join(os.path.dirname(fn_args.serving_model_dir), 'logs')
+
+  # Write logs to path
+  tensorboard_callback = tf.keras.callbacks.TensorBoard(
+      log_dir=log_dir, update_freq='batch')
+
   model.fit(
       train_dataset,
       epochs=int(fn_args.train_steps / steps_per_epoch),
       steps_per_epoch=steps_per_epoch,
       validation_data=eval_dataset,
-      validation_steps=fn_args.eval_steps)
+      validation_steps=fn_args.eval_steps,
+      callbacks=[tensorboard_callback])
 
   signatures = {
       'serving_default':
diff --git a/tfx/examples/mnist/mnist_utils_native_keras.py b/tfx/examples/mnist/mnist_utils_native_keras.py
diff --git a/tfx/examples/mnist/mnist_utils_native_keras_lite.py b/tfx/examples/mnist/mnist_utils_native_keras_lite.py
diff --git a/tfx/experimental/templates/taxi/models/keras/model.py b/tfx/experimental/templates/taxi/models/keras/model.py
diff --git a/tfx/types/standard_artifacts.py b/tfx/types/standard_artifacts.py
diff --git a/tfx/types/standard_component_specs.py b/tfx/types/standard_component_specs.py

Original file line number	Diff line number	Diff line change
`@@ -617,6 +617,16 @@ nodes {`
`617`	`617`	`}`
`618`	`618`	`}`
`619`	`619`	`}`
	`620`	`+ outputs {`
	`621`	`+ key: "model_run"`
	`622`	`+ value {`
	`623`	`+ artifact_spec {`
	`624`	`+ type {`
	`625`	`+ name: "ModelRun"`
	`626`	`+ }`
	`627`	`+ }`
	`628`	`+ }`
	`629`	`+ }`
`620`	`630`	`}`
`621`	`631`	`parameters {`
`622`	`632`	`parameters {`