Add optimized hyperparameters to model info in db; format hyperparam display (cesium-ml#139)

acrellin · stefanv · commit 9c4c3d910de1 · 2016-12-09T10:23:47.000-08:00
diff --git a/cesium_app/handlers/model.py b/cesium_app/handlers/model.py
@@ -20,6 +20,56 @@
 from distributed.client import _wait
 
 
+def _build_model_compute_statistics(fset_path, model_type, model_params,
+                                    params_to_optimize, model_path):
+    '''Build model and return summary statistics.
+
+    Parameters
+    ----------
+    fset_path : str
+        Path to feature set NetCDF file.
+    model_type : str
+        Type of model to be built, e.g. 'RandomForestClassifier'.
+    model_params : dict
+        Dictionary with hyperparameter values to be used in model building.
+        Keys are parameter names, values are the associated parameter values.
+        These hyperparameters will be passed to the model constructor as-is
+        (for hyperparameter optimization, see `params_to_optimize`).
+    params_to_optimize : dict or list of dict
+        During hyperparameter optimization, various model parameters
+        are adjusted to give an optimal fit. This dictionary gives the
+        different values that should be explored for each parameter. E.g.,
+        `{'alpha': [1, 2], 'beta': [4, 5, 6]}` would fit models on all
+        6 combinations of alpha and beta and compare the resulting models'
+        goodness-of-fit. If None, only those hyperparameters specified in
+        `model_parameters` will be used (passed to model constructor as-is).
+    model_path : str
+        Path indicating where serialized model will be saved.
+
+    Returns
+    -------
+    score : float
+        The model's training score.
+    best_params : dict
+        Dictionary of best hyperparameter values (keys are parameter names,
+        values are the corresponding best values) determined by `scikit-learn`'s
+        `GridSearchCV`. If no hyperparameter optimization is performed (i.e.
+        `params_to_optimize` is None or is an empty dict, this will be an empty
+        dict.
+    '''
+    fset = featureset.from_netcdf(fset_path, engine=cfg['xr_engine'])
+    computed_model = build_model.build_model_from_featureset(
+        featureset=fset, model_type=model_type,
+        model_parameters=model_params,
+        params_to_optimize=params_to_optimize)
+    score = build_model.score_model(computed_model, fset)
+    best_params = computed_model.best_params_ if params_to_optimize else {}
+    joblib.dump(computed_model, model_path)
+    fset.close()
+
+    return score, best_params
+
+
 class ModelHandler(BaseHandler):
     def _get_model(self, model_id):
         try:
@@ -42,14 +92,14 @@ def get(self, model_id=None):
         return self.success(model_info)
 
     @tornado.gen.coroutine
-    def _await_model(self, score_future, save_future, model):
+    def _await_model_statistics(self, model_stats_future, model):
         try:
-            yield save_future._result()
-            score = yield score_future._result()
+            score, best_params = yield model_stats_future._result()
 
             model.task_id = None
             model.finished = datetime.datetime.now()
             model.train_score = score
+            model.params.update(best_params)
             model.save()
 
             self.action('cesium/SHOW_NOTIFICATION',
@@ -97,30 +147,15 @@ def post(self):
 
         executor = yield self._get_executor()
 
-        fset = executor.submit(lambda path: featureset.from_netcdf(path,
-            engine=cfg['xr_engine']), fset.file.uri)
-        imputed_fset = executor.submit(featureset.Featureset.impute, fset)
-        computed_model = executor.submit(
-            build_model.build_model_from_featureset,
-            featureset=imputed_fset, model_type=model_type,
-            model_parameters=model_params,
-            params_to_optimize=params_to_optimize)
-        score_future = executor.submit(build_model.score_model, computed_model,
-                                       imputed_fset)
-        save_future = executor.submit(joblib.dump, computed_model, model_file.uri)
-
-        @tornado.gen.coroutine
-        def _wait_and_call(callback, *args, futures=[]):
-            yield _wait(futures_list)
-            return callback(*args)
-
-        model.task_id = save_future.key
+        model_stats_future = executor.submit(
+            _build_model_compute_statistics, fset.file.uri, model_type,
+            model_params, params_to_optimize, model_path)
+
+        model.task_id = model_stats_future.key
         model.save()
 
         loop = tornado.ioloop.IOLoop.current()
-        loop.add_callback(_wait_and_call, xr.Dataset.close, imputed_fset,
-                          futures=[computed_model, score_future, save_future])
-        loop.spawn_callback(self._await_model, score_future, save_future, model)
+        loop.spawn_callback(self._await_model_statistics, model_stats_future, model)
 
         return self.success(data={'message': "Model training begun."},
                             action='cesium/FETCH_MODELS')
diff --git a/cesium_app/tests/frontend/test_build_model.py b/cesium_app/tests/frontend/test_build_model.py
@@ -132,7 +132,7 @@ def test_model_info_display(driver):
         driver.find_element_by_xpath("//td[contains(text(),'{}')]".format(m.name)).click()
         assert driver.find_element_by_xpath("//th[contains(text(),'Model Type')]")\
                      .is_displayed()
-        assert driver.find_element_by_xpath("//th[contains(text(),'Hyper "
-                                            "Parameters')]").is_displayed()
+        assert driver.find_element_by_xpath("//th[contains(text(),'Hyper"
+                                            "parameters')]").is_displayed()
         assert driver.find_element_by_xpath("//th[contains(text(),'Training "
                                             "Data Score')]").is_displayed()
diff --git a/cesium_app/tests/frontend/test_datasets.py b/cesium_app/tests/frontend/test_datasets.py
@@ -35,7 +35,6 @@ def test_add_new_dataset(driver):
         driver.implicitly_wait(1)
         status_td = driver.find_element_by_xpath(
             "//div[contains(text(),'Successfully uploaded new dataset')]")
-        assert test_dataset_name in driver.page_source
 
 
 def test_dataset_info_display(driver):
@@ -62,4 +61,3 @@ def test_delete_dataset(driver):
         driver.implicitly_wait(1)
         status_td = driver.find_element_by_xpath(
             "//div[contains(text(),'Dataset deleted')]")
-        assert test_dataset_name not in driver.page_source
diff --git a/cesium_app/tests/frontend/test_pipeline_sequentially.py b/cesium_app/tests/frontend/test_pipeline_sequentially.py
@@ -25,8 +25,7 @@ def test_pipeline_sequentially(driver):
     driver.implicitly_wait(1)
     status_td = driver.find_element_by_xpath(
         "//div[contains(text(),'Added new project')]")
-    time.sleep(0.1)
-    assert test_proj_name in driver.page_source
+    driver.refresh()
 
     # Ensure new project is selected
     proj_select = Select(driver.find_element_by_css_selector('[name=project]'))
@@ -54,7 +53,11 @@ def test_pipeline_sequentially(driver):
     driver.implicitly_wait(1)
     status_td = driver.find_element_by_xpath(
         "//div[contains(text(),'Successfully uploaded new dataset')]")
-    assert test_dataset_name in driver.page_source
+    driver.refresh()
+
+    # Ensure new project is selected
+    proj_select = Select(driver.find_element_by_css_selector('[name=project]'))
+    proj_select.select_by_visible_text(test_proj_name)
 
     # Generate new feature set
     test_featureset_name = str(uuid.uuid4())
diff --git a/cesium_app/tests/frontend/test_projects.py b/cesium_app/tests/frontend/test_projects.py
@@ -25,8 +25,7 @@ def test_create_project(driver):
     driver.implicitly_wait(1)
     status_td = driver.find_element_by_xpath(
         "//div[contains(text(),'Added new project')]")
-    time.sleep(0.1)
-    assert test_proj_name in driver.page_source
+    driver.refresh()
 
     proj_select = Select(driver.find_element_by_css_selector('[name=project]'))
     proj_select.select_by_visible_text(test_proj_name)
diff --git a/public/scripts/Models.jsx b/public/scripts/Models.jsx
@@ -165,7 +165,7 @@ let ModelInfo = props => (
     <thead>
       <tr>
         <th>Model Type</th>
-        <th>Hyper Parameters</th>
+        <th>Hyperparameters</th>
         <th>Training Data Score</th>
       </tr>
     </thead>
@@ -175,7 +175,18 @@ let ModelInfo = props => (
           {props.model.type}
         </td>
         <td>
-          {JSON.stringify(props.model.params, null, 4)}
+          <table>
+            <tbody>
+              {
+                Object.keys(props.model.params).map(param => (
+                  <tr>
+                    <td>{param}</td>
+                    <td style={{ paddingLeft: "5px" }}>{JSON.stringify(props.model.params[param])}</td>
+                  </tr>
+                ))
+              }
+            </tbody>
+          </table>
         </td>
         <td>
           {props.model.train_score}