From 98ddbaa894a582520fb2816128bbb8ba270cdf4d Mon Sep 17 00:00:00 2001
From: EdenWuyifan <yfw215@nyu.edu>
Date: Thu, 2 May 2024 14:00:55 -0400
Subject: [PATCH] fix bugs and add scoring type

---
 alpha_automl/pipeline_search/agent_environment.py   | 13 +++++++++----
 alpha_automl/pipeline_search/agent_lab.py           |  2 +-
 alpha_automl/pipeline_synthesis/pipeline_builder.py |  9 ++++-----
 alpha_automl/pipeline_synthesis/setup_search.py     |  7 +++++--
 alpha_automl/resource/primitives_hierarchy.json     | 11 ++++++++---
 5 files changed, 27 insertions(+), 15 deletions(-)

diff --git a/alpha_automl/pipeline_search/agent_environment.py b/alpha_automl/pipeline_search/agent_environment.py
index afeede47..96b0315a 100644
--- a/alpha_automl/pipeline_search/agent_environment.py
+++ b/alpha_automl/pipeline_search/agent_environment.py
@@ -27,6 +27,12 @@ def __init__(self, config: EnvContext):
         self.board = self.game.getInitBoard()  # initial board
         self.step_stack = ["S"]  # stack for steps
         self.metadata = self.board[: self.game.m]
+
+        if self.metadata[0] == 2:  # regression.error
+            self.scoring_type = "error"
+        else:  # classification.precision | clustering | regression.r2
+            self.scoring_type = "precision"
+
         self.observation_space = Dict(
             {
                 "board": Box(
@@ -93,11 +99,10 @@ def step(self, action):
         game_end = self.game.getGameEnded(self.board)
         if game_end == 1:  # pipeline score over threshold
             try:
-                if self.game.problem == "REGRESSION":
-                    # reward = 10 + (100 / self.game.getEvaluation(self.board))
-                    reward = 10 + (self.game.getEvaluation(self.board)) ** 3 * 100
+                if self.scoring_type == "error":
+                    reward = 10 + (100 / self.game.getEvaluation(self.board))
                 else:
-                    reward = 10 + (self.game.getEvaluation(self.board)) ** 2 * 100
+                    reward = 10 + (self.game.getEvaluation(self.board)) ** 3 * 100
             except Exception as e:
                 logger.critical(f"[PIPELINE FOUND] Error happened: {str(e)}")
         elif game_end == 2:  # finished but invalid
diff --git a/alpha_automl/pipeline_search/agent_lab.py b/alpha_automl/pipeline_search/agent_lab.py
index 386ffd4f..1efd9297 100644
--- a/alpha_automl/pipeline_search/agent_lab.py
+++ b/alpha_automl/pipeline_search/agent_lab.py
@@ -82,7 +82,7 @@ def train_rllib_model(algo, time_bound, checkpoint_load_folder, checkpoint_save_
     while True:
         if (
             time.time() > timeout
-            or (best_unchanged_iter >= 600 and result["episode_reward_mean"] >= 0)
+            or (best_unchanged_iter >= 10 and result["episode_reward_mean"] >= 0)
             # or result["episode_reward_mean"] >= 70
         ):
             logger.debug(f"Training timeout reached")
diff --git a/alpha_automl/pipeline_synthesis/pipeline_builder.py b/alpha_automl/pipeline_synthesis/pipeline_builder.py
index 858fe7bd..342afad6 100644
--- a/alpha_automl/pipeline_synthesis/pipeline_builder.py
+++ b/alpha_automl/pipeline_synthesis/pipeline_builder.py
@@ -16,14 +16,13 @@
     "lightgbm.LGBMClassifier": {'verbose': -1},
     "lightgbm.LGBMRegressor": {'verbose': -1},
     "catboost.CatBoostRegressor": {
-        'depth': 8,
         'grow_policy': 'Depthwise',
-        'l2_leaf_reg': 2.7997999596449104,
-        'learning_rate': 0.031375015734637225,
-        'max_ctr_complexity': 2,
-        'one_hot_max_size': 3,
         'logging_level': 'Silent'
     },
+    "catboost.CatBoostClassifier": {
+        'grow_policy': 'Depthwise',
+        'logging_level': 'Silent'
+    }
 }
 
 
diff --git a/alpha_automl/pipeline_synthesis/setup_search.py b/alpha_automl/pipeline_synthesis/setup_search.py
index e54dbd64..d4911105 100644
--- a/alpha_automl/pipeline_synthesis/setup_search.py
+++ b/alpha_automl/pipeline_synthesis/setup_search.py
@@ -170,16 +170,19 @@ def compute_metafeatures(metric, metadata):
         "mean_squared_error",
         "mean_squared_log_error",
         "median_absolute_error",
-        "r2_score",
     ]:
         scoring_type = 2
+    elif metric in [
+        "r2_score",
+    ]:
+        scoring_type = 3
     elif metric in [
         "adjusted_mutual_info_score",
         "rand_score",
         "mutual_info_score",
         "normalized_mutual_info_score",
     ]:
-        scoring_type = 3
+        scoring_type = 4
     metafeatures.append(scoring_type)
 
     # IMPUTE
diff --git a/alpha_automl/resource/primitives_hierarchy.json b/alpha_automl/resource/primitives_hierarchy.json
index 58a29ff8..92121a4c 100644
--- a/alpha_automl/resource/primitives_hierarchy.json
+++ b/alpha_automl/resource/primitives_hierarchy.json
@@ -19,7 +19,8 @@
         "sklearn.svm.SVC",
         "sklearn.tree.DecisionTreeClassifier",
         "xgboost.XGBClassifier",
-        "lightgbm.LGBMClassifier"
+        "lightgbm.LGBMClassifier",
+        "catboost.CatBoostClassifier"
     ],
     "CLUSTERER": [
         "sklearn.cluster.KMeans",
@@ -32,10 +33,14 @@
         "alpha_automl.builtin_primitives.datetime_encoder.DummyEncoder"
     ],
     "FEATURE_SCALER": [
-        "sklearn.preprocessing.RobustScaler"
+        "sklearn.preprocessing.MaxAbsScaler",
+        "sklearn.preprocessing.RobustScaler",
+        "sklearn.preprocessing.StandardScaler"
     ],
     "FEATURE_SELECTOR": [
-        "sklearn.feature_selection.SelectPercentile"
+        "sklearn.feature_selection.GenericUnivariateSelect",
+        "sklearn.feature_selection.SelectPercentile",
+        "sklearn.feature_selection.SelectKBest"
     ],
     "IMPUTER": [
         "sklearn.impute.SimpleImputer"