Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

WIP: Add SMAC optimizer #44

Closed
wants to merge 8 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -57,3 +57,5 @@ desktop.ini
tmp
lightning_logs
**/lightning_logs
smac3_output
**/smac3_output
46 changes: 35 additions & 11 deletions alpha_automl/automl_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,10 @@
from multiprocessing import set_start_method
from sklearn.preprocessing import LabelEncoder
from alpha_automl.automl_manager import AutoMLManager
from alpha_automl.scorer import make_scorer, make_splitter, make_str_metric, get_sign_sorting
from alpha_automl.scorer import make_scorer, make_splitter, make_str_metric, get_sign_sorting, score_pipeline
from alpha_automl.utils import make_d3m_pipelines, hide_logs, get_start_method, check_input_for_multiprocessing
from alpha_automl.visualization import plot_comparison_pipelines
from alpha_automl.hyperparameter_tuning.smac import SmacOptimizer


logging.basicConfig(level=logging.DEBUG, stream=sys.stdout)
Expand All @@ -22,7 +23,7 @@ class BaseAutoML():

def __init__(self, output_folder, time_bound=15, metric=None, split_strategy='holdout', time_bound_run=5, task=None,
score_sorting='auto', metric_kwargs=None, split_strategy_kwargs=None, start_mode='auto',
verbose=False):
verbose=False, optimizing=False, optimizing_number=10):
"""
Create/instantiate an BaseAutoML object.

Expand All @@ -39,6 +40,8 @@ def __init__(self, output_folder, time_bound=15, metric=None, split_strategy='ho
:param split_strategy_kwargs: Additional arguments for splitting_strategy.
:param start_mode: The mode to start the multiprocessing library. It could be `auto`, `fork` or `spawn`.
:param verbose: Whether or not to show additional logs
:param optimizing: Whether or not tuning the top pipelines using SMAC3 optimizer
:param optimizing_number: The number of top pipelines to be optimized
"""

self.output_folder = output_folder
Expand All @@ -56,7 +59,7 @@ def __init__(self, output_folder, time_bound=15, metric=None, split_strategy='ho
self.X = None
self.y = None
self.leaderboard = None
self.automl_manager = AutoMLManager(output_folder, time_bound, time_bound_run, task, verbose)
self.automl_manager = AutoMLManager(output_folder, time_bound*0.8, time_bound_run, task, verbose)

if not verbose:
hide_logs()
Expand All @@ -67,6 +70,10 @@ def __init__(self, output_folder, time_bound=15, metric=None, split_strategy='ho
check_input_for_multiprocessing(self._start_method, self.scorer._score_func, 'metric')
check_input_for_multiprocessing(self._start_method, self.splitter, 'split strategy')

self.optimizing = optimizing
self.optimizing_number = optimizing_number
self.optimizing_timelimit = time_bound*0.2

def fit(self, X, y):
"""
Search for pipelines and fit the best pipeline.
Expand Down Expand Up @@ -99,12 +106,28 @@ def fit(self, X, y):
logger.info(f'Found {len(pipelines)} pipelines')
sign = get_sign_sorting(self.scorer._score_func, self.score_sorting)
sorted_pipelines = sorted(pipelines, key=lambda x: x.get_score() * sign, reverse=True)

leaderboard_data = []

# [SMAC] added here!!
if self.optimizing:
optimizer = SmacOptimizer(X=X, y=y, splitter=self.splitter, scorer=self.scorer,
n_trials=200, time_limit=self.optimizing_timelimit)
for index, pipeline in enumerate(sorted_pipelines, start=1):
pipeline_id = PIPELINE_PREFIX + str(index)
if index <= self.optimizing_number:
opt_pipeline = optimizer.optimize_pipeline(pipeline.get_pipeline())
opt_score, _, _ = score_pipeline(opt_pipeline, X, y, self.scorer, self.splitter)
if opt_score * sign >= pipeline.get_score() * sign:
logger.critical(f'[SMAC] {pipeline_id} successfully optimized: {pipeline.get_score()} => {opt_score}')
pipeline.set_pipeline(opt_pipeline)
pipeline.set_score(opt_score)
else:
sorted_pipelines = sorted(pipelines, key=lambda x: x.get_score() * sign, reverse=True)
break

for index, pipeline in enumerate(sorted_pipelines, start=1):
pipeline_id = PIPELINE_PREFIX + str(index)
self.pipelines[pipeline_id] = pipeline

leaderboard_data.append([index, pipeline.get_summary(), pipeline.get_score()])

self.leaderboard = pd.DataFrame(leaderboard_data, columns=['ranking', 'pipeline', self.metric])
Expand Down Expand Up @@ -278,7 +301,7 @@ class AutoMLClassifier(BaseAutoML):

def __init__(self, output_folder, time_bound=15, metric='accuracy_score', split_strategy='holdout',
time_bound_run=5, score_sorting='auto', metric_kwargs=None, split_strategy_kwargs=None,
start_mode='auto', verbose=False):
start_mode='auto', verbose=False, optimizing=False, optimizing_number=10):
"""
Create/instantiate an AutoMLClassifier object.

Expand All @@ -299,7 +322,7 @@ def __init__(self, output_folder, time_bound=15, metric='accuracy_score', split_
self.label_enconder = LabelEncoder()
task = 'CLASSIFICATION'
super().__init__(output_folder, time_bound, metric, split_strategy, time_bound_run, task, score_sorting,
metric_kwargs, split_strategy_kwargs, start_mode, verbose)
metric_kwargs, split_strategy_kwargs, start_mode, verbose, optimizing, optimizing_number)

def fit(self, X, y):
y = self.label_enconder.fit_transform(y)
Expand Down Expand Up @@ -331,9 +354,9 @@ def score_pipeline(self, X, y, pipeline_id):

class AutoMLRegressor(BaseAutoML):

def __init__(self, output_folder, time_bound=15, metric='mean_absolute_error', split_strategy='holdout',
def __init__(self, output_folder, time_bound=15, metric='mean_squared_error', split_strategy='holdout',
time_bound_run=5, score_sorting='auto', metric_kwargs=None, split_strategy_kwargs=None,
start_mode='auto', verbose=False):
start_mode='auto', verbose=False, optimizing=False, optimizing_number=10):
"""
Create/instantiate an AutoMLRegressor object.

Expand All @@ -353,12 +376,13 @@ def __init__(self, output_folder, time_bound=15, metric='mean_absolute_error', s

task = 'REGRESSION'
super().__init__(output_folder, time_bound, metric, split_strategy, time_bound_run, task, score_sorting,
metric_kwargs, split_strategy_kwargs, start_mode, verbose)
metric_kwargs, split_strategy_kwargs, start_mode, verbose, optimizing, optimizing_number)


class AutoMLTimeSeries(BaseAutoML):
def __init__(self, output_folder, time_bound=15, metric='mean_squared_error', split_strategy='timeseries',
time_bound_run=5, score_sorting='auto', metric_kwargs=None, split_strategy_kwargs=None, verbose=False, date_column=None, target_column=None):
time_bound_run=5, score_sorting='auto', metric_kwargs=None, split_strategy_kwargs=None,
verbose=False, date_column=None, target_column=None):
"""
Create/instantiate an AutoMLTimeSeries object.

Expand Down
210 changes: 210 additions & 0 deletions alpha_automl/hyperparameter_tuning/smac.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,210 @@
import json
import logging
from os.path import dirname, join

import numpy as np
from ConfigSpace import (
Categorical,
Configuration,
ConfigurationSpace,
Constant,
Float,
Integer,
)
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from smac import HyperparameterOptimizationFacade, Scenario

from alpha_automl.primitive_loader import PRIMITIVE_TYPES
from alpha_automl.scorer import make_scorer, make_splitter
from alpha_automl.utils import create_object

logger = logging.getLogger(__name__)
SMAC_PARAMETERS_PATH = join(dirname(__file__), "smac_parameters.json")


def load_smac_parameters():
with open(SMAC_PARAMETERS_PATH) as fin:
primitives = json.load(fin)
logger.info("[SMAC] smac_parameters loaded")

return primitives


SMAC_DICT = load_smac_parameters()


def gen_pipeline(config, pipeline):
new_pipeline = make_pipeline()
for step_name, step_obj in pipeline.steps:
step_type = PRIMITIVE_TYPES[step_name]

if step_type == "COLUMN_TRANSFORMER":
transformers = []
for trans_name, _, trans_index in step_obj.__dict__["transformers"]:
trans_prim_name = trans_name.split("-")[0]
trans_obj = create_object(
trans_prim_name, get_primitive_params(config, trans_prim_name)
)
transformers.append((trans_name, trans_obj, trans_index))
transformer_obj = ColumnTransformer(transformers, remainder='passthrough')
new_pipeline.steps.append(
[step_name, transformer_obj]
)
elif step_type == "SEMISUPERVISED_CLASSIFIER":
classifier_name = find_classifier_prim_name(
step_obj.__dict__["base_estimator"]
)
classifier_obj = create_object(
classifier_name, get_primitive_params(config, classifier_name)
)
step_obj.base_estimator = classifier_obj
new_pipeline.steps.append(
[step_name, step_obj]
)
else:
new_pipeline.steps.append(
[
step_name,
create_object(step_name, get_primitive_params(config, step_name)),
]
)

return new_pipeline


def get_primitive_params(config, step_name):
params = list(SMAC_DICT[step_name].keys())
class_params = {}
for param in params:
class_params[param] = config[param]
logger.debug(f"[SMAC] {step_name}: {class_params}")
return class_params


def gen_configspace(pipeline):
# (from build_configspace) Build Configuration Space which defines all parameters and their ranges
configspace = ConfigurationSpace(seed=0)
for primitive, prim_obj in pipeline.steps:
step_type = PRIMITIVE_TYPES[primitive]
try:
params = SMAC_DICT[primitive]
configspace.add_hyperparameters(cast_primitive(params))
if step_type == "COLUMN_TRANSFORMER":
for trans_name, _, _ in prim_obj.__dict__["transformers"]:
trans_prim_name = trans_name.split("-")[0]
params = SMAC_DICT[trans_prim_name]
configspace.add_hyperparameters(cast_primitive(params))
elif step_type == "SEMISUPERVISED_CLASSIFIER":
logger.critical(prim_obj.__dict__)
classifier_name = find_classifier_prim_name(
prim_obj.__dict__["base_estimator"]
)
params = SMAC_DICT[classifier_name]
configspace.add_hyperparameters(cast_primitive(params))

except Exception as e:
logger.critical(f"[SMAC] {str(e)}")
return configspace


def find_classifier_prim_name(classifier_obj):
classifier_name = classifier_obj.__class__.__name__
for prim_name in SMAC_DICT.keys():
if classifier_name in prim_name:
classifier_name = prim_name
return classifier_name


def cast_primitive(params):
new_hyperparameters = []
for name, conf in params.items():
config_space = cast_hyperparameter(name, conf)
if config_space is not None:
new_hyperparameters.append(config_space)

return new_hyperparameters


def cast_hyperparameter(param_name, param_conf):
param_type, param_value, param_default = "", "", ""
config_space = None
try:
param_type = param_conf["type"]
param_value = param_conf["value"]
param_default = param_conf["default"]
except Exception as e:
logger.critical(f"[SMAC] {str(e)}")
return
if param_type == "Categorical":
config_space = Categorical(param_name, param_value, default=param_default)
elif param_type == "Integer":
min_value = int(param_value[0])
max_value = int(param_value[1])
config_space = Integer(
param_name, (min_value, max_value), default=param_default
)
elif param_type == "Float":
min_value = float(param_value[0])
max_value = float(param_value[1])
config_space = Float(param_name, (min_value, max_value), default=param_default)
elif param_type == "Constant":
config_space = Constant(param_name, param_value)
else:
logger.error(f"Unknown param_type {param_type}")

return config_space


class SmacOptimizer:
def __init__(
self,
X=None,
y=None,
n_trials=50,
splitter=make_splitter("holdout"),
scorer=make_scorer("accuracy_score"),
time_limit=300,
):
self.pipeline = None
self.X = X
self.y = y
self.n_trials = n_trials
self.splitter = splitter
self.scorer = scorer
self.time_limit = time_limit
return

def train(self, config: Configuration, seed: int = 0) -> float:
pipeline = gen_pipeline(config, self.pipeline)
scores = cross_val_score(
pipeline,
self.X,
self.y,
cv=self.splitter,
scoring=self.scorer,
error_score="raise",
)
return 1 - np.mean(scores)

def optimize_pipeline(self, pipeline):
self.pipeline = pipeline
if self.pipeline is None:
logger.critical("[SMAC] get_pipeline return None value!")
return
optimized_conf = self._optimize_pipeline(self.pipeline)
optimized_pipeline = gen_pipeline(optimized_conf, self.pipeline)
logger.debug(f"[SMAC] {pipeline} successfully optimized!")
return optimized_pipeline

def _optimize_pipeline(self, pipeline):
scenario = Scenario(
gen_configspace(pipeline),
deterministic=True,
n_trials=self.n_trials,
walltime_limit=self.time_limit,
)

smac = HyperparameterOptimizationFacade(scenario, self.train)
return smac.optimize()
Loading