Skip to content

Commit

Permalink
fix: Fixed bug where group folds and sample weights couldn't be used …
Browse files Browse the repository at this point in the history
…in the same automl instance (#1405)
  • Loading branch information
dannycg1996 authored Feb 15, 2025
1 parent 0ef9b00 commit d0a1195
Show file tree
Hide file tree
Showing 2 changed files with 41 additions and 5 deletions.
6 changes: 3 additions & 3 deletions flaml/automl/task/generic_task.py
Original file line number Diff line number Diff line change
Expand Up @@ -769,10 +769,10 @@ def evaluate_model_CV(
if not is_spark_dataframe:
y_train, y_val = y_train_split[train_index], y_train_split[val_index]
if weight is not None:
fit_kwargs["sample_weight"], weight_val = (
weight[train_index],
weight[val_index],
fit_kwargs["sample_weight"] = (
weight[train_index] if isinstance(weight, np.ndarray) else weight.iloc[train_index]
)
weight_val = weight[val_index] if isinstance(weight, np.ndarray) else weight.iloc[val_index]
if groups is not None:
fit_kwargs["groups"] = (
groups[train_index] if isinstance(groups, np.ndarray) else groups.iloc[train_index]
Expand Down
40 changes: 38 additions & 2 deletions test/automl/test_split.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import numpy as np
import pandas as pd
from sklearn.datasets import fetch_openml, load_iris
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GroupKFold, KFold, train_test_split
Expand Down Expand Up @@ -59,8 +60,6 @@ def test_groups_for_classification_task():

X, y = load_wine(return_X_y=True)

import numpy as np

automl = AutoML()
automl_settings = {
"time_budget": 2,
Expand Down Expand Up @@ -118,6 +117,43 @@ def test_groups_for_regression_task():
automl.fit(X_train, y_train, **automl_settings)


def test_groups_with_sample_weights():
"""Verifies that sample weights can be used with group splits i.e. that https://github.com/microsoft/FLAML/issues/1396 remains fixed"""
iris_dict_data = load_iris(as_frame=True) # numpy arrays
iris_data = iris_dict_data["frame"] # pandas dataframe data + target
iris_data["cluster"] = np.random.randint(0, 5, iris_data.shape[0])
automl = AutoML()

X = iris_data[["sepal length (cm)", "sepal width (cm)", "petal length (cm)"]].to_numpy()
y = iris_data["petal width (cm)"]
sample_weight = pd.Series(np.random.rand(X.shape[0]))
(
X_train,
X_test,
y_train,
y_test,
groups_train,
groups_test,
sample_weight_train,
sample_weight_test,
) = train_test_split(X, y, iris_data["cluster"], sample_weight, random_state=42)
automl_settings = {
"max_iter": 5,
"time_budget": -1,
"metric": "r2",
"task": "regression",
"log_file_name": "error.log",
"log_type": "all",
"estimator_list": ["lgbm"],
"eval_method": "cv",
"split_type": "group",
"groups": groups_train,
"sample_weight": sample_weight_train,
}
automl.fit(X_train, y_train, **automl_settings)
assert automl.model is not None


def test_stratified_groupkfold():
from minio.error import ServerError
from sklearn.model_selection import StratifiedGroupKFold
Expand Down

0 comments on commit d0a1195

Please sign in to comment.