fix: Fixed bug where group folds and sample weights couldn't be used …

…in the same automl instance (#1405)
microsoft · Feb 15, 2025 · d0a1195 · d0a1195
1 parent 0ef9b00
commit d0a1195
Show file tree

Hide file tree

Showing 2 changed files with 41 additions and 5 deletions.
diff --git a/flaml/automl/task/generic_task.py b/flaml/automl/task/generic_task.py
@@ -769,10 +769,10 @@ def evaluate_model_CV(
             if not is_spark_dataframe:
                 y_train, y_val = y_train_split[train_index], y_train_split[val_index]
                 if weight is not None:
-                    fit_kwargs["sample_weight"], weight_val = (
-                        weight[train_index],
-                        weight[val_index],
+                    fit_kwargs["sample_weight"] = (
+                        weight[train_index] if isinstance(weight, np.ndarray) else weight.iloc[train_index]
                     )
+                    weight_val = weight[val_index] if isinstance(weight, np.ndarray) else weight.iloc[val_index]
                 if groups is not None:
                     fit_kwargs["groups"] = (
                         groups[train_index] if isinstance(groups, np.ndarray) else groups.iloc[train_index]

diff --git a/test/automl/test_split.py b/test/automl/test_split.py
@@ -1,4 +1,5 @@
 import numpy as np
+import pandas as pd
 from sklearn.datasets import fetch_openml, load_iris
 from sklearn.metrics import accuracy_score
 from sklearn.model_selection import GroupKFold, KFold, train_test_split
@@ -59,8 +60,6 @@ def test_groups_for_classification_task():
 
         X, y = load_wine(return_X_y=True)
 
-    import numpy as np
-
     automl = AutoML()
     automl_settings = {
         "time_budget": 2,
@@ -118,6 +117,43 @@ def test_groups_for_regression_task():
     automl.fit(X_train, y_train, **automl_settings)
 
 
+def test_groups_with_sample_weights():
+    """Verifies that sample weights can be used with group splits i.e. that https://github.com/microsoft/FLAML/issues/1396 remains fixed"""
+    iris_dict_data = load_iris(as_frame=True)  # numpy arrays
+    iris_data = iris_dict_data["frame"]  # pandas dataframe data + target
+    iris_data["cluster"] = np.random.randint(0, 5, iris_data.shape[0])
+    automl = AutoML()
+
+    X = iris_data[["sepal length (cm)", "sepal width (cm)", "petal length (cm)"]].to_numpy()
+    y = iris_data["petal width (cm)"]
+    sample_weight = pd.Series(np.random.rand(X.shape[0]))
+    (
+        X_train,
+        X_test,
+        y_train,
+        y_test,
+        groups_train,
+        groups_test,
+        sample_weight_train,
+        sample_weight_test,
+    ) = train_test_split(X, y, iris_data["cluster"], sample_weight, random_state=42)
+    automl_settings = {
+        "max_iter": 5,
+        "time_budget": -1,
+        "metric": "r2",
+        "task": "regression",
+        "log_file_name": "error.log",
+        "log_type": "all",
+        "estimator_list": ["lgbm"],
+        "eval_method": "cv",
+        "split_type": "group",
+        "groups": groups_train,
+        "sample_weight": sample_weight_train,
+    }
+    automl.fit(X_train, y_train, **automl_settings)
+    assert automl.model is not None
+
+
 def test_stratified_groupkfold():
     from minio.error import ServerError
     from sklearn.model_selection import StratifiedGroupKFold