added stats output for features

ahmed-khalil-hafsi · May 21, 2023 · da9f0dd · da9f0dd
1 parent 0c546b2
commit da9f0dd
Show file tree

Hide file tree

Showing 6 changed files with 67 additions and 13 deletions.
diff --git a/modules/hyper_param_opt.py b/modules/hyper_param_opt.py
@@ -1,3 +1,43 @@
 # TODO
-def hyper_parameter_tuning(model):
-    return None
+from sklearn import preprocessing
+from sklearn.ensemble import RandomForestRegressor
+from sklearn.linear_model import Ridge
+from sklearn.model_selection import GridSearchCV
+from sklearn.pipeline import Pipeline
+from xgboost import XGBRegressor
+
+
+def hyper_parameter_tuning(my_pipeline,models,X_train,y_train):
+        grid_param = [
+            [{"model": [Ridge()],
+              "model__alpha": [0.5, 1, 2],
+              # Add other parameters here
+              }],
+            [{"model": [XGBRegressor(objective ='reg:squarederror')],
+              "model__n_estimators": [100, 200],
+              # Add other parameters here
+              }],
+            [{"model": [RandomForestRegressor(random_state=0)],
+              "model__n_estimators": [100, 200],
+              # Add other parameters here
+              }]
+    ]
+
+        for i in range(len(models)):
+        # Bundle preprocessing and modeling code in a pipeline
+            my_pipeline = Pipeline(steps=[('preprocessor', preprocessing),
+                                          ('model', models[i][1])
+                                       ])
+
+            gd_sr = GridSearchCV(estimator=my_pipeline,
+                                 param_grid=grid_param[i],
+                                 scoring='accuracy',
+                                 cv=5,
+                                 n_jobs=-1)
+
+            gd_sr.fit(X_train, y_train)
+
+            best_parameters = gd_sr.best_params_
+            best_result = gd_sr.best_score_
+
+            #console.print(f"Model: {models[i][0]}, Best Parameters: {best_parameters}, Best Score: {best_result}")
diff --git a/modules/input_checks.py b/modules/input_checks.py
@@ -0,0 +1 @@
+#TODO
diff --git a/modules/outliers.py b/modules/outliers.py
diff --git a/modules/p2predict_feature_selection.py b/modules/p2predict_feature_selection.py
@@ -78,7 +78,7 @@ def get_most_predictable_features(data, target_column):
             ('num', numerical_transformer, numerical_cols),
             ('cat', categorical_transformer, categorical_cols)])
 
-    # Define the model
+    # Use a shallow random forest to estimate feature importance
     model = RandomForestRegressor(random_state=0)
 
     # Bundle preprocessing and modeling code in a pipeline

diff --git a/p2predict_train.py b/p2predict_train.py
@@ -1,6 +1,6 @@
 
 
-# Machine learning libs 
+# Math, Machine learning libs 
 import random
 import pandas as pd
 from sklearn.model_selection import train_test_split
@@ -15,10 +15,11 @@
 from sklearn.inspection import permutation_importance
 from sklearn.metrics import mean_absolute_error
 from sklearn.metrics import r2_score
+
+# P2Predict
 from modules.p2predict_feature_selection import get_most_predictable_features
 from modules.hyper_param_opt import hyper_parameter_tuning
 
-
 # Plotting Module
 from modules import plotting
 import webbrowser
@@ -77,7 +78,6 @@ def train_model(X_train,y_train,numerical_cols, categorical_cols, algorithm):
         ('onehot', OneHotEncoder(handle_unknown='ignore'))
     ])
 
-
     # Bundle preprocessing for numerical and categorical data
     preprocessor = ColumnTransformer(
         transformers=[
@@ -103,7 +103,7 @@ def train_model(X_train,y_train,numerical_cols, categorical_cols, algorithm):
     # Preprocessing of training data, fit model 
     my_pipeline.fit(X_train, y_train)
 
-    # Define the model
+    # Get model weights
     if algorithm == 'ridge':
         importance = model.coef_
     elif algorithm == 'xgboost':
@@ -125,7 +125,7 @@ def train_model(X_train,y_train,numerical_cols, categorical_cols, algorithm):
 
 
 
-def compute_feature_importance(X,y,model):
+def calculate_feature_importance(X,y,model):
     result = permutation_importance(model, X, y, n_repeats=10, random_state=0)
     importance_normalized = result.importances_mean / sum(result.importances_mean)
     return importance_normalized
@@ -208,6 +208,8 @@ def main(input, target, algorithm, silent,training_features):
     file = input
     data = load_data(file)
 
+    # TODO add input sanity checks
+
     if not target:
         target = questionary.select('Enter target column',choices=data.columns.tolist()).ask()
 
@@ -234,11 +236,16 @@ def main(input, target, algorithm, silent,training_features):
 
     target_column = target
 
+
+
     # Prepare data for training. Split X and Y variables into a set for training and a set for testing.
     X_train, X_test, y_train, y_test, numerical_cols, categorical_cols = prepare_data(data,selected_columns,target_column)
 
+    console.print("Feature characterization... ")
+    print_feature_stats(data[numerical_cols])
+
     # Start model training
-    console.print("Training the model, this may take a few minutes...", style="blue")
+    console.print("Training the model, this may take a few minutes...", style='bold blue')
     model = train_model(X_train,y_train,numerical_cols,categorical_cols,algorithm)
 
     # Calculate model accuracy
@@ -270,22 +277,28 @@ def main(input, target, algorithm, silent,training_features):
 
 
 
-def check_normalization(data):
+def print_feature_stats(data):
     console = Console()
     table = Table(show_header=True, header_style="bold magenta")
     table.add_column("Feature")
     table.add_column("Min")
     table.add_column("Max")
     table.add_column("Mean")
+    table.add_column("Median")
     table.add_column("Standard Deviation")
+    table.add_column("Skewness")
+    table.add_column("Kurtosis")
 
     for col in data.columns:
         min_val = data[col].min()
         max_val = data[col].max()
-        mean_val = data[col].mean()
-        std_val = data[col].std()
+        mean_val = round(data[col].mean(),ndigits=4)
+        median_val = round(data[col].median(),ndigits=4)
+        std_val = round(data[col].std(),ndigits=4)
+        skewness = round(data[col].skew(),ndigits=4)
+        curt = round(data[col].kurt(),ndigits=4)
 
-        table.add_row(col, str(min_val), str(max_val), str(mean_val), str(std_val))
+        table.add_row(col, str(min_val), str(max_val), str(mean_val), str(median_val), str(std_val), str(skewness), str(curt))
 
     console.print(table)
 

diff --git a/reports/example.pdf b/reports/example.pdf