Skip to content

Commit

Permalink
added stats output for features
Browse files Browse the repository at this point in the history
  • Loading branch information
ahmed-khalil-hafsi committed May 21, 2023
1 parent 0c546b2 commit da9f0dd
Show file tree
Hide file tree
Showing 6 changed files with 67 additions and 13 deletions.
44 changes: 42 additions & 2 deletions modules/hyper_param_opt.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,43 @@
# TODO
def hyper_parameter_tuning(model):
return None
from sklearn import preprocessing
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from xgboost import XGBRegressor


def hyper_parameter_tuning(my_pipeline,models,X_train,y_train):
grid_param = [
[{"model": [Ridge()],
"model__alpha": [0.5, 1, 2],
# Add other parameters here
}],
[{"model": [XGBRegressor(objective ='reg:squarederror')],
"model__n_estimators": [100, 200],
# Add other parameters here
}],
[{"model": [RandomForestRegressor(random_state=0)],
"model__n_estimators": [100, 200],
# Add other parameters here
}]
]

for i in range(len(models)):
# Bundle preprocessing and modeling code in a pipeline
my_pipeline = Pipeline(steps=[('preprocessor', preprocessing),
('model', models[i][1])
])

gd_sr = GridSearchCV(estimator=my_pipeline,
param_grid=grid_param[i],
scoring='accuracy',
cv=5,
n_jobs=-1)

gd_sr.fit(X_train, y_train)

best_parameters = gd_sr.best_params_
best_result = gd_sr.best_score_

#console.print(f"Model: {models[i][0]}, Best Parameters: {best_parameters}, Best Score: {best_result}")
1 change: 1 addition & 0 deletions modules/input_checks.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
#TODO
Empty file added modules/outliers.py
Empty file.
2 changes: 1 addition & 1 deletion modules/p2predict_feature_selection.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ def get_most_predictable_features(data, target_column):
('num', numerical_transformer, numerical_cols),
('cat', categorical_transformer, categorical_cols)])

# Define the model
# Use a shallow random forest to estimate feature importance
model = RandomForestRegressor(random_state=0)

# Bundle preprocessing and modeling code in a pipeline
Expand Down
33 changes: 23 additions & 10 deletions p2predict_train.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@


# Machine learning libs
# Math, Machine learning libs
import random
import pandas as pd
from sklearn.model_selection import train_test_split
Expand All @@ -15,10 +15,11 @@
from sklearn.inspection import permutation_importance
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score

# P2Predict
from modules.p2predict_feature_selection import get_most_predictable_features
from modules.hyper_param_opt import hyper_parameter_tuning


# Plotting Module
from modules import plotting
import webbrowser
Expand Down Expand Up @@ -77,7 +78,6 @@ def train_model(X_train,y_train,numerical_cols, categorical_cols, algorithm):
('onehot', OneHotEncoder(handle_unknown='ignore'))
])


# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
transformers=[
Expand All @@ -103,7 +103,7 @@ def train_model(X_train,y_train,numerical_cols, categorical_cols, algorithm):
# Preprocessing of training data, fit model
my_pipeline.fit(X_train, y_train)

# Define the model
# Get model weights
if algorithm == 'ridge':
importance = model.coef_
elif algorithm == 'xgboost':
Expand All @@ -125,7 +125,7 @@ def train_model(X_train,y_train,numerical_cols, categorical_cols, algorithm):



def compute_feature_importance(X,y,model):
def calculate_feature_importance(X,y,model):
result = permutation_importance(model, X, y, n_repeats=10, random_state=0)
importance_normalized = result.importances_mean / sum(result.importances_mean)
return importance_normalized
Expand Down Expand Up @@ -208,6 +208,8 @@ def main(input, target, algorithm, silent,training_features):
file = input
data = load_data(file)

# TODO add input sanity checks

if not target:
target = questionary.select('Enter target column',choices=data.columns.tolist()).ask()

Expand All @@ -234,11 +236,16 @@ def main(input, target, algorithm, silent,training_features):

target_column = target



# Prepare data for training. Split X and Y variables into a set for training and a set for testing.
X_train, X_test, y_train, y_test, numerical_cols, categorical_cols = prepare_data(data,selected_columns,target_column)

console.print("Feature characterization... ")
print_feature_stats(data[numerical_cols])

# Start model training
console.print("Training the model, this may take a few minutes...", style="blue")
console.print("Training the model, this may take a few minutes...", style='bold blue')
model = train_model(X_train,y_train,numerical_cols,categorical_cols,algorithm)

# Calculate model accuracy
Expand Down Expand Up @@ -270,22 +277,28 @@ def main(input, target, algorithm, silent,training_features):



def check_normalization(data):
def print_feature_stats(data):
console = Console()
table = Table(show_header=True, header_style="bold magenta")
table.add_column("Feature")
table.add_column("Min")
table.add_column("Max")
table.add_column("Mean")
table.add_column("Median")
table.add_column("Standard Deviation")
table.add_column("Skewness")
table.add_column("Kurtosis")

for col in data.columns:
min_val = data[col].min()
max_val = data[col].max()
mean_val = data[col].mean()
std_val = data[col].std()
mean_val = round(data[col].mean(),ndigits=4)
median_val = round(data[col].median(),ndigits=4)
std_val = round(data[col].std(),ndigits=4)
skewness = round(data[col].skew(),ndigits=4)
curt = round(data[col].kurt(),ndigits=4)

table.add_row(col, str(min_val), str(max_val), str(mean_val), str(std_val))
table.add_row(col, str(min_val), str(max_val), str(mean_val), str(median_val), str(std_val), str(skewness), str(curt))

console.print(table)

Expand Down
Binary file modified reports/example.pdf
Binary file not shown.

0 comments on commit da9f0dd

Please sign in to comment.