diff --git a/project_1_simple_linear_regression/Salary_Data.csv b/project_1_simple_linear_regression/Salary_Data.csv new file mode 100644 index 0000000..a6863aa --- /dev/null +++ b/project_1_simple_linear_regression/Salary_Data.csv @@ -0,0 +1,31 @@ +YearsExperience,Salary +1.1,39343.00 +1.3,46205.00 +1.5,37731.00 +2.0,43525.00 +2.2,39891.00 +2.9,56642.00 +3.0,60150.00 +3.2,54445.00 +3.2,64445.00 +3.7,57189.00 +3.9,63218.00 +4.0,55794.00 +4.0,56957.00 +4.1,57081.00 +4.5,61111.00 +4.9,67938.00 +5.1,66029.00 +5.3,83088.00 +5.9,81363.00 +6.0,93940.00 +6.8,91738.00 +7.1,98273.00 +7.9,101302.00 +8.2,113812.00 +8.7,109431.00 +9.0,105582.00 +9.5,116969.00 +9.6,112635.00 +10.3,122391.00 +10.5,121872.00 diff --git a/project_1_simple_linear_regression/img_1_dataset copy.png b/project_1_simple_linear_regression/img_1_dataset copy.png new file mode 100644 index 0000000..76d9c4a Binary files /dev/null and b/project_1_simple_linear_regression/img_1_dataset copy.png differ diff --git a/project_1_simple_linear_regression/img_1_dataset.png b/project_1_simple_linear_regression/img_1_dataset.png new file mode 100644 index 0000000..418edc0 Binary files /dev/null and b/project_1_simple_linear_regression/img_1_dataset.png differ diff --git a/project_1_simple_linear_regression/img_2_x_y.png b/project_1_simple_linear_regression/img_2_x_y.png new file mode 100644 index 0000000..071c433 Binary files /dev/null and b/project_1_simple_linear_regression/img_2_x_y.png differ diff --git a/project_1_simple_linear_regression/img_3_train_test_data.png b/project_1_simple_linear_regression/img_3_train_test_data.png new file mode 100644 index 0000000..ddb71cc Binary files /dev/null and b/project_1_simple_linear_regression/img_3_train_test_data.png differ diff --git a/project_1_simple_linear_regression/img_4_compare_results.png b/project_1_simple_linear_regression/img_4_compare_results.png new file mode 100644 index 0000000..0e11a30 Binary files /dev/null and b/project_1_simple_linear_regression/img_4_compare_results.png differ diff --git a/project_1_simple_linear_regression/img_5_plot_test.png b/project_1_simple_linear_regression/img_5_plot_test.png new file mode 100644 index 0000000..4a81d3a Binary files /dev/null and b/project_1_simple_linear_regression/img_5_plot_test.png differ diff --git a/project_1_simple_linear_regression/img_5_plot_training.png b/project_1_simple_linear_regression/img_5_plot_training.png new file mode 100644 index 0000000..8e4c80b Binary files /dev/null and b/project_1_simple_linear_regression/img_5_plot_training.png differ diff --git a/project_1_simple_linear_regression/project_1 b/project_1_simple_linear_regression/project_1 new file mode 100644 index 0000000..8a8ac39 --- /dev/null +++ b/project_1_simple_linear_regression/project_1 @@ -0,0 +1,131 @@ +I'm basically writing this blog for myself because I've been wanting to learn Machine Learning for a while now but have never really got to it. So this blog is more like a journal for me to write about my daily progress - (hopefully I will be making some progress every day). + +#100DaysOfMLCode #100ProjectsInML + +The best approach for me to learn anything is by working on sample projects. No matter how simple the project is, it helps me better understand the concepts. So I will be working through some small mini projects as part of this learning journey. + +There are 100's of excellent resources out there to help you get started. I stumbled upon this A-Z Machine learning course on Udemy and I'll be walking through those examples in the first few weeks. + + + +Today I'll be going through "Simple Linear Regression" + +Dataset +First lets look at the dataset. It is Salary_Data.csv and can be found here +It has 2 columns - "Years of Experience" and "Salary" for 30 employees in a company. So in this example, we will train a Simple Linear Regression model to learn the correlation between the number of years of experience of each employee and their respective salary. Once the model is trained, we will be able to do some sample predictions. + +Below is a sample screenshot of the dataset. + + +So lets get started. + +Step 1: Load the Dataset + +Below is the code snippet for loading the dataset. +We will be using the pandas dataframe. +Here X is the independent variable which is the "Years of Experience" +and y is the dependent variable which is the "Salary" + +So for X, we specify dataset.iloc[:, :-1].values +which simply means take all rows and all columns except last one + +And for y, we specify dataset.iloc[:, 1].values +which simply means take all rows and only columns with index 1 - In python indexes begin at 0 - so index 1 here is the second column which is Salary + +# Step 1 Load Data +import pandas as pd +dataset = pd.read_csv('Salary_Data.csv') +X = dataset.iloc[:, :-1].values +y = dataset.iloc[:,1].values + +Below is the sample screenshot of X and y + +Step 2: Split dataset into training set and test set + +Next we have to split the dataset into training and testing. We will use the training dataset for training the model and then check the performance of the model on the test dataset. + +For this we will use the train_test_split method from library model_selection +We are providing a test_size of 1/3 which means test set will contain 10 observations and training set will contain 20 observations +The random_state=0 is required only if you want to compare your results with mine. + +# Step 2: Split data into training and testing +from sklearn.model_selection import train_test_split +X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1/3, random_state=0) + +Below is the sample screenshot of X_train, y_train, X_test and y_test + +Step 3: Fit Simple Linear Regression model to training set + +This is a very simple step. We will be using the LinearRegression class from the library sklearn.linear_model. First we create an object of the LinearRegression class and call the fit method passing the X_train and y_train. + +# Step 3: Fit Simple Linear Regression to Training Data +from sklearn.linear_model import LinearRegression +regressor = LinearRegression() +regressor.fit(X_train, y_train) + + +Step 4: Predict the test set +Using the regressor we trained in the previous step, we will not use it to predict the results of the test set and compare the predicted values with the actual values + +# Step 4: Make Prediction +y_pred = regressor.predict(X_test) + +Now we have the y_pred which are the predicted values from our Model and y_test which are the actual values. +Let us compare are see how well our model did. As you can see from the screenshot below - our basic model did pretty well. + +If we take the first employee - the actual salary is 37731 and our model predicted 40835.1 - which is not too bad. There are some predictions that are off but some are pretty close. + +Step 5 - Visualizing the training set + +Lets visualize the results. +First we'll plot the actual data points of training set - X_train and y_train +plt.scatter(X_train, y_train, color = 'red') + +Next we'll plot the regression line - which is the predicted values for the X_train +plt.plot(X_train, regressor.predict(X_train), color='blue') + +# Step 5 - Visualize training set results +import matplotlib.pyplot as plt +# plot the actual data points of training set +plt.scatter(X_train, y_train, color = 'red') +# plot the regression line +plt.plot(X_train, regressor.predict(X_train), color='blue') +plt.title('Salary vs Experience (Training set)') +plt.xlabel('Years of Experience') +plt.ylabel('Salary') +plt.show() + + +Step 6 - Visualizing the test set + +Lets visualize the results. +First we'll plot the actual data points of training set - X_test and y_test +plt.scatter(X_test, y_test, color = 'red') + +Next we'll plot the regression line - which is the same as above +plt.plot(X_train, regressor.predict(X_train), color='blue') + +# Step 6 - Visualize test set results +import matplotlib.pyplot as plt +# plot the actual data points of training set +plt.scatter(X_test, y_test, color = 'red') +# plot the regression line +plt.plot(X_train, regressor.predict(X_train), color='blue') +plt.title('Salary vs Experience (Test set)') +plt.xlabel('Years of Experience') +plt.ylabel('Salary') +plt.show() + +Step 7 - Make new predictions +We can also make brand new predictions for data points that do not exist in the dataset. +Like for a person with 15 years experience + +new_salary_pred = regressor.predict([[15]]) + +# Step 7 - Make new prediction +new_salary_pred = regressor.predict([[15]]) + +Here is the full source code + + + diff --git a/project_1_simple_linear_regression/simple_linear_regression.py b/project_1_simple_linear_regression/simple_linear_regression.py new file mode 100644 index 0000000..67f33ae --- /dev/null +++ b/project_1_simple_linear_regression/simple_linear_regression.py @@ -0,0 +1,49 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Sun Sep 1 19:14:35 2019 +@author: omairaasim +""" + +# Step 1 Load Data +import pandas as pd +dataset = pd.read_csv('Salary_Data.csv') +X = dataset.iloc[:, :-1].values +y = dataset.iloc[:,1].values + +# Step 2: Split data into training and testing +from sklearn.model_selection import train_test_split +X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1/3, random_state=0) + +# Step 3: Fit Simple Linear Regression to Training Data +from sklearn.linear_model import LinearRegression +regressor = LinearRegression() +regressor.fit(X_train, y_train) + +# Step 4: Make Prediction +y_pred = regressor.predict(X_test) + +# Step 5 - Visualize training set results +import matplotlib.pyplot as plt +# plot the actual data points of training set +plt.scatter(X_train, y_train, color = 'red') +# plot the regression line +plt.plot(X_train, regressor.predict(X_train), color='blue') +plt.title('Salary vs Experience (Training set)') +plt.xlabel('Years of Experience') +plt.ylabel('Salary') +plt.show() + +# Step 6 - Visualize test set results +import matplotlib.pyplot as plt +# plot the actual data points of test set +plt.scatter(X_test, y_test, color = 'red') +# plot the regression line (same as above) +plt.plot(X_train, regressor.predict(X_train), color='blue') +plt.title('Salary vs Experience (Test set)') +plt.xlabel('Years of Experience') +plt.ylabel('Salary') +plt.show() + +# Step 7 - Make new prediction +new_salary_pred = regressor.predict([[15]]) diff --git a/project_2_multiple_linear_regression/50_Startups.csv b/project_2_multiple_linear_regression/50_Startups.csv new file mode 100644 index 0000000..14ffb86 --- /dev/null +++ b/project_2_multiple_linear_regression/50_Startups.csv @@ -0,0 +1,51 @@ +R&D Spend,Administration,Marketing Spend,State,Profit +165349.2,136897.8,471784.1,New York,192261.83 +162597.7,151377.59,443898.53,California,191792.06 +153441.51,101145.55,407934.54,Florida,191050.39 +144372.41,118671.85,383199.62,New York,182901.99 +142107.34,91391.77,366168.42,Florida,166187.94 +131876.9,99814.71,362861.36,New York,156991.12 +134615.46,147198.87,127716.82,California,156122.51 +130298.13,145530.06,323876.68,Florida,155752.6 +120542.52,148718.95,311613.29,New York,152211.77 +123334.88,108679.17,304981.62,California,149759.96 +101913.08,110594.11,229160.95,Florida,146121.95 +100671.96,91790.61,249744.55,California,144259.4 +93863.75,127320.38,249839.44,Florida,141585.52 +91992.39,135495.07,252664.93,California,134307.35 +119943.24,156547.42,256512.92,Florida,132602.65 +114523.61,122616.84,261776.23,New York,129917.04 +78013.11,121597.55,264346.06,California,126992.93 +94657.16,145077.58,282574.31,New York,125370.37 +91749.16,114175.79,294919.57,Florida,124266.9 +86419.7,153514.11,0,New York,122776.86 +76253.86,113867.3,298664.47,California,118474.03 +78389.47,153773.43,299737.29,New York,111313.02 +73994.56,122782.75,303319.26,Florida,110352.25 +67532.53,105751.03,304768.73,Florida,108733.99 +77044.01,99281.34,140574.81,New York,108552.04 +64664.71,139553.16,137962.62,California,107404.34 +75328.87,144135.98,134050.07,Florida,105733.54 +72107.6,127864.55,353183.81,New York,105008.31 +66051.52,182645.56,118148.2,Florida,103282.38 +65605.48,153032.06,107138.38,New York,101004.64 +61994.48,115641.28,91131.24,Florida,99937.59 +61136.38,152701.92,88218.23,New York,97483.56 +63408.86,129219.61,46085.25,California,97427.84 +55493.95,103057.49,214634.81,Florida,96778.92 +46426.07,157693.92,210797.67,California,96712.8 +46014.02,85047.44,205517.64,New York,96479.51 +28663.76,127056.21,201126.82,Florida,90708.19 +44069.95,51283.14,197029.42,California,89949.14 +20229.59,65947.93,185265.1,New York,81229.06 +38558.51,82982.09,174999.3,California,81005.76 +28754.33,118546.05,172795.67,California,78239.91 +27892.92,84710.77,164470.71,Florida,77798.83 +23640.93,96189.63,148001.11,California,71498.49 +15505.73,127382.3,35534.17,New York,69758.98 +22177.74,154806.14,28334.72,California,65200.33 +1000.23,124153.04,1903.93,New York,64926.08 +1315.46,115816.21,297114.46,Florida,49490.75 +0,135426.92,0,California,42559.73 +542.05,51743.15,0,New York,35673.41 +0,116983.8,45173.06,California,14681.4 \ No newline at end of file diff --git a/project_2_multiple_linear_regression/img_10_p_value_2.png b/project_2_multiple_linear_regression/img_10_p_value_2.png new file mode 100644 index 0000000..6bd06ca Binary files /dev/null and b/project_2_multiple_linear_regression/img_10_p_value_2.png differ diff --git a/project_2_multiple_linear_regression/img_11_p_value_3.png b/project_2_multiple_linear_regression/img_11_p_value_3.png new file mode 100644 index 0000000..0d4ac29 Binary files /dev/null and b/project_2_multiple_linear_regression/img_11_p_value_3.png differ diff --git a/project_2_multiple_linear_regression/img_12_p_value_4.png b/project_2_multiple_linear_regression/img_12_p_value_4.png new file mode 100644 index 0000000..22170d0 Binary files /dev/null and b/project_2_multiple_linear_regression/img_12_p_value_4.png differ diff --git a/project_2_multiple_linear_regression/img_1_dataset.png b/project_2_multiple_linear_regression/img_1_dataset.png new file mode 100644 index 0000000..eb647c8 Binary files /dev/null and b/project_2_multiple_linear_regression/img_1_dataset.png differ diff --git a/project_2_multiple_linear_regression/img_2_x_y.png b/project_2_multiple_linear_regression/img_2_x_y.png new file mode 100644 index 0000000..263f238 Binary files /dev/null and b/project_2_multiple_linear_regression/img_2_x_y.png differ diff --git a/project_2_multiple_linear_regression/img_3_convert_text_to_num.png b/project_2_multiple_linear_regression/img_3_convert_text_to_num.png new file mode 100644 index 0000000..07dd5fe Binary files /dev/null and b/project_2_multiple_linear_regression/img_3_convert_text_to_num.png differ diff --git a/project_2_multiple_linear_regression/img_4_dummy_variables.png b/project_2_multiple_linear_regression/img_4_dummy_variables.png new file mode 100644 index 0000000..bbef7d9 Binary files /dev/null and b/project_2_multiple_linear_regression/img_4_dummy_variables.png differ diff --git a/project_2_multiple_linear_regression/img_5_dummy_org.png b/project_2_multiple_linear_regression/img_5_dummy_org.png new file mode 100644 index 0000000..881e9f1 Binary files /dev/null and b/project_2_multiple_linear_regression/img_5_dummy_org.png differ diff --git a/project_2_multiple_linear_regression/img_6_train_test_data.png b/project_2_multiple_linear_regression/img_6_train_test_data.png new file mode 100644 index 0000000..73fd2db Binary files /dev/null and b/project_2_multiple_linear_regression/img_6_train_test_data.png differ diff --git a/project_2_multiple_linear_regression/img_7_compare_results.png b/project_2_multiple_linear_regression/img_7_compare_results.png new file mode 100644 index 0000000..47488cd Binary files /dev/null and b/project_2_multiple_linear_regression/img_7_compare_results.png differ diff --git a/project_2_multiple_linear_regression/img_8_add_ones.png b/project_2_multiple_linear_regression/img_8_add_ones.png new file mode 100644 index 0000000..2cb2076 Binary files /dev/null and b/project_2_multiple_linear_regression/img_8_add_ones.png differ diff --git a/project_2_multiple_linear_regression/img_9_p_value_1.png b/project_2_multiple_linear_regression/img_9_p_value_1.png new file mode 100644 index 0000000..7c075f3 Binary files /dev/null and b/project_2_multiple_linear_regression/img_9_p_value_1.png differ diff --git a/project_2_multiple_linear_regression/multiple_linear_regression.py b/project_2_multiple_linear_regression/multiple_linear_regression.py new file mode 100644 index 0000000..8840033 --- /dev/null +++ b/project_2_multiple_linear_regression/multiple_linear_regression.py @@ -0,0 +1,63 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Fri Nov 30 19:45:38 2018 + +@author: omairaasim +""" + +# Step 1 - Load Data +import pandas as pd +dataset = pd.read_csv("50_Startups.csv") +X = dataset.iloc[:,:-1].values +y = dataset.iloc[:,4].values + +# Step 2 - Encode Categorical Data +from sklearn.preprocessing import LabelEncoder, OneHotEncoder +labelEncoder_X = LabelEncoder() +X[:,3] = labelEncoder_X.fit_transform(X[:,3]) + +oneHotEncoder = OneHotEncoder(categorical_features=[3]) +X = oneHotEncoder.fit_transform(X).toarray() + +# Step 3 - Dummy Trap +X = X[:,1:] + +# Step 4 - Split Data +from sklearn.model_selection import train_test_split +X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=0) + +# Step 5 - Fit Regressor +from sklearn.linear_model import LinearRegression +regressor = LinearRegression() +regressor.fit(X_train, y_train) + +# Step 6 - Predict +y_pred = regressor.predict(X_test) + +# Add ones +import numpy as np +ones = np.ones(shape = (50,1), dtype=int) +X = np.append(arr = ones, values= X, axis=1) + +# Backward Elimination +import statsmodels.formula.api as sm +X_opt = X[:,[0,1,2,3,4,5]] +regressor_OLS = sm.OLS(endog = y, exog=X_opt).fit() +regressor_OLS.summary() + +X_opt = X[:,[0,1,3,4,5]] +regressor_OLS = sm.OLS(endog = y, exog=X_opt).fit() +regressor_OLS.summary() + +X_opt = X[:,[0,3,4,5]] +regressor_OLS = sm.OLS(endog = y, exog=X_opt).fit() +regressor_OLS.summary() + +X_opt = X[:,[0,3,5]] +regressor_OLS = sm.OLS(endog = y, exog=X_opt).fit() +regressor_OLS.summary() + +X_opt = X[:,[0,3]] +regressor_OLS = sm.OLS(endog = y, exog=X_opt).fit() +regressor_OLS.summary() diff --git a/project_2_multiple_linear_regression/multiple_linear_regression_banner.png b/project_2_multiple_linear_regression/multiple_linear_regression_banner.png new file mode 100644 index 0000000..35258cf Binary files /dev/null and b/project_2_multiple_linear_regression/multiple_linear_regression_banner.png differ diff --git a/project_2_multiple_linear_regression/project_2 b/project_2_multiple_linear_regression/project_2 new file mode 100644 index 0000000..7bcd42a --- /dev/null +++ b/project_2_multiple_linear_regression/project_2 @@ -0,0 +1,214 @@ +In project 2 of Machine Learning, I'm going to be looking at Multiple Linear Regression. Unlike Simple Linear Regression where there is one independent variable and one dependent variable - in Multiple Linear Regression there are several independent variables that could have an effect on determining the dependent variable. + +I'll be using the example from the A-Z Machine learning course from Udemy. + +Let's dive right in. + +Dataset +The dataset we will be using for this project can be found here**. +It contains data about 50 startups +It has 5 columns - "R&D Spend", "Administration", "Marketing Spend", "State", "Profit" +The first 3 columns indicate how much each startup spends on Research and Development, how much they spend on Marketing and how much they spend on Administration cost. +The state column indicates which state the startup is based in. And the last column states the profit made by the start up. + +Project Objective +We want our model to predict the profit based on the independent variables described above. So Profit is the dependent variable and the other 4 are independent variables. + +Step 1: Load the Dataset + +Below is the code snippet for loading the dataset. +We will be using the pandas dataframe. +Here X is contains all the independent variable which are "R&D Spend", "Administration", "Marketing Spend" and "State" +and y is the dependent variable which is the "Profit" + +So for X, we specify dataset.iloc[:, :-1].values +which simply means take all rows and all columns except last one + +And for y, we specify dataset.iloc[:, 4].values +which simply means take all rows and only columns with index 4 - In python indexes begin at 0 - so index 4 here is the fifth column which is "Profit" + +# Step 1 - Load Data +import pandas as pd +dataset = pd.read_csv("50_Startups.csv") +X = dataset.iloc[:,:-1].values +y = dataset.iloc[:,4].values + +*************************************************** + +Step 2: Convert text variable to numbers +We can see that in our dataset we have a categorical variable which is "State" which we have to encode. +Here the "State" variable is at index 3 +We use LabelEncoder class to convert text to numbers + +# Step 2 - Convert text variable "State" to numbers +from sklearn.preprocessing import LabelEncoder, OneHotEncoder +labelEncoder_X = LabelEncoder() +X[:,3] = labelEncoder_X.fit_transform(X[:,3]) + +Once we run the above code snippet - we will see that all States have been converted to numbers +For example New York has been converted to 2, California to 0 and Florida to 1 + +**************************************************** + +Step 3: Use OneHotEncoder to introduce Dummy variables +If we leave the dataset in the above state, it will not be right. Because New York has been assigned a value 2 and California has been assigned 0. So the model might assume New York is higher than California which is not right. + +So to avoid this we have to introduce dummy variables using OneHotEncoder as shown below + +# Step 3 - Use OneHotEncoder to introduce dummy variables +oneHotEncoder = OneHotEncoder(categorical_features=[3]) +X = oneHotEncoder.fit_transform(X).toarray() + +After running the above code snippet - lets examine the dataset - we can see that 3 dummy variables have been added as we had 3 different States. + +Lets compare the X dataset with the original dataset. +- Lets looks at the first entry at index 0 - In original dataset the state was "New York" - and after encoding the 3rd dummy variable has the value 1 which means the 3rd dummy variable represents the state New York +- Lets looks at the second entry at index 1 - In original dataset the state was "California" - and after encoding the 1st dummy variable has the value 1 which means the 1st dummy variable represents the state California +- Lets looks at the third entry at index 2 - In original dataset the state was "Florida" - and after encoding the 2nd dummy variable has the value 1 which means the 2nd dummy variable represents the state Florida + + +Step 4: Dummy Variable Trap +We have to remove one of the dummy variables. You can read about the dummy variable trap and why we need to remove one of the dummy variables. +In the below code snippet - we are removing the first column. + +# Step 4 - Dummy Trap +X = X[:,1:] + + +Step 5: Split dataset into training set and test set + +Next we have to split the dataset into training and testing. We will use the training dataset for training the model and then check the performance of the model on the test dataset. + +For this we will use the train_test_split method from library model_selection +We are providing a test_size of 0.2 which means test set will contain 10 observations and training set will contain 40 observations +The random_state=0 is required only if you want to compare your results with mine. + +# Step 5 - Split Data +from sklearn.model_selection import train_test_split +X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=0) + +Below is the sample screenshot of X_train, y_train, X_test and y_test + +Step 6: Fit Simple Linear Regression model to training set +This is a very simple step. We will be using the LinearRegression class from the library sklearn.linear_model. First we create an object of the LinearRegression class and call the fit method passing the X_train and y_train. + +# Step 6 - Fit Regressor +from sklearn.linear_model import LinearRegression +regressor = LinearRegression() +regressor.fit(X_train, y_train) + +Step 7: Predict the test set +Using the regressor we trained in the previous step, we will now use it to predict the results of the test set and compare the predicted values with the actual values + +# Step 7 - Predict +y_pred = regressor.predict(X_test) + +Now we have the y_pred which are the predicted values from our Model and y_test which are the actual values. +Let us compare are see how well our model did. As you can see from the screenshot below - our basic model did pretty well. + +If we take the first startup - the actual profit is 103282 and our model predicted 103015 - which is almost perfect. There are some predictions that are off like the second startup - the actual profit is 144259 and our model predicted 132582. + +Step 8: Backward Elimination +In the model that we just built, we used all the independent variables but its possible that some independent variables are more significant than others and have a greater impact on the profit and some are not significant meaning if we remove them from the model - we may get better predictions. + +So we are going to use backward elimination process to see which independent variables we must include in the model and which to exclude. + +The first step is for us to add a column of 1's to our X dataset as the first column. We add this column of ones to develop the y-intercept. +This column corresponds to x0=1 associated to this constant b0 in the multiple linear regression equation +y = b0 + b1 * x1 + b2 * x2 + bn * xn + +# Add ones +import numpy as np +ones = np.ones(shape = (50,1), dtype=int) +X = np.append(arr = ones, values= X, axis=1) + +Now we will start the backward elimination process. Since we will be creating a new optimal matrix of features - we will call it X_opt. This will contain only the independent features that are significant in predicting profit. + +To begin with, we will include all independent variables in X_opt +X_opt = X[:,[0,1,2,3,4,5]] + +Next we need to select a significance level (SL) - here we decode on significance level of 0.05. So if the p value of the independent variable is greater than SL, we will remove that independent variable and repeat the process with the remaining independent variables. + +Next we create a new regressor of the OLS class (Ordinary Least Square) from statsmodel library. +It takes 2 arguments +- endog : which is the dependent variable +- exog : which is the matrix containing all independent variables + +Now we need to fit the OLS algorithm as shown below: + +regressor_OLS = sm.OLS(endog = y, exog=X_opt).fit() + +Then we will look at the summary to see which independent variable has p value higher than SL (0.05) + +regressor_OLS.summary() + +Below all the steps are outlined + +# Backward Elimination +import statsmodels.formula.api as sm +X_opt = X[:,[0,1,2,3,4,5]] +regressor_OLS = sm.OLS(endog = y, exog=X_opt).fit() +regressor_OLS.summary() + +Here is the screenshot of the summary + + +Lets examine the output +x1 and x2 are the 2 dummy variables we added for state +x3 is R&D spent +x4 is Admin spent +x5 is marketing spent + +We have to look for the highest P value greater than 0.5 which in this case is 0.99 (99%) for x2 +So we have to remove x2 (2nd dummy variable for state) which has index 2 + +X_opt = X[:,[0,1,3,4,5]] + +Now lets repeat the process after removing the independent variable with highest p value + +X_opt = X[:,[0,1,3,4,5]] +regressor_OLS = sm.OLS(endog = y, exog=X_opt).fit() +regressor_OLS.summary() + +Here is the screenshot of the summary + +Lets examine the output. We have to look for the highest P value greater than 0.5 which in this case is 0.94 (94%) for x1 +So we have to remove x1 (1st dummy variable for state) which has index 1 + +X_opt = X[:,[0,3,4,5]] + +Now lets repeat the process after removing the independent variable with highest p value + +X_opt = X[:,[0,3,4,5]] +regressor_OLS = sm.OLS(endog = y, exog=X_opt).fit() +regressor_OLS.summary() + +Here is the screenshot of the summary + +Lets examine the output. We have to again look for the highest P value greater than 0.5 which in this case is 0.602 (60%) for x2 +So we have to remove x2 (Admin spent) which has index 4 + +X_opt = X[:,[0,3,5]] + +Now lets repeat the process after removing the independent variable with highest p value + +X_opt = X[:,[0,3,5]] +regressor_OLS = sm.OLS(endog = y, exog=X_opt).fit() +regressor_OLS.summary() + +Here is the screenshot of the summary + +Lets examine the output. We have to again look for the highest P value greater than 0.5 which in this case is 0.06 (6%) for x2 +So we have to remove x2 (Marketing spent) which has index 5 in X_opt + +X_opt = X[:,[0,3]] + +Now lets repeat the process after removing the independent variable with highest p value + +X_opt = X[:,[0,3]] +regressor_OLS = sm.OLS(endog = y, exog=X_opt).fit() +regressor_OLS.summary() + +Finally we are left with only 1 independent variable which is the R&D spent. + +So we can build our model again but this time taking only 1 independent variable which is the R&D spent and do the prediction and our results will be better than the first time. \ No newline at end of file diff --git a/project_3_polynomial_regression/Position_Salaries.csv b/project_3_polynomial_regression/Position_Salaries.csv new file mode 100644 index 0000000..76d9d3e --- /dev/null +++ b/project_3_polynomial_regression/Position_Salaries.csv @@ -0,0 +1,11 @@ +Position,Level,Salary +Business Analyst,1,45000 +Junior Consultant,2,50000 +Senior Consultant,3,60000 +Manager,4,80000 +Country Manager,5,110000 +Region Manager,6,150000 +Partner,7,200000 +Senior Partner,8,300000 +C-level,9,500000 +CEO,10,1000000 \ No newline at end of file diff --git a/project_3_polynomial_regression/img_1_dataset.png b/project_3_polynomial_regression/img_1_dataset.png new file mode 100644 index 0000000..3d46494 Binary files /dev/null and b/project_3_polynomial_regression/img_1_dataset.png differ diff --git a/project_3_polynomial_regression/img_2_x_y.png b/project_3_polynomial_regression/img_2_x_y.png new file mode 100644 index 0000000..dd2f930 Binary files /dev/null and b/project_3_polynomial_regression/img_2_x_y.png differ diff --git a/project_3_polynomial_regression/img_3_plot_linear.png b/project_3_polynomial_regression/img_3_plot_linear.png new file mode 100644 index 0000000..3a8f096 Binary files /dev/null and b/project_3_polynomial_regression/img_3_plot_linear.png differ diff --git a/project_3_polynomial_regression/img_4_x_poly.png b/project_3_polynomial_regression/img_4_x_poly.png new file mode 100644 index 0000000..4163101 Binary files /dev/null and b/project_3_polynomial_regression/img_4_x_poly.png differ diff --git a/project_3_polynomial_regression/img_5_plot_poly_degree2.png b/project_3_polynomial_regression/img_5_plot_poly_degree2.png new file mode 100644 index 0000000..c8b3944 Binary files /dev/null and b/project_3_polynomial_regression/img_5_plot_poly_degree2.png differ diff --git a/project_3_polynomial_regression/img_6_plot_poly_degree3.png b/project_3_polynomial_regression/img_6_plot_poly_degree3.png new file mode 100644 index 0000000..bd64ceb Binary files /dev/null and b/project_3_polynomial_regression/img_6_plot_poly_degree3.png differ diff --git a/project_3_polynomial_regression/img_7_plot_poly_degree4.png b/project_3_polynomial_regression/img_7_plot_poly_degree4.png new file mode 100644 index 0000000..3610bd4 Binary files /dev/null and b/project_3_polynomial_regression/img_7_plot_poly_degree4.png differ diff --git a/project_3_polynomial_regression/poly_regression.py b/project_3_polynomial_regression/poly_regression.py new file mode 100644 index 0000000..c8cc33c --- /dev/null +++ b/project_3_polynomial_regression/poly_regression.py @@ -0,0 +1,52 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" + +@author: omairaasim +""" + +# Step 1 - Load Data +import pandas as pd +dataset = pd.read_csv("Position_Salaries.csv") +X = dataset.iloc[:, 1:2].values +y = dataset.iloc[:, 2].values + +# Step 2 - Fitting Linear Regression +from sklearn.linear_model import LinearRegression +lin_reg = LinearRegression() +lin_reg.fit(X,y) + +# Step 3 - Visualize Linear Regression Results +import matplotlib.pyplot as plt + +plt.scatter(X,y, color="red") +plt.plot(X, lin_reg.predict(X)) +plt.title("Linear Regression") +plt.xlabel("Level") +plt.ylabel("Salary") +plt.show() + +# Step 4 Linear Regression prediction +lin_reg.predict([[6.5]]) + +# Step 5 - Convert X to polynomial format +from sklearn.preprocessing import PolynomialFeatures +poly_reg = PolynomialFeatures(degree=4) +X_poly = poly_reg.fit_transform(X) + + +# Step 6 - Passing X_poly to LinearRegression +lin_reg_2 = LinearRegression() +lin_reg_2.fit(X_poly,y) + +# Step 7 - Visualize Poly Regression Results +plt.scatter(X,y, color="red") +plt.plot(X, lin_reg_2.predict(poly_reg.fit_transform(X))) +plt.title("Poly Regression - Degree 4") +plt.xlabel("Level") +plt.ylabel("Salary") +plt.show() + + +# Step 8 Polynomial Regression prediction +lin_reg_2.predict(poly_reg.fit_transform([[6.5]])) \ No newline at end of file diff --git a/project_3_polynomial_regression/project_3 b/project_3_polynomial_regression/project_3 new file mode 100644 index 0000000..21f6b29 --- /dev/null +++ b/project_3_polynomial_regression/project_3 @@ -0,0 +1,200 @@ +#100DaysOfMLCode #100ProjectsInML + +Today I'll be looking at the Polynomial Regression example from the A-Z Machine Learning course on Udemy + +If you look at the image above which list the equations for all 3 types of Regression - you will notice that in Polynomial Regression we have the same variables x1 but it is raised different powers. + +For example +- instead of x2 - we have x1 raised to the power 2 +- instead of x3 - we have x1 raised to the power 3 + +Lets explore the dataset. + +Dataset +First lets look at the dataset. It is Position_Salaries.csv and can be found here +It has 3 columns - "Position", "Level" and "Salary" and describes the approximate salary range for an employee based on what level he falls under. + +For example if an employee is a Manager - he falls in Level 4 and should get around $80,000. + +Below is the screenshot of the dataset. + +Project Objective +Lets assume the above table is what the HR team of a company uses to determine what salary to offer to a new employee. For our project, let's take an example that an employee has applied for the role of a Regional Manager and has already worked as a Regional Manager for 2 years. So based on the table above - he falls between level 6 and level 7 - Lets say he falls under level 6.5 + +We want to build a model to predict what salary we should offer this new employee. + +Let's get started. + +Step 1: Load the Dataset + +If we look at the dataset, we need to predict the salary for an employee who falls under Level 6.5 - So we really do not need the first column "Position". + +Here X is the independent variable which is the "Level" +and y is the dependent variable which is the "Salary" + +So for X, we specify + +X = dataset.iloc[:, 1:2].values + +which simply means take all rows and all columns from index 1 upto index 2 but not including index 2 (upper bound range is not included) + +And for y, we specify dataset.iloc[:, 2].values +which simply means take all rows and only columns with index 2 - In python indexes begin at 0 - so index 2 here is the second column which is Salary + +# Step 1 - Load Data +import pandas as pd +dataset = pd.read_csv("Position_Salaries.csv") +X = dataset.iloc[:, 1:2].values +y = dataset.iloc[:, 2].values + +********************************* + +Step 2: Fit Linear Regression model to dataset + +First we will build a simple linear regression model to see what prediction it makes and then compare it to the prediction made by the Polynomial Regression to see which is more accurate. + +We will be using the LinearRegression class from the library sklearn.linear_model. We create an object of the LinearRegression class and call the fit method passing the X and y. + +# Step 2 - Fitting Linear Regression +from sklearn.linear_model import LinearRegression +lin_reg = LinearRegression() +lin_reg.fit(X,y) + +******************************** + +Step 3: Visualize Linear Regression Results + +Lets plot the graph to look at the results for Linear Regression + +# Step 3 - Visualize Linear Regression Results +import matplotlib.pyplot as plt + +plt.scatter(X,y, color="red") +plt.plot(X, lin_reg.predict(X)) +plt.title("Linear Regression") +plt.xlabel("Level") +plt.ylabel("Salary") +plt.show() + +If we look at the graph, we can see that a person at level 6.5 should be offered a salary of around $300k. We will confirm this in next step. + +*********************************** + +Step 4: Predict Linear Regression Results + +# Step 4 prediction +lin_reg.predict([[6.5]]) + +We can see that the prediction is way off as it predicts $330k. + +Now lets check the predictions by implementing Polynomial Regression + +*********************************** + +Step 5: Convert X to polynomial format + +For Polynomial Regression, we need to transform our matrix X to X_poly where X_poly will contain X to the power of n - depending upon the degree we choose. If we choose degree 2, then X_poly will contain X and X to the power 2. If we choose degree 3, then X_poly will contain X, X to the power 2 and X to the power 3. + +We will be using the PolynomialFeatures class from the sklearn.preprocessing library for this purpose. When we create an object of this class - we have to pass the degree parameter. Lets begin by choose degree as 2. Then we call the fit_transform method to transform matrix X. + +# Step 5 - Convert X to polynomial format +from sklearn.preprocessing import PolynomialFeatures +poly_reg = PolynomialFeatures(degree=2) +X_poly = poly_reg.fit_transform(X) + +Lets look at X_poly + +If you see, the 2nd column is the actual levels from 1 to 10 present in X. +The 3rd column contains X raised to the power 2 (as we chose degree 2) +The first column contains just 1's - This is automatically added by the PolynomialFeatures class to include the constant b0. + +********************************* + +Step 6: Fitting Polynomial Regression + +Now we will create a new linear regression object called lin_reg_2 and pass X_poly to it instead of X that we passed in Step 2. + + # Step 6 - Passing X_poly to LinearRegression +lin_reg_2 = LinearRegression() +lin_reg_2.fit(X_poly,y) + +********************************* + +Step 7: Visualize Poly Regression Results + +Lets plot the graph to look at the results for Polynomial Regression + +# Step 7 - Visualize Poly Regression Results +plt.scatter(X,y, color="red") +plt.plot(X, lin_reg_2.predict(poly_reg.fit_transform(X))) +plt.title("Poly Regression Degree 2") +plt.xlabel("Level") +plt.ylabel("Salary") +plt.show() + +If we look at the graph, we can see that a person at level 6.5 should be offered a salary of around $190k. We will confirm this in next step. + +******************************** + +Step 8: Predict Polynomial Regression Results + +We get a prediction of $189k + +# Step 8 prediction +lin_reg_2.predict(poly_reg.fit_transform([[6.5]])) + +******************************** + +Step 9 - Change degree to 3 and run steps 5-8 + +# Step 5 - Convert X to polynomial format +from sklearn.preprocessing import PolynomialFeatures +poly_reg = PolynomialFeatures(degree=3) +X_poly = poly_reg.fit_transform(X) + + # Step 6 - Passing X_poly to LinearRegression +lin_reg_2 = LinearRegression() +lin_reg_2.fit(X_poly,y) + +# Step 7 - Visualize Poly Regression Results +plt.scatter(X,y, color="red") +plt.plot(X, lin_reg_2.predict(poly_reg.fit_transform(X))) +plt.title("Poly Regression Degree 3") +plt.xlabel("Level") +plt.ylabel("Salary") +plt.show() + +# Step 8 prediction +lin_reg_2.predict(poly_reg.fit_transform([[6.5]])) + +We get a prediction of $133k + +********************************* + +Step 10 - Change degree to 4 and run steps 5-8 + +# Step 5 - Convert X to polynomial format +from sklearn.preprocessing import PolynomialFeatures +poly_reg = PolynomialFeatures(degree=4) +X_poly = poly_reg.fit_transform(X) + + # Step 6 - Passing X_poly to LinearRegression +lin_reg_2 = LinearRegression() +lin_reg_2.fit(X_poly,y) + +# Step 7 - Visualize Poly Regression Results +plt.scatter(X,y, color="red") +plt.plot(X, lin_reg_2.predict(poly_reg.fit_transform(X))) +plt.title("Poly Regression Degree 4") +plt.xlabel("Level") +plt.ylabel("Salary") +plt.show() + +# Step 8 prediction +lin_reg_2.predict(poly_reg.fit_transform([[6.5]])) + +We get a prediction of $158k which looks reasonable based on our dataset. + +So in this case by using Linear Regression - we got a prediction of $330k and by using Polynomial Regression we got a prediction of 158k. + +Here is the full source code. \ No newline at end of file diff --git a/project_3_polynomial_regression/project_3_banner.png b/project_3_polynomial_regression/project_3_banner.png new file mode 100644 index 0000000..f4b5145 Binary files /dev/null and b/project_3_polynomial_regression/project_3_banner.png differ diff --git a/project_4_support_vector_regression/Position_Salaries.csv b/project_4_support_vector_regression/Position_Salaries.csv new file mode 100644 index 0000000..76d9d3e --- /dev/null +++ b/project_4_support_vector_regression/Position_Salaries.csv @@ -0,0 +1,11 @@ +Position,Level,Salary +Business Analyst,1,45000 +Junior Consultant,2,50000 +Senior Consultant,3,60000 +Manager,4,80000 +Country Manager,5,110000 +Region Manager,6,150000 +Partner,7,200000 +Senior Partner,8,300000 +C-level,9,500000 +CEO,10,1000000 \ No newline at end of file diff --git a/project_4_support_vector_regression/img_1_dataset.png b/project_4_support_vector_regression/img_1_dataset.png new file mode 100644 index 0000000..3d46494 Binary files /dev/null and b/project_4_support_vector_regression/img_1_dataset.png differ diff --git a/project_4_support_vector_regression/img_2_x_y.png b/project_4_support_vector_regression/img_2_x_y.png new file mode 100644 index 0000000..dd2f930 Binary files /dev/null and b/project_4_support_vector_regression/img_2_x_y.png differ diff --git a/project_4_support_vector_regression/project_4 b/project_4_support_vector_regression/project_4 new file mode 100644 index 0000000..065e509 --- /dev/null +++ b/project_4_support_vector_regression/project_4 @@ -0,0 +1,101 @@ + +Today I'll be looking at the Support Vector Regression (SVR) example from the A-Z Machine Learning course on Udemy. + +#100DaysOfMLCode #100ProjectsInML + +We will be working on the same problem that we worked on Project 3. Here instead of using Polynomial Regression, we will use Support Vector Regression and see whether the prediction is better or worse compared to Polynomial Regression. + +Lets explore the dataset. + +Dataset +First lets look at the dataset. It is Position_Salaries.csv and can be found here +It has 3 columns - "Position", "Level" and "Salary" and describes the approximate salary range for an employee based on what level he falls under. + +For example if an employee is a Manager - he falls in Level 4 and should get around $80,000. + +Below is the screenshot of the dataset. + +Project Objective + +Lets assume the above table is what the HR team of a company uses to determine what salary to offer to a new employee. For our project, let's take an example that an employee has applied for the role of a Regional Manager and has already worked as a Regional Manager for 2 years. So based on the table above - he falls between level 6 and level 7 - Lets say he falls under level 6.5 + +We want to build a model to predict what salary we should offer this new employee. + +Let's get started. + +Step 1: Load the Dataset + +If we look at the dataset, we need to predict the salary for an employee who falls under Level 6.5 - So we really do not need the first column "Position". + +Here X is the independent variable which is the "Level" +and y is the dependent variable which is the "Salary" + +So for X, we specify + +X = dataset.iloc[:, 1:2].values + +which simply means take all rows and all columns from index 1 upto index 2 but not including index 2 (upper bound range is not included) + +And for y, we specify + +y = dataset.iloc[:, 2:].values + +which simply means take all rows and only columns with index 2 which is Salary + +# Step 1 - Load Data +import pandas as pd +dataset = pd.read_csv("Position_Salaries.csv") +X = dataset.iloc[: ,1:2].values +y = dataset.iloc[:, 2:].values + + +Step 2 - Feature Scaling + +# Step 2 - Feature Scaling +from sklearn.preprocessing import StandardScaler +sc_X = StandardScaler() +sc_y = StandardScaler() +X = sc_X.fit_transform(X) +y = sc_y.fit_transform(y) + + +Step 3 - Fit SVR + +We will be using the SVR class from the library sklearn.svm. First we create an object of the SVR class and pass kernel parameter as "rbf" (Radial Basis Function) and then call the fit method passing the X and y. + +# Step 3 - Fit SVR +from sklearn.svm import SVR +regressor = SVR(kernel = "rbf") +regressor.fit(X,y) + + +Step 4 - Visualization + +# Step 4 - Visualization +import matplotlib.pyplot as plt +plt.scatter(X, y , color="red") +plt.plot(X, regressor.predict(X), color="blue") +plt.title("SVR") +plt.xlabel("Position") +plt.ylabel("Salary") +plt.show() + +Step 5 - Make Predictions + +Since we want to predict the salary for an employee at level 6.5 - first we will have to do feature scaling to transform value 6.5 +Then we have to do the prediction +Finally since the predicted value is already scaled, we have to do inverse transformation to get the actual value +These steps are outlined below. + + +# Step 5 - Predictions +import numpy as np +# First transform 6.5 to feature scaling +sc_X_val = sc_X.transform(np.array([[6.5]])) +# Second predict the value +scaled_y_pred = regressor.predict(sc_X_val) +# Third - since this is scaled - we have to inverse transform +y_pred = sc_y.inverse_transform(scaled_y_pred) + + +We can see that the predicted value is $170k and in Polynomial Regression we got $158k diff --git a/project_4_support_vector_regression/svr_2019.py b/project_4_support_vector_regression/svr_2019.py new file mode 100644 index 0000000..e7d4da2 --- /dev/null +++ b/project_4_support_vector_regression/svr_2019.py @@ -0,0 +1,43 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Sat Dec 1 19:28:27 2018 + +@author: omairaasim +""" + +# Step 1 - Load Data +import pandas as pd +dataset = pd.read_csv("Position_Salaries.csv") +X = dataset.iloc[: ,1:2].values +y = dataset.iloc[:, 2:].values + +# Step 2 - Feature Scaling +from sklearn.preprocessing import StandardScaler +sc_X = StandardScaler() +sc_y = StandardScaler() +X = sc_X.fit_transform(X) +y = sc_y.fit_transform(y) + +# Step 3 - Fit SVR +from sklearn.svm import SVR +regressor = SVR(kernel = "rbf") +regressor.fit(X,y) + +# Step 4 - Visualization +import matplotlib.pyplot as plt +plt.scatter(X, y , color="red") +plt.plot(X, regressor.predict(X), color="blue") +plt.title("SVR") +plt.xlabel("Position") +plt.ylabel("Salary") +plt.show() + +# Step 5 - Predict Results +import numpy as np +# First transform 6.5 to feature scaling +sc_X_val = sc_X.transform(np.array([[6.5]])) +# Second predict the value +scaled_y_pred = regressor.predict(sc_X_val) +# Third - since this is scaled - we have to inverse transform +y_pred = sc_y.inverse_transform(scaled_y_pred) diff --git a/project_4_support_vector_regression/svr_banner.png b/project_4_support_vector_regression/svr_banner.png new file mode 100644 index 0000000..995b98a Binary files /dev/null and b/project_4_support_vector_regression/svr_banner.png differ diff --git a/project_4_support_vector_regression/svr_plot.png b/project_4_support_vector_regression/svr_plot.png new file mode 100644 index 0000000..cd4255a Binary files /dev/null and b/project_4_support_vector_regression/svr_plot.png differ diff --git a/project_5_decision_tree_regression/Position_Salaries.csv b/project_5_decision_tree_regression/Position_Salaries.csv new file mode 100644 index 0000000..76d9d3e --- /dev/null +++ b/project_5_decision_tree_regression/Position_Salaries.csv @@ -0,0 +1,11 @@ +Position,Level,Salary +Business Analyst,1,45000 +Junior Consultant,2,50000 +Senior Consultant,3,60000 +Manager,4,80000 +Country Manager,5,110000 +Region Manager,6,150000 +Partner,7,200000 +Senior Partner,8,300000 +C-level,9,500000 +CEO,10,1000000 \ No newline at end of file diff --git a/project_5_decision_tree_regression/Screen Shot 2019-09-05 at 8.38.22 PM.png b/project_5_decision_tree_regression/Screen Shot 2019-09-05 at 8.38.22 PM.png new file mode 100644 index 0000000..b8aa496 Binary files /dev/null and b/project_5_decision_tree_regression/Screen Shot 2019-09-05 at 8.38.22 PM.png differ diff --git a/project_5_decision_tree_regression/Screen Shot 2019-09-05 at 8.43.43 PM.png b/project_5_decision_tree_regression/Screen Shot 2019-09-05 at 8.43.43 PM.png new file mode 100644 index 0000000..5b9450a Binary files /dev/null and b/project_5_decision_tree_regression/Screen Shot 2019-09-05 at 8.43.43 PM.png differ diff --git a/project_5_decision_tree_regression/decision_tree.py b/project_5_decision_tree_regression/decision_tree.py new file mode 100644 index 0000000..b09983e --- /dev/null +++ b/project_5_decision_tree_regression/decision_tree.py @@ -0,0 +1,35 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Sat Dec 1 20:35:24 2018 + +@author: omairaasim +""" + +# Step 1 - Load Dataset +import pandas as pd +dataset = pd.read_csv("Position_Salaries.csv") +X = dataset.iloc[:, 1:2].values +y = dataset.iloc[:, 2].values + +# Step 2 - Fit Decision Tree Regressor +from sklearn.tree import DecisionTreeRegressor +regressor = DecisionTreeRegressor(criterion="mse") +regressor.fit(X, y) + +# Step 3 - Visualize +import matplotlib.pyplot as plt + +import numpy as np +X_grid = np.arange(min(X), max(X), 0.01) +X_grid = X_grid.reshape((len(X_grid),1)) + +plt.scatter(X, y, color="red") +plt.plot(X_grid, regressor.predict(X_grid), color="blue") +plt.title("Decision Tree Regressor") +plt.xlabel("Position") +plt.ylabel("Salary") +plt.show() + +# Step 4 - Predict +regressor.predict([[6.5]]) \ No newline at end of file diff --git a/project_5_decision_tree_regression/img_0_scatter_plot.png b/project_5_decision_tree_regression/img_0_scatter_plot.png new file mode 100644 index 0000000..affdf5d Binary files /dev/null and b/project_5_decision_tree_regression/img_0_scatter_plot.png differ diff --git a/project_5_decision_tree_regression/img_1_dataset.png b/project_5_decision_tree_regression/img_1_dataset.png new file mode 100644 index 0000000..3d46494 Binary files /dev/null and b/project_5_decision_tree_regression/img_1_dataset.png differ diff --git a/project_5_decision_tree_regression/img_2_x_y.png b/project_5_decision_tree_regression/img_2_x_y.png new file mode 100644 index 0000000..dd2f930 Binary files /dev/null and b/project_5_decision_tree_regression/img_2_x_y.png differ diff --git a/project_5_decision_tree_regression/img_split_1.png b/project_5_decision_tree_regression/img_split_1.png new file mode 100644 index 0000000..e2a6e3a Binary files /dev/null and b/project_5_decision_tree_regression/img_split_1.png differ diff --git a/project_5_decision_tree_regression/img_split_2.png b/project_5_decision_tree_regression/img_split_2.png new file mode 100644 index 0000000..e7dfe67 Binary files /dev/null and b/project_5_decision_tree_regression/img_split_2.png differ diff --git a/project_5_decision_tree_regression/img_split_3.png b/project_5_decision_tree_regression/img_split_3.png new file mode 100644 index 0000000..b115da8 Binary files /dev/null and b/project_5_decision_tree_regression/img_split_3.png differ diff --git a/project_5_decision_tree_regression/img_split_4.png b/project_5_decision_tree_regression/img_split_4.png new file mode 100644 index 0000000..8bde73b Binary files /dev/null and b/project_5_decision_tree_regression/img_split_4.png differ diff --git a/project_5_decision_tree_regression/project_5 b/project_5_decision_tree_regression/project_5 new file mode 100644 index 0000000..a97b6f6 --- /dev/null +++ b/project_5_decision_tree_regression/project_5 @@ -0,0 +1,101 @@ +Today we will be looking at the one of the most popular regression models called Decision Tree. + +#100DaysOfMLCode #100ProjectsInML + +I will be solving the same problem about predicting salary of a new employee based on his position level. + +I have solved the same problem in project 3 using Polynomial Regression - You can check it out here. +And the same problem has been solved in project 4 using Support Vector Regression - You can check that project here. + +Let's understand Decision Trees. + +Decision tree regression model is Non Linear and a Non continuous model. + +Below is a scatter plot which represents our dataset. It has 2 independent variables X1 and X2 and what we are trying to predict is a 3rd dependent variable y. + +Insert image + +Now once we run the decision tree algorithm, the scatter plot will be split up into segments. Each one of these splits is called a leaf. The way the splits are made is based on the principle of information entropy. It is a mathematical concept and is quite complex. If you want to learn more about that - you can read up on the concept of information entropy. + + +Let's walk through an example scenario so we understand how decision tree's work. Let's say the algorithm makes the first split at X1 = 20 - so the scatter plot is divided into 2 segments - first segment is when X1 < 20 and second segment is when X1 > 20. + +Insert image + +Insert image + + +Now let's say split 2 happens at X2 = 170 - but it only happens to points where X1 > 20 + +Insert image + +Insert image + +Next, split 3 happens at X2 = 200 - but it happens to points X1 < 20 + +Insert image + +Insert image + + +Finally, split 4 happens at X1 = 40 - but it applies to points where X1 > 20 and X2 < 170 + +Insert image + +So now our decision tree is done. + +Now how do we determine the value of a new data point. It's very simple - we take the average of each of our terminal leaves. The diagram below shows an example of average for each of the terminal leaves. + +Insert image + +Now lets say we have a new data point where X1 = 30 and X2 = 50, it falls in the leaf whose average is -64.1 - so the decision tree algorithm will predict the value of y as -64.1. From the below diagram, we can see how it arrives at that value. + +Insert image + + +Dataset +First lets look at the dataset. It is Position_Salaries.csv and can be found here +It has 3 columns - "Position", "Level" and "Salary" and describes the approximate salary range for an employee based on what level he falls under. + +For example if an employee is a Manager - he falls in Level 4 and should get around $80,000. + +Below is the screenshot of the dataset. + +Project Objective + +Lets assume the above table is what the HR team of a company uses to determine what salary to offer to a new employee. For our project, let's take an example that an employee has applied for the role of a Regional Manager and has already worked as a Regional Manager for 2 years. So based on the table above - he falls between level 6 and level 7 - Lets say he falls under level 6.5 + +We want to build a model to predict what salary we should offer this new employee. + +Let's get started. + + +Step 1: Load the Dataset + +If we look at the dataset, we need to predict the salary for an employee who falls under Level 6.5 - So we really do not need the first column "Position". + +Here X is the independent variable which is the "Level" +and y is the dependent variable which is the "Salary" + +So for X, we specify + +X = dataset.iloc[:, 1:2].values + +which simply means take all rows and all columns from index 1 upto index 2 but not including index 2 (upper bound range is not included) + +And for y, we specify + +dataset.iloc[:, 2].values + +which simply means take all rows and only columns with index 2 - In python indexes begin at 0 - so index 2 here is the second column which is Salary + +Step 2 - Fit Decision Tree Regressor + +We will be using the DecisionTreeRegressor class from the library sklearn.tree. First we create an object of the DecisionTreeRegressor class and pass criterion parameter as "mse" (Mean Squared Error) and then call the fit method passing the X and y. + +Step 3 - Visualize +Let's plot the graph to look at the results for Decision Tree Regression. For Decision Trees we have to use continuous points. + +Step 4: Predict Decision Tree Regression Results + +We get a prediction of $150k diff --git a/project_6_random_forest_regression/Position_Salaries.csv b/project_6_random_forest_regression/Position_Salaries.csv new file mode 100644 index 0000000..76d9d3e --- /dev/null +++ b/project_6_random_forest_regression/Position_Salaries.csv @@ -0,0 +1,11 @@ +Position,Level,Salary +Business Analyst,1,45000 +Junior Consultant,2,50000 +Senior Consultant,3,60000 +Manager,4,80000 +Country Manager,5,110000 +Region Manager,6,150000 +Partner,7,200000 +Senior Partner,8,300000 +C-level,9,500000 +CEO,10,1000000 \ No newline at end of file diff --git a/project_6_random_forest_regression/img_1_dataset.png b/project_6_random_forest_regression/img_1_dataset.png new file mode 100644 index 0000000..3d46494 Binary files /dev/null and b/project_6_random_forest_regression/img_1_dataset.png differ diff --git a/project_6_random_forest_regression/img_2_x_y.png b/project_6_random_forest_regression/img_2_x_y.png new file mode 100644 index 0000000..dd2f930 Binary files /dev/null and b/project_6_random_forest_regression/img_2_x_y.png differ diff --git a/project_6_random_forest_regression/img_3_plot_10_trees.png b/project_6_random_forest_regression/img_3_plot_10_trees.png new file mode 100644 index 0000000..020a1b4 Binary files /dev/null and b/project_6_random_forest_regression/img_3_plot_10_trees.png differ diff --git a/project_6_random_forest_regression/img_4_plot_100_trees.png b/project_6_random_forest_regression/img_4_plot_100_trees.png new file mode 100644 index 0000000..62e3c20 Binary files /dev/null and b/project_6_random_forest_regression/img_4_plot_100_trees.png differ diff --git a/project_6_random_forest_regression/img_5_plot_300_trees.png b/project_6_random_forest_regression/img_5_plot_300_trees.png new file mode 100644 index 0000000..f70134c Binary files /dev/null and b/project_6_random_forest_regression/img_5_plot_300_trees.png differ diff --git a/project_6_random_forest_regression/project_6 b/project_6_random_forest_regression/project_6 new file mode 100644 index 0000000..a8069af --- /dev/null +++ b/project_6_random_forest_regression/project_6 @@ -0,0 +1,134 @@ +Today I will be writing about Random Forest Regression Model. Random Forest is a version of Ensemble Learning. Ensemble learning in simple terms is when you take a sample algorithm multiple times and you put them together to make it more powerful than the original version. Unlike Decision Tree model where we built a Decision Tree to predict the value for a new data point - In Random Forest we build many Decision Trees - (typical default is 500 trees). + +So instead of getting 1 prediction, in Random Forest we get many predictions for y (say 500 trees give out 500 predictions). We then take the average of all the predictions to assign that to y. + +#100DaysOfMLCode #100ProjectsInML + +I have solved the same problem in project 3 using Polynomial Regression - You can check it out here. +We then solved it Support Vector Regression - You can check that project here. +And in the last project, we used Decision Tree Regression - Its available here. + +Today, we will use Random Forest model and see how good our prediction is. + +Dataset + +First lets look at the dataset. It is Position_Salaries.csv and can be found here +It has 3 columns - "Position", "Level" and "Salary" and describes the approximate salary range for an employee based on what level he falls under. + +For example if an employee is a Manager - he falls in Level 4 and should get around $80,000. + +Below is the screenshot of the dataset. + + +Project Objective + +Lets assume the above table is what the HR team of a company uses to determine what salary to offer to a new employee. For our project, let's take an example that an employee has applied for the role of a Regional Manager and has already worked as a Regional Manager for 2 years. So based on the table above - he falls between level 6 and level 7 - Lets say he falls under level 6.5 + +We want to build a model to predict what salary we should offer this new employee. + +Let's get started. + +Step 1: Load the Dataset + +If we look at the dataset, we need to predict the salary for an employee who falls under Level 6.5 - So we really do not need the first column "Position". + +Here X is the independent variable which is the "Level" +and y is the dependent variable which is the "Salary" + +So for X, we specify + +X = dataset.iloc[:, 1:2].values + +which simply means take all rows and all columns from index 1 upto index 2 but not including index 2 (upper bound range is not included) + +And for y, we specify + +dataset.iloc[:, 2].values + +which simply means take all rows and only columns with index 2 - In python indexes begin at 0 - so index 2 here is the second column which is Salary + +# Step 1 - Load Data +import pandas as pd +dataset = pd.read_csv("Position_Salaries.csv") +X = dataset.iloc[:, 1:2].values +y = dataset.iloc[:, 2].values + +Step 2 - Fit Random Forest Regressor + +We will be using the RandomForestRegressor class from the library sklearn.ensemble. First we create an object of the RandomForestRegressor class. + +When initializing the class, we need to specify the number of trees. So we pass the parameter n_estimators which specifies the number of trees we want to use. The second parameter of random_state = 0 is just so that our results match. We then call the fit method passing the X and y. + +First lets run by setting n_estimators as 10 trees + +# Step 2 - Fit Regressor +from sklearn.ensemble import RandomForestRegressor +regressor = RandomForestRegressor(n_estimators=10, random_state=0) +regressor.fit(X, y) + +Step 3 - Visualize +Let's plot the graph to look at the results for Random Forest Regression. For Random Forest also we have to use continuous points. + +# Step 3 - Visualize +import matplotlib.pyplot as plt +import numpy as np +X_grid = np.arange(min(X), max(X), 0.01) +X_grid = X_grid.reshape((len(X_grid),1)) + +plt.scatter(X, y, color="red") +plt.plot(X_grid, regressor.predict(X_grid), color="blue") +plt.title("Random Forest Regressor - 10 Trees") +plt.xlabel("Position") +plt.ylabel("Salaries") +plt.show() + +Step 4: Predict Random Forest Regression Results + +We get a prediction of $167k + +# Step 4 - Predict +regressor.predict([[6.5]]) + +Step 5: Increase number of tree's to 100 + +regressor = RandomForestRegressor(n_estimators=100, random_state=0) +regressor.fit(X, y) + +import numpy as np +X_grid = np.arange(min(X), max(X), 0.01) +X_grid = X_grid.reshape((len(X_grid),1)) +plt.scatter(X, y, color="red") +plt.plot(X_grid, regressor.predict(X_grid), color="blue") +plt.title("Random Forest Regressor - 100 Trees") +plt.xlabel("Position") +plt.ylabel("Salaries") +plt.show() + +regressor.predict([[6.5]]) + +We get a prediction of $158k + +Step 6: Increase number of tree's to 300 + +regressor = RandomForestRegressor(n_estimators=300, random_state=0) +regressor.fit(X, y) + +import numpy as np +X_grid = np.arange(min(X), max(X), 0.01) +X_grid = X_grid.reshape((len(X_grid),1)) +plt.scatter(X, y, color="red") +plt.plot(X_grid, regressor.predict(X_grid), color="blue") +plt.title("Random Forest Regressor - 300 Trees") +plt.xlabel("Position") +plt.ylabel("Salaries") +plt.show() + +regressor.predict([[6.5]]) + +We get a prediction of $160k + +So to compare our results with previous regression models +Polynomial Regression gave a prediction of $158k +Support Vector Regression gave a prediction of $170k +Decision Tree Regression gave a prediction of $150k +Random Forest Regression with 300 trees gave a prediction of $160k \ No newline at end of file diff --git a/project_6_random_forest_regression/random_forest.py b/project_6_random_forest_regression/random_forest.py new file mode 100644 index 0000000..d0ffdfd --- /dev/null +++ b/project_6_random_forest_regression/random_forest.py @@ -0,0 +1,33 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Sun Dec 2 11:54:30 2018 + +@author: omairaasim +""" +# Step 1 - Load Data +import pandas as pd +dataset = pd.read_csv("Position_Salaries.csv") +X = dataset.iloc[:, 1:2].values +y = dataset.iloc[:, 2].values + +# Step 2 - Fit Regressor +from sklearn.ensemble import RandomForestRegressor +regressor = RandomForestRegressor(n_estimators=100, random_state=0) +regressor.fit(X, y) + +# Step 3 - Visualize +import matplotlib.pyplot as plt +import numpy as np +X_grid = np.arange(min(X), max(X), 0.01) +X_grid = X_grid.reshape((len(X_grid),1)) + +plt.scatter(X, y, color="red") +plt.plot(X_grid, regressor.predict(X_grid), color="blue") +plt.title("Random Forest Regressor - 100 Trees") +plt.xlabel("Position") +plt.ylabel("Salaries") +plt.show() + +# Step 4 - Predict +regressor.predict([[6.5]]) \ No newline at end of file