Add files via upload

omairaasim · Sep 6, 2019 · 37775e9 · 37775e9
1 parent 089d1ad
commit 37775e9
Show file tree

Hide file tree

Showing 64 changed files with 1,282 additions and 0 deletions.
diff --git a/project_1_simple_linear_regression/Salary_Data.csv b/project_1_simple_linear_regression/Salary_Data.csv
@@ -0,0 +1,31 @@
+YearsExperience,Salary
+1.1,39343.00
+1.3,46205.00
+1.5,37731.00
+2.0,43525.00
+2.2,39891.00
+2.9,56642.00
+3.0,60150.00
+3.2,54445.00
+3.2,64445.00
+3.7,57189.00
+3.9,63218.00
+4.0,55794.00
+4.0,56957.00
+4.1,57081.00
+4.5,61111.00
+4.9,67938.00
+5.1,66029.00
+5.3,83088.00
+5.9,81363.00
+6.0,93940.00
+6.8,91738.00
+7.1,98273.00
+7.9,101302.00
+8.2,113812.00
+8.7,109431.00
+9.0,105582.00
+9.5,116969.00
+9.6,112635.00
+10.3,122391.00
+10.5,121872.00
diff --git a/project_1_simple_linear_regression/img_1_dataset copy.png b/project_1_simple_linear_regression/img_1_dataset copy.png
diff --git a/project_1_simple_linear_regression/img_1_dataset.png b/project_1_simple_linear_regression/img_1_dataset.png
diff --git a/project_1_simple_linear_regression/img_2_x_y.png b/project_1_simple_linear_regression/img_2_x_y.png
diff --git a/project_1_simple_linear_regression/img_3_train_test_data.png b/project_1_simple_linear_regression/img_3_train_test_data.png
diff --git a/project_1_simple_linear_regression/img_4_compare_results.png b/project_1_simple_linear_regression/img_4_compare_results.png
diff --git a/project_1_simple_linear_regression/img_5_plot_test.png b/project_1_simple_linear_regression/img_5_plot_test.png
diff --git a/project_1_simple_linear_regression/img_5_plot_training.png b/project_1_simple_linear_regression/img_5_plot_training.png
diff --git a/project_1_simple_linear_regression/project_1 b/project_1_simple_linear_regression/project_1
@@ -0,0 +1,131 @@
+I'm basically writing this blog for myself because I've been wanting to learn Machine Learning for a while now but have never really got to it. So this blog is more like a journal for me to write about my daily progress - (hopefully I will be making some progress every day).
+
+#100DaysOfMLCode #100ProjectsInML
+
+The best approach for me to learn anything is by working on sample projects. No matter how simple the project is, it helps me better understand the concepts. So I will be working through some small mini projects as part of this learning journey.
+
+There are 100's of excellent resources out there to help you get started. I stumbled upon this A-Z Machine learning course on Udemy and I'll be walking through those examples in the first few weeks.
+
+
+
+Today I'll be going through "Simple Linear Regression"
+
+Dataset
+First lets look at the dataset. It is Salary_Data.csv and can be found here
+It has 2 columns - "Years of Experience" and "Salary" for 30 employees in a company. So in this example, we will train a Simple Linear Regression model to learn the correlation between the number of years of experience of each employee and their respective salary. Once the model is trained, we will be able to do some sample predictions.   
+
+Below is a sample screenshot of the dataset.
+
+
+So lets get started.
+
+Step 1: Load the Dataset
+
+Below is the code snippet for loading the dataset. 
+We will be using the pandas dataframe.
+Here X is the independent variable which is the "Years of Experience"
+and y is the dependent variable which is the "Salary"
+
+So for X, we specify dataset.iloc[:, :-1].values
+which simply means take all rows and all columns except last one
+
+And for y, we specify dataset.iloc[:, 1].values
+which simply means take all rows and only columns with index 1 - In python indexes begin at 0 - so index 1 here is the second column which is Salary
+
+# Step 1 Load Data
+import pandas as pd
+dataset = pd.read_csv('Salary_Data.csv')
+X = dataset.iloc[:, :-1].values
+y = dataset.iloc[:,1].values
+
+Below is the sample screenshot of X and y
+
+Step 2: Split dataset into training set and test set
+
+Next we have to split the dataset into training and testing. We will use the training dataset for training the model and then check the performance of the model on the test dataset.
+
+For this we will use the train_test_split method from library model_selection
+We are providing a test_size of 1/3 which means test set will contain 10 observations and training set will contain 20 observations
+The random_state=0 is required only if you want to compare your results with mine.
+
+# Step 2: Split data into training and testing
+from sklearn.model_selection import train_test_split
+X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1/3, random_state=0)
+
+Below is the sample screenshot of X_train, y_train, X_test and y_test
+
+Step 3: Fit Simple Linear Regression model to training set
+
+This is a very simple step. We will be using the LinearRegression class from the library sklearn.linear_model. First we create an object of the LinearRegression class and call the fit method passing the X_train and y_train. 
+
+# Step 3: Fit Simple Linear Regression to Training Data
+from sklearn.linear_model import LinearRegression
+regressor = LinearRegression()
+regressor.fit(X_train, y_train)
+
+
+Step 4: Predict the test set
+Using the regressor we trained in the previous step, we will not use it to predict the results of the test set and compare the predicted values with the actual values
+
+# Step 4: Make Prediction
+y_pred = regressor.predict(X_test)
+
+Now we have the y_pred which are the predicted values from our Model and y_test which are the actual values. 
+Let us compare are see how well our model did. As you can see from the screenshot below - our basic model did pretty well.
+
+If we take the first employee - the actual salary is 37731 and our model predicted 40835.1 - which is not too bad. There are some predictions that are off but some are pretty close.
+
+Step 5 - Visualizing the training set
+
+Lets visualize the results.
+First we'll plot the actual data points of training set - X_train and y_train
+plt.scatter(X_train, y_train, color = 'red')
+
+Next we'll plot the regression line - which is the predicted values for the X_train
+plt.plot(X_train, regressor.predict(X_train), color='blue')
+
+# Step 5 - Visualize training set results
+import matplotlib.pyplot as plt
+# plot the actual data points of training set
+plt.scatter(X_train, y_train, color = 'red')
+# plot the regression line
+plt.plot(X_train, regressor.predict(X_train), color='blue')
+plt.title('Salary vs Experience (Training set)')
+plt.xlabel('Years of Experience')
+plt.ylabel('Salary')
+plt.show()
+
+
+Step 6 - Visualizing the test set
+
+Lets visualize the results.
+First we'll plot the actual data points of training set - X_test and y_test
+plt.scatter(X_test, y_test, color = 'red')
+
+Next we'll plot the regression line - which is the same as above
+plt.plot(X_train, regressor.predict(X_train), color='blue')
+
+# Step 6 - Visualize test set results
+import matplotlib.pyplot as plt
+# plot the actual data points of training set
+plt.scatter(X_test, y_test, color = 'red')
+# plot the regression line
+plt.plot(X_train, regressor.predict(X_train), color='blue')
+plt.title('Salary vs Experience (Test set)')
+plt.xlabel('Years of Experience')
+plt.ylabel('Salary')
+plt.show()
+
+Step 7 - Make new predictions
+We can also make brand new predictions for data points that do not exist in the dataset.
+Like for a person with 15 years experience
+
+new_salary_pred = regressor.predict([[15]])
+
+# Step 7 - Make new prediction
+new_salary_pred = regressor.predict([[15]])
+
+Here is the full source code
+
+
+
diff --git a/project_1_simple_linear_regression/simple_linear_regression.py b/project_1_simple_linear_regression/simple_linear_regression.py
@@ -0,0 +1,49 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Sun Sep  1 19:14:35 2019
+@author: omairaasim
+"""
+
+# Step 1 Load Data
+import pandas as pd
+dataset = pd.read_csv('Salary_Data.csv')
+X = dataset.iloc[:, :-1].values
+y = dataset.iloc[:,1].values
+
+# Step 2: Split data into training and testing
+from sklearn.model_selection import train_test_split
+X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1/3, random_state=0)
+
+# Step 3: Fit Simple Linear Regression to Training Data
+from sklearn.linear_model import LinearRegression
+regressor = LinearRegression()
+regressor.fit(X_train, y_train)
+
+# Step 4: Make Prediction
+y_pred = regressor.predict(X_test)
+
+# Step 5 - Visualize training set results
+import matplotlib.pyplot as plt
+# plot the actual data points of training set
+plt.scatter(X_train, y_train, color = 'red')
+# plot the regression line
+plt.plot(X_train, regressor.predict(X_train), color='blue')
+plt.title('Salary vs Experience (Training set)')
+plt.xlabel('Years of Experience')
+plt.ylabel('Salary')
+plt.show()
+
+# Step 6 - Visualize test set results
+import matplotlib.pyplot as plt
+# plot the actual data points of test set
+plt.scatter(X_test, y_test, color = 'red')
+# plot the regression line (same as above)
+plt.plot(X_train, regressor.predict(X_train), color='blue')
+plt.title('Salary vs Experience (Test set)')
+plt.xlabel('Years of Experience')
+plt.ylabel('Salary')
+plt.show()
+
+# Step 7 - Make new prediction
+new_salary_pred = regressor.predict([[15]])
diff --git a/project_2_multiple_linear_regression/50_Startups.csv b/project_2_multiple_linear_regression/50_Startups.csv
@@ -0,0 +1,51 @@
+R&D Spend,Administration,Marketing Spend,State,Profit
+165349.2,136897.8,471784.1,New York,192261.83
+162597.7,151377.59,443898.53,California,191792.06
+153441.51,101145.55,407934.54,Florida,191050.39
+144372.41,118671.85,383199.62,New York,182901.99
+142107.34,91391.77,366168.42,Florida,166187.94
+131876.9,99814.71,362861.36,New York,156991.12
+134615.46,147198.87,127716.82,California,156122.51
+130298.13,145530.06,323876.68,Florida,155752.6
+120542.52,148718.95,311613.29,New York,152211.77
+123334.88,108679.17,304981.62,California,149759.96
+101913.08,110594.11,229160.95,Florida,146121.95
+100671.96,91790.61,249744.55,California,144259.4
+93863.75,127320.38,249839.44,Florida,141585.52
+91992.39,135495.07,252664.93,California,134307.35
+119943.24,156547.42,256512.92,Florida,132602.65
+114523.61,122616.84,261776.23,New York,129917.04
+78013.11,121597.55,264346.06,California,126992.93
+94657.16,145077.58,282574.31,New York,125370.37
+91749.16,114175.79,294919.57,Florida,124266.9
+86419.7,153514.11,0,New York,122776.86
+76253.86,113867.3,298664.47,California,118474.03
+78389.47,153773.43,299737.29,New York,111313.02
+73994.56,122782.75,303319.26,Florida,110352.25
+67532.53,105751.03,304768.73,Florida,108733.99
+77044.01,99281.34,140574.81,New York,108552.04
+64664.71,139553.16,137962.62,California,107404.34
+75328.87,144135.98,134050.07,Florida,105733.54
+72107.6,127864.55,353183.81,New York,105008.31
+66051.52,182645.56,118148.2,Florida,103282.38
+65605.48,153032.06,107138.38,New York,101004.64
+61994.48,115641.28,91131.24,Florida,99937.59
+61136.38,152701.92,88218.23,New York,97483.56
+63408.86,129219.61,46085.25,California,97427.84
+55493.95,103057.49,214634.81,Florida,96778.92
+46426.07,157693.92,210797.67,California,96712.8
+46014.02,85047.44,205517.64,New York,96479.51
+28663.76,127056.21,201126.82,Florida,90708.19
+44069.95,51283.14,197029.42,California,89949.14
+20229.59,65947.93,185265.1,New York,81229.06
+38558.51,82982.09,174999.3,California,81005.76
+28754.33,118546.05,172795.67,California,78239.91
+27892.92,84710.77,164470.71,Florida,77798.83
+23640.93,96189.63,148001.11,California,71498.49
+15505.73,127382.3,35534.17,New York,69758.98
+22177.74,154806.14,28334.72,California,65200.33
+1000.23,124153.04,1903.93,New York,64926.08
+1315.46,115816.21,297114.46,Florida,49490.75
+0,135426.92,0,California,42559.73
+542.05,51743.15,0,New York,35673.41
+0,116983.8,45173.06,California,14681.4
diff --git a/project_2_multiple_linear_regression/img_10_p_value_2.png b/project_2_multiple_linear_regression/img_10_p_value_2.png
diff --git a/project_2_multiple_linear_regression/img_11_p_value_3.png b/project_2_multiple_linear_regression/img_11_p_value_3.png
diff --git a/project_2_multiple_linear_regression/img_12_p_value_4.png b/project_2_multiple_linear_regression/img_12_p_value_4.png
diff --git a/project_2_multiple_linear_regression/img_1_dataset.png b/project_2_multiple_linear_regression/img_1_dataset.png
diff --git a/project_2_multiple_linear_regression/img_2_x_y.png b/project_2_multiple_linear_regression/img_2_x_y.png
diff --git a/project_2_multiple_linear_regression/img_3_convert_text_to_num.png b/project_2_multiple_linear_regression/img_3_convert_text_to_num.png
diff --git a/project_2_multiple_linear_regression/img_4_dummy_variables.png b/project_2_multiple_linear_regression/img_4_dummy_variables.png
diff --git a/project_2_multiple_linear_regression/img_5_dummy_org.png b/project_2_multiple_linear_regression/img_5_dummy_org.png
diff --git a/project_2_multiple_linear_regression/img_6_train_test_data.png b/project_2_multiple_linear_regression/img_6_train_test_data.png
diff --git a/project_2_multiple_linear_regression/img_7_compare_results.png b/project_2_multiple_linear_regression/img_7_compare_results.png
diff --git a/project_2_multiple_linear_regression/img_8_add_ones.png b/project_2_multiple_linear_regression/img_8_add_ones.png
diff --git a/project_2_multiple_linear_regression/img_9_p_value_1.png b/project_2_multiple_linear_regression/img_9_p_value_1.png
diff --git a/project_2_multiple_linear_regression/multiple_linear_regression.py b/project_2_multiple_linear_regression/multiple_linear_regression.py
@@ -0,0 +1,63 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Fri Nov 30 19:45:38 2018
+
+@author: omairaasim
+"""
+
+# Step 1 - Load Data
+import pandas as pd
+dataset = pd.read_csv("50_Startups.csv")
+X = dataset.iloc[:,:-1].values
+y = dataset.iloc[:,4].values
+
+# Step 2 - Encode Categorical Data
+from sklearn.preprocessing import LabelEncoder, OneHotEncoder
+labelEncoder_X = LabelEncoder()
+X[:,3] = labelEncoder_X.fit_transform(X[:,3])
+
+oneHotEncoder = OneHotEncoder(categorical_features=[3])
+X = oneHotEncoder.fit_transform(X).toarray()
+
+# Step 3 - Dummy Trap
+X = X[:,1:]
+
+# Step 4 - Split Data
+from sklearn.model_selection import train_test_split
+X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=0)
+
+# Step 5 - Fit Regressor
+from sklearn.linear_model import LinearRegression
+regressor = LinearRegression()
+regressor.fit(X_train, y_train)
+
+# Step 6 - Predict
+y_pred = regressor.predict(X_test)
+
+# Add ones
+import numpy as np
+ones = np.ones(shape = (50,1), dtype=int)
+X = np.append(arr = ones, values= X, axis=1)
+
+# Backward Elimination
+import statsmodels.formula.api as sm
+X_opt = X[:,[0,1,2,3,4,5]]
+regressor_OLS = sm.OLS(endog = y, exog=X_opt).fit()
+regressor_OLS.summary()
+
+X_opt = X[:,[0,1,3,4,5]]
+regressor_OLS = sm.OLS(endog = y, exog=X_opt).fit()
+regressor_OLS.summary()
+
+X_opt = X[:,[0,3,4,5]]
+regressor_OLS = sm.OLS(endog = y, exog=X_opt).fit()
+regressor_OLS.summary()
+
+X_opt = X[:,[0,3,5]]
+regressor_OLS = sm.OLS(endog = y, exog=X_opt).fit()
+regressor_OLS.summary()
+
+X_opt = X[:,[0,3]]
+regressor_OLS = sm.OLS(endog = y, exog=X_opt).fit()
+regressor_OLS.summary()
diff --git a/project_2_multiple_linear_regression/multiple_linear_regression_banner.png b/project_2_multiple_linear_regression/multiple_linear_regression_banner.png