From d08aff1e9ec1f8be403d069b2ab8a74cfdf58a46 Mon Sep 17 00:00:00 2001 From: Anuragsarkar12 Date: Tue, 21 May 2024 20:59:54 +0530 Subject: [PATCH] Performed feature selection --- .../diabetesclassification-checkpoint.ipynb | 213 +++++++++++------- .../diabetesclassification.ipynb | 213 +++++++++++------- 2 files changed, 258 insertions(+), 168 deletions(-) diff --git a/Diabetes Classification/.ipynb_checkpoints/diabetesclassification-checkpoint.ipynb b/Diabetes Classification/.ipynb_checkpoints/diabetesclassification-checkpoint.ipynb index 0f854e2..fc0e11b 100644 --- a/Diabetes Classification/.ipynb_checkpoints/diabetesclassification-checkpoint.ipynb +++ b/Diabetes Classification/.ipynb_checkpoints/diabetesclassification-checkpoint.ipynb @@ -10,7 +10,7 @@ }, { "cell_type": "code", - "execution_count": 36, + "execution_count": 2, "id": "312c95a1", "metadata": {}, "outputs": [], @@ -29,7 +29,7 @@ }, { "cell_type": "code", - "execution_count": 37, + "execution_count": 3, "id": "aea1b45b", "metadata": {}, "outputs": [], @@ -47,7 +47,7 @@ }, { "cell_type": "code", - "execution_count": 38, + "execution_count": 4, "id": "2ac5c17b", "metadata": {}, "outputs": [ @@ -275,7 +275,7 @@ "[5132 rows x 11 columns]" ] }, - "execution_count": 38, + "execution_count": 4, "metadata": {}, "output_type": "execute_result" } @@ -286,7 +286,7 @@ }, { "cell_type": "code", - "execution_count": 39, + "execution_count": 5, "id": "c4484360", "metadata": {}, "outputs": [ @@ -408,7 +408,7 @@ "4 4 50 F 24 3.6 1.3 0.9 2.1 50.0 2.0 0" ] }, - "execution_count": 39, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" } @@ -427,7 +427,7 @@ }, { "cell_type": "code", - "execution_count": 40, + "execution_count": 6, "id": "ea7c6dcb", "metadata": {}, "outputs": [ @@ -437,7 +437,7 @@ "(5132, 11)" ] }, - "execution_count": 40, + "execution_count": 6, "metadata": {}, "output_type": "execute_result" } @@ -456,7 +456,7 @@ }, { "cell_type": "code", - "execution_count": 41, + "execution_count": 7, "id": "a3c2ff78", "metadata": {}, "outputs": [ @@ -499,7 +499,7 @@ }, { "cell_type": "code", - "execution_count": 42, + "execution_count": 8, "id": "a9cc7614", "metadata": {}, "outputs": [], @@ -510,7 +510,7 @@ }, { "cell_type": "code", - "execution_count": 43, + "execution_count": 9, "id": "cdee3c52", "metadata": { "scrolled": true @@ -554,7 +554,7 @@ }, { "cell_type": "code", - "execution_count": 44, + "execution_count": 10, "id": "2f353609", "metadata": {}, "outputs": [ @@ -572,7 +572,7 @@ "Name: Age, dtype: float64" ] }, - "execution_count": 44, + "execution_count": 10, "metadata": {}, "output_type": "execute_result" } @@ -583,7 +583,7 @@ }, { "cell_type": "code", - "execution_count": 45, + "execution_count": 11, "id": "b7887949", "metadata": {}, "outputs": [ @@ -601,7 +601,7 @@ "Name: BMI, dtype: float64" ] }, - "execution_count": 45, + "execution_count": 11, "metadata": {}, "output_type": "execute_result" } @@ -612,7 +612,7 @@ }, { "cell_type": "code", - "execution_count": 46, + "execution_count": 12, "id": "fdf468b3", "metadata": {}, "outputs": [ @@ -630,7 +630,7 @@ "Name: Chol, dtype: float64" ] }, - "execution_count": 46, + "execution_count": 12, "metadata": {}, "output_type": "execute_result" } @@ -641,7 +641,7 @@ }, { "cell_type": "code", - "execution_count": 47, + "execution_count": 13, "id": "7877549d", "metadata": {}, "outputs": [ @@ -659,7 +659,7 @@ "Name: TG, dtype: float64" ] }, - "execution_count": 47, + "execution_count": 13, "metadata": {}, "output_type": "execute_result" } @@ -670,7 +670,7 @@ }, { "cell_type": "code", - "execution_count": 48, + "execution_count": 14, "id": "71ec5fcb", "metadata": {}, "outputs": [ @@ -688,7 +688,7 @@ "Name: HDL, dtype: float64" ] }, - "execution_count": 48, + "execution_count": 14, "metadata": {}, "output_type": "execute_result" } @@ -699,7 +699,7 @@ }, { "cell_type": "code", - "execution_count": 49, + "execution_count": 15, "id": "bbd3682c", "metadata": {}, "outputs": [ @@ -717,7 +717,7 @@ "Name: LDL, dtype: float64" ] }, - "execution_count": 49, + "execution_count": 15, "metadata": {}, "output_type": "execute_result" } @@ -728,7 +728,7 @@ }, { "cell_type": "code", - "execution_count": 50, + "execution_count": 16, "id": "c4603e52", "metadata": {}, "outputs": [ @@ -746,7 +746,7 @@ "Name: Cr, dtype: float64" ] }, - "execution_count": 50, + "execution_count": 16, "metadata": {}, "output_type": "execute_result" } @@ -757,7 +757,7 @@ }, { "cell_type": "code", - "execution_count": 51, + "execution_count": 17, "id": "f98b84c7", "metadata": {}, "outputs": [ @@ -775,7 +775,7 @@ "Name: BUN, dtype: float64" ] }, - "execution_count": 51, + "execution_count": 17, "metadata": {}, "output_type": "execute_result" } @@ -794,7 +794,7 @@ }, { "cell_type": "code", - "execution_count": 52, + "execution_count": 18, "id": "a9add765", "metadata": {}, "outputs": [ @@ -822,7 +822,7 @@ }, { "cell_type": "code", - "execution_count": 53, + "execution_count": 19, "id": "752e448c", "metadata": {}, "outputs": [], @@ -835,7 +835,7 @@ }, { "cell_type": "code", - "execution_count": 54, + "execution_count": 20, "id": "a8441035", "metadata": {}, "outputs": [], @@ -846,7 +846,7 @@ }, { "cell_type": "code", - "execution_count": 55, + "execution_count": 21, "id": "378ecf72", "metadata": {}, "outputs": [ @@ -892,7 +892,7 @@ }, { "cell_type": "code", - "execution_count": 56, + "execution_count": 22, "id": "39477c2a", "metadata": {}, "outputs": [ @@ -953,7 +953,7 @@ }, { "cell_type": "code", - "execution_count": 57, + "execution_count": 23, "id": "651dd699", "metadata": {}, "outputs": [], @@ -969,7 +969,7 @@ }, { "cell_type": "code", - "execution_count": 58, + "execution_count": 24, "id": "e1801370", "metadata": {}, "outputs": [], @@ -979,7 +979,7 @@ }, { "cell_type": "code", - "execution_count": 59, + "execution_count": 25, "id": "eb6127cc", "metadata": {}, "outputs": [ @@ -1089,7 +1089,7 @@ "4 50 24.0 3.6 1.3 0.9 2.1 50.0 2.0 0" ] }, - "execution_count": 59, + "execution_count": 25, "metadata": {}, "output_type": "execute_result" } @@ -1100,7 +1100,7 @@ }, { "cell_type": "code", - "execution_count": 60, + "execution_count": 26, "id": "12304d13", "metadata": {}, "outputs": [], @@ -1111,7 +1111,7 @@ }, { "cell_type": "code", - "execution_count": 61, + "execution_count": 27, "id": "dfd08af6", "metadata": {}, "outputs": [], @@ -1144,7 +1144,7 @@ }, { "cell_type": "code", - "execution_count": 62, + "execution_count": 28, "id": "237cd728", "metadata": {}, "outputs": [ @@ -1154,7 +1154,7 @@ "['Chol', 'Cr', 'BUN']" ] }, - "execution_count": 62, + "execution_count": 28, "metadata": {}, "output_type": "execute_result" } @@ -1173,17 +1173,17 @@ }, { "cell_type": "code", - "execution_count": 63, + "execution_count": 29, "id": "40e0c236", "metadata": {}, "outputs": [], "source": [ - "data_new_kbest=data_new.drop(['Chol','Cr','BUN'],axis=1)" + "data_new_kbest=data_new.drop(['Chol','Cr','BUN'],axis" ] }, { "cell_type": "code", - "execution_count": 64, + "execution_count": 30, "id": "6f5836ec", "metadata": {}, "outputs": [ @@ -1338,7 +1338,7 @@ "[5132 rows x 6 columns]" ] }, - "execution_count": 64, + "execution_count": 30, "metadata": {}, "output_type": "execute_result" } @@ -1357,33 +1357,51 @@ }, { "cell_type": "code", - "execution_count": 66, + "execution_count": 40, "id": "67e27ab3", "metadata": {}, + "outputs": [], + "source": [ + "y_f= data_new_kbest['Diagnosis']\n", + "X_f=data_new_kbest.drop(['Diagnosis'],axis=1)\n", + "X_train, X_test, y_train, y_test = train_test_split(X_f, y_f, test_size=0.2, random_state=42)\n", + "from sklearn.model_selection import GridSearchCV, RandomizedSearchCV\n", + "from sklearn.ensemble import RandomForestClassifier\n", + "param_grid = {\n", + " 'n_estimators': [100, 200, 500],\n", + " 'max_depth': [3, 5, 8],\n", + " 'min_samples_split': [2, 5, 10],\n", + " 'min_samples_leaf': [1, 2, 4],\n", + " 'max_features': [5, 'sqrt', 'log2']\n", + "}\n", + "search = RandomizedSearchCV(RandomForestClassifier(), param_grid, cv=5) \n", + "search.fit(X_train, y_train)\n", + "best_model = search.best_estimator_\n", + "best_score = search.best_score_\n", + "\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "id": "476a4ae1", + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Accuracy: 0.8198636806231743\n", - "[[536 68]\n", - " [117 306]]\n" + "Best Model: RandomForestClassifier(max_depth=8, max_features='log2', min_samples_leaf=2,\n", + " n_estimators=200)\n", + "Best Score: 0.8285018270401949\n" ] } ], "source": [ - "y_f= data_new_kbest['Diagnosis']\n", - "X_f=data_new_kbest.drop(['Diagnosis'],axis=1)\n", - "X_train, X_test, y_train, y_test = train_test_split(X_f, y_f, test_size=0.2, random_state=42)\n", - "from sklearn.ensemble import RandomForestClassifier\n", - "rfc = RandomForestClassifier(n_estimators=100) \n", - "rfc.fit(X_train, y_train)\n", - "y_pred = rfc.predict(X_test)\n", - "accuracy = accuracy_score(y_test, y_pred)\n", - "print(\"Accuracy:\", accuracy)\n", - "cm = confusion_matrix(y_test, y_pred)\n", - "print(cm)\n", - "\n" + "\n", + "print(\"Best Model:\", search.best_estimator_)\n", + "print(\"Best Score:\", search.best_score_)\n" ] }, { @@ -1391,14 +1409,16 @@ "id": "8e6b9595", "metadata": {}, "source": [ - "Applying a random forest classifier on the dataset obtained by using f-statistic in selectkbest, we also bobserve the model accuracy and the confusion matrix" + "Applying a random forest classifier along with hyperparameter tuning on the dataset obtained by using f-statistic in selectkbest, we also bobserve the model accuracy " ] }, { "cell_type": "code", - "execution_count": 69, + "execution_count": 32, "id": "f63ffb4d", - "metadata": {}, + "metadata": { + "scrolled": true + }, "outputs": [ { "name": "stdout", @@ -1433,7 +1453,7 @@ }, { "cell_type": "code", - "execution_count": 74, + "execution_count": 33, "id": "22dbb42e", "metadata": {}, "outputs": [], @@ -1443,7 +1463,7 @@ }, { "cell_type": "code", - "execution_count": 75, + "execution_count": 42, "id": "d96d1966", "metadata": {}, "outputs": [ @@ -1634,7 +1654,7 @@ "[5132 rows x 9 columns]" ] }, - "execution_count": 75, + "execution_count": 42, "metadata": {}, "output_type": "execute_result" } @@ -1653,32 +1673,49 @@ }, { "cell_type": "code", - "execution_count": 76, + "execution_count": 43, "id": "084425a3", "metadata": {}, + "outputs": [], + "source": [ + "y_rfecv= data_new_rfecv['Diagnosis']\n", + "X_rfecv=data_new_rfecv.drop(['Diagnosis'],axis=1)\n", + "X_train, X_test, y_train, y_test = train_test_split(X_rfecv, y_rfecv, test_size=0.2, random_state=42)\n", + "from sklearn.model_selection import GridSearchCV, RandomizedSearchCV\n", + "from sklearn.ensemble import RandomForestClassifier\n", + "param_grid = {\n", + " 'n_estimators': [100, 200, 500],\n", + " 'max_depth': [3, 5, 8],\n", + " 'min_samples_split': [2, 5, 10],\n", + " 'min_samples_leaf': [1, 2, 4],\n", + " 'max_features': [5, 'sqrt', 'log2']\n", + "}\n", + "search = RandomizedSearchCV(RandomForestClassifier(), param_grid, cv=5) \n", + "search.fit(X_train, y_train)\n", + "best_model = search.best_estimator_\n", + "best_score = search.best_score_\n", + "\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "id": "8ce4d337", + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Accuracy: 0.818889970788705\n", - "[[539 65]\n", - " [121 302]]\n" + "Best Model: RandomForestClassifier(max_depth=5, max_features='log2', n_estimators=200)\n", + "Best Score: 0.8294762484774665\n" ] } ], "source": [ - "y_rfecv= data_new_rfecv['Diagnosis']\n", - "X_rfecv=data_new_rfecv.drop(['Diagnosis'],axis=1)\n", - "X_train, X_test, y_train, y_test = train_test_split(X_rfecv, y_rfecv, test_size=0.2, random_state=42)\n", - "from sklearn.ensemble import RandomForestClassifier\n", - "rfc = RandomForestClassifier(n_estimators=100) \n", - "rfc.fit(X_train, y_train)\n", - "y_pred = rfc.predict(X_test)\n", - "accuracy = accuracy_score(y_test, y_pred)\n", - "print(\"Accuracy:\", accuracy)\n", - "cm = confusion_matrix(y_test, y_pred)\n", - "print(cm)" + "print(\"Best Model:\", search.best_estimator_)\n", + "print(\"Best Score:\", search.best_score_)\n" ] }, { @@ -1686,7 +1723,7 @@ "id": "a52ed238", "metadata": {}, "source": [ - "Applying a random forest classifier on the dataset obtained by using RFECV, we also bobserve the model accuracy and the confusion matrix" + "Applying a random forest classifier along with hyperparameter tuning on the dataset obtained by using RFECV, we also observe the model accuracy " ] }, { @@ -1694,16 +1731,24 @@ "id": "59349ad6", "metadata": {}, "source": [ - "Since, we observe that the accuracy score of the model trained on datset generated by selectkbest is slightly higher that that trained by dataset generate by rfecv, hence we will use the 5 features selected by selectkbest as they have the most significant relationship with diagnosis" + "Thus, we are able to compare the accuracy of two rfc models with hyperparameter tuning trained on datasets developed by applying two different feature selection methods" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 36, "id": "82916004", "metadata": {}, "outputs": [], "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "acb0a472", + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { diff --git a/Diabetes Classification/diabetesclassification.ipynb b/Diabetes Classification/diabetesclassification.ipynb index 0f854e2..fc0e11b 100644 --- a/Diabetes Classification/diabetesclassification.ipynb +++ b/Diabetes Classification/diabetesclassification.ipynb @@ -10,7 +10,7 @@ }, { "cell_type": "code", - "execution_count": 36, + "execution_count": 2, "id": "312c95a1", "metadata": {}, "outputs": [], @@ -29,7 +29,7 @@ }, { "cell_type": "code", - "execution_count": 37, + "execution_count": 3, "id": "aea1b45b", "metadata": {}, "outputs": [], @@ -47,7 +47,7 @@ }, { "cell_type": "code", - "execution_count": 38, + "execution_count": 4, "id": "2ac5c17b", "metadata": {}, "outputs": [ @@ -275,7 +275,7 @@ "[5132 rows x 11 columns]" ] }, - "execution_count": 38, + "execution_count": 4, "metadata": {}, "output_type": "execute_result" } @@ -286,7 +286,7 @@ }, { "cell_type": "code", - "execution_count": 39, + "execution_count": 5, "id": "c4484360", "metadata": {}, "outputs": [ @@ -408,7 +408,7 @@ "4 4 50 F 24 3.6 1.3 0.9 2.1 50.0 2.0 0" ] }, - "execution_count": 39, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" } @@ -427,7 +427,7 @@ }, { "cell_type": "code", - "execution_count": 40, + "execution_count": 6, "id": "ea7c6dcb", "metadata": {}, "outputs": [ @@ -437,7 +437,7 @@ "(5132, 11)" ] }, - "execution_count": 40, + "execution_count": 6, "metadata": {}, "output_type": "execute_result" } @@ -456,7 +456,7 @@ }, { "cell_type": "code", - "execution_count": 41, + "execution_count": 7, "id": "a3c2ff78", "metadata": {}, "outputs": [ @@ -499,7 +499,7 @@ }, { "cell_type": "code", - "execution_count": 42, + "execution_count": 8, "id": "a9cc7614", "metadata": {}, "outputs": [], @@ -510,7 +510,7 @@ }, { "cell_type": "code", - "execution_count": 43, + "execution_count": 9, "id": "cdee3c52", "metadata": { "scrolled": true @@ -554,7 +554,7 @@ }, { "cell_type": "code", - "execution_count": 44, + "execution_count": 10, "id": "2f353609", "metadata": {}, "outputs": [ @@ -572,7 +572,7 @@ "Name: Age, dtype: float64" ] }, - "execution_count": 44, + "execution_count": 10, "metadata": {}, "output_type": "execute_result" } @@ -583,7 +583,7 @@ }, { "cell_type": "code", - "execution_count": 45, + "execution_count": 11, "id": "b7887949", "metadata": {}, "outputs": [ @@ -601,7 +601,7 @@ "Name: BMI, dtype: float64" ] }, - "execution_count": 45, + "execution_count": 11, "metadata": {}, "output_type": "execute_result" } @@ -612,7 +612,7 @@ }, { "cell_type": "code", - "execution_count": 46, + "execution_count": 12, "id": "fdf468b3", "metadata": {}, "outputs": [ @@ -630,7 +630,7 @@ "Name: Chol, dtype: float64" ] }, - "execution_count": 46, + "execution_count": 12, "metadata": {}, "output_type": "execute_result" } @@ -641,7 +641,7 @@ }, { "cell_type": "code", - "execution_count": 47, + "execution_count": 13, "id": "7877549d", "metadata": {}, "outputs": [ @@ -659,7 +659,7 @@ "Name: TG, dtype: float64" ] }, - "execution_count": 47, + "execution_count": 13, "metadata": {}, "output_type": "execute_result" } @@ -670,7 +670,7 @@ }, { "cell_type": "code", - "execution_count": 48, + "execution_count": 14, "id": "71ec5fcb", "metadata": {}, "outputs": [ @@ -688,7 +688,7 @@ "Name: HDL, dtype: float64" ] }, - "execution_count": 48, + "execution_count": 14, "metadata": {}, "output_type": "execute_result" } @@ -699,7 +699,7 @@ }, { "cell_type": "code", - "execution_count": 49, + "execution_count": 15, "id": "bbd3682c", "metadata": {}, "outputs": [ @@ -717,7 +717,7 @@ "Name: LDL, dtype: float64" ] }, - "execution_count": 49, + "execution_count": 15, "metadata": {}, "output_type": "execute_result" } @@ -728,7 +728,7 @@ }, { "cell_type": "code", - "execution_count": 50, + "execution_count": 16, "id": "c4603e52", "metadata": {}, "outputs": [ @@ -746,7 +746,7 @@ "Name: Cr, dtype: float64" ] }, - "execution_count": 50, + "execution_count": 16, "metadata": {}, "output_type": "execute_result" } @@ -757,7 +757,7 @@ }, { "cell_type": "code", - "execution_count": 51, + "execution_count": 17, "id": "f98b84c7", "metadata": {}, "outputs": [ @@ -775,7 +775,7 @@ "Name: BUN, dtype: float64" ] }, - "execution_count": 51, + "execution_count": 17, "metadata": {}, "output_type": "execute_result" } @@ -794,7 +794,7 @@ }, { "cell_type": "code", - "execution_count": 52, + "execution_count": 18, "id": "a9add765", "metadata": {}, "outputs": [ @@ -822,7 +822,7 @@ }, { "cell_type": "code", - "execution_count": 53, + "execution_count": 19, "id": "752e448c", "metadata": {}, "outputs": [], @@ -835,7 +835,7 @@ }, { "cell_type": "code", - "execution_count": 54, + "execution_count": 20, "id": "a8441035", "metadata": {}, "outputs": [], @@ -846,7 +846,7 @@ }, { "cell_type": "code", - "execution_count": 55, + "execution_count": 21, "id": "378ecf72", "metadata": {}, "outputs": [ @@ -892,7 +892,7 @@ }, { "cell_type": "code", - "execution_count": 56, + "execution_count": 22, "id": "39477c2a", "metadata": {}, "outputs": [ @@ -953,7 +953,7 @@ }, { "cell_type": "code", - "execution_count": 57, + "execution_count": 23, "id": "651dd699", "metadata": {}, "outputs": [], @@ -969,7 +969,7 @@ }, { "cell_type": "code", - "execution_count": 58, + "execution_count": 24, "id": "e1801370", "metadata": {}, "outputs": [], @@ -979,7 +979,7 @@ }, { "cell_type": "code", - "execution_count": 59, + "execution_count": 25, "id": "eb6127cc", "metadata": {}, "outputs": [ @@ -1089,7 +1089,7 @@ "4 50 24.0 3.6 1.3 0.9 2.1 50.0 2.0 0" ] }, - "execution_count": 59, + "execution_count": 25, "metadata": {}, "output_type": "execute_result" } @@ -1100,7 +1100,7 @@ }, { "cell_type": "code", - "execution_count": 60, + "execution_count": 26, "id": "12304d13", "metadata": {}, "outputs": [], @@ -1111,7 +1111,7 @@ }, { "cell_type": "code", - "execution_count": 61, + "execution_count": 27, "id": "dfd08af6", "metadata": {}, "outputs": [], @@ -1144,7 +1144,7 @@ }, { "cell_type": "code", - "execution_count": 62, + "execution_count": 28, "id": "237cd728", "metadata": {}, "outputs": [ @@ -1154,7 +1154,7 @@ "['Chol', 'Cr', 'BUN']" ] }, - "execution_count": 62, + "execution_count": 28, "metadata": {}, "output_type": "execute_result" } @@ -1173,17 +1173,17 @@ }, { "cell_type": "code", - "execution_count": 63, + "execution_count": 29, "id": "40e0c236", "metadata": {}, "outputs": [], "source": [ - "data_new_kbest=data_new.drop(['Chol','Cr','BUN'],axis=1)" + "data_new_kbest=data_new.drop(['Chol','Cr','BUN'],axis" ] }, { "cell_type": "code", - "execution_count": 64, + "execution_count": 30, "id": "6f5836ec", "metadata": {}, "outputs": [ @@ -1338,7 +1338,7 @@ "[5132 rows x 6 columns]" ] }, - "execution_count": 64, + "execution_count": 30, "metadata": {}, "output_type": "execute_result" } @@ -1357,33 +1357,51 @@ }, { "cell_type": "code", - "execution_count": 66, + "execution_count": 40, "id": "67e27ab3", "metadata": {}, + "outputs": [], + "source": [ + "y_f= data_new_kbest['Diagnosis']\n", + "X_f=data_new_kbest.drop(['Diagnosis'],axis=1)\n", + "X_train, X_test, y_train, y_test = train_test_split(X_f, y_f, test_size=0.2, random_state=42)\n", + "from sklearn.model_selection import GridSearchCV, RandomizedSearchCV\n", + "from sklearn.ensemble import RandomForestClassifier\n", + "param_grid = {\n", + " 'n_estimators': [100, 200, 500],\n", + " 'max_depth': [3, 5, 8],\n", + " 'min_samples_split': [2, 5, 10],\n", + " 'min_samples_leaf': [1, 2, 4],\n", + " 'max_features': [5, 'sqrt', 'log2']\n", + "}\n", + "search = RandomizedSearchCV(RandomForestClassifier(), param_grid, cv=5) \n", + "search.fit(X_train, y_train)\n", + "best_model = search.best_estimator_\n", + "best_score = search.best_score_\n", + "\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "id": "476a4ae1", + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Accuracy: 0.8198636806231743\n", - "[[536 68]\n", - " [117 306]]\n" + "Best Model: RandomForestClassifier(max_depth=8, max_features='log2', min_samples_leaf=2,\n", + " n_estimators=200)\n", + "Best Score: 0.8285018270401949\n" ] } ], "source": [ - "y_f= data_new_kbest['Diagnosis']\n", - "X_f=data_new_kbest.drop(['Diagnosis'],axis=1)\n", - "X_train, X_test, y_train, y_test = train_test_split(X_f, y_f, test_size=0.2, random_state=42)\n", - "from sklearn.ensemble import RandomForestClassifier\n", - "rfc = RandomForestClassifier(n_estimators=100) \n", - "rfc.fit(X_train, y_train)\n", - "y_pred = rfc.predict(X_test)\n", - "accuracy = accuracy_score(y_test, y_pred)\n", - "print(\"Accuracy:\", accuracy)\n", - "cm = confusion_matrix(y_test, y_pred)\n", - "print(cm)\n", - "\n" + "\n", + "print(\"Best Model:\", search.best_estimator_)\n", + "print(\"Best Score:\", search.best_score_)\n" ] }, { @@ -1391,14 +1409,16 @@ "id": "8e6b9595", "metadata": {}, "source": [ - "Applying a random forest classifier on the dataset obtained by using f-statistic in selectkbest, we also bobserve the model accuracy and the confusion matrix" + "Applying a random forest classifier along with hyperparameter tuning on the dataset obtained by using f-statistic in selectkbest, we also bobserve the model accuracy " ] }, { "cell_type": "code", - "execution_count": 69, + "execution_count": 32, "id": "f63ffb4d", - "metadata": {}, + "metadata": { + "scrolled": true + }, "outputs": [ { "name": "stdout", @@ -1433,7 +1453,7 @@ }, { "cell_type": "code", - "execution_count": 74, + "execution_count": 33, "id": "22dbb42e", "metadata": {}, "outputs": [], @@ -1443,7 +1463,7 @@ }, { "cell_type": "code", - "execution_count": 75, + "execution_count": 42, "id": "d96d1966", "metadata": {}, "outputs": [ @@ -1634,7 +1654,7 @@ "[5132 rows x 9 columns]" ] }, - "execution_count": 75, + "execution_count": 42, "metadata": {}, "output_type": "execute_result" } @@ -1653,32 +1673,49 @@ }, { "cell_type": "code", - "execution_count": 76, + "execution_count": 43, "id": "084425a3", "metadata": {}, + "outputs": [], + "source": [ + "y_rfecv= data_new_rfecv['Diagnosis']\n", + "X_rfecv=data_new_rfecv.drop(['Diagnosis'],axis=1)\n", + "X_train, X_test, y_train, y_test = train_test_split(X_rfecv, y_rfecv, test_size=0.2, random_state=42)\n", + "from sklearn.model_selection import GridSearchCV, RandomizedSearchCV\n", + "from sklearn.ensemble import RandomForestClassifier\n", + "param_grid = {\n", + " 'n_estimators': [100, 200, 500],\n", + " 'max_depth': [3, 5, 8],\n", + " 'min_samples_split': [2, 5, 10],\n", + " 'min_samples_leaf': [1, 2, 4],\n", + " 'max_features': [5, 'sqrt', 'log2']\n", + "}\n", + "search = RandomizedSearchCV(RandomForestClassifier(), param_grid, cv=5) \n", + "search.fit(X_train, y_train)\n", + "best_model = search.best_estimator_\n", + "best_score = search.best_score_\n", + "\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "id": "8ce4d337", + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Accuracy: 0.818889970788705\n", - "[[539 65]\n", - " [121 302]]\n" + "Best Model: RandomForestClassifier(max_depth=5, max_features='log2', n_estimators=200)\n", + "Best Score: 0.8294762484774665\n" ] } ], "source": [ - "y_rfecv= data_new_rfecv['Diagnosis']\n", - "X_rfecv=data_new_rfecv.drop(['Diagnosis'],axis=1)\n", - "X_train, X_test, y_train, y_test = train_test_split(X_rfecv, y_rfecv, test_size=0.2, random_state=42)\n", - "from sklearn.ensemble import RandomForestClassifier\n", - "rfc = RandomForestClassifier(n_estimators=100) \n", - "rfc.fit(X_train, y_train)\n", - "y_pred = rfc.predict(X_test)\n", - "accuracy = accuracy_score(y_test, y_pred)\n", - "print(\"Accuracy:\", accuracy)\n", - "cm = confusion_matrix(y_test, y_pred)\n", - "print(cm)" + "print(\"Best Model:\", search.best_estimator_)\n", + "print(\"Best Score:\", search.best_score_)\n" ] }, { @@ -1686,7 +1723,7 @@ "id": "a52ed238", "metadata": {}, "source": [ - "Applying a random forest classifier on the dataset obtained by using RFECV, we also bobserve the model accuracy and the confusion matrix" + "Applying a random forest classifier along with hyperparameter tuning on the dataset obtained by using RFECV, we also observe the model accuracy " ] }, { @@ -1694,16 +1731,24 @@ "id": "59349ad6", "metadata": {}, "source": [ - "Since, we observe that the accuracy score of the model trained on datset generated by selectkbest is slightly higher that that trained by dataset generate by rfecv, hence we will use the 5 features selected by selectkbest as they have the most significant relationship with diagnosis" + "Thus, we are able to compare the accuracy of two rfc models with hyperparameter tuning trained on datasets developed by applying two different feature selection methods" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 36, "id": "82916004", "metadata": {}, "outputs": [], "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "acb0a472", + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": {