refresh

the-mrinal · the-mrinal · commit 96430e133d11 · 2018-05-16T17:53:13.000+05:30
diff --git a/15. NaiveBayes/.ipynb_checkpoints/NaiveBayesMine-checkpoint.ipynb b/15. NaiveBayes/.ipynb_checkpoints/NaiveBayesMine-checkpoint.ipynb
@@ -0,0 +1,6 @@
+{
+ "cells": [],
+ "metadata": {},
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/15. NaiveBayes/.ipynb_checkpoints/Untitled-checkpoint.ipynb b/15. NaiveBayes/.ipynb_checkpoints/Untitled-checkpoint.ipynb
@@ -0,0 +1,6 @@
+{
+ "cells": [],
+ "metadata": {},
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/15. NaiveBayes/NaiveBayesMine.ipynb b/15. NaiveBayes/NaiveBayesMine.ipynb
@@ -0,0 +1,228 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import numpy as np"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 27,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def fit(X_train,Y_train):\n",
+    "    result = {}\n",
+    "    class_values = set(Y_train)\n",
+    "    for curr_value in class_values:\n",
+    "        result[curr_value] = {}\n",
+    "        result[\"total_data\"] = len(Y_train)\n",
+    "        curr_class_rows = (Y_train == curr_value)\n",
+    "        X_train_curr = X_train[curr_class_rows]\n",
+    "        Y_train_curr = Y_train[curr_class_rows]\n",
+    "        num_features = X_train.shape[1]\n",
+    "        result[curr_value][\"total_count\"] = len(Y_train_curr)\n",
+    "        for j in range(1,num_features+1):\n",
+    "            result[curr_value][j] = {}\n",
+    "            all_possible_values = set(X_train[:,j-1])\n",
+    "            for this_value in all_possible_values:\n",
+    "                result[curr_value][j][this_value] = (X_train_curr[:,j-1]==this_value).sum()\n",
+    "    return result"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 28,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def probablity(dictionary,x,current_class):\n",
+    "    output= np.log(dictionary[current_class][\"total_count\"])-np.log(dictionary[\"total_data\"])\n",
+    "    num_features = len(dictionary[current_class].keys())-1;\n",
+    "    for j in range(1,num_features+1):\n",
+    "        xj = x[j-1]\n",
+    "        count_current_class_with_value_xj = dictionary[current_class][j][xj] + 1 \n",
+    "        count_current_class = dictionary[current_class][\"total_count\"] + len(dictionary[current_class][j].keys())\n",
+    "        current_xj_prob = np.log(count_current_class_with_value_xj) -np.log(count_current_class)\n",
+    "        output = output + current_xj_prob\n",
+    "    return output "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 29,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def predictSinglePoint(dictionary,x):\n",
+    "    classes = dictionary.keys()\n",
+    "    best_p = -1000\n",
+    "    best_class = -1\n",
+    "    first_run = True\n",
+    "    for current_class in classes:\n",
+    "        if(current_class == \"total_data\"):\n",
+    "            continue\n",
+    "        p_curr_class = probablity(dictionary,x,current_class)\n",
+    "        if(first_run or p_curr_class > best_p):\n",
+    "            best_p = p_curr_class\n",
+    "            best_class = current_class\n",
+    "        first_run = False\n",
+    "    return best_class"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 30,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def predict(dictionary,X_test):\n",
+    "    Y_pred = []\n",
+    "    for x in X_test:\n",
+    "        x_class = predictSinglePoint(dictionary,x)\n",
+    "        Y_pred.append(x_class)\n",
+    "    return Y_pred"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 31,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def makelabelled(column):\n",
+    "    second_limit = column.mean()\n",
+    "    first_limit = 0.5 * second_limit\n",
+    "    third_limit = 1.5 * second_limit\n",
+    "    for i in range(0,len(column)):\n",
+    "        if(column[i]<first_limit):\n",
+    "            column[i] = 0\n",
+    "        elif(column[i] < second_limit):\n",
+    "            column[i] = 1\n",
+    "        elif(column[i]<third_limit):\n",
+    "            column[i] = 2\n",
+    "        else:\n",
+    "            column[i] = 3\n",
+    "    return column"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 32,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from sklearn import datasets\n",
+    "iris = datasets.load_iris()\n",
+    "x = iris.data\n",
+    "y = iris.target"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 33,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "for i in range(0,x.shape[-1]):\n",
+    "    x[:,i] = makelabelled(x[:,i])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 34,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from sklearn import model_selection\n",
+    "X_train,X_test,Y_train,Y_test = model_selection.train_test_split(x,y,test_size=0.25,random_state=0)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 35,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dictionary = fit(X_train,Y_train)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 36,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "Y_pred = predict(dictionary,X_test)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 37,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "             precision    recall  f1-score   support\n",
+      "\n",
+      "          0       1.00      1.00      1.00        13\n",
+      "          1       0.94      1.00      0.97        16\n",
+      "          2       1.00      0.89      0.94         9\n",
+      "\n",
+      "avg / total       0.98      0.97      0.97        38\n",
+      "\n",
+      "[[13  0  0]\n",
+      " [ 0 16  0]\n",
+      " [ 0  1  8]]\n"
+     ]
+    }
+   ],
+   "source": [
+    "from sklearn.metrics import classification_report,confusion_matrix\n",
+    "print(classification_report(Y_test,Y_pred))\n",
+    "print(confusion_matrix(Y_test,Y_pred))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Implememtation of Multinomial Naive Bayes from Scratch"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.4"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/15. NaiveBayes/Untitled.ipynb b/15. NaiveBayes/Untitled.ipynb
@@ -0,0 +1,6 @@
+{
+ "cells": [],
+ "metadata": {},
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/17. textClassificationProject/textClassification.ipynb b/17. textClassificationProject/textClassification.ipynb