diff --git a/numpy_ml/linear_models/README.md b/numpy_ml/linear_models/README.md index 743a57c..53ac166 100644 --- a/numpy_ml/linear_models/README.md +++ b/numpy_ml/linear_models/README.md @@ -8,6 +8,7 @@ The `lm.py` module implements: 3. [Bayesian linear regression](https://en.wikipedia.org/wiki/Bayesian_linear_regression) with maximum a posteriori parameter estimates via [conjugacy](https://en.wikipedia.org/wiki/Conjugate_prior#Table_of_conjugate_distributions) - Known coefficient prior mean and known error variance - Known coefficient prior mean and unknown error variance +4. [Naive Bayes classifier](https://en.wikipedia.org/wiki/Naive_Bayes_classifier) with Gaussian feature likelihoods. ## Plots

diff --git a/numpy_ml/linear_models/__init__.py b/numpy_ml/linear_models/__init__.py index b29e90b..a0a762a 100644 --- a/numpy_ml/linear_models/__init__.py +++ b/numpy_ml/linear_models/__init__.py @@ -1 +1,2 @@ from .lm import * +from .naive_bayes import * diff --git a/numpy_ml/linear_models/naive_bayes.py b/numpy_ml/linear_models/naive_bayes.py new file mode 100644 index 0000000..d174fd1 --- /dev/null +++ b/numpy_ml/linear_models/naive_bayes.py @@ -0,0 +1,211 @@ +import numpy as np + + +class GaussianNBClassifier: + def __init__(self, eps=1e-6): + r""" + A naive Bayes classifier for real-valued data. + + Notes + ----- + The naive Bayes model assumes the features of each training example + :math:`\mathbf{x}` are mutually independent given the example label + :math:`y`: + + .. math:: + + P(\mathbf{x}_i \mid y_i) = \prod_{j=1}^M P(x_{i,j} \mid y_i) + + where :math:`M` is the rank of the `i`th example :math:`\mathbf{x}_i` + and :math:`y_i` is the label associated with the `i`th example. + + Combining the conditional independence assumption with a simple + application of Bayes' theorem gives the naive Bayes classification + rule: + + .. math:: + + \hat{y} &= \arg \max_y P(y \mid \mathbf{x}) \\ + &= \arg \max_y P(y) P(\mathbf{x} \mid y) \\ + &= \arg \max_y P(y) \prod_{j=1}^M P(x_j \mid y) + + In the final expression, the prior class probability :math:`P(y)` can + be specified in advance or estimated empirically from the training + data. + + In the Gaussian version of the naive Bayes model, the feature + likelihood is assumed to be normally distributed for each class: + + .. math:: + + \mathbf{x}_i \mid y_i = c, \theta \sim \mathcal{N}(\mu_c, \Sigma_c) + + where :math:`\theta` is the set of model parameters: :math:`\{\mu_1, + \Sigma_1, \ldots, \mu_K, \Sigma_K\}`, :math:`K` is the total number of + unique classes present in the data, and the parameters for the Gaussian + associated with class :math:`c`, :math:`\mu_c` and :math:`\Sigma_c` + (where :math:`1 \leq c \leq K`), are estimated via MLE from the set of + training examples with label :math:`c`. + + Parameters + ---------- + eps : float + A value added to the variance to prevent numerical error. Default + is 1e-6. + + Attributes + ---------- + parameters : dict + Dictionary of model parameters: "mean", the `(K, M)` array of + feature means under each class, "sigma", the `(K, M)` array of + feature variances under each class, and "prior", the `(K,)` array of + empirical prior probabilities for each class label. + hyperparameters : dict + Dictionary of model hyperparameters + labels : :py:class:`ndarray ` of shape `(K,)` + An array containing the unique class labels for the training + examples. + """ + self.labels = None + self.hyperparameters = {"eps": eps} + self.parameters = { + "mean": None, # shape: (K, M) + "sigma": None, # shape: (K, M) + "prior": None, # shape: (K,) + } + + def fit(self, X, y): + """ + Fit the model parameters via maximum likelihood. + + Notes + ----- + The model parameters are stored in the :py:attr:`parameters` attribute. + The following keys are present: + + mean: :py:class:`ndarray ` of shape `(K, M)` + Feature means for each of the `K` label classes + sigma: :py:class:`ndarray ` of shape `(K, M)` + Feature variances for each of the `K` label classes + prior : :py:class:`ndarray ` of shape `(K,)` + Prior probability of each of the `K` label classes, estimated + empirically from the training data + + Parameters + ---------- + X : :py:class:`ndarray ` of shape `(N, M)` + A dataset consisting of `N` examples, each of dimension `M` + y: :py:class:`ndarray ` of shape `(N,)` + The class label for each of the `N` examples in `X` + + Returns + ------- + self: object + """ + P = self.parameters + H = self.hyperparameters + + self.labels = np.unique(y) + + K = len(self.labels) + N, M = X.shape + + P["mean"] = np.zeros((K, M)) + P["sigma"] = np.zeros((K, M)) + P["prior"] = np.zeros((K,)) + + for i, c in enumerate(self.labels): + X_c = X[y == c, :] + + P["mean"][i, :] = np.mean(X_c, axis=0) + P["sigma"][i, :] = np.var(X_c, axis=0) + H["eps"] + P["prior"][i] = X_c.shape[0] / N + return self + + def predict(self, X): + """ + Use the trained classifier to predict the class label for each example + in **X**. + + Parameters + ---------- + X: :py:class:`ndarray ` of shape `(N, M)` + A dataset of `N` examples, each of dimension `M` + + Returns + ------- + labels : :py:class:`ndarray ` of shape `(N)` + The predicted class labels for each example in `X` + """ + return self.labels[self._log_posterior(X).argmax(axis=1)] + + def _log_posterior(self, X): + r""" + Compute the (unnormalized) log posterior for each class. + + Parameters + ---------- + X: :py:class:`ndarray ` of shape `(N, M)` + A dataset of `N` examples, each of dimension `M` + + Returns + ------- + log_posterior : :py:class:`ndarray ` of shape `(N, K)` + Unnormalized log posterior probability of each class for each + example in `X` + """ + K = len(self.labels) + log_posterior = np.zeros((X.shape[0], K)) + for i in range(K): + log_posterior[:, i] = self._log_class_posterior(X, i) + return log_posterior + + def _log_class_posterior(self, X, class_idx): + r""" + Compute the (unnormalized) log posterior for the label at index + `class_idx` in :py:attr:`labels`. + + Notes + ----- + Unnormalized log posterior for example :math:`\mathbf{x}_i` and class + :math:`c` is:: + + .. math:: + + \log P(y_i = c \mid \mathbf{x}_i, \theta) + &\propto \log P(y=c \mid \theta) + + \log P(\mathbf{x}_i \mid y_i = c, \theta) \\ + &\propto \log P(y=c \mid \theta) + \sum{j=1}^M \log P(x_j \mid y_i = c, \theta) + + In the Gaussian naive Bayes model, the feature likelihood for class + :math:`c`, :math:`P(\mathbf{x}_i \mid y_i = c, \theta)` is assumed to + be normally distributed + + .. math:: + + \mathbf{x}_i \mid y_i = c, \theta \sim \mathcal{N}(\mu_c, \Sigma_c) + + + Parameters + ---------- + X: :py:class:`ndarray ` of shape `(N, M)` + A dataset of `N` examples, each of dimension `M` + class_idx : int + The index of the current class in :py:attr:`labels` + + Returns + ------- + log_class_posterior : :py:class:`ndarray ` of shape `(N,)` + Unnormalized log probability of the label at index `class_idx` + in :py:attr:`labels` for each example in `X` + """ + P = self.parameters + mu = P["mean"][class_idx] + prior = P["prior"][class_idx] + sigsq = P["sigma"][class_idx] + + # log likelihood = log X | N(mu, sigsq) + log_likelihood = -0.5 * np.sum(np.log(2 * np.pi * sigsq)) + log_likelihood -= 0.5 * np.sum(((X - mu) ** 2) / sigsq, axis=1) + return log_likelihood + np.log(prior) diff --git a/numpy_ml/tests/test_naive_bayes.py b/numpy_ml/tests/test_naive_bayes.py new file mode 100644 index 0000000..34780d5 --- /dev/null +++ b/numpy_ml/tests/test_naive_bayes.py @@ -0,0 +1,72 @@ +import numpy as np +from sklearn import datasets +from sklearn.model_selection import train_test_split + +from sklearn import naive_bayes + +from numpy_ml.linear_models import GaussianNBClassifier +from numpy_ml.utils.testing import random_tensor + + +def test_GaussianNB(N=10): + np.random.seed(12345) + N = np.inf if N is None else N + + i = 1 + while i < N + 1: + n_ex = np.random.randint(1, 300) + n_feats = np.random.randint(1, 100) + n_classes = np.random.randint(2, 10) + + X = random_tensor((n_ex, n_feats), standardize=True) + y = np.random.randint(0, n_classes, size=n_ex) + + X_test = random_tensor((n_ex, n_feats), standardize=True) + + NB = GaussianNBClassifier(eps=1e-09) + NB.fit(X, y) + + preds = NB.predict(X_test) + + sklearn_NB = naive_bayes.GaussianNB() + sklearn_NB.fit(X, y) + + sk_preds = sklearn_NB.predict(X_test) + + for i in range(len(NB.labels)): + P = NB.parameters + jointi = np.log(sklearn_NB.class_prior_[i]) + jointi_mine = np.log(P["prior"][i]) + + np.testing.assert_almost_equal(jointi, jointi_mine) + + n_ij = -0.5 * np.sum(np.log(2.0 * np.pi * sklearn_NB.sigma_[i, :])) + n_ij_mine = -0.5 * np.sum(np.log(2.0 * np.pi * P["sigma"][i])) + + np.testing.assert_almost_equal(n_ij_mine, n_ij) + + n_ij2 = n_ij - 0.5 * np.sum( + ((X_test - sklearn_NB.theta_[i, :]) ** 2) / (sklearn_NB.sigma_[i, :]), 1 + ) + + n_ij2_mine = n_ij_mine - 0.5 * np.sum( + ((X_test - P["mean"][i]) ** 2) / (P["sigma"][i]), 1 + ) + np.testing.assert_almost_equal(n_ij2_mine, n_ij2, decimal=4) + + llh = jointi + n_ij2 + llh_mine = jointi_mine + n_ij2_mine + + np.testing.assert_almost_equal(llh_mine, llh, decimal=4) + + np.testing.assert_almost_equal(P["prior"], sklearn_NB.class_prior_) + np.testing.assert_almost_equal(P["mean"], sklearn_NB.theta_) + np.testing.assert_almost_equal(P["sigma"], sklearn_NB.sigma_) + np.testing.assert_almost_equal( + sklearn_NB._joint_log_likelihood(X_test), + NB._log_posterior(X_test), + decimal=4, + ) + np.testing.assert_almost_equal(preds, sk_preds) + print("PASSED") + i += 1