diff --git a/numpy_ml/linear_models/README.md b/numpy_ml/linear_models/README.md
index 743a57c..53ac166 100644
--- a/numpy_ml/linear_models/README.md
+++ b/numpy_ml/linear_models/README.md
@@ -8,6 +8,7 @@ The `lm.py` module implements:
 3. [Bayesian linear regression](https://en.wikipedia.org/wiki/Bayesian_linear_regression) with maximum a posteriori parameter estimates via [conjugacy](https://en.wikipedia.org/wiki/Conjugate_prior#Table_of_conjugate_distributions)
     - Known coefficient prior mean and known error variance
     - Known coefficient prior mean and unknown error variance
+4. [Naive Bayes classifier](https://en.wikipedia.org/wiki/Naive_Bayes_classifier) with Gaussian feature likelihoods.
 
 ## Plots
 <p align="center">
diff --git a/numpy_ml/linear_models/__init__.py b/numpy_ml/linear_models/__init__.py
index b29e90b..a0a762a 100644
--- a/numpy_ml/linear_models/__init__.py
+++ b/numpy_ml/linear_models/__init__.py
@@ -1 +1,2 @@
 from .lm import *
+from .naive_bayes import *
diff --git a/numpy_ml/linear_models/naive_bayes.py b/numpy_ml/linear_models/naive_bayes.py
new file mode 100644
index 0000000..d174fd1
--- /dev/null
+++ b/numpy_ml/linear_models/naive_bayes.py
@@ -0,0 +1,211 @@
+import numpy as np
+
+
+class GaussianNBClassifier:
+    def __init__(self, eps=1e-6):
+        r"""
+        A naive Bayes classifier for real-valued data.
+
+        Notes
+        -----
+        The naive Bayes model assumes the features of each training example
+        :math:`\mathbf{x}` are mutually independent given the example label
+        :math:`y`:
+
+        .. math::
+
+            P(\mathbf{x}_i \mid y_i) = \prod_{j=1}^M P(x_{i,j} \mid y_i)
+
+        where :math:`M` is the rank of the `i`th example :math:`\mathbf{x}_i`
+        and :math:`y_i` is the label associated with the `i`th example.
+
+        Combining the conditional independence assumption with a simple
+        application of Bayes' theorem gives the naive Bayes classification
+        rule:
+
+        .. math::
+
+            \hat{y} &= \arg \max_y P(y \mid \mathbf{x}) \\
+                    &= \arg \max_y  P(y) P(\mathbf{x} \mid y) \\
+                    &= \arg \max_y  P(y) \prod_{j=1}^M P(x_j \mid y)
+
+        In the final expression, the prior class probability :math:`P(y)` can
+        be specified in advance or estimated empirically from the training
+        data.
+
+        In the Gaussian version of the naive Bayes model, the feature
+        likelihood is assumed to be normally distributed for each class:
+
+        .. math::
+
+            \mathbf{x}_i \mid y_i = c, \theta \sim \mathcal{N}(\mu_c, \Sigma_c)
+
+        where :math:`\theta` is the set of model parameters: :math:`\{\mu_1,
+        \Sigma_1, \ldots, \mu_K, \Sigma_K\}`, :math:`K` is the total number of
+        unique classes present in the data, and the parameters for the Gaussian
+        associated with class :math:`c`, :math:`\mu_c` and :math:`\Sigma_c`
+        (where :math:`1 \leq c \leq K`), are estimated via MLE from the set of
+        training examples with label :math:`c`.
+
+        Parameters
+        ----------
+        eps : float
+            A value added to the variance to prevent numerical error. Default
+            is 1e-6.
+
+        Attributes
+        ----------
+        parameters : dict
+            Dictionary of model parameters: "mean", the `(K, M)` array of
+            feature means under each class, "sigma", the `(K, M)` array of
+            feature variances under each class, and "prior", the `(K,)` array of
+            empirical prior probabilities for each class label.
+        hyperparameters : dict
+            Dictionary of model hyperparameters
+        labels : :py:class:`ndarray <numpy.ndarray>` of shape `(K,)`
+            An array containing the unique class labels for the training
+            examples.
+        """
+        self.labels = None
+        self.hyperparameters = {"eps": eps}
+        self.parameters = {
+            "mean": None,  # shape: (K, M)
+            "sigma": None,  # shape: (K, M)
+            "prior": None,  # shape: (K,)
+        }
+
+    def fit(self, X, y):
+        """
+        Fit the model parameters via maximum likelihood.
+
+        Notes
+        -----
+        The model parameters are stored in the :py:attr:`parameters` attribute.
+        The following keys are present:
+
+        mean: :py:class:`ndarray <numpy.ndarray>` of shape `(K, M)`
+            Feature means for each of the `K` label classes
+        sigma: :py:class:`ndarray <numpy.ndarray>` of shape `(K, M)`
+            Feature variances for each of the `K` label classes
+        prior :  :py:class:`ndarray <numpy.ndarray>` of shape `(K,)`
+            Prior probability of each of the `K` label classes, estimated
+            empirically from the training data
+
+        Parameters
+        ----------
+        X : :py:class:`ndarray <numpy.ndarray>` of shape `(N, M)`
+            A dataset consisting of `N` examples, each of dimension `M`
+        y: :py:class:`ndarray <numpy.ndarray>` of shape `(N,)`
+            The class label for each of the `N` examples in `X`
+
+        Returns
+        -------
+        self: object
+        """
+        P = self.parameters
+        H = self.hyperparameters
+
+        self.labels = np.unique(y)
+
+        K = len(self.labels)
+        N, M = X.shape
+
+        P["mean"] = np.zeros((K, M))
+        P["sigma"] = np.zeros((K, M))
+        P["prior"] = np.zeros((K,))
+
+        for i, c in enumerate(self.labels):
+            X_c = X[y == c, :]
+
+            P["mean"][i, :] = np.mean(X_c, axis=0)
+            P["sigma"][i, :] = np.var(X_c, axis=0) + H["eps"]
+            P["prior"][i] = X_c.shape[0] / N
+        return self
+
+    def predict(self, X):
+        """
+        Use the trained classifier to predict the class label for each example
+        in **X**.
+
+        Parameters
+        ----------
+        X: :py:class:`ndarray <numpy.ndarray>` of shape `(N, M)`
+            A dataset of `N` examples, each of dimension `M`
+
+        Returns
+        -------
+        labels : :py:class:`ndarray <numpy.ndarray>` of shape `(N)`
+            The predicted class labels for each example in `X`
+        """
+        return self.labels[self._log_posterior(X).argmax(axis=1)]
+
+    def _log_posterior(self, X):
+        r"""
+        Compute the (unnormalized) log posterior for each class.
+
+        Parameters
+        ----------
+        X: :py:class:`ndarray <numpy.ndarray>` of shape `(N, M)`
+            A dataset of `N` examples, each of dimension `M`
+
+        Returns
+        -------
+        log_posterior : :py:class:`ndarray <numpy.ndarray>` of shape `(N, K)`
+            Unnormalized log posterior probability of each class for each
+            example in `X`
+        """
+        K = len(self.labels)
+        log_posterior = np.zeros((X.shape[0], K))
+        for i in range(K):
+            log_posterior[:, i] = self._log_class_posterior(X, i)
+        return log_posterior
+
+    def _log_class_posterior(self, X, class_idx):
+        r"""
+        Compute the (unnormalized) log posterior for the label at index
+        `class_idx` in :py:attr:`labels`.
+
+        Notes
+        -----
+        Unnormalized log posterior for example :math:`\mathbf{x}_i` and class
+        :math:`c` is::
+
+        .. math::
+
+            \log P(y_i = c \mid \mathbf{x}_i, \theta)
+                &\propto \log P(y=c \mid \theta) +
+                    \log P(\mathbf{x}_i \mid y_i = c, \theta) \\
+                &\propto \log P(y=c \mid \theta)
+                    \sum{j=1}^M \log P(x_j \mid y_i = c, \theta)
+
+        In the Gaussian naive Bayes model, the feature likelihood for class
+        :math:`c`, :math:`P(\mathbf{x}_i \mid y_i = c, \theta)` is assumed to
+        be normally distributed
+
+        .. math::
+
+            \mathbf{x}_i \mid y_i = c, \theta \sim \mathcal{N}(\mu_c, \Sigma_c)
+
+
+        Parameters
+        ----------
+        X: :py:class:`ndarray <numpy.ndarray>` of shape `(N, M)`
+            A dataset of `N` examples, each of dimension `M`
+        class_idx : int
+            The index of the current class in :py:attr:`labels`
+
+        Returns
+        -------
+        log_class_posterior : :py:class:`ndarray <numpy.ndarray>` of shape `(N,)`
+            Unnormalized log probability of the label at index `class_idx`
+            in :py:attr:`labels` for each example in `X`
+        """
+        P = self.parameters
+        mu = P["mean"][class_idx]
+        prior = P["prior"][class_idx]
+        sigsq = P["sigma"][class_idx]
+
+        # log likelihood = log X | N(mu, sigsq)
+        log_likelihood = -0.5 * np.sum(np.log(2 * np.pi * sigsq))
+        log_likelihood -= 0.5 * np.sum(((X - mu) ** 2) / sigsq, axis=1)
+        return log_likelihood + np.log(prior)
diff --git a/numpy_ml/tests/test_naive_bayes.py b/numpy_ml/tests/test_naive_bayes.py
new file mode 100644
index 0000000..34780d5
--- /dev/null
+++ b/numpy_ml/tests/test_naive_bayes.py
@@ -0,0 +1,72 @@
+import numpy as np
+from sklearn import datasets
+from sklearn.model_selection import train_test_split
+
+from sklearn import naive_bayes
+
+from numpy_ml.linear_models import GaussianNBClassifier
+from numpy_ml.utils.testing import random_tensor
+
+
+def test_GaussianNB(N=10):
+    np.random.seed(12345)
+    N = np.inf if N is None else N
+
+    i = 1
+    while i < N + 1:
+        n_ex = np.random.randint(1, 300)
+        n_feats = np.random.randint(1, 100)
+        n_classes = np.random.randint(2, 10)
+
+        X = random_tensor((n_ex, n_feats), standardize=True)
+        y = np.random.randint(0, n_classes, size=n_ex)
+
+        X_test = random_tensor((n_ex, n_feats), standardize=True)
+
+        NB = GaussianNBClassifier(eps=1e-09)
+        NB.fit(X, y)
+
+        preds = NB.predict(X_test)
+
+        sklearn_NB = naive_bayes.GaussianNB()
+        sklearn_NB.fit(X, y)
+
+        sk_preds = sklearn_NB.predict(X_test)
+
+        for i in range(len(NB.labels)):
+            P = NB.parameters
+            jointi = np.log(sklearn_NB.class_prior_[i])
+            jointi_mine = np.log(P["prior"][i])
+
+            np.testing.assert_almost_equal(jointi, jointi_mine)
+
+            n_ij = -0.5 * np.sum(np.log(2.0 * np.pi * sklearn_NB.sigma_[i, :]))
+            n_ij_mine = -0.5 * np.sum(np.log(2.0 * np.pi * P["sigma"][i]))
+
+            np.testing.assert_almost_equal(n_ij_mine, n_ij)
+
+            n_ij2 = n_ij - 0.5 * np.sum(
+                ((X_test - sklearn_NB.theta_[i, :]) ** 2) / (sklearn_NB.sigma_[i, :]), 1
+            )
+
+            n_ij2_mine = n_ij_mine - 0.5 * np.sum(
+                ((X_test - P["mean"][i]) ** 2) / (P["sigma"][i]), 1
+            )
+            np.testing.assert_almost_equal(n_ij2_mine, n_ij2, decimal=4)
+
+            llh = jointi + n_ij2
+            llh_mine = jointi_mine + n_ij2_mine
+
+            np.testing.assert_almost_equal(llh_mine, llh, decimal=4)
+
+        np.testing.assert_almost_equal(P["prior"], sklearn_NB.class_prior_)
+        np.testing.assert_almost_equal(P["mean"], sklearn_NB.theta_)
+        np.testing.assert_almost_equal(P["sigma"], sklearn_NB.sigma_)
+        np.testing.assert_almost_equal(
+            sklearn_NB._joint_log_likelihood(X_test),
+            NB._log_posterior(X_test),
+            decimal=4,
+        )
+        np.testing.assert_almost_equal(preds, sk_preds)
+        print("PASSED")
+        i += 1