online-ml · hoanganhngo610 · Oct 20, 2023
@@ -17,6 +17,7 @@
 from .filter import QuantileFilter, ThresholdFilter
 from .gaussian import GaussianScorer
 from .hst import HalfSpaceTrees
+from .loda import LODA
 from .lof import LocalOutlierFactor
 from .svm import OneClassSVM
 
@@ -29,4 +30,5 @@
     "QuantileFilter",
     "ThresholdFilter",
     "LocalOutlierFactor",
+    "LODA",
 ]
@@ -0,0 +1,113 @@
+from __future__ import annotations
+
+import math
+
+import numpy as np
+
+from river import anomaly, utils
+
+__all__ = ["LODA"]
+
+
+class LODA(anomaly.base.AnomalyDetector):
+    """LODA (Lightweight on-line detector of anomalies)
+
+    LODA [^1] comprises a collection of one-dimensional histograms, each approximating
+    the probability density of the inputed data projected on a single projection vector. Its output
+    on a sample is the average of the logarithm of probabilities estimated on individual projection vectors.
+
+    LODA shows that an ensemble of very weak detections can lead to a very strong anomaly detector with performance
+    equal to or even better than state-of-the-art methods.
+
+    This implementation within `River` is adapted from the versions implemented by the
+    [PyOD - Python Outlier Detection](https://pyod.readthedocs.io/en/latest/_modules/pyod/models/loda.html) and
+    [PySAD - Python Streaming Anomaly Detection](https://pysad.readthedocs.io/en/latest/_modules/pysad/models/loda.html)
+    frameworks.
+
+    Parameters
+    ----------
+    n_bins
+        Number of bins of the histograms generated by the algorithm.
+    n_random_cuts
+        Number of random cuts
+
+    References
+    ----------
+    [^1] Pevný, T. 2015. LODA: Lightweight on-line detector of anomalies. Machine Learning. 102, 2 (2015), 275–304.
+
+    Examples
+    --------
+
+    >>> import pandas as pd
+    >>> from river import anomaly
+    >>> from river import datasets
+
+    >>> cc_df = pd.DataFrame(datasets.CreditCard())
+
+    >>> loda = anomaly.LODA(n_bins=10, n_random_cuts=100)
+
+    >>> for x, _ in datasets.CreditCard().take(10_000):
+    ...     loda.learn_one(x)
+
+    >>> loda.n_features
+    30
+
+    >>> loda.score_one(cc_df[0][10_001])
+    9.091044415623026e-16
+
+    """
+
+    def __init__(self, n_bins=10, n_random_cuts=100):
+        self.n_bins = n_bins
+        self.n_random_cuts = n_random_cuts
+
+        self.weights = []
+        self.projections_ = []
+        self.histograms_ = []
+        self.limits_ = []
+        self.n_bins_ = []
+
+        self.n_features = 0
+        self.n_zero_features = 0
+        self.n_nonzero_features = 0
+
+        self.init = True
+
+    def learn_one(self, x):
+        x_np = utils.dict2numpy(x)
+
+        if self.init:
+            self.n_features = len(x)
+            self.n_nonzero_features = math.sqrt(self.n_features)
+            self.n_zero_features = self.n_features - np.int_(self.n_nonzero_features)
+
+            self.weights = np.ones(self.n_random_cuts) / self.n_random_cuts
+            self.projections_ = np.random.rand(self.n_random_cuts, self.n_features)
+            self.histograms_ = np.zeros((self.n_random_cuts, self.n_bins))
+            self.limits_ = np.zeros((self.n_random_cuts, self.n_bins + 1))
+
+            self.init = False
+
+        x_np = x_np.reshape(1, -1)
+
+        for i in range(self.n_random_cuts):
+            rands = np.random.permutation(self.n_features)[: self.n_zero_features]
+            self.projections_[i, rands] = 0
+            projected_data = self.projections_[i, :].dot(x_np.T)
+            self.histograms_[i, :], self.limits_[i, :] = np.histogram(
+                projected_data, bins=self.n_bins, density=False
+            )
+            self.histograms_[i, :] += 1e-12
+            self.histograms_[i, :] /= np.sum(self.histograms_[i, :])
+
+    def score_one(self, x):
+        x_np = utils.dict2numpy(x).reshape(1, -1)
+
+        pred_scores = np.zeros([x_np.shape[0], 1])
+        for i in range(self.n_random_cuts):
+            projected_data = self.projections_[i, :].dot(x_np.T)
+            inds = np.searchsorted(self.limits_[i, : self.n_bins - 1], projected_data, side="left")
+            pred_scores[:, 0] += -self.weights[i] * np.log(self.histograms_[i, inds])
+            pred_scores /= self.n_random_cuts
+
+        return pred_scores.ravel().item()