diff --git a/river/anomaly/__init__.py b/river/anomaly/__init__.py index ac0cd08269..a915d45622 100644 --- a/river/anomaly/__init__.py +++ b/river/anomaly/__init__.py @@ -17,6 +17,7 @@ from .filter import QuantileFilter, ThresholdFilter from .gaussian import GaussianScorer from .hst import HalfSpaceTrees +from .loda import LODA from .lof import LocalOutlierFactor from .svm import OneClassSVM @@ -29,4 +30,5 @@ "QuantileFilter", "ThresholdFilter", "LocalOutlierFactor", + "LODA", ] diff --git a/river/anomaly/loda.py b/river/anomaly/loda.py new file mode 100644 index 0000000000..a776445719 --- /dev/null +++ b/river/anomaly/loda.py @@ -0,0 +1,113 @@ +from __future__ import annotations + +import math + +import numpy as np + +from river import anomaly, utils + +__all__ = ["LODA"] + + +class LODA(anomaly.base.AnomalyDetector): + """LODA (Lightweight on-line detector of anomalies) + + LODA [^1] comprises a collection of one-dimensional histograms, each approximating + the probability density of the inputed data projected on a single projection vector. Its output + on a sample is the average of the logarithm of probabilities estimated on individual projection vectors. + + LODA shows that an ensemble of very weak detections can lead to a very strong anomaly detector with performance + equal to or even better than state-of-the-art methods. + + This implementation within `River` is adapted from the versions implemented by the + [PyOD - Python Outlier Detection](https://pyod.readthedocs.io/en/latest/_modules/pyod/models/loda.html) and + [PySAD - Python Streaming Anomaly Detection](https://pysad.readthedocs.io/en/latest/_modules/pysad/models/loda.html) + frameworks. + + Parameters + ---------- + n_bins + Number of bins of the histograms generated by the algorithm. + n_random_cuts + Number of random cuts + + References + ---------- + [^1] Pevný, T. 2015. LODA: Lightweight on-line detector of anomalies. Machine Learning. 102, 2 (2015), 275–304. + + Examples + -------- + + >>> import pandas as pd + >>> from river import anomaly + >>> from river import datasets + + >>> cc_df = pd.DataFrame(datasets.CreditCard()) + + >>> loda = anomaly.LODA(n_bins=10, n_random_cuts=100) + + >>> for x, _ in datasets.CreditCard().take(10_000): + ... loda.learn_one(x) + + >>> loda.n_features + 30 + + >>> loda.score_one(cc_df[0][10_001]) + 9.091044415623026e-16 + + """ + + def __init__(self, n_bins=10, n_random_cuts=100): + self.n_bins = n_bins + self.n_random_cuts = n_random_cuts + + self.weights = [] + self.projections_ = [] + self.histograms_ = [] + self.limits_ = [] + self.n_bins_ = [] + + self.n_features = 0 + self.n_zero_features = 0 + self.n_nonzero_features = 0 + + self.init = True + + def learn_one(self, x): + x_np = utils.dict2numpy(x) + + if self.init: + self.n_features = len(x) + self.n_nonzero_features = math.sqrt(self.n_features) + self.n_zero_features = self.n_features - np.int_(self.n_nonzero_features) + + self.weights = np.ones(self.n_random_cuts) / self.n_random_cuts + self.projections_ = np.random.rand(self.n_random_cuts, self.n_features) + self.histograms_ = np.zeros((self.n_random_cuts, self.n_bins)) + self.limits_ = np.zeros((self.n_random_cuts, self.n_bins + 1)) + + self.init = False + + x_np = x_np.reshape(1, -1) + + for i in range(self.n_random_cuts): + rands = np.random.permutation(self.n_features)[: self.n_zero_features] + self.projections_[i, rands] = 0 + projected_data = self.projections_[i, :].dot(x_np.T) + self.histograms_[i, :], self.limits_[i, :] = np.histogram( + projected_data, bins=self.n_bins, density=False + ) + self.histograms_[i, :] += 1e-12 + self.histograms_[i, :] /= np.sum(self.histograms_[i, :]) + + def score_one(self, x): + x_np = utils.dict2numpy(x).reshape(1, -1) + + pred_scores = np.zeros([x_np.shape[0], 1]) + for i in range(self.n_random_cuts): + projected_data = self.projections_[i, :].dot(x_np.T) + inds = np.searchsorted(self.limits_[i, : self.n_bins - 1], projected_data, side="left") + pred_scores[:, 0] += -self.weights[i] * np.log(self.histograms_[i, inds]) + pred_scores /= self.n_random_cuts + + return pred_scores.ravel().item()