Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implementation of LODA (Lightweight On-line Detection of Anomalies) #1342

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions river/anomaly/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
from .filter import QuantileFilter, ThresholdFilter
from .gaussian import GaussianScorer
from .hst import HalfSpaceTrees
from .loda import LODA
from .lof import LocalOutlierFactor
from .svm import OneClassSVM

Expand All @@ -29,4 +30,5 @@
"QuantileFilter",
"ThresholdFilter",
"LocalOutlierFactor",
"LODA",
]
113 changes: 113 additions & 0 deletions river/anomaly/loda.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
from __future__ import annotations

import math

import numpy as np

from river import anomaly, utils

__all__ = ["LODA"]


class LODA(anomaly.base.AnomalyDetector):
"""LODA (Lightweight on-line detector of anomalies)

LODA [^1] comprises a collection of one-dimensional histograms, each approximating
the probability density of the inputed data projected on a single projection vector. Its output
on a sample is the average of the logarithm of probabilities estimated on individual projection vectors.

LODA shows that an ensemble of very weak detections can lead to a very strong anomaly detector with performance
equal to or even better than state-of-the-art methods.

This implementation within `River` is adapted from the versions implemented by the
[PyOD - Python Outlier Detection](https://pyod.readthedocs.io/en/latest/_modules/pyod/models/loda.html) and
[PySAD - Python Streaming Anomaly Detection](https://pysad.readthedocs.io/en/latest/_modules/pysad/models/loda.html)
frameworks.

Parameters
----------
n_bins
Number of bins of the histograms generated by the algorithm.
n_random_cuts
Number of random cuts

References
----------
[^1] Pevný, T. 2015. LODA: Lightweight on-line detector of anomalies. Machine Learning. 102, 2 (2015), 275–304.

Examples
--------

>>> import pandas as pd
>>> from river import anomaly
>>> from river import datasets

>>> cc_df = pd.DataFrame(datasets.CreditCard())

>>> loda = anomaly.LODA(n_bins=10, n_random_cuts=100)

>>> for x, _ in datasets.CreditCard().take(10_000):
... loda.learn_one(x)

>>> loda.n_features
30

>>> loda.score_one(cc_df[0][10_001])
9.091044415623026e-16

"""

def __init__(self, n_bins=10, n_random_cuts=100):
self.n_bins = n_bins
self.n_random_cuts = n_random_cuts

self.weights = []
self.projections_ = []
self.histograms_ = []
self.limits_ = []
self.n_bins_ = []

self.n_features = 0
self.n_zero_features = 0
self.n_nonzero_features = 0

self.init = True

def learn_one(self, x):
x_np = utils.dict2numpy(x)

if self.init:
self.n_features = len(x)
self.n_nonzero_features = math.sqrt(self.n_features)
self.n_zero_features = self.n_features - np.int_(self.n_nonzero_features)

self.weights = np.ones(self.n_random_cuts) / self.n_random_cuts
self.projections_ = np.random.rand(self.n_random_cuts, self.n_features)
self.histograms_ = np.zeros((self.n_random_cuts, self.n_bins))
self.limits_ = np.zeros((self.n_random_cuts, self.n_bins + 1))

self.init = False

x_np = x_np.reshape(1, -1)

for i in range(self.n_random_cuts):
rands = np.random.permutation(self.n_features)[: self.n_zero_features]
self.projections_[i, rands] = 0
projected_data = self.projections_[i, :].dot(x_np.T)
self.histograms_[i, :], self.limits_[i, :] = np.histogram(
projected_data, bins=self.n_bins, density=False
)
self.histograms_[i, :] += 1e-12
self.histograms_[i, :] /= np.sum(self.histograms_[i, :])

def score_one(self, x):
x_np = utils.dict2numpy(x).reshape(1, -1)

pred_scores = np.zeros([x_np.shape[0], 1])
for i in range(self.n_random_cuts):
projected_data = self.projections_[i, :].dot(x_np.T)
inds = np.searchsorted(self.limits_[i, : self.n_bins - 1], projected_data, side="left")
pred_scores[:, 0] += -self.weights[i] * np.log(self.histograms_[i, inds])
pred_scores /= self.n_random_cuts

return pred_scores.ravel().item()