import logging
import numpy as np
from sklearn.ensemble import IsolationForest
from typing import Dict, Union
from alibi_detect.base import BaseDetector, FitMixin, ThresholdMixin, outlier_prediction_dict
logger = logging.getLogger(__name__)
[docs]class IForest(BaseDetector, FitMixin, ThresholdMixin):
[docs] def __init__(self,
threshold: float = None,
n_estimators: int = 100,
max_samples: Union[str, int, float] = 'auto',
max_features: Union[int, float] = 1.,
bootstrap: bool = False,
n_jobs: int = 1,
data_type: str = 'tabular'
) -> None:
"""
Outlier detector for tabular data using isolation forests.
Parameters
----------
threshold
Threshold used for outlier score to determine outliers.
n_estimators
Number of base estimators in the ensemble.
max_samples
Number of samples to draw from the training data to train each base estimator.
If int, draw 'max_samples' samples.
If float, draw 'max_samples * number of features' samples.
If 'auto', max_samples = min(256, number of samples)
max_features
Number of features to draw from the training data to train each base estimator.
If int, draw 'max_features' features.
If float, draw 'max_features * number of features' features.
bootstrap
Whether to fit individual trees on random subsets of the training data, sampled with replacement.
n_jobs
Number of jobs to run in parallel for 'fit' and 'predict'.
data_type
Optionally specify the data type (tabular, image or time-series). Added to metadata.
"""
super().__init__()
if threshold is None:
logger.warning('No threshold level set. Need to infer threshold using `infer_threshold`.')
self.threshold = threshold
self.isolationforest = IsolationForest(n_estimators=n_estimators,
max_samples=max_samples,
max_features=max_features,
bootstrap=bootstrap,
n_jobs=n_jobs)
# set metadata
self.meta['detector_type'] = 'outlier'
self.meta['data_type'] = data_type
self.meta['online'] = False
[docs] def fit(self,
X: np.ndarray,
sample_weight: np.ndarray = None
) -> None:
"""
Fit isolation forest.
Parameters
----------
X
Training batch.
sample_weight
Sample weights.
"""
self.isolationforest.fit(X, sample_weight=sample_weight)
[docs] def infer_threshold(self,
X: np.ndarray,
threshold_perc: float = 95.
) -> None:
"""
Update threshold by a value inferred from the percentage of instances considered to be
outliers in a sample of the dataset.
Parameters
----------
X
Batch of instances.
threshold_perc
Percentage of X considered to be normal based on the outlier score.
"""
# compute outlier scores
iscore = self.score(X)
# update threshold
self.threshold = np.percentile(iscore, threshold_perc)
[docs] def score(self, X: np.ndarray) -> np.ndarray:
"""
Compute outlier scores.
Parameters
----------
X
Batch of instances to analyze.
Returns
-------
Array with outlier scores for each instance in the batch.
"""
return - self.isolationforest.decision_function(X)
[docs] def predict(self,
X: np.ndarray,
return_instance_score: bool = True) \
-> Dict[Dict[str, str], Dict[np.ndarray, np.ndarray]]:
"""
Compute outlier scores and transform into outlier predictions.
Parameters
----------
X
Batch of instances.
return_instance_score
Whether to return instance level outlier scores.
Returns
-------
Dictionary containing ``'meta'`` and ``'data'`` dictionaries.
- ``'meta'`` has the model's metadata.
- ``'data'`` contains the outlier predictions and instance level outlier scores.
"""
# compute outlier scores
iscore = self.score(X)
# values above threshold are outliers
outlier_pred = (iscore > self.threshold).astype(int)
# populate output dict
od = outlier_prediction_dict()
od['meta'] = self.meta
od['data']['is_outlier'] = outlier_pred
if return_instance_score:
od['data']['instance_score'] = iscore
return od