Source code for alibi_detect.od.isolationforest

import logging
import numpy as np
from sklearn.ensemble import IsolationForest
from typing import Dict, Union
from alibi_detect.base import BaseDetector, FitMixin, ThresholdMixin, outlier_prediction_dict

logger = logging.getLogger(__name__)



[docs]
class IForest(BaseDetector, FitMixin, ThresholdMixin):


[docs]
    def __init__(self,
                 threshold: float = None,
                 n_estimators: int = 100,
                 max_samples: Union[str, int, float] = 'auto',
                 max_features: Union[int, float] = 1.,
                 bootstrap: bool = False,
                 n_jobs: int = 1,
                 data_type: str = 'tabular'
                 ) -> None:
        """
        Outlier detector for tabular data using isolation forests.

        Parameters
        ----------
        threshold
            Threshold used for outlier score to determine outliers.
        n_estimators
            Number of base estimators in the ensemble.
        max_samples
            Number of samples to draw from the training data to train each base estimator.
            If int, draw 'max_samples' samples.
            If float, draw 'max_samples * number of features' samples.
            If 'auto', max_samples = min(256, number of samples)
        max_features
            Number of features to draw from the training data to train each base estimator.
            If int, draw 'max_features' features.
            If float, draw 'max_features * number of features' features.
        bootstrap
            Whether to fit individual trees on random subsets of the training data, sampled with replacement.
        n_jobs
            Number of jobs to run in parallel for 'fit' and 'predict'.
        data_type
            Optionally specify the data type (tabular, image or time-series). Added to metadata.
        """
        super().__init__()

        if threshold is None:
            logger.warning('No threshold level set. Need to infer threshold using `infer_threshold`.')

        self.threshold = threshold
        self.isolationforest = IsolationForest(n_estimators=n_estimators,
                                               max_samples=max_samples,
                                               max_features=max_features,
                                               bootstrap=bootstrap,
                                               n_jobs=n_jobs)

        # set metadata
        self.meta['detector_type'] = 'outlier'
        self.meta['data_type'] = data_type
        self.meta['online'] = False



[docs]
    def fit(self,
            X: np.ndarray,
            sample_weight: np.ndarray = None
            ) -> None:
        """
        Fit isolation forest.

        Parameters
        ----------
        X
            Training batch.
        sample_weight
            Sample weights.
        """
        self.isolationforest.fit(X, sample_weight=sample_weight)



[docs]
    def infer_threshold(self,
                        X: np.ndarray,
                        threshold_perc: float = 95.
                        ) -> None:
        """
        Update threshold by a value inferred from the percentage of instances considered to be
        outliers in a sample of the dataset.

        Parameters
        ----------
        X
            Batch of instances.
        threshold_perc
            Percentage of X considered to be normal based on the outlier score.
        """
        # compute outlier scores
        iscore = self.score(X)

        # update threshold
        self.threshold = np.percentile(iscore, threshold_perc)



[docs]
    def score(self, X: np.ndarray) -> np.ndarray:
        """
        Compute outlier scores.

        Parameters
        ----------
        X
            Batch of instances to analyze.

        Returns
        -------
        Array with outlier scores for each instance in the batch.
        """
        return - self.isolationforest.decision_function(X)



[docs]
    def predict(self,
                X: np.ndarray,
                return_instance_score: bool = True) \
            -> Dict[Dict[str, str], Dict[np.ndarray, np.ndarray]]:
        """
        Compute outlier scores and transform into outlier predictions.

        Parameters
        ----------
        X
            Batch of instances.
        return_instance_score
            Whether to return instance level outlier scores.

        Returns
        -------
        Dictionary containing ``'meta'`` and ``'data'`` dictionaries.
            - ``'meta'`` has the model's metadata.
            - ``'data'`` contains the outlier predictions and instance level outlier scores.
        """
        # compute outlier scores
        iscore = self.score(X)

        # values above threshold are outliers
        outlier_pred = (iscore > self.threshold).astype(int)

        # populate output dict
        od = outlier_prediction_dict()
        od['meta'] = self.meta
        od['data']['is_outlier'] = outlier_pred
        if return_instance_score:
            od['data']['instance_score'] = iscore
        return od