Source code for alibi_detect.od.prophet

from prophet import Prophet
import logging
import pandas as pd
from typing import Dict, List, Union
from alibi_detect.base import BaseDetector, FitMixin, outlier_prediction_dict

logger = logging.getLogger(__name__)



[docs]
class OutlierProphet(BaseDetector, FitMixin):


[docs]
    def __init__(self,
                 threshold: float = .8,
                 growth: str = 'linear',
                 cap: float = None,
                 holidays: pd.DataFrame = None,
                 holidays_prior_scale: float = 10.,
                 country_holidays: str = None,
                 changepoint_prior_scale: float = .05,
                 changepoint_range: float = .8,
                 seasonality_mode: str = 'additive',
                 daily_seasonality: Union[str, bool, int] = 'auto',
                 weekly_seasonality: Union[str, bool, int] = 'auto',
                 yearly_seasonality: Union[str, bool, int] = 'auto',
                 add_seasonality: List = None,
                 seasonality_prior_scale: float = 10.,
                 uncertainty_samples: int = 1000,
                 mcmc_samples: int = 0
                 ) -> None:
        """
        Outlier detector for time series data using fbprophet.
        See https://facebook.github.io/prophet/ for more details.

        Parameters
        ----------
        threshold
            Width of the uncertainty intervals of the forecast, used as outlier threshold.
            Equivalent to `interval_width`. If the instance lies outside of the uncertainty intervals,
            it is flagged as an outlier. If `mcmc_samples` equals 0, it is the uncertainty in the trend
            using the MAP estimate of the extrapolated model. If `mcmc_samples` >0, then uncertainty
            over all parameters is used.
        growth
            'linear' or 'logistic' to specify a linear or logistic trend.
        cap
            Growth cap in case growth equals 'logistic'.
        holidays
            pandas DataFrame with columns `holiday` (string) and `ds` (dates) and optionally
            columns `lower_window` and `upper_window` which specify a range of days around
            the date to be included as holidays.
        holidays_prior_scale
            Parameter controlling the strength of the holiday components model.
            Higher values imply a more flexible trend, more prone to more overfitting.
        country_holidays
            Include country-specific holidays via country abbreviations.
            The holidays for each country are provided by the holidays package in Python.
            A list of available countries and the country name to use is available on:
            https://github.com/dr-prodigy/python-holidays. Additionally, Prophet includes holidays for:
            Brazil (BR), Indonesia (ID), India (IN), Malaysia (MY), Vietnam (VN), Thailand (TH),
            Philippines (PH), Turkey (TU), Pakistan (PK), Bangladesh (BD), Egypt (EG), China (CN) and Russian (RU).
        changepoint_prior_scale
            Parameter controlling the flexibility of the automatic changepoint selection.
            Large values will allow many changepoints, potentially leading to overfitting.
        changepoint_range
            Proportion of history in which trend changepoints will be estimated.
            Higher values means more changepoints, potentially leading to overfitting.
        seasonality_mode
            Either 'additive' or 'multiplicative'.
        daily_seasonality
            Can be 'auto', True, False, or a number of Fourier terms to generate.
        weekly_seasonality
            Can be 'auto', True, False, or a number of Fourier terms to generate.
        yearly_seasonality
            Can be 'auto', True, False, or a number of Fourier terms to generate.
        add_seasonality
            Manually add one or more seasonality components. Pass a list of dicts containing the keys
            `name`, `period`, `fourier_order` (obligatory), `prior_scale` and `mode` (optional).
        seasonality_prior_scale
            Parameter controlling the strength of the seasonality model. Larger values allow the model to
            fit larger seasonal fluctuations, potentially leading to overfitting.
        uncertainty_samples
            Number of simulated draws used to estimate uncertainty intervals.
        mcmc_samples
            If >0, will do full Bayesian inference with the specified number of MCMC samples.
            If 0, will do MAP estimation.
        """
        super().__init__()

        # initialize Prophet model
        # TODO: add conditional seasonalities
        kwargs = {
            'growth': growth,
            'interval_width': threshold,
            'holidays': holidays,
            'holidays_prior_scale': holidays_prior_scale,
            'changepoint_prior_scale': changepoint_prior_scale,
            'changepoint_range': changepoint_range,
            'seasonality_mode': seasonality_mode,
            'daily_seasonality': daily_seasonality,
            'weekly_seasonality': weekly_seasonality,
            'yearly_seasonality': yearly_seasonality,
            'seasonality_prior_scale': seasonality_prior_scale,
            'uncertainty_samples': uncertainty_samples,
            'mcmc_samples': mcmc_samples
        }
        self.model = Prophet(**kwargs)
        if country_holidays:
            self.model.add_country_holidays(country_name=country_holidays)
        if add_seasonality:
            for s in add_seasonality:
                self.model.add_seasonality(**s)
        self.cap = cap

        # set metadata
        self.meta['detector_type'] = 'outlier'
        self.meta['data_type'] = 'time-series'
        self.meta['online'] = False



[docs]
    def fit(self, df: pd.DataFrame) -> None:
        """
        Fit Prophet model on normal (inlier) data.

        Parameters
        ----------
        df
            Dataframe with columns `ds` with timestamps and `y` with target values.
        """
        if self.cap:
            df['cap'] = self.cap
        self.model.fit(df)



[docs]
    def score(self, df: pd.DataFrame) -> pd.DataFrame:
        """
        Compute outlier scores.

        Parameters
        ----------
        df
            DataFrame with columns `ds` with timestamps and `y` with values which
            need to be flagged as outlier or not.

        Returns
        -------
        Array with outlier scores for each instance in the batch.
        """
        if self.cap:
            df['cap'] = self.cap
        forecast = self.model.predict(df)
        forecast['y'] = df['y'].values
        forecast['score'] = (
                (forecast['y'] - forecast['yhat_upper']) * (forecast['y'] >= forecast['yhat']) +
                (forecast['yhat_lower'] - forecast['y']) * (forecast['y'] < forecast['yhat'])
        )
        return forecast



[docs]
    def predict(self,
                df: pd.DataFrame,
                return_instance_score: bool = True,
                return_forecast: bool = True
                ) -> Dict[Dict[str, str], Dict[pd.DataFrame, pd.DataFrame]]:
        """
        Compute outlier scores and transform into outlier predictions.

        Parameters
        ----------
        df
            DataFrame with columns `ds` with timestamps and `y` with values which
            need to be flagged as outlier or not.
        return_instance_score
            Whether to return instance level outlier scores.
        return_forecast
            Whether to return the model forecast.

        Returns
        -------
        Dictionary containing ``'meta'`` and ``'data'`` dictionaries.
            - ``'meta'`` has the model's metadata.
            - ``'data'`` contains the outlier predictions, instance level outlier scores and the model forecast.
        """
        # compute outlier scores
        forecast = self.score(df)
        iscore = pd.DataFrame(data={
            'ds': df['ds'].values,
            'instance_score': forecast['score']
        })

        # values above threshold are outliers
        outlier_pred = pd.DataFrame(data={
            'ds': df['ds'].values,
            'is_outlier': (forecast['score'] > 0.).astype(int)
        })

        # populate output dict
        od = outlier_prediction_dict()
        od['meta'] = self.meta
        od['data']['is_outlier'] = outlier_pred
        if return_instance_score:
            od['data']['instance_score'] = iscore
        if return_forecast:
            od['data']['forecast'] = forecast
        return od