Source code for

import numpy as np
from scipy.stats import chi2_contingency
from typing import Callable, Dict, List, Optional, Tuple, Union
from import BaseUnivariateDrift
from alibi_detect.utils.warnings import deprecated_alias

[docs]class ChiSquareDrift(BaseUnivariateDrift):
[docs] @deprecated_alias(preprocess_x_ref='preprocess_at_init') def __init__( self, x_ref: Union[np.ndarray, list], p_val: float = .05, categories_per_feature: Optional[Dict[int, int]] = None, x_ref_preprocessed: bool = False, preprocess_at_init: bool = True, update_x_ref: Optional[Dict[str, int]] = None, preprocess_fn: Optional[Callable] = None, correction: str = 'bonferroni', n_features: Optional[int] = None, input_shape: Optional[tuple] = None, data_type: Optional[str] = None ) -> None: """ Chi-Squared data drift detector with Bonferroni or False Discovery Rate (FDR) correction for multivariate data. Parameters ---------- x_ref Data used as reference distribution. p_val p-value used for significance of the Chi-Squared test for each feature. If the FDR correction method is used, this corresponds to the acceptable q-value. categories_per_feature Optional dictionary with as keys the feature column index and as values the number of possible categorical values for that feature or a list with the possible values. If you know how many categories are present for a given feature you could pass this in the `categories_per_feature` dict in the Dict[int, int] format, e.g. {0: 3, 3: 2}. If you pass N categories this will assume the possible values for the feature are [0, ..., N-1]. You can also explicitly pass the possible categories in the Dict[int, List[int]] format, e.g. {0: [0, 1, 2], 3: [0, 55]}. Note that the categories can be arbitrary int values. If it is not specified, `categories_per_feature` is inferred from `x_ref`. x_ref_preprocessed Whether the given reference data `x_ref` has been preprocessed yet. If `x_ref_preprocessed=True`, only the test data `x` will be preprocessed at prediction time. If `x_ref_preprocessed=False`, the reference data will also be preprocessed. preprocess_at_init Whether to preprocess the reference data when the detector is instantiated. Otherwise, the reference data will be preprocessed at prediction time. Only applies if `x_ref_preprocessed=False`. update_x_ref Reference data can optionally be updated to the last n instances seen by the detector or via reservoir sampling with size n. For the former, the parameter equals {'last': n} while for reservoir sampling {'reservoir_sampling': n} is passed. preprocess_fn Function to preprocess the data before computing the data drift metrics. Typically a dimensionality reduction technique. correction Correction type for multivariate data. Either 'bonferroni' or 'fdr' (False Discovery Rate). n_features Number of features used in the Chi-Squared test. No need to pass it if no preprocessing takes place. In case of a preprocessing step, this can also be inferred automatically but could be more expensive to compute. input_shape Shape of input data. data_type Optionally specify the data type (tabular, image or time-series). Added to metadata. """ super().__init__( x_ref=x_ref, p_val=p_val, x_ref_preprocessed=x_ref_preprocessed, preprocess_at_init=preprocess_at_init, update_x_ref=update_x_ref, preprocess_fn=preprocess_fn, correction=correction, n_features=n_features, input_shape=input_shape, data_type=data_type ) # Set config self._set_config(locals()) # construct categories from the user-specified dict if isinstance(categories_per_feature, dict): vals = list(categories_per_feature.values()) int_types = (int, np.int16, np.int32, np.int64) if all(isinstance(v, int_types) for v in vals): # categories_per_feature = Dict[int, int] categories_per_feature = {f: list(np.arange(v)) # type: ignore for f, v in categories_per_feature.items()} elif not all(isinstance(val, list) for val in vals) and \ all(isinstance(v, int_types) for val in vals for v in val): # type: ignore raise ValueError('categories_per_feature needs to be None or one of ' 'Dict[int, int], Dict[int, List[int]]') else: # infer number of possible categories for each feature from reference data x_flat = self.x_ref.reshape(self.x_ref.shape[0], -1) categories_per_feature = {f: list(np.unique(x_flat[:, f])) for f in range(self.n_features)} self.x_ref_categories = categories_per_feature
[docs] def feature_score(self, x_ref: np.ndarray, x: np.ndarray) -> Tuple[np.ndarray, np.ndarray]: """ Compute Chi-Squared test statistic and p-values per feature. Parameters ---------- x_ref Reference instances to compare distribution with. x Batch of instances. Returns ------- Feature level p-values and Chi-Squared statistics. """ x_ref = x_ref.reshape(x_ref.shape[0], -1) x = x.reshape(x.shape[0], -1) # apply counts on union of categories per variable in both the reference and test data x_categories = {f: list(np.unique(x[:, f])) for f in range(self.n_features)} all_categories = {f: list(set().union(self.x_ref_categories[f], x_categories[f])) # type: ignore for f in range(self.n_features)} x_ref_count = self._get_counts(x_ref, all_categories) x_count = self._get_counts(x, all_categories) p_val = np.zeros(self.n_features, dtype=np.float32) dist = np.zeros_like(p_val) for f in range(self.n_features): # apply Chi-Squared test contingency_table = np.vstack((x_ref_count[f], x_count[f])) dist[f], p_val[f], _, _ = chi2_contingency(contingency_table) return p_val, dist
def _get_counts(self, x: np.ndarray, categories: Dict[int, List[int]]) -> Dict[int, List[int]]: """ Utility method for getting the counts of categories for each categorical variable. """ return {f: [(x[:, f] == v).sum() for v in vals] for f, vals in categories.items()}