import numpy as np
from scipy.stats import ks_2samp
from typing import Callable, Dict, Optional, Tuple, Union
from alibi_detect.cd.base import BaseUnivariateDrift
from alibi_detect.utils.warnings import deprecated_alias
[docs]
class KSDrift(BaseUnivariateDrift):
[docs]
@deprecated_alias(preprocess_x_ref='preprocess_at_init')
def __init__(
self,
x_ref: Union[np.ndarray, list],
p_val: float = .05,
x_ref_preprocessed: bool = False,
preprocess_at_init: bool = True,
update_x_ref: Optional[Dict[str, int]] = None,
preprocess_fn: Optional[Callable] = None,
correction: str = 'bonferroni',
alternative: str = 'two-sided',
n_features: Optional[int] = None,
input_shape: Optional[tuple] = None,
data_type: Optional[str] = None
) -> None:
"""
Kolmogorov-Smirnov (K-S) data drift detector with Bonferroni or False Discovery Rate (FDR)
correction for multivariate data.
Parameters
----------
x_ref
Data used as reference distribution.
p_val
p-value used for significance of the K-S test for each feature. If the FDR correction method
is used, this corresponds to the acceptable q-value.
x_ref_preprocessed
Whether the given reference data `x_ref` has been preprocessed yet. If `x_ref_preprocessed=True`, only
the test data `x` will be preprocessed at prediction time. If `x_ref_preprocessed=False`, the reference
data will also be preprocessed.
preprocess_at_init
Whether to preprocess the reference data when the detector is instantiated. Otherwise, the reference
data will be preprocessed at prediction time. Only applies if `x_ref_preprocessed=False`.
update_x_ref
Reference data can optionally be updated to the last n instances seen by the detector
or via reservoir sampling with size n. For the former, the parameter equals {'last': n} while
for reservoir sampling {'reservoir_sampling': n} is passed.
preprocess_fn
Function to preprocess the data before computing the data drift metrics.
Typically a dimensionality reduction technique.
correction
Correction type for multivariate data. Either 'bonferroni' or 'fdr' (False Discovery Rate).
alternative
Defines the alternative hypothesis. Options are 'two-sided', 'less' or 'greater'.
n_features
Number of features used in the K-S test. No need to pass it if no preprocessing takes place.
In case of a preprocessing step, this can also be inferred automatically but could be more
expensive to compute.
input_shape
Shape of input data.
data_type
Optionally specify the data type (tabular, image or time-series). Added to metadata.
"""
super().__init__(
x_ref=x_ref,
p_val=p_val,
x_ref_preprocessed=x_ref_preprocessed,
preprocess_at_init=preprocess_at_init,
update_x_ref=update_x_ref,
preprocess_fn=preprocess_fn,
correction=correction,
n_features=n_features,
input_shape=input_shape,
data_type=data_type
)
# Set config
self._set_config(locals())
# Other attributes
self.alternative = alternative
[docs]
def feature_score(self, x_ref: np.ndarray, x: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
"""
Compute K-S scores and statistics per feature.
Parameters
----------
x_ref
Reference instances to compare distribution with.
x
Batch of instances.
Returns
-------
Feature level p-values and K-S statistics.
"""
x = x.reshape(x.shape[0], -1)
x_ref = x_ref.reshape(x_ref.shape[0], -1)
p_val = np.zeros(self.n_features, dtype=np.float32)
dist = np.zeros_like(p_val)
for f in range(self.n_features):
# TODO: update to 'exact' when bug fix is released in scipy 1.5
dist[f], p_val[f] = ks_2samp(x_ref[:, f], x[:, f], alternative=self.alternative, mode='asymp')
return p_val, dist