Source code for

import numpy as np
from scipy.stats import ks_2samp
from typing import Callable, Dict, Optional, Tuple, Union
from import BaseUnivariateDrift

[docs]class KSDrift(BaseUnivariateDrift):
[docs] def __init__( self, x_ref: Union[np.ndarray, list], p_val: float = .05, preprocess_x_ref: bool = True, update_x_ref: Optional[Dict[str, int]] = None, preprocess_fn: Optional[Callable] = None, correction: str = 'bonferroni', alternative: str = 'two-sided', n_features: Optional[int] = None, input_shape: Optional[tuple] = None, data_type: Optional[str] = None ) -> None: """ Kolmogorov-Smirnov (K-S) data drift detector with Bonferroni or False Discovery Rate (FDR) correction for multivariate data. Parameters ---------- x_ref Data used as reference distribution. p_val p-value used for significance of the K-S test for each feature. If the FDR correction method is used, this corresponds to the acceptable q-value. preprocess_x_ref Whether to already preprocess and store the reference data. update_x_ref Reference data can optionally be updated to the last n instances seen by the detector or via reservoir sampling with size n. For the former, the parameter equals {'last': n} while for reservoir sampling {'reservoir_sampling': n} is passed. preprocess_fn Function to preprocess the data before computing the data drift metrics. Typically a dimensionality reduction technique. correction Correction type for multivariate data. Either 'bonferroni' or 'fdr' (False Discovery Rate). alternative Defines the alternative hypothesis. Options are 'two-sided', 'less' or 'greater'. n_features Number of features used in the K-S test. No need to pass it if no preprocessing takes place. In case of a preprocessing step, this can also be inferred automatically but could be more expensive to compute. input_shape Shape of input data. data_type Optionally specify the data type (tabular, image or time-series). Added to metadata. """ super().__init__( x_ref=x_ref, p_val=p_val, preprocess_x_ref=preprocess_x_ref, update_x_ref=update_x_ref, preprocess_fn=preprocess_fn, correction=correction, n_features=n_features, input_shape=input_shape, data_type=data_type ) self.alternative = alternative
[docs] def feature_score(self, x_ref: np.ndarray, x: np.ndarray) -> Tuple[np.ndarray, np.ndarray]: """ Compute K-S scores and statistics per feature. Parameters ---------- x_ref Reference instances to compare distribution with. x Batch of instances. Returns ------- Feature level p-values and K-S statistics. """ x = x.reshape(x.shape[0], -1) x_ref = x_ref.reshape(x_ref.shape[0], -1) p_val = np.zeros(self.n_features, dtype=np.float32) dist = np.zeros_like(p_val) for f in range(self.n_features): # TODO: update to 'exact' when bug fix is released in scipy 1.5 dist[f], p_val[f] = ks_2samp(x_ref[:, f], x[:, f], alternative=self.alternative, mode='asymp') return p_val, dist