Source code for alibi_detect.utils.discretizer

import numpy as np
from typing import Dict, Callable, List


[docs] class Discretizer(object):
[docs] def __init__(self, data: np.ndarray, categorical_features: List[int], feature_names: List[str], percentiles: List[int] = [25, 50, 75]) -> None: """ Initialize the discretizer. Parameters ---------- data Data to discretize categorical_features List of indices corresponding to the categorical columns. These features will not be discretized. The other features will be considered continuous and therefore discretized. feature_names List with feature names percentiles Percentiles used for discretization """ self.to_discretize = ([x for x in range(data.shape[1]) if x not in categorical_features]) self.percentiles = percentiles bins = self.bins(data) bins = [np.unique(x) for x in bins] self.names: Dict[int, list] = {} self.lambdas: Dict[int, Callable] = {} for feature, qts in zip(self.to_discretize, bins): # get nb of borders (nb of bins - 1) and the feature name n_bins = qts.shape[0] name = feature_names[feature] # create names for bins of discretized features self.names[feature] = ['%s <= %.2f' % (name, qts[0])] for i in range(n_bins - 1): self.names[feature].append('%.2f < %s <= %.2f' % (qts[i], name, qts[i + 1])) self.names[feature].append('%s > %.2f' % (name, qts[n_bins - 1])) self.lambdas[feature] = lambda x, qts = qts: np.searchsorted(qts, x)
[docs] def bins(self, data: np.ndarray) -> List[np.ndarray]: """ Parameters ---------- data Data to discretize Returns ------- List with bin values for each feature that is discretized. """ bins = [] for feature in self.to_discretize: qts = np.array(np.percentile(data[:, feature], self.percentiles)) bins.append(qts) return bins
[docs] def discretize(self, data: np.ndarray) -> np.ndarray: """ Parameters ---------- data Data to discretize Returns ------- Discretized version of data with the same dimension. """ data_disc = data.copy() for feature in self.lambdas: if len(data.shape) == 1: data_disc[feature] = int(self.lambdas[feature](data_disc[feature])) else: data_disc[:, feature] = self.lambdas[feature](data_disc[:, feature]).astype(int) return data_disc