Source code for alibi_detect.utils.mapping

import numpy as np
from typing import Tuple, List



[docs]
def ohe2ord_shape(shape: tuple, cat_vars: dict = None, is_ohe: bool = False) -> tuple:
    """
    Infer shape of instance if the categorical variables have ordinal instead of on-hot encoding.

    Parameters
    ----------
    shape
        Instance shape, starting with batch dimension.
    cat_vars
        Dict with as keys the categorical columns and as values
        the number of categories per categorical variable.
    is_ohe
        Whether instance is OHE.

    Returns
    -------
    Tuple with shape of instance with ordinal encoding of categorical variables.
    """
    if not is_ohe:
        return shape
    else:
        n_cols_ohe = 0
        for _, v in cat_vars.items():
            n_cols_ohe += v - 1
        shape = (shape[0],) + (shape[-1] - n_cols_ohe,)
        return shape




[docs]
def ord2num(data: np.ndarray, dist: dict) -> np.ndarray:
    """
    Transform categorical into numerical values using a mapping.

    Parameters
    ----------
    data
        Numpy array with the categorical data.
    dist
        Dict with as keys the categorical variables and as values
        the numerical value for each category.

    Returns
    -------
    Numpy array with transformed categorical data into numerical values.
    """
    rng = data.shape[0]
    X = data.astype(np.float32, copy=True)
    for k, v in dist.items():
        cat_col = X[:, k].copy()
        cat_col = np.array([v[int(cat_col[i])] for i in range(rng)])
        if isinstance(X, np.matrix):
            X[:, k] = cat_col.reshape(-1, 1)
        else:
            X[:, k] = cat_col
    return X.astype(np.float32)




[docs]
def num2ord(data: np.ndarray, dist: dict) -> np.ndarray:
    """
    Transform numerical values into categories using the map calculated under the fit method.

    Parameters
    ----------
    data
        Numpy array with the numerical data.
    dist
        Dict with as keys the categorical variables and as values
        the numerical value for each category.

    Returns
    -------
    Numpy array with transformed numerical data into categories.
    """
    X = data.copy()
    for k, v in dist.items():
        num_col = np.repeat(X[:, k].reshape(-1, 1), v.shape[0], axis=1)
        diff = np.abs(num_col - v.reshape(1, -1))
        X[:, k] = np.argmin(diff, axis=1)
    return X




[docs]
def ord2ohe(X_ord: np.ndarray, cat_vars_ord: dict) -> Tuple[np.ndarray, dict]:
    """
    Convert ordinal to one-hot encoded variables.

    Parameters
    ----------
    X_ord
        Data with mixture of ordinal encoded and numerical variables.
    cat_vars_ord
        Dict with as keys the categorical columns and as values
        the number of categories per categorical variable.

    Returns
    -------
    One-hot equivalent of ordinal encoded data and dict with categorical columns and number of categories.
    """
    n, cols = X_ord.shape
    ord_vars_keys = list(cat_vars_ord.keys())
    X_list = []
    c = 0
    k = 0
    cat_vars_ohe = {}
    while c < cols:
        if c in ord_vars_keys:
            v = cat_vars_ord[c]
            X_ohe_c = np.zeros((n, v), dtype=np.float32)
            X_ohe_c[np.arange(n), X_ord[:, c].astype(int)] = 1.
            cat_vars_ohe[k] = v
            k += v
            X_list.append(X_ohe_c)
        else:
            X_list.append(X_ord[:, c].reshape(n, 1))
            k += 1
        c += 1
    X_ohe = np.concatenate(X_list, axis=1)
    return X_ohe, cat_vars_ohe




[docs]
def ohe2ord(X_ohe: np.ndarray, cat_vars_ohe: dict) -> Tuple[np.ndarray, dict]:
    """
    Convert one-hot encoded variables to ordinal encodings.

    Parameters
    ----------
    X_ohe
        Data with mixture of one-hot encoded and numerical variables.
    cat_vars_ohe
        Dict with as keys the first column index for each one-hot encoded categorical variable
        and as values the number of categories per categorical variable.

    Returns
    -------
    Ordinal equivalent of one-hot encoded data and dict with categorical columns and number of categories.
    """
    n, cols = X_ohe.shape
    ohe_vars_keys = list(cat_vars_ohe.keys())
    X_list: List = []
    c = 0
    cat_vars_ord = {}
    while c < cols:
        if c in ohe_vars_keys:
            v = cat_vars_ohe[c]
            X_ohe_c = X_ohe[:, c:c + v]
            assert int(np.sum(X_ohe_c, axis=1).sum()) == n
            X_ord_c = np.argmax(X_ohe_c, axis=1)
            cat_vars_ord[len(X_list)] = v
            X_list.append(X_ord_c.reshape(n, 1))
            c += v
            continue
        X_list.append(X_ohe[:, c].reshape(n, 1))
        c += 1
    X_ord = np.concatenate(X_list, axis=1)
    return X_ord, cat_vars_ord