Source code for alibi_detect.utils.data

import numpy as np
import pandas as pd
from typing import Tuple, Union



[docs]
class Bunch(dict):
    """
    Container object for internal datasets
    Dictionary-like object that exposes its keys as attributes.
    """

    def __init__(self, **kwargs):
        super().__init__(kwargs)

    def __setattr__(self, key, value):
        self[key] = value

    def __dir__(self):
        return self.keys()

    def __getattr__(self, key):
        try:
            return self[key]
        except KeyError:
            raise AttributeError(key)




[docs]
def sample_df(df: pd.DataFrame,
              n: int):
    """ Sample n instances from the dataframe df. """
    if n < df.shape[0]+1:
        replace = False
    else:
        replace = True
    return df.sample(n=n, replace=replace)




[docs]
def create_outlier_batch(data: np.ndarray,
                         target: np.ndarray,
                         n_samples: int,
                         perc_outlier: int) -> Union[Bunch, Tuple[np.ndarray, np.ndarray]]:
    """ Create a batch with a defined percentage of outliers. """

    # create df
    data = pd.DataFrame(data=data)
    data['target'] = target

    # separate inlier and outlier data
    normal = data[data['target'] == 0]
    outlier = data[data['target'] == 1]

    if n_samples == 1:
        n_outlier = np.random.binomial(1, .01 * perc_outlier)
        n_normal = 1 - n_outlier
    else:
        n_outlier = int(perc_outlier * .01 * n_samples)
        n_normal = int((100 - perc_outlier) * .01 * n_samples)

    # draw samples
    batch_normal = sample_df(normal, n_normal)
    batch_outlier = sample_df(outlier, n_outlier)

    batch = pd.concat([batch_normal, batch_outlier])
    batch = batch.sample(frac=1).reset_index(drop=True)

    is_outlier = batch['target'].values
    batch.drop(columns=['target'], inplace=True)

    return Bunch(data=batch.values, target=is_outlier, target_names=['normal', 'outlier'])