import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from typing import Dict, List, Union
# TODO: This should inherit from collections.UserDict not dict
[docs]
class Bunch(dict):
"""
Container object for internal datasets.
Dictionary-like object that exposes its keys as attributes.
"""
def __init__(self, **kwargs):
super().__init__(kwargs)
def __setattr__(self, key, value):
self[key] = value
def __dir__(self):
return self.keys()
def __getattr__(self, key):
try:
return self[key]
except KeyError:
raise AttributeError(key)
[docs]
def gen_category_map(data: Union[pd.DataFrame, np.ndarray],
categorical_columns: Union[List[int], List[str], None] = None) -> Dict[int, list]:
"""
Parameters
----------
data
2-dimensional `pandas` dataframe or `numpy` array.
categorical_columns
A list of columns indicating categorical variables. Optional if passing a `pandas` dataframe as inference
will be used based on dtype ``'O'``. If passing a `numpy` array this is compulsory.
Returns
-------
category_map
A dictionary with keys being the indices of the categorical columns and values being lists of categories for
that column. Implicitly each category is mapped to the index of its position in the list.
"""
if data.ndim != 2:
raise TypeError('Expected a 2-dimensional dataframe or array')
n_features = data.shape[1]
if isinstance(data, np.ndarray):
# if numpy array, we need categorical_columns, otherwise impossible to infer
if categorical_columns is None:
raise ValueError('If passing a numpy array, `categorical_columns` is required')
elif not all(isinstance(ix, int) for ix in categorical_columns):
raise ValueError('If passing a numpy array, `categorical_columns` must be a list of integers')
data = pd.DataFrame(data)
# infer categorical columns
if categorical_columns is None:
try:
categorical_columns = [i for i in range(n_features) if data.iloc[:, i].dtype == 'O'] # NB: 'O'
except AttributeError:
raise
# create the map
category_map = {}
for col in categorical_columns:
if not isinstance(col, int):
col = int(data.columns.get_loc(col))
le = LabelEncoder()
try:
_ = le.fit_transform(data.iloc[:, col])
except (AttributeError, IndexError):
raise
category_map[col] = list(le.classes_)
return category_map