Source code for alibi.datasets.default

import logging
import pkgutil
import tarfile
import json
from io import BytesIO, StringIO
from typing import Optional, Tuple, Union, Dict

import numpy as np
import pandas as pd
import PIL
import requests
from requests import RequestException
from sklearn.preprocessing import LabelEncoder

from alibi.utils.data import Bunch

logger = logging.getLogger(__name__)

ADULT_URLS = ['https://storage.googleapis.com/seldon-datasets/adult/adult.data',
              'https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data',
              'http://mlr.cs.umass.edu/ml/machine-learning-databases/adult/adult.data']

MOVIESENTIMENT_URLS = ['https://storage.googleapis.com/seldon-datasets/sentence_polarity_v1/rt-polaritydata.tar.gz',
                       'http://www.cs.cornell.edu/People/pabo/movie-review-data/rt-polaritydata.tar.gz']

#  TODO change storage format.
IMAGENET_URLS = ['https://storage.googleapis.com/seldon-datasets/imagenet10/imagenet10.tar.gz']


[docs] def fetch_imagenet_10(url_id: int = 0) -> Dict: """ Sample dataset extracted from imagenet in a dictionary format. The train set contains 1000 random samples, 100 for each of the following 10 selected classes: * stingray * trilobite * centipede * slug * snail * Rhodesian ridgeback * beagle * golden retriever * sea lion * espresso The test set contains 50 random samples, 5 for each of the classes above. Parameters ---------- url_id Index specifying which URL to use for downloading. Returns ------- Dictionary with the following keys: * trainset - train set tuple (X_train, y_train) * testset - test set tuple (X_test, y_test) * int_to_str_labels - map from target to target name * str_to_int_labels - map from target name to target """ url = IMAGENET_URLS[url_id] try: resp = requests.get(url, timeout=2) resp.raise_for_status() except RequestException: logger.exception("Could not connect, URL may be out of service") raise tar = tarfile.open(fileobj=BytesIO(resp.content), mode="r:gz") def keystoint(x): return {int(k): v for k, v in x} int_to_str_labels = json.load(tar.extractfile('imagenet10/int_to_str_labels.json'), # type: ignore[arg-type] object_pairs_hook=keystoint) str_to_int_labels = json.load(tar.extractfile('imagenet10/str_to_int_labels.json')) # type: ignore[arg-type] # hack to load npy files from a tar archive # see https://github.com/numpy/numpy/issues/7989 mean_channels_af = BytesIO() mean_channels_af.write(tar.extractfile('imagenet10/mean_channels.npy').read()) # type: ignore[union-attr] mean_channels_af.seek(0) mean_channels = np.load(mean_channels_af) X_train_af = BytesIO() X_train_af.write(tar.extractfile('imagenet10/trainset/X.npy').read()) # type: ignore[union-attr] X_train_af.seek(0) X_train = np.load(X_train_af) y_train_af = BytesIO() y_train_af.write(tar.extractfile('imagenet10/trainset/y.npy').read()) # type: ignore[union-attr] y_train_af.seek(0) y_train = np.load(y_train_af) X_test_af = BytesIO() X_test_af.write(tar.extractfile('imagenet10/testset/X.npy').read()) # type: ignore[union-attr] X_test_af.seek(0) X_test = np.load(X_test_af) y_test_af = BytesIO() y_test_af.write(tar.extractfile('imagenet10/testset/y.npy').read()) # type: ignore[union-attr] y_test_af.seek(0) y_test = np.load(y_test_af) # buiding dataset dict imagenet10 = {'trainset': (X_train, y_train), 'testset': (X_test, y_test), 'int_to_str_labels': int_to_str_labels, 'str_to_int_labels': str_to_int_labels, 'mean_channels': mean_channels} return imagenet10
[docs] def load_cats(target_size: tuple = (299, 299), return_X_y: bool = False) -> Union[Bunch, Tuple[np.ndarray, np.ndarray]]: """ A small sample of Imagenet-like public domain images of cats used primarily for examples. The images were hand-collected using flickr.com by searching for various cat types, filtered by images in the public domain. Parameters ---------- target_size Size of the returned images, used to crop images for a specified model input size. return_X_y If ``True``, return features `X` and labels `y` as `numpy` arrays. If ``False`` return a `Bunch` object Returns ------- Bunch Bunch object with fields 'data', 'target' and 'target_names'. Both `targets` and `target_names` are taken from the original Imagenet. (data, target) Tuple if ``return_X_y=True``. """ tar = tarfile.open(fileobj=BytesIO(pkgutil.get_data(__name__, "../data/cats.tar.gz")), # type: ignore[arg-type] mode='r:gz') images = [] target = [] target_names = [] for member in tar.getmembers(): # data img = tar.extractfile(member).read() # type: ignore[union-attr] img = PIL.Image.open(BytesIO(img)) img = np.expand_dims(img.resize(target_size), axis=0) images.append(img) # labels name = member.name.split('_') target.append(int(name.pop(1))) target_names.append('_'.join(name).split('.')[0]) tar.close() images = np.concatenate(images, axis=0) targets = np.asarray(target) if return_X_y: return images, targets # type: ignore[return-value] # TODO: allow redefiniton else: return Bunch(data=images, target=targets, target_names=target_names)
# deprecated
[docs] def fetch_imagenet(category: str = 'Persian cat', nb_images: int = 10, target_size: tuple = (299, 299), min_std: float = 10., seed: int = 42, return_X_y: bool = False) -> None: import warnings warnings.warn("""The Imagenet API is no longer publicly available, as a result `fetch_imagenet` is deprecated. To download images from Imagenet please follow instructions on http://image-net.org/download""")
[docs] def fetch_movie_sentiment(return_X_y: bool = False, url_id: int = 0) -> Union[Bunch, Tuple[list, list]]: """ The movie review dataset, equally split between negative and positive reviews. Parameters ---------- return_X_y If ``True``, return features `X` and labels `y` as `Python` lists. If ``False`` return a `Bunch` object. url_id Index specifying which URL to use for downloading Returns ------- Bunch Movie reviews and sentiment labels (0 means 'negative' and 1 means 'positive'). (data, target) Tuple if ``return_X_y=True``. """ url = MOVIESENTIMENT_URLS[url_id] try: resp = requests.get(url, timeout=2) resp.raise_for_status() except RequestException: logger.exception("Could not connect, URL may be out of service") raise tar = tarfile.open(fileobj=BytesIO(resp.content), mode="r:gz") data = [] labels = [] for i, member in enumerate(tar.getnames()[1:]): f = tar.extractfile(member) for line in f.readlines(): # type: ignore[union-attr] try: line.decode('utf8') except UnicodeDecodeError: continue data.append(line.decode('utf8').strip()) labels.append(i) tar.close() if return_X_y: return data, labels target_names = ['negative', 'positive'] return Bunch(data=data, target=labels, target_names=target_names)
[docs] def fetch_adult(features_drop: Optional[list] = None, return_X_y: bool = False, url_id: int = 0) -> \ Union[Bunch, Tuple[np.ndarray, np.ndarray]]: """ Downloads and pre-processes 'adult' dataset. More info: http://mlr.cs.umass.edu/ml/machine-learning-databases/adult/ Parameters ---------- features_drop List of features to be dropped from dataset, by default drops ``["fnlwgt", "Education-Num"]``. return_X_y If ``True``, return features `X` and labels `y` as `numpy` arrays. If ``False`` return a `Bunch` object. url_id Index specifying which URL to use for downloading. Returns ------- Bunch Dataset, labels, a list of features and a dictionary containing a list with the potential categories for each categorical feature where the key refers to the feature column. (data, target) Tuple if ``return_X_y=True`` """ if features_drop is None: features_drop = ["fnlwgt", "Education-Num"] # download data dataset_url = ADULT_URLS[url_id] raw_features = ['Age', 'Workclass', 'fnlwgt', 'Education', 'Education-Num', 'Marital Status', 'Occupation', 'Relationship', 'Race', 'Sex', 'Capital Gain', 'Capital Loss', 'Hours per week', 'Country', 'Target'] try: resp = requests.get(dataset_url) resp.raise_for_status() except RequestException: logger.exception("Could not connect, URL may be out of service") raise raw_data = pd.read_csv(StringIO(resp.text), names=raw_features, delimiter=', ', engine='python').fillna('?') # get labels, features and drop unnecessary features labels = (raw_data['Target'] == '>50K').astype(int).values features_drop += ['Target'] data = raw_data.drop(features_drop, axis=1) features = list(data.columns) # map categorical features education_map = { '10th': 'Dropout', '11th': 'Dropout', '12th': 'Dropout', '1st-4th': 'Dropout', '5th-6th': 'Dropout', '7th-8th': 'Dropout', '9th': 'Dropout', 'Preschool': 'Dropout', 'HS-grad': 'High School grad', 'Some-college': 'High School grad', 'Masters': 'Masters', 'Prof-school': 'Prof-School', 'Assoc-acdm': 'Associates', 'Assoc-voc': 'Associates' } occupation_map = { "Adm-clerical": "Admin", "Armed-Forces": "Military", "Craft-repair": "Blue-Collar", "Exec-managerial": "White-Collar", "Farming-fishing": "Blue-Collar", "Handlers-cleaners": "Blue-Collar", "Machine-op-inspct": "Blue-Collar", "Other-service": "Service", "Priv-house-serv": "Service", "Prof-specialty": "Professional", "Protective-serv": "Other", "Sales": "Sales", "Tech-support": "Other", "Transport-moving": "Blue-Collar" } country_map = { 'Cambodia': 'SE-Asia', 'Canada': 'British-Commonwealth', 'China': 'China', 'Columbia': 'South-America', 'Cuba': 'Other', 'Dominican-Republic': 'Latin-America', 'Ecuador': 'South-America', 'El-Salvador': 'South-America', 'England': 'British-Commonwealth', 'France': 'Euro_1', 'Germany': 'Euro_1', 'Greece': 'Euro_2', 'Guatemala': 'Latin-America', 'Haiti': 'Latin-America', 'Holand-Netherlands': 'Euro_1', 'Honduras': 'Latin-America', 'Hong': 'China', 'Hungary': 'Euro_2', 'India': 'British-Commonwealth', 'Iran': 'Other', 'Ireland': 'British-Commonwealth', 'Italy': 'Euro_1', 'Jamaica': 'Latin-America', 'Japan': 'Other', 'Laos': 'SE-Asia', 'Mexico': 'Latin-America', 'Nicaragua': 'Latin-America', 'Outlying-US(Guam-USVI-etc)': 'Latin-America', 'Peru': 'South-America', 'Philippines': 'SE-Asia', 'Poland': 'Euro_2', 'Portugal': 'Euro_2', 'Puerto-Rico': 'Latin-America', 'Scotland': 'British-Commonwealth', 'South': 'Euro_2', 'Taiwan': 'China', 'Thailand': 'SE-Asia', 'Trinadad&Tobago': 'Latin-America', 'United-States': 'United-States', 'Vietnam': 'SE-Asia' } married_map = { 'Never-married': 'Never-Married', 'Married-AF-spouse': 'Married', 'Married-civ-spouse': 'Married', 'Married-spouse-absent': 'Separated', 'Separated': 'Separated', 'Divorced': 'Separated', 'Widowed': 'Widowed' } mapping = {'Education': education_map, 'Occupation': occupation_map, 'Country': country_map, 'Marital Status': married_map} data_copy = data.copy() for f, f_map in mapping.items(): data_tmp = data_copy[f].values for key, value in f_map.items(): data_tmp[data_tmp == key] = value data[f] = data_tmp # get categorical features and apply labelencoding categorical_features = [f for f in features if data[f].dtype == 'O'] category_map = {} for f in categorical_features: le = LabelEncoder() data_tmp = le.fit_transform(data[f].values) data[f] = data_tmp category_map[features.index(f)] = list(le.classes_) # only return data values data = data.values target_names = ['<=50K', '>50K'] if return_X_y: return data, labels return Bunch(data=data, target=labels, feature_names=features, target_names=target_names, category_map=category_map)