Source code for alibi.explainers.anchors.anchor_tabular_distributed

import logging
from typing import (Any, Callable, Dict, List, Optional, Tuple, Type, Union)

import numpy as np
import ray

from alibi.api.interfaces import Explanation
from alibi.utils.discretizer import Discretizer
from alibi.utils.mapping import ohe_to_ord
from .anchor_base import AnchorBaseBeam
from .anchor_tabular import AnchorTabular, TabularSampler
from alibi.utils.distributed import ActorPool
from functools import partial



[docs]
class DistributedAnchorBaseBeam(AnchorBaseBeam):
    def __init__(self, samplers: List[Callable], **kwargs) -> None:

        super().__init__(samplers)
        self.chunksize = kwargs.get('chunksize', 1)
        self.sample_fcn = lambda actor, anchor, n_samples, compute_labels=True: \
            actor.__call__.remote(anchor,
                                  n_samples,
                                  compute_labels=compute_labels)
        self.pool = ActorPool(samplers)
        self.samplers = samplers

    def _get_coverage_samples(self, coverage_samples: int,  # type: ignore[override]
                              samplers: List[Callable]) -> np.ndarray:
        """
        Sends a request for a coverage set to process running sampling tasks.

        Parameters
        ----------
        coverage_samples, samplers
            See :py:meth:`alibi.explainers.anchors.anchor_base.AnchorBaseBeam._get_coverage_samples` implementation.

        Returns
        -------
        See :py:meth:`alibi.explainers.anchors.anchor_base.AnchorBaseBeam._get_coverage_samples` implementation.
        """
        [coverage_data] = ray.get(
            self.sample_fcn(samplers[0], (0, ()), coverage_samples, compute_labels=False)
        )

        return coverage_data


[docs]
    def draw_samples(self, anchors: list, batch_size: int) -> Tuple[np.ndarray, np.ndarray]:  # type: ignore[override]
        """
        Distributes sampling requests among processes running sampling tasks.

        Parameters
        ----------
        anchors, batch_size
            See :py:meth:`alibi.explainers.anchors.anchor_base.AnchorBaseBeam.draw_samples` implementation.

        Returns
        -------
        See :py:meth:`alibi.explainers.anchors.anchor_base.AnchorBaseBeam.draw_samples` implementation.
        """

        # partial anchors not generated by propose_anchors are not in the order dictionary
        for anchor in anchors:
            if anchor not in self.state['t_order']:
                self.state['t_order'][anchor] = list(anchor)

        pos, total = np.zeros((len(anchors),)), np.zeros((len(anchors),))
        order_map = [(i, tuple(self.state['t_order'][anchor])) for i, anchor in enumerate(anchors)]
        samples_iter = self.pool.map_unordered(
            partial(self.sample_fcn, n_samples=batch_size),
            order_map,
            self.chunksize,
        )
        for samples_batch in samples_iter:
            for samples in samples_batch:
                covered_true, covered_false, labels, *additionals, anchor_idx = samples
                positives, n_samples = self.update_state(
                    covered_true,
                    covered_false,
                    labels,
                    additionals,
                    anchors[anchor_idx],
                )
                # return statistics in the same order as the requests
                pos[anchor_idx], total[anchor_idx] = positives, n_samples

        return pos, total





[docs]
class RemoteSampler:
    """ A wrapper that facilitates the use of `TabularSampler` for distributed sampling."""

    def __init__(self, *args):
        self.train_id, self.d_train_id, self.sampler = args
        self.sampler = self.sampler.deferred_init(self.train_id, self.d_train_id)


[docs]
    def __call__(self, anchors_batch: Union[Tuple[int, tuple], List[Tuple[int, tuple]]], num_samples: int,
                 compute_labels: bool = True) -> List:
        """
        Wrapper around :py:meth:`alibi.explainers.anchors.anchor_tabular.TabularSampler.__call__`. It allows sampling a
        batch of anchors in the same process, which can improve performance.

        Parameters
        ----------
        anchors_batch, num_samples, compute_labels
            A list of result tuples. See :py:meth:`alibi.explainers.anchors.anchor_tabular.TabularSampler.__call__`
            for details.
        """

        if isinstance(anchors_batch, tuple):  # DistributedAnchorBaseBeam._get_samples_coverage call
            return self.sampler(anchors_batch, num_samples, compute_labels=compute_labels)
        elif len(anchors_batch) == 1:  # batch size = 1
            return [self.sampler(*anchors_batch, num_samples, compute_labels=compute_labels)]
        else:  # batch size > 1
            batch_result = []
            for anchor in anchors_batch:
                batch_result.append(self.sampler(anchor, num_samples, compute_labels=compute_labels))

            return batch_result



[docs]
    def set_instance_label(self, X: np.ndarray) -> int:
        """
        Sets the remote sampler instance label.

        Parameters
        ----------
        X
            The instance to be explained.

        Returns
        -------
        label
            The label of the instance to be explained.
        """

        self.sampler.set_instance_label(X)
        label = self.sampler.instance_label

        return label



[docs]
    def set_n_covered(self, n_covered: int) -> None:
        """
        Sets the remote sampler number of examples to save for inspection.

        Parameters
        ----------
        n_covered
            Number of examples where the result (and partial anchors) apply.
        """

        self.sampler.set_n_covered(n_covered)


    def _get_sampler(self) -> TabularSampler:
        """
        A getter that returns the underlying tabular object.

        Returns
        -------
        The tabular sampler object that is used in the process.
        """
        return self.sampler


[docs]
    def build_lookups(self, X: np.ndarray):
        """
        Wrapper around :py:meth:`alibi.explainers.anchors.anchor_tabular.TabularSampler.build_lookups`.

        Parameters
        --------
        X
            See :py:meth:`alibi.explainers.anchors.anchor_tabular.TabularSampler.build_lookups`.

        Returns
        -------
        See :py:meth:`alibi.explainers.anchors.anchor_tabular.TabularSampler.build_lookups`.
        """

        cat_lookup_id, ord_lookup_id, enc2feat_idx_id = self.sampler.build_lookups(X)

        return [cat_lookup_id, ord_lookup_id, enc2feat_idx_id]





[docs]
class DistributedAnchorTabular(AnchorTabular):
    def __init__(self,
                 predictor: Callable,
                 feature_names: List[str],
                 categorical_names: Optional[Dict[int, List[str]]] = None,
                 dtype: Type[np.generic] = np.float32,
                 ohe: bool = False,
                 seed: Optional[int] = None) -> None:

        super().__init__(predictor, feature_names, categorical_names, dtype, ohe, seed)
        if not ray.is_initialized():
            ray.init()


[docs]
    def fit(self,  # type: ignore[override]
            train_data: np.ndarray,
            disc_perc: tuple = (25, 50, 75),
            **kwargs) -> "AnchorTabular":
        """
        Creates a list of handles to parallel processes handles that are used for submitting sampling
        tasks.

        Parameters
        ----------
        train_data, disc_perc, **kwargs
            See :py:meth:`alibi.explainers.anchors.anchor_tabular.AnchorTabular.fit` superclass.
        """

        try:
            ncpu = kwargs['ncpu']
        except KeyError:
            logging.warning('DistributedAnchorTabular object has been initalised but kwargs did not contain '
                            'expected argument, ncpu. Defaulting to ncpu=2!')
            ncpu = 2

        # transform one-hot encodings to labels if ohe == True
        train_data = ohe_to_ord(X_ohe=train_data, cat_vars_ohe=self.cat_vars_ohe)[0] if self.ohe else train_data

        disc = Discretizer(train_data, self.numerical_features, self.feature_names, percentiles=disc_perc)
        d_train_data = disc.discretize(train_data)

        self.feature_values.update(disc.feature_intervals)

        sampler_args = (
            self._predictor,
            disc_perc,
            self.numerical_features,
            self.categorical_features,
            self.feature_names,
            self.feature_values,
        )
        train_data_id = ray.put(train_data)
        d_train_data_id = ray.put(d_train_data)
        samplers = [TabularSampler(*sampler_args, seed=self.seed) for _ in range(ncpu)]  # type: ignore[arg-type]
        d_samplers = []
        for sampler in samplers:
            d_samplers.append(
                ray.remote(RemoteSampler).remote(  # type: ignore[call-arg]
                    *(train_data_id, d_train_data_id, sampler)
                )
            )
        self.samplers = d_samplers

        # update metadata
        self.meta['params'].update(disc_perc=disc_perc)

        return self


    def _build_sampling_lookups(self, X: np.ndarray) -> None:
        """
        See :py:meth:`alibi.explainers.anchors.anchor_tabular.AnchorTabular._build_sampling_lookups` documentation.

        Parameters
        ----------
        X
            See :py:meth:`alibi.explainers.anchors.anchor_tabular.AnchorTabular._build_sampling_lookups` documentation.
        """

        lookups = [sampler.build_lookups.remote(X) for sampler in self.samplers][0]
        self.cat_lookup, self.ord_lookup, self.enc2feat_idx = ray.get(lookups)


[docs]
    def explain(self,
                X: np.ndarray,
                threshold: float = 0.95,
                delta: float = 0.1,
                tau: float = 0.15,
                batch_size: int = 100,
                coverage_samples: int = 10000,
                beam_size: int = 1,
                stop_on_first: bool = False,
                max_anchor_size: Optional[int] = None,
                min_samples_start: int = 1,
                n_covered_ex: int = 10,
                binary_cache_size: int = 10000,
                cache_margin: int = 1000,
                verbose: bool = False,
                verbose_every: int = 1,
                **kwargs: Any) -> Explanation:
        """
        Explains the prediction made by a classifier on instance `X`. Sampling is done in parallel over a number of
        cores specified in `kwargs['ncpu']`.

        Parameters
        ----------
        X, threshold, delta, tau, batch_size, coverage_samples, beam_size, stop_on_first, max_anchor_size, \
        min_samples_start, n_covered_ex, binary_cache_size, cache_margin, verbose, verbose_every, **kwargs
            See :py:meth:`alibi.explainers.anchors.anchor_tabular.AnchorTabular.explain`.

        Returns
        -------
        See :py:meth:`alibi.explainers.anchors.anchor_tabular.AnchorTabular.explain` superclass.
        """
        # transform one-hot encodings to labels if ohe == True
        X = ohe_to_ord(X_ohe=X.reshape(1, -1), cat_vars_ohe=self.cat_vars_ohe)[0].reshape(-1) if self.ohe else X

        # get params for storage in meta
        params = locals()
        remove = ['X', 'self']
        for key in remove:
            params.pop(key)

        for sampler in self.samplers:
            label = sampler.set_instance_label.remote(X)
            sampler.set_n_covered.remote(n_covered_ex)

        self.instance_label = ray.get(label)

        # build feature encoding and mappings from the instance values to database rows where similar records are found
        # get anchors and add metadata
        self._build_sampling_lookups(X)
        mab = DistributedAnchorBaseBeam(
            samplers=self.samplers,
            sample_cache_size=binary_cache_size,
            cache_margin=cache_margin,
            **kwargs,
        )
        result: Any = mab.anchor_beam(
            delta=delta, epsilon=tau,
            desired_confidence=threshold,
            beam_size=beam_size,
            min_samples_start=min_samples_start,
            max_anchor_size=max_anchor_size,
            batch_size=batch_size,
            coverage_samples=coverage_samples,
            verbose=verbose,
            verbose_every=verbose_every,
        )
        self.mab = mab

        return self._build_explanation(X, result, self.instance_label, params)



[docs]
    def reset_predictor(self, predictor: Callable) -> None:
        """
        Resets the predictor function.

        Parameters
        ----------
        predictor
            New model prediction function.
        """
        raise NotImplementedError("Resetting predictor is currently not supported for distributed explainers.")


        # TODO: to support resetting a predictor we would need to re-run most of the code in `fit` instantiating the
        # instances of RemoteSampler anew