Source code for datalib.metrics._plot.cap_curve

"""
Module containing the implementar for CAP Curve Display.
"""
import matplotlib.pyplot as plt
import numpy as np

from sklearn.utils.validation import _check_pos_label_consistency
from sklearn.utils import check_matplotlib_support
from sklearn.utils._plotting import _BinaryClassifierCurveDisplayMixin
from .. import cap_curve


[docs]class CAPCurveDisplay(_BinaryClassifierCurveDisplayMixin):
    """CAP Curve visualization.

    Parameters
    ----------
    cumulative_gains : ndarray
        Cumulative gain with each threshold (percentage of class 1).

    thresholds : ndarray
        Increasing thresholds (percentage of examples) on the decision
        function used to compute cap curve.

    positive_rate : ndarray
        Rate of positive class examples to compute the perfect curve.

    gini : float, default=None
        Gini score. If None, the gini score is not shown.

    estimator_name : str, default=None
        Name of estimator. If None, the estimator name is not shown.

    pos_label : str or int, default=None
        The class considered as the positive class when computing
        the CAP curve.
        By default, `estimators.classes_[1]` is considered
        as the positive class.

    Attributes
    ----------
    line_ : matplotlib Artist
        CAP Curve.
    ax_ : matplotlib Axes
        Axes with CAP Curve.
    figure_ : matplotlib Figure
        Figure containing the curve.
    """

    def __init__(
        self,
        *,
        cumulative_gains,
        thresholds,
        positive_rate=None,
        gini=None,
        estimator_name=None,
        pos_label=None,
    ):
        self.estimator_name = estimator_name
        self.cumulative_gains = cumulative_gains
        self.thresholds = thresholds
        self.positive_rate = positive_rate
        self.gini = gini
        self.pos_label = pos_label

[docs]    def plot(
        self,
        *,
        plot_random=False,
        plot_perfect=False,
        name=None,
        ax=None,
        **kwargs,
    ):
        """Plot visualization
        Extra keyword arguments will be passed to matplotlib's ``plot``.

        Parameters
        ----------
        ax : matplotlib axes, default=None
            Axes object to plot on. If `None`, a new figure and axes is
            created.

        name : str, default=None
            Name of CAP Curve for labeling. If `None`, use `estimator_name` if
            not `None`, otherwise no labeling is shown.

        plot_random : boolean, default = False
            Flag indicating whether to plot the baseline random curve (True)
            or not (False).

        plot_perfect : boolean, default = False
            Flag indicating whether to plot the baseline perfect curve (True)
            or not (False).

        Returns
        -------
        display : :class:`~sklearn.metrics.plot.CAPCurveDisplay`
            Object that stores computed values.
        """
        check_matplotlib_support("CAPCurveDisplay.plot")

        name = self.estimator_name if name is None else name

        line_kwargs = {}
        if self.gini is not None and name is not None:
            line_kwargs["label"] = f"{name} (GINI = {self.gini:0.2f})"
        elif self.gini is not None:
            line_kwargs["label"] = f"Gini = {self.gini:0.2f}"
        elif name is not None:
            line_kwargs["label"] = name

        line_kwargs.update(**kwargs)

        if ax is None:
            _, ax = plt.subplots()

        if plot_random is True:
            ax.plot([0, 1], [0, 1], linestyle="--", label="Random Model")

        if plot_perfect is True and self.positive_rate is not None:
            ax.plot(
                [0, self.positive_rate, 1],
                [0, 1, 1],
                label="Perfect Model",
            )

        (self.line_,) = ax.plot(
            self.thresholds, self.cumulative_gains, **line_kwargs
        )
        info_pos_label = (
            f" (Positive label: {self.pos_label})"
            if self.pos_label is not None
            else ""
        )

        xlabel = "% of Observations" + info_pos_label
        ylabel = "% of Positive Observations" + info_pos_label
        ax.set(xlabel=xlabel, ylabel=ylabel)

        if "label" in line_kwargs:
            ax.legend(loc="lower right")

        self.ax_ = ax
        self.figure_ = ax.figure
        return self

[docs]    @classmethod
    def from_estimator(
        cls,
        estimator,
        X,
        y,
        *,
        sample_weight=None,
        response_method="auto",
        pos_label=None,
        plot_random=False,
        plot_perfect=False,
        name=None,
        ax=None,
        **kwargs,
    ):
        """Create a CAP Curve display from an estimator.

        Parameters
        ----------
        estimator : estimator instance
            Fitted classifier or a fitted :class:`~sklearn.pipeline.Pipeline`
            in which the last estimator is a classifier.

        X : {array-like, sparse matrix} of shape (n_samples, n_features)
            Input values.

        y : array-like of shape (n_samples,)
            Target values.

        sample_weight : array-like of shape (n_samples,), default=None
            Sample weights.

        response_method : {'predict_proba', 'decision_function', 'auto'} \
                default='auto'
            Specifies whether to use :term:`predict_proba` or
            :term:`decision_function` as the target response. If set to 'auto',
            :term:`predict_proba` is tried first and if it does not exist
            :term:`decision_function` is tried next.

        pos_label : str or int, default=None
            The class considered as the positive class when computing the
            ROC-AUC metric. By default, `estimators.classes_[1]` is considered
            as the positive class.

        plot_random : boolean, default = False
            Flag indicating whether to plot the baseline random curve (True)
            or not (False).

        plot_perfect : boolean, default = False
            Flag indicating whether to plot the baseline perfect curve (True)
            or not (False).

        name : str, default=None
            Name of CAP Curve for labeling. If `None`, use the name of the
            estimator.

        ax : matplotlib axes, default=None
            Axes object to plot on. If `None`, a new figure and axes is
        created.

        **kwargs : dict
            Keyword arguments to be passed to matplotlib's `plot`.

        Returns
        -------
        display : :class:`~sklearn.metrics.plot.CAPCurveDisplay`
            The ROC Curve display.

        Examples
        --------
        >>> import matplotlib.pyplot as plt
        >>> from datalib.metrics import CAPCurveDisplay
        >>> from sklearn.datasets import make_classification
        >>> from sklearn.model_selection import train_test_split
        >>> from sklearn.svm import SVC
        >>> X, y = make_classification(random_state=0)
        >>> X_train, X_test, y_train, y_test = train_test_split(X, y)
        >>> clf = SVC(random_state=0).fit(X_train, y_train)
        >>> CAPCurveDisplay.from_estimator(clf, X_test, y_test)
        >>> plt.show()
        """
        check_matplotlib_support(f"{cls.__name__}.from_estimator")

        y_score, pos_label, name = cls._validate_and_get_response_values(
            estimator,
            X,
            y,
            response_method=response_method,
            pos_label=pos_label,
            name=name,
        )

        return cls.from_predictions(
            y_true=y,
            y_score=y_score,
            sample_weight=sample_weight,
            pos_label=pos_label,
            plot_random=plot_random,
            plot_perfect=plot_perfect,
            name=name,
            ax=ax,
            **kwargs,
        )

[docs]    @classmethod
    def from_predictions(
        cls,
        y_true,
        y_score,
        *,
        sample_weight=None,
        pos_label=None,
        plot_random=False,
        plot_perfect=False,
        name=None,
        ax=None,
        **kwargs,
    ):
        """Plot CAP curve given the true and predicted score.

        Parameters
        ----------
        y_true : array-like of shape (n_samples,)
            True labels.

        y_score : array-like of shape (n_samples,)
            Target scores, can either be probability estimates of the positive
            class, confidence values, or non-thresholded measure of decisions
            (as returned by “decision_function” on some classifiers).

        sample_weight : array-like of shape (n_samples,), default=None
            Sample weights.

        pos_label : str or int, default=None
            The label of the positive class. When `pos_label=None`, if `y_true`
            is in {-1, 1} or {0, 1}, `pos_label` is set to 1, otherwise an
            error will be raised.

        plot_random : boolean, default = False
            Flag indicating whether to plot the baseline random curve (True)
            or not (False).

        plot_perfect : boolean, default = False
            Flag indicating whether to plot the baseline perfect curve (True)
            or not (False).

        name : str, default=None
            Name of ROC curve for labeling. If `None`, name will be set to
            `"Classifier"`.

        ax : matplotlib axes, default=None
            Axes object to plot on. If `None`, a new figure and axes is
            created.

        **kwargs : dict
            Additional keywords arguments passed to matplotlib `plot` function.

        Returns
        -------
        display : :class:`~sklearn.metrics.CAPCurveDisplay`
            Object that stores computed values.

        Examples
        --------
        >>> import matplotlib.pyplot as plt
        >>> from datalib.metrics import CAPCurveDisplay
        >>> from sklearn.datasets import make_classification
        >>> from sklearn.model_selection import train_test_split
        >>> from sklearn.svm import SVC
        >>> X, y = make_classification(random_state=0)
        >>> X_train, X_test, y_train, y_test = train_test_split(X, y)
        >>> clf = SVC(random_state=0, probability=True).fit(X_train, y_train)
        >>> y_pred = clf.predict_proba(X_test)[:, 1]
        >>> CAPCurveDisplay.from_predictions(y_test, y_pred)
        >>> plt.show()
        """
        check_matplotlib_support(f"{cls.__name__}.from_predictions")

        cumulative_gains, thresholds, gini = cap_curve(
            y_true, y_score, sample_weight
        )
        positive_rate = (
            np.sum(y_true) / len(y_true) if plot_perfect is True else None
        )

        name = "Classifier" if name is None else name
        pos_label = _check_pos_label_consistency(pos_label, y_true)

        viz = CAPCurveDisplay(
            cumulative_gains=cumulative_gains,
            thresholds=thresholds,
            positive_rate=positive_rate,
            gini=gini,
            estimator_name=name,
            pos_label=pos_label,
        )

        return viz.plot(
            ax=ax,
            name=name,
            plot_random=plot_random,
            plot_perfect=plot_perfect,
            **kwargs,
        )