longling.ML.metrics.classification 源代码

# coding: utf-8
# 2020/4/13 @ tongshiwei

import logging
from longling.lib.candylib import as_list
from collections import OrderedDict
from sklearn.metrics import (classification_report as cr,
                             roc_auc_score, average_precision_score, accuracy_score
                             )

from sklearn.utils import check_consistent_length
from sklearn.utils.multiclass import unique_labels
import functools
from longling.ML.metrics.utils import POrderedDict

__all__ = ["classification_report"]


def multiclass2multilabel(y: list):
    """
    Parameters
    ----------
    y: 1d label indicator array

    Examples
    --------
    >>> multiclass2multilabel([0, 1, 5])
    matrix([[1., 0., 0.],
            [0., 1., 0.],
            [0., 0., 1.]])
    """
    from sklearn.preprocessing import OneHotEncoder
    ret = []
    for _y in y:
        ret.append([_y])
    return OneHotEncoder().fit_transform(ret).todense()


[文档]def classification_report(y_true, y_pred=None, y_score=None, labels=None, metrics=None, sample_weight=None, average_options=None, multiclass_to_multilabel=False, logger=logging, **kwargs): """ Currently support binary and multiclasss classification. Parameters ---------- y_true : list, 1d array-like, or label indicator array / sparse matrix Ground truth (correct) target values. y_pred : list or None, 1d array-like, or label indicator array / sparse matrix Estimated targets as returned by a classifier. y_score : array or None, shape = [n_samples] or [n_samples, n_classes] Target scores, can either be probability estimates of the positive class, confidence values, or non-thresholded measure of decisions (as returned by "decision_function" on some classifiers). For binary y_true, y_score is supposed to be the score of the class with greater label. labels : array, shape = [n_labels] Optional list of label indices to include in the report. metrics: list of str, Support: precision, recall, f1, support, accuracy, auc, aupoc. sample_weight : array-like of shape = [n_samples], optional Sample weights. average_options: str or list default to macro, choices (one or many): "micro", "macro", "samples", "weighted" multiclass_to_multilabel: bool logger Returns ------- Examples -------- >>> import numpy as np >>> # binary classification >>> y_true = np.array([0, 0, 1, 1, 0]) >>> y_pred = np.array([0, 1, 0, 1, 0]) >>> classification_report(y_true, y_pred) precision recall f1 support 0 0.666667 0.666667 0.666667 3 1 0.500000 0.500000 0.500000 2 macro_avg 0.583333 0.583333 0.583333 5 accuracy: 0.600000 >>> y_true = np.array([0, 0, 1, 1]) >>> y_score = np.array([0.1, 0.4, 0.35, 0.8]) >>> classification_report(y_true, y_score=y_score) # doctest: +NORMALIZE_WHITESPACE macro_auc: 0.750000 macro_aupoc: 0.833333 >>> y_true = np.array([0, 0, 1, 1]) >>> y_pred = [0, 0, 0, 1] >>> y_score = np.array([0.1, 0.4, 0.35, 0.8]) >>> classification_report(y_true, y_pred, y_score=y_score) # doctest: +NORMALIZE_WHITESPACE precision recall f1 support 0 0.666667 1.00 0.800000 2 1 1.000000 0.50 0.666667 2 macro_avg 0.833333 0.75 0.733333 4 accuracy: 0.750000 macro_auc: 0.750000 macro_aupoc: 0.833333 >>> # multiclass classification >>> y_true = [0, 1, 2, 2, 2] >>> y_pred = [0, 0, 2, 2, 1] >>> classification_report(y_true, y_pred) precision recall f1 support 0 0.5 1.000000 0.666667 1 1 0.0 0.000000 0.000000 1 2 1.0 0.666667 0.800000 3 macro_avg 0.5 0.555556 0.488889 5 accuracy: 0.600000 >>> # multiclass in multilabel >>> y_true = np.array([0, 0, 1, 1, 2, 1]) >>> y_pred = np.array([2, 1, 0, 2, 1, 0]) >>> y_score = np.array([ ... [0.15, 0.4, 0.45], ... [0.1, 0.9, 0.0], ... [0.33333, 0.333333, 0.333333], ... [0.15, 0.4, 0.45], ... [0.1, 0.9, 0.0], ... [0.33333, 0.333333, 0.333333] ... ]) >>> classification_report( ... y_true, y_pred, y_score, ... multiclass_to_multilabel=True, ... metrics=["aupoc"] ... ) aupoc 0 0.291667 1 0.416667 2 0.166667 macro_avg 0.291667 >>> classification_report( ... y_true, y_pred, y_score, ... multiclass_to_multilabel=True, ... metrics=["auc", "aupoc"] ... ) auc aupoc 0 0.250000 0.291667 1 0.055556 0.416667 2 0.100000 0.166667 macro_avg 0.135185 0.291667 macro_auc: 0.194444 >>> y_true = np.array([0, 1, 1, 1, 2, 1]) >>> y_pred = np.array([2, 1, 0, 2, 1, 0]) >>> y_score = np.array([ ... [0.45, 0.4, 0.15], ... [0.1, 0.9, 0.0], ... [0.33333, 0.333333, 0.333333], ... [0.15, 0.4, 0.45], ... [0.1, 0.9, 0.0], ... [0.33333, 0.333333, 0.333333] ... ]) >>> classification_report( ... y_true, y_pred, ... y_score, ... multiclass_to_multilabel=True, ... ) # doctest: +NORMALIZE_WHITESPACE precision recall f1 auc aupoc support 0 0.000000 0.000000 0.000000 1.00 1.000000 1 1 0.500000 0.250000 0.333333 0.25 0.583333 4 2 0.000000 0.000000 0.000000 0.10 0.166667 1 macro_avg 0.166667 0.083333 0.111111 0.45 0.583333 6 accuracy: 0.166667 macro_auc: 0.437500 >>> classification_report( ... y_true, y_pred, ... y_score, ... labels=[0, 1], ... multiclass_to_multilabel=True, ... ) # doctest: +NORMALIZE_WHITESPACE precision recall f1 auc aupoc support 0 0.00 0.000 0.000000 1.00 1.000000 1 1 0.50 0.250 0.333333 0.25 0.583333 4 macro_avg 0.25 0.125 0.166667 0.45 0.583333 5 accuracy: 0.166667 macro_auc: 0.437500 """ if y_pred is not None: check_consistent_length(y_true, y_pred) if y_score is not None: check_consistent_length(y_true, y_score) assert y_pred is not None or y_score is not None average_options = set(as_list(average_options) if average_options else ["macro"]) average_label_fmt = "{average}_avg" average_metric_fmt = "{average}_{metric}" if y_pred is not None: _unique_labels = unique_labels(y_true, y_pred) else: _unique_labels = unique_labels(y_true) labels = _unique_labels if labels is None else labels labels_set = set(labels) if not metrics: if y_pred is not None: metrics = [ "accuracy", "precision", "recall", "f1", ] else: metrics = [] if y_score is not None: metrics += [ "auc", "aupoc", ] if y_pred is not None: metrics += ["support"] _metrics = set(metrics) ret = OrderedDict() if _metrics & {"precision", "recall", "f1", "support", "accuracy"}: logger.info("evaluate %s" % ",".join(_metrics & {"precision", "recall", "f1", "support", "accuracy"})) cr_result = cr(y_true, y_pred, labels=labels, sample_weight=sample_weight, output_dict=True) if "accuracy" in cr_result: acc = cr_result.pop("accuracy") else: acc = accuracy_score(y_true, y_pred) if "accuracy" in _metrics: ret["accuracy"] = acc for key, value in cr_result.items(): ret[key] = {} for k in _metrics & {"precision", "recall", "f1", "support", "accuracy"}: _k = k if k != "f1" else "f1-score" if _k in value: ret[key][k] = value[_k] for average in ["micro", "macro", "samples", "weighted"]: _label = average_label_fmt.format(average=average) __label = " ".join(_label.split("_")) _prefix = __label.split(" ")[0] if _prefix in average_options: ret[_label] = ret.pop(__label) elif __label in ret: ret.pop(__label) if "auc" in _metrics: logger.info("evaluate auc") assert y_score is not None, "when evaluate auc, y_score is required" func = functools.partial(roc_auc_score, y_score=y_score, sample_weight=sample_weight, **kwargs.get("auc", {"multi_class": 'ovo'})) if multiclass_to_multilabel: _y_true = multiclass2multilabel(y_true) auc_score = func(y_true=_y_true, average=None) for _label, score in enumerate(auc_score): if _label not in labels_set: continue if str(_label) not in ret: ret[str(_label)] = {} ret[str(_label)]["auc"] = score for average in average_options: auc_score = func(y_true=_y_true, average=average) _label = average_label_fmt.format(average=average) if _label not in ret: ret[_label] = {} ret[_label]["auc"] = auc_score for average in average_options: auc_score = func(y_true=y_true, average=average) _label = average_metric_fmt.format(average=average, metric="auc") ret[_label] = auc_score if "aupoc" in _metrics: logger.info("evaluate aupoc") func = functools.partial(average_precision_score, y_score=y_score, sample_weight=sample_weight) if multiclass_to_multilabel: _y_true = multiclass2multilabel(y_true) aupoc = func(y_true=_y_true, average=None) for _label, score in enumerate(aupoc): if _label not in labels_set: continue if str(_label) not in ret: ret[str(_label)] = {} ret[str(_label)]["aupoc"] = score for average in average_options: _label = average_label_fmt.format(average=average) aupoc = func(y_true=_y_true, average=average) if _label not in ret: ret[_label] = {} ret[_label]["aupoc"] = aupoc if len(_unique_labels) == 2: for average in average_options: aupoc = func(y_true=y_true) _label = average_metric_fmt.format(average=average, metric="aupoc") ret[_label] = aupoc logger.info("sorting metrics") _ret = POrderedDict() for key in ret: if isinstance(ret[key], dict): _ret[key] = OrderedDict() for k in metrics: if k in ret[key]: _ret[key][k] = ret[key][k] else: _ret[key] = ret[key] return _ret