Source code for gojo.plotting.classification

# Module with ad-hoc plotting functions to represent results for classification problems.
#
# Author: Fernando García Gutiérrez
# Email: ga.gu.fernando.concat@gmail.com
#
# STATUS: completed, functional, and documented.
#
import warnings
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
from sklearn.metrics import (
    confusion_matrix,
    roc_curve,
    roc_auc_score)

from ..util.validation import (
    checkMultiInputTypes,
)


[docs]def confusionMatrix( df: pd.DataFrame, y_pred: str, y_true: str, average: str = None, y_pred_threshold: float or None = None, normalize: bool = True, labels: list = None, ax: mpl.axes.Axes = None, figsize: tuple = (5, 4), dpi: int = 100, cmap: str = 'Blues', alpha: float = 0.7, cm_font_size: int = 14, xaxis_label: str = None, yaxis_label: str = None, axis_label_size: int = 15, axis_label_pad: int = 15, axis_tick_size: int = 12, title: str = '', title_size: int = 15, title_pad: int = 15, save: str = None, save_kw: dict = None, show: bool = True): """ Function used to represent a confusion matrix from a pandas DataFrame with predictions and true values (e.g., returned by methods :meth:`gojo.core.report.CVReport.getTestPredictions` and :meth:`gojo.core.report.CVReport.getTrainPredictions`). Parameters ---------- df : pd.DataFrame Pandas DataFrame with the model predictions. Example ------- >>> df Out[0] pred_labels true_labels n_fold indices 0 2 0.0 0.0 6 0.0 0.0 11 0.0 0.0 12 0.0 0.0 13 0.0 0.0 ... ... ... 4 987 0.0 0.0 992 0.0 0.0 1011 0.0 0.0 1016 0.0 0.0 1018 0.0 0.0 y_pred : str Variable indicating which values are predicted by the model. y_true : str Variable indicating which values are the ground truth. average : str, default=None Variable that stratifies the predictions (e.g.n at the folds level) to represent the mean and standard deviation values of the confusion matrix. y_pred_threshold : float or None, default=None Threshold to be used to binarize model predictions. normalize : bool, default=True Parameter indicating whether to express the normalized confusion matrix (as a percentage). labels : list, default=None Labels used to identify the classes. By default, they will be C0, C1, ..., CX. ax : matplotlib.axes.Axes, default=None Axes used to represent the figure. figsize : tuple, default=(5, 4) Figure size. dpi : int, default=100 Figure dpi. cmap : str, default='Blues' Colormap. alpha : float, default=0.7 Plot opacity. cm_font_size : int, default=14 Confusion matriz font size. xaxis_label : str, default=None X-axis label. yaxis_label : str, default=None Y-axis label. axis_label_size : int, default=15 XY-axis label size. axis_label_pad : int, default=15 XY-axis pad. axis_tick_size : int, default=12 XY-ticks size. title : str, default='' Title. title_size : int, default=15 Title size. title_pad : int, default=15 Title pad. save : str, default=None Parameter indicating whether to save the generated plot. If None (default) the plot will not be saved. save_kw : dict, default=None Optional parameters for saving the plot. This parameter will not have effect if the save parameter was set as None. show : bool, default=True Parameter indicating whether to save the generated plot. Examples -------- >>> from gojo import core >>> from gojo import plotting >>> >>> # ... data loading and model definition >>> >>> # perform the cross validation >>> cv_report = core.evalCrossVal( >>> X=X, >>> y=y, >>> model=model, >>> cv=util.getCrossValObj(cv=5) >>> ) >>> >>> # get the model predictions on the test data >>> predictions = cv_report.getTestPredictions() >>> >>> # plot the confusion matrix >>> plotting.confusionMatrix( >>> df=predictions, >>> y_pred='pred_labels', >>> y_true='true_labels', >>> average='n_fold', >>> normalize=True, >>> labels=['Class 1', 'Class 2'], >>> title='Confusion matrix', >>> ) >>> """ checkMultiInputTypes( ('df', df, [pd.DataFrame]), ('y_pred', y_pred, [str]), ('y_true', y_true, [str]), ('average', average, [str, type(None)]), ('y_pred_threshold', y_pred_threshold, [float, type(None)]), ('normalize', normalize, [bool]), ('labels', labels, [list, type(None)]), ('ax', ax, [mpl.axes.Axes, type(None)]), ('figsize', figsize, [tuple]), ('dpi', dpi, [int]), ('cmap', cmap, [str]), ('alpha', alpha, [float]), ('cm_font_size', cm_font_size, [int]), ('xaxis_label', xaxis_label, [str, type(None)]), ('yaxis_label', yaxis_label, [str, type(None)]), ('axis_label_size', axis_label_size, [int]), ('axis_label_pad', axis_label_pad, [int]), ('axis_tick_size', axis_tick_size, [int]), ('title', title, [str]), ('title_size', title_size, [int, float]), ('title_pad', title_pad, [int, float]), ('save', save, [str, type(None)]), ('save_kw', save_kw, [dict, type(None)]), ('show', show, [bool])) # avoid inplace modifications when resetting index and applying threshold df = df.copy().reset_index() if y_pred not in df.columns: raise TypeError('Missing "%s" column in dataframe. Available columns are: %r' % (y_pred, list(df.columns))) if y_true not in df.columns: raise TypeError('Missing "%s" column in dataframe. Available columns are: %r' % (y_true, list(df.columns))) # select default parameters if xaxis_label is None: xaxis_label = 'Predicted label' if yaxis_label is None: yaxis_label = 'True label' font_layout = { 'family': 'sans-serif', 'weight': 'normal', 'size': cm_font_size} # binarize input predictions if y_pred_threshold is not None: df[y_pred] = (df[y_pred] > y_pred_threshold).astype(int) # calculate confusion matrices cms = [] if average is not None: # calculate a confusion matrix per fold if average not in df.columns: raise TypeError( 'Missing "%s" column in dataframe. Available columns are: %r' % (average, list(df.columns))) for _, sub_df in df.groupby(average): cms.append(confusion_matrix( y_true=sub_df[y_true].values, y_pred=sub_df[y_pred].values, normalize='true' if normalize else None )) else: # calculate a global confusion matrix cms.append(confusion_matrix( y_true=df[y_true].values, y_pred=df[y_pred].values, normalize='true' if normalize else None )) # stack confusion matrices cms = np.stack(cms) # if the values were normalized represent it as percentages if normalize: cms = cms * 100 # format confusion matrix representation if average is None: assert cms.shape[0] == 1, 'Internal error (0)' cms = cms[0] avg_cms = cms cms_repr = np.empty(shape=cms.shape, dtype=object) for i in range(cms.shape[0]): for j in range(cms.shape[1]): cms_repr[i, j] = ('%.2f' % cms[i, j]) if normalize else ('%d' % cms[i, j]) else: avg_cms = cms.mean(axis=0) std_cms = cms.std(axis=0) cms_repr = np.empty(shape=cms.shape[1:], dtype=object) for i in range(cms_repr.shape[0]): for j in range(cms_repr.shape[1]): str_val = ('%.2f' % avg_cms[i, j]) + r'$\pm$' + ('%.2f' % std_cms[i, j]) cms_repr[i, j] = str_val # default ticks labels if labels is None: labels = ['C%d' % (i + 1) for i in range(cms_repr.shape[0])] if len(labels) != cms_repr.shape[0]: raise TypeError( 'Number of classes (%d), number of labels provided in the param "labels" (%d)' % ( cms_repr.shape[0], len(labels))) # create figure layout with plt.style.context('bmh'): if ax is None: fig, ax = plt.subplots(figsize=figsize) fig.set_dpi(dpi) # represent the average value ax.matshow(avg_cms, cmap=cmap, alpha=alpha) # annotate confusion matrix for i in range(cms_repr.shape[0]): for j in range(cms_repr.shape[1]): ax.text( x=j, y=i, s=cms_repr[i, j], va='center', ha='center', fontdict=font_layout) # formal layout ax.grid(False) ax.set_title(title, size=title_size, pad=title_pad) ax.set_xlabel(xaxis_label, fontsize=axis_label_size, labelpad=axis_label_pad) ax.set_ylabel(yaxis_label, fontsize=axis_label_size, labelpad=axis_label_pad) # remove axis-ticks with warnings.catch_warnings(): warnings.simplefilter("ignore") ax.set_yticklabels([''] + labels, fontsize=axis_tick_size) ax.set_xticklabels([''] + labels, fontsize=axis_tick_size) # change the position of X-axis ticks to the bottom ax.xaxis.set_ticks_position('bottom') # save figure if specified if save: save_kw = {} if save_kw is None else save_kw plt.savefig(save, **save_kw) if show: plt.show()
[docs]def roc( df: pd.DataFrame, y_pred: str, y_true: str, average: str = None, stratify: str = None, n_roc_points: int = 200, add_auc_info: bool = True, labels: dict = None, labels_order: list = None, show_random: bool = True, random_ls: str = 'dotted', random_lw: int or float = 1, random_color: str = 'black', random_label: str = 'Random', ax: mpl.axes.Axes = None, figsize: tuple = (5, 4), dpi: int = 100, style: str = 'ggplot', xaxis_label: str = None, yaxis_label: str = None, lw: float or int or list = None, ls: str or list = None, colors: list or str = None, err_alpha: float = 0.3, title: str = '', title_size: int or float = 15, title_pad: int = 15, hide_legend: bool = False, legend_pos: str = 'lower right', legend_size: int or float = 10, xlabel_size: float or int = 13, ylabel_size: float or int = 13, grid_alpha: float = 0.5, save: str = None, save_kw: dict = None, show: bool = True): """ Function used to represent a ROC curve from a pandas DataFrame with predictions and true values (e.g., returned by methods :meth:`gojo.core.report.CVReport.getTestPredictions` and :meth:`gojo.core.report.CVReport.getTrainPredictions`). Parameters ---------- df : pd.DataFrame Pandas DataFrame with the model predictions. Example ------- >>> df Out[0] pred_labels true_labels n_fold indices 0 2 0.0 0.0 6 0.0 0.0 11 0.0 0.0 12 0.0 0.0 13 0.0 0.0 ... ... ... 4 987 0.0 0.0 992 0.0 0.0 1011 0.0 0.0 1016 0.0 0.0 1018 0.0 0.0 y_pred : str Variable indicating which values are predicted by the model. y_true : str Variable indicating which values are the ground truth. average : str, default=None Variable that stratifies the predictions (e.g.n at the folds level) to represent the mean and standard deviation values of the confusion matrix. stratify : str, default=None Variable used to separate the predictions made by different models. n_roc_points : int, default=200 Number of ROC points to be calculated in order to represent the ROC curve. add_auc_info : bool, default=True Parameter indicating whether to display the AUC value associated with each model in the legend. labels : dict, default=None Labels used to identify the models, if not provided the values of the variable specified in `stratify` or a default value of "Model" will be used. The labels should be provided as a dictionary where the key will be the value that identifies the model in the input data and the key will be the name given to the model. labels_order : list, default=None Order in which the labels will be displayed by default they will be sorted or if parameter `labels` is provided they will appear in the order defined in that input parameter. show_random : bool, default=True Indicates whether to display the ROC curve associated with a random model. random_ls : str, default='dotted' Random line style. random_lw : int or float, default=1 Random line width. random_color : str, default='black' Random line color. random_label : str, default='Random' Random line label. ax : matplotlib.axes.Axes, default=None Axes used to represent the figure. figsize : tuple, default=(5, 4) Figure size. dpi : int, default=100 Figure dpi. style : str, default='ggplot' Plot styling. (see 'matplotlib.pyplot.styles') xaxis_label : str, default=None X-axis label. Default to "False positive rate" yaxis_label : str, default=None Y-axis label. Default to "True positive rate" lw : float or int or list, default=None Line width(s). ls : str or list, default=None Line style(s). colors : list or str, default=None Colors used for identifying the dataframe information. A string colormap can be provided. err_alpha : float, default=0.3 Opacity of the error shadow. title : str, default='' Plot title. title_size : int, default=15 Title font size. title_pad : int, default=15 Title pad. hide_legend : bool, default=False Parameter indicating whether to hide the legend. legend_pos : str, default='upper right' Legend position. legend_size : int, default=12 Legend size. xlabel_size : int, default=13 Size of the x-label. ylabel_size : int, default=13 Size of the y-label. grid_alpha : float, default=0.15 Gird lines opacity. save : str, default=None Parameter indicating whether to save the generated plot. If None (default) the plot will not be saved. save_kw : dict, default=None Optional parameters for saving the plot. This parameter will not have effect if the save parameter was set as None. show : bool, default=True Parameter indicating whether to save the generated plot. Examples -------- >>> from gojo import core >>> from gojo import plotting >>> >>> # ... model definition and data loading >>> >>> # train the models >>> model1.train(X_train, y_train) >>> model2.train(X_train, y_train) >>> >>> # perform inference on the new data >>> y_preds1 = model1.performInference(X_test) >>> y_preds2 = model2.performInference(X_test) >>> >>> # gather the predictions on a single dataframe >>> model1_df = pd.DataFrame({ >>> 'y_pred': y_preds1, >>> 'y_true': y_test, >>> 'model': ['Model 1'] * y_test.shape[0] >>> }) >>> model2_df = pd.DataFrame({ >>> 'y_pred': y_preds2, >>> 'y_true': y_test, >>> 'model': ['Model 2'] * y_test.shape[0] >>> }) >>> model_preds = pd.concat([model1_df, model2_df], axis=0) >>> >>> # display the ROC curve >>> plotting.roc( >>> df=model_preds, >>> y_pred='y_pred', >>> y_true='y_true', >>> stratify='model') >>> """ # check input parameters checkMultiInputTypes( ('df', df, [pd.DataFrame]), ('y_pred', y_pred, [str]), ('y_true', y_true, [str]), ('average', average, [str, type(None)]), ('stratify', stratify, [str, type(None)]), ('n_roc_points', n_roc_points, [int]), ('add_auc_info', add_auc_info, [bool]), ('labels', labels, [dict, type(None)]), ('labels_order', labels_order, [list, type(None)]), ('show_random', show_random, [bool]), ('random_ls', random_ls, [str]), ('random_lw', random_lw, [int, float]), ('random_color', random_color, [str]), ('random_label', random_label, [str]), ('ax', ax, [mpl.axes.Axes, type(None)]), ('figsize', figsize, [tuple]), ('dpi', dpi, [int]), ('style', style, [str]), ('xaxis_label', xaxis_label, [str, type(None)]), ('yaxis_label', yaxis_label, [str, type(None)]), ('lw', lw, [float, int, list, type(None)]), ('ls', ls, [str, list, type(None)]), ('colors', colors, [list, str, type(None)]), ('err_alpha', err_alpha, [float]), ('title', title, [str]), ('title_size', title_size, [int, float]), ('title_pad', title_pad, [int]), ('hide_legend', hide_legend, [bool]), ('legend_pos', legend_pos, [str]), ('legend_size', legend_size, [int, float]), ('xlabel_size', xlabel_size, [float, int]), ('ylabel_size', ylabel_size, [float, int]), ('grid_alpha', grid_alpha, [float]), ('save', save, [str, type(None)]), ('save_kw', save_kw, [dict, type(None)]), ('show', show, [bool]), ) # make a copy of the input dataframe and reset the index df = df.copy().reset_index() # check variable existence if y_pred not in df.columns: raise TypeError( 'Missing "y_pred" variable "%s". Available variables are: %r' % (y_pred, list(df.columns))) if y_true not in df.columns: raise TypeError( 'Missing "y_true" variable "%s". Available variables are: %r' % (y_true, list(df.columns))) if average is not None: if average not in df.columns: raise TypeError( 'Missing "average" variable "%s". Available variables are: %r' % (average, list(df.columns))) # select default parameters if xaxis_label is None: xaxis_label = 'False positive rate' if yaxis_label is None: yaxis_label = 'True positive rate' # extract predictions for individual models model_preds = [] labels_ = [] if stratify is not None: if stratify not in df.columns: raise TypeError( 'Missing "stratify" variable "%s". Available variables are: %r' % (stratify, list(df.columns))) for label, preds_df in df.groupby(stratify): labels_.append(label) model_preds.append(preds_df) # sort label order if labels is None and not labels_order is None: if len(set(labels_order)) != len(labels_order): raise ValueError('Duplicated model name in input labels (parameter labels_order) "%r"' % labels_order) labels = {l: l for l in labels_order} # rename and sort the models if labels is not None: sorted_labels = [] sorted_preds = [] for lkey, lval in labels.items(): for idx, label_ in enumerate(labels_): if label_ == lkey: if lval in sorted_labels: raise ValueError('Duplicated model name "%s"' % lval) sorted_labels.append(lval) sorted_preds.append(model_preds[idx]) labels_ = sorted_labels model_preds = sorted_preds else: model_preds = [df] labels_ = ['Model'] # select default labels labels = labels_ # check labels shape if len(labels) != len(model_preds): raise TypeError( 'Missing labels in "labels". Number of labels provided '\ '(%d) not match the number of input models (%d)' % (len(labels), len(model_preds))) # select line widths if lw is None: lw = [None] * len(labels) elif isinstance(lw, (float, int)): lw = [lw] * len(labels) if len(lw) != len(labels): raise TypeError( 'Missmatch shape between input models (%d) and "lw" (%d)' % (len(labels), len(lw))) # select line styles if ls is None: ls = ['solid'] * len(labels) elif isinstance(ls, str): ls = [ls] * len(labels) if len(ls) != len(labels): raise TypeError( 'Missmatch shape between input models (%d) and "ls" (%d)' % (len(labels), len(ls))) # get colormap colors if isinstance(colors, str): cmap = plt.get_cmap(colors, len(labels) + 1) colors = [mpl.colors.to_hex(cmap(i)) for i in range(len(labels))] if not (colors is None or len(labels) == len(colors)): raise TypeError( 'Missmatch shape between input models (%d) and "colors" (%d)' % (len(labels), len(colors))) # calculate ROC curves roc_data = {} xs = np.linspace(0, 1, n_roc_points) for model_label, model_df in zip(labels, model_preds): # calculate ROC curves averaging the values if average is not None: all_tpr = [] aucs = [] for _, model_df_i in model_df.groupby(average): fpr, tpr, _ = roc_curve(model_df_i[y_true].values, model_df_i[y_pred].values) mean_tpr = np.interp(xs, fpr, tpr) mean_tpr[0] = 0.0 all_tpr.append(mean_tpr) aucs.append(roc_auc_score( y_true=model_df_i[y_true].values, y_score=model_df_i[y_pred].values)) # aggregated metrics mean_tpr = np.mean(all_tpr, axis=0) mean_tpr[-1] = 1.0 std_tpr = np.std(all_tpr, axis=0) mean_auc = np.mean(aucs, axis=0) # calculate a unique ROC curve else: fpr, tpr, _ = roc_curve(model_df[y_true].values, model_df[y_pred].values) mean_tpr = np.interp(xs, fpr, tpr) mean_tpr[0] = 0.0 mean_tpr[-1] = 1.0 std_tpr = np.zeros_like(mean_tpr) mean_auc = roc_auc_score( y_true=model_df[y_true].values, y_score=model_df[y_pred].values) # save model ROC information roc_data[model_label] = { 'mean': mean_tpr, 'std': std_tpr, 'auc': mean_auc} # display the ROC curves with plt.style.context(style): if ax is None: fig, ax = plt.subplots(figsize=figsize) fig.set_dpi(dpi) for i, (label, roc_model_data) in enumerate(roc_data.items()): # select color (if specified) color = None if colors is None else colors[i] mean_roc = roc_model_data['mean'] std_roc = roc_model_data['std'] # add AUC information (if specified) if add_auc_info: label += ' (AUC=%.2f)' % roc_model_data['auc'] # plot roc curve ax.plot(xs, mean_roc, color=color, lw=lw[i], ls=ls[i], label=label) if average is not None: ax.fill_between( xs, mean_roc - std_roc, mean_roc + std_roc, color=color, alpha=err_alpha) # add legend information legend = [] lines = ax.get_lines() for line in lines: legend.append( mpl.lines.Line2D( [], [], alpha=1.0, color=line.get_color(), lw=line.get_lw(), ls=line.get_ls(), label=line.get_label())) # random prediction line if show_random: ax.plot([0, 1], [0, 1], linestyle=random_ls, lw=random_lw, color=random_color, alpha=1) legend.append( mpl.lines.Line2D( [], [], color=random_color, lw=random_lw, alpha=1.0, linestyle=random_ls, label=random_label)) if not hide_legend: ax.legend(handles=legend, loc=legend_pos, prop=dict(size=legend_size)) # figure layout ax.set(xlim=[-0.05, 1.05], ylim=[-0.05, 1.05]) ax.grid(alpha=grid_alpha) ax.set_xlabel(xaxis_label, size=xlabel_size) ax.set_ylabel(yaxis_label, size=ylabel_size) ax.set_title(title, size=title_size, pad=title_pad) # save figure if specified if save: save_kw = {} if save_kw is None else save_kw plt.savefig(save, **save_kw) if show: plt.show()