# Module containing the code typically used to train and evaluate Machine Learning models (e.g.,
# cross-validation subroutines).
#
# Author: Fernando García Gutiérrez
# Email: ga.gu.fernando.concat@gmail.com
#
# STATUS: completed, functional, and documented.
#
import os
import time
import platform
import pandas as pd
import numpy as np
import warnings
import joblib
import multiprocessing as mp
import optuna
from datetime import datetime
from pathlib import Path
from functools import partial
from typing import List
from copy import deepcopy
from tqdm import tqdm
from sklearn.model_selection import (
RepeatedKFold,
RepeatedStratifiedKFold,
LeaveOneOut)
from ..interfaces import (
Model,
Dataset,
Transform
)
from .evaluation import (
Metric
)
from .report import (
CVReport
)
from ..util.validation import (
checkMultiInputTypes,
checkInputType,
checkCallable,
)
from ..util.splitter import (
SimpleSplitter,
InstanceLevelKFoldSplitter,
PredefinedSplitter
)
from ..exception import (
UnfittedTransform
)
from ..util.io import pprint
def _getModelPredictions(model: Model, X: np.ndarray, op_instance_args: dict) -> np.ndarray:
""" Subroutine that return the model predictions. Model prediction order resolution:
The predictions will be returned as numpy.arrays.
"""
checkMultiInputTypes(
('X', X, [np.ndarray]),
('model', model, [Model]),
('op_instance_args', op_instance_args, [dict])
)
predictions = model.performInference(X, **op_instance_args)
checkInputType('model.performInference() -> out', predictions, [np.ndarray])
return predictions
def _fitModelAndPredict(model: Model, X_train: np.ndarray, X_test: np.ndarray,
y_train: np.ndarray = None, op_train_instance_args: dict = None,
op_test_instance_args: dict = None) -> np.ndarray:
""" Subroutine used to fit a model and make the predictions. """
checkMultiInputTypes(
('X_train', X_train, [np.ndarray]),
('X_test', X_test, [np.ndarray]),
('y_train', y_train, [np.ndarray, type(None)]),
('model', model, [Model]),
('op_train_instance_args', op_train_instance_args, [dict, type(None)]),
('op_test_instance_args', op_test_instance_args, [dict, type(None)]))
op_train_instance_args = {} if op_train_instance_args is None else op_train_instance_args
op_test_instance_args = {} if op_test_instance_args is None else op_test_instance_args
if model.is_fitted:
warnings.warn(
'Providing a fitted model to "gojo.core.loops._fitModelAndPredict()". The model provided will be '
'automatically reset using "model.resetFit()" and re-fitted.')
model.resetFit()
model.train(X_train, y_train, **op_train_instance_args)
predictions = _getModelPredictions(model=model, X=X_test, op_instance_args=op_test_instance_args)
return predictions
def _applyTransforms(transforms: List[Transform], X: np.ndarray, y: np.ndarray = None,
op_instance_args: dict = None) -> np.ndarray:
""" Subroutine that applies the provided transforms. """
checkMultiInputTypes(
('transforms', transforms, [list]),
('X', X, [np.ndarray]),
('y', y, [np.ndarray, type(None)]),
('op_instance_args', op_instance_args, [dict, type(None)])
)
op_instance_args = {} if op_instance_args is None else op_instance_args
if len(transforms) == 0:
raise TypeError('Parameter "transformations" is an empty list.')
for i, transform in enumerate(transforms):
checkInputType('transformations[%d]' % i, transform, [Transform])
# check for unfitted transforms
if not transform.is_fitted:
raise UnfittedTransform()
X = transform.transform(X=X, y=y, **op_instance_args)
return X
def _fitAndApplyTransforms(transforms: List[Transform], X_train: np.ndarray, X_test: np.ndarray,
y_train: np.ndarray = None, y_test: np.ndarray = None,
op_train_instance_args: dict = None, op_test_instance_args: dict = None) -> tuple:
""" Subroutine used to fit transforms and make predictions.
NOTE: This functions performs inplace modification of the input transforms.
"""
checkMultiInputTypes(
('transforms', transforms, [list]),
('X_train', X_train, [np.ndarray]),
('X_test', X_test, [np.ndarray]),
('y_train', y_train, [np.ndarray, type(None)]),
('y_test', y_test, [np.ndarray, type(None)]),
('op_train_instance_args', op_train_instance_args, [dict, type(None)]),
('op_test_instance_args', op_test_instance_args, [dict, type(None)]),
)
op_train_instance_args = {} if op_train_instance_args is None else op_train_instance_args
op_test_instance_args = {} if op_test_instance_args is None else op_test_instance_args
if len(transforms) == 0:
raise TypeError('Parameter "transformations" is an empty list.')
for i, transform in enumerate(transforms):
checkInputType('transformations[%d]' % i, transform, [Transform])
# check for fitted transforms
if transform.is_fitted:
warnings.warn(
'Providing a fitted transform to "gojo.core.loops._fitTransformsAndApply()". The transform provided '
'will be automatically reset using "transform.resetFit()" and re-fitted.')
transform.resetFit()
# fit the transformations based on the training data, and apply the transformation
# to the training/test data
transform.fit(X=X_train, y=y_train, **op_train_instance_args)
X_train = transform.transform(X=X_train, y=y_train, **op_train_instance_args)
X_test = transform.transform(X=X_test, y=y_test, **op_test_instance_args)
return X_train, X_test
def _evalCrossValFold(
_n_fold: int,
_model: Model,
_X_train: np.ndarray,
_y_train: np.ndarray or None,
_X_test: np.ndarray,
_y_test: np.ndarray,
_train_idx: np.ndarray,
_test_idx: np.ndarray,
_predict_train: bool,
_return_model: bool,
_reset_model_fit: bool,
_transforms: list or None,
_return_transforms: bool,
_reset_transforms: bool,
_op_instance_args: dict) -> tuple:
""" Subroutine used internally to train and perform the predictions of a model in relation to a fold. This
subroutine has been segmented to allow parallelization of training.
Parameters
----------
_model : gojo.interfaces.Model
Model to be trained and used to make the predictions on '_X_test'.
_X_train : np.ndarray
Data used for model training.
_y_train : np.ndarray or None
Labels used for model training.
_X_test : np.ndarray
Data used for the model to make inferences on new data.
_predict_train : bool
Parameter indicating whether to return the predictions on the data used
to train the models.
_return_model : bool
Parameter indicating whether to return a deepcopy of the trained model.
_reset_model_fit : bool
Parameter indicating if the model should be reset by calling to the 'resetFit()'
method.
_transforms : list or None
Transformations that will be applied to the data before training the model. These
transformations will be adjusted based on the training data and will be used to transform
the training and test data. They will be applied sequentially.
_return_transforms : bool
Parameter indicating whether to return the transforms.
_reset_transforms : bool
Parameter indicating if the transforms should be reset by calling to the
'resetFit()' method.
_op_instance_args : dict or None
Optional instance-level parameters.
Returns
-------
(_n_fold, y_pred_test, y_pred_train, y_true_test, y_true_train, test_idx, train_idx, trained_model,
_transforms) : tuple
Elements specified according to the input parameters of the method. The tuple will contain sub-tuples
of two elements, where the first element will identify the information and the second will correspond
to the information.
IMPORTANT NOTE: If the input parameter '_reset_model_fit' is set to False the input model will remain trained (
inplace modifications will take place). Applicable also for transforms ('_transforms' and '_reset_transforms'
parameter).
"""
# separate instance-level parameters if provided
_op_train_instance_args = {}
_op_test_instance_args = {}
if len(_op_instance_args) > 0:
for _var_name, _var_values in _op_instance_args.items():
_op_train_instance_args[_var_name] = [_var_values[_idx] for _idx in _train_idx]
_op_test_instance_args[_var_name] = [_var_values[_idx] for _idx in _test_idx]
# fit transformations to the training data and apply to the test data
if _transforms is not None:
_X_train, _X_test = _fitAndApplyTransforms(
transforms=_transforms,
X_train=_X_train,
X_test=_X_test,
y_train=_y_train,
y_test=_y_test,
op_train_instance_args=_op_train_instance_args,
op_test_instance_args=_op_test_instance_args)
# train the model and make the predictions on the test data
y_pred_test = _fitModelAndPredict(
model=_model,
X_train=_X_train,
X_test=_X_test,
y_train=_y_train,
op_train_instance_args=_op_train_instance_args,
op_test_instance_args=_op_test_instance_args)
# make predictions on the training data and save training data information
y_pred_train = None
y_true_train = None
train_idx = None
if _predict_train:
y_pred_train = _getModelPredictions(model=_model, X=_X_train, op_instance_args=_op_train_instance_args)
y_true_train = _y_train
train_idx = _train_idx
trained_model = None
if _return_model:
trained_model = _model.copy()
transforms = None
if _return_transforms and _transforms is not None:
transforms = [_trans.copy() for _trans in _transforms]
# reset transforms
if _reset_transforms and _transforms is not None:
for _transform in _transforms:
_transform.resetFit()
# reset trained model
if _reset_model_fit:
_model.resetFit()
return (
('n_fold', _n_fold),
('pred_test', y_pred_test),
('pred_train', y_pred_train),
('true_test', _y_test),
('true_train', y_true_train),
('test_idx', _test_idx),
('train_idx', train_idx),
('trained_model', trained_model),
('transforms', transforms)
)
def _createCVReport(cv_results: list, X_dataset, y_dataset) -> CVReport:
# HACk. a little hard-coded...
cv_report = CVReport(
raw_results=cv_results,
X_dataset=X_dataset,
y_dataset=y_dataset,
n_fold_key='n_fold',
pred_test_key='pred_test',
true_test_key='true_test',
pred_train_key='pred_train',
true_train_key='true_train',
test_idx_key='test_idx',
train_idx_key='train_idx',
trained_model_key='trained_model',
fitted_transforms_key='transforms'
)
return cv_report
[docs]def evalCrossVal(
X: np.ndarray or pd.DataFrame,
y: np.ndarray or pd.DataFrame or pd.Series,
model: Model,
cv: RepeatedKFold or RepeatedStratifiedKFold or LeaveOneOut or SimpleSplitter,
transforms: List[Transform] or None = None,
verbose: int = -1,
n_jobs: int = 1,
save_train_preds: bool = False,
save_transforms: bool = False,
save_models: bool = False,
op_instance_args: dict = None) -> CVReport:
""" Subroutine used to evaluate a model according to a cross-validation scheme provided by the `cv` argument.
Parameters
-----------
X : np.ndarray or pd.DataFrame
Variables used to fit the model.
y : np.ndarray or pd.DataFrame or pd.Series
Target prediction variable.
model : :class:`gojo.interfaces.Model`
Model to be trained. The input model must follow the :class:`gojo.base.Model` interfaz.
cv : Cross-validation splitter
Cross-validation schema. For more information about cross validation see `sklearn.model_selection` module.
The gojo module implements useful functions for easy loading of cross-validation objects (see
:func:`gojo.util.getCrossValObj`). Supported splitters are :class:`sklearn.model_selection.RepeatedKFold`,
:class:`sklearn.model_selection.RepeatedStratifiedKFold`, :class:`sklearn.model_selection.LeaveOneOut`,
:class:`gojo.util.splitter.SimpleSplitter`, :class:`gojo.util.splitter.InstanceLevelKFoldSplitter` or
:class:`gojo.util.splitter.PredefinedSplitter`
transforms : List[Transform] or None, default=None
Transformations applied to the data before being provided to the models. These transformations will be fitted
using the training data, and will be applied to both training and test data. For more information see the
module :py:mod:`gojo.core.transform`.
verbose : int, default=-1
Verbosity level.
n_jobs : int, default=1
Number of jobs used for parallelization.
save_train_preds : bool, default=False
Parameter that indicates whether the predictions made on the training set will be saved in
:class:`gojo.core.report.CVReport`. For large training sets this may involve higher computational and
storage costs.
save_transforms : bool, default=False
Parameter that indicates whether the fitted transforms will be saved in :class:`gojo.core.report.CVReport`.
save_models : bool, default=False
Parameter that indicates whether the fitted models will be saved in :class:`gojo.core.report.CVReport`. For
larger models this may involve higher computational and storage costs.
op_instance_args : dict, default=None
Instance-level optional arguments. This parameter should be a dictionary whose values must be list on an
array-like iterable containing the same number of elements as instances in `X` and `y`.
Returns
--------
cv_obj : :class:`gojo.core.report.CVReport`
Cross validation report. For more information see :class:`gojo.core.report.CVReport`.
Examples
--------
>>> import pandas as pd
>>> from sklearn import datasets
>>> from sklearn.svm import SVC
>>> from sklearn.preprocessing import StandardScaler
>>> from sklearn.decomposition import PCA
>>>
>>> # GOJO libraries
>>> from gojo import core
>>> from gojo import interfaces
>>>
>>> N_JOBS = 8
>>>
>>> # load test dataset (Wine)
>>> wine_dt = datasets.load_wine()
>>>
>>> # create the target variable. Classification problem 0 vs rest
>>> # to see the target names you can use wine_dt['target_names']
>>> y = (wine_dt['target'] == 1).astype(int)
>>> X = wine_dt['data']
>>>
>>> # previous model transforms
>>> transforms = [
>>> interfaces.SKLearnTransformWrapper(StandardScaler),
>>> interfaces.SKLearnTransformWrapper(PCA, n_components=5)
>>> ]
>>>
>>> # default model
>>> model = interfaces.SklearnModelWrapper(
>>> SVC, kernel='poly', degree=1, coef0=0.0,
>>> cache_size=1000, class_weight=None
>>> )
>>>
>>> # evaluate the model using a simple cross-validation strategy with a
>>> # default parameters
>>> cv_report = core.evalCrossVal(
>>> X=X, y=y,
>>> model=model,
>>> cv=gojo.util.splitter.getCrossValObj(cv=5, repeats=1, stratified=True, loocv=False, random_state=1997),
>>> transforms=transforms,
>>> verbose=True,
>>> save_train_preds=True,
>>> save_models=False,
>>> save_transforms=False,
>>> n_jobs=N_JOBS
>>> )
>>>
>>> scores = cv_report.getScores(core.getDefaultMetrics('binary_classification', bin_threshold=0.5))
>>> results = pd.concat([
>>> pd.DataFrame(scores['train'].mean(axis=0)).round(decimals=3),
>>> pd.DataFrame(scores['test'].mean(axis=0)).round(decimals=3)],
>>> axis=1).drop(index=['n_fold'])
>>> results.columns = ['Train', 'Test']
>>> results
>>>
"""
checkMultiInputTypes(
('X', X, [np.ndarray, pd.DataFrame]),
('y', y, [np.ndarray, pd.DataFrame, pd.Series]),
('model', model, [Model]),
('cv', cv, [RepeatedKFold, RepeatedStratifiedKFold, LeaveOneOut, SimpleSplitter, InstanceLevelKFoldSplitter,
PredefinedSplitter]),
('transforms', transforms, [list, type(None)]),
('verbose', verbose, [int]),
('n_jobs', n_jobs, [int]),
('save_models', save_models, [bool]),
('save_transforms', save_transforms, [bool]),
('save_train_preds', save_train_preds, [bool]),
('op_instance_args', op_instance_args, [dict, type(None)])
)
# create the model datasets
X_dt = Dataset(X)
y_dt = Dataset(y)
# check op_instance_args argument
if op_instance_args is not None:
for var_name, var_values in op_instance_args.items():
checkInputType('op_instance_args["%s"]' % var_name, var_values, [list, np.ndarray])
if len(X_dt) != len(var_values):
raise TypeError(
'Missmatch in X shape (%d) and op_instance_args["%s"] shape (%d).' % (
len(X_dt), var_name, len(var_values)))
else:
op_instance_args = {}
# check data lengths
if len(X_dt) != len(y_dt):
raise TypeError('Missmatch in X shape (%d) and y shape (%d).' % (len(X_dt), len(y_dt)))
# verbose parameters
verbose = np.inf if verbose < 0 else verbose # negative values indicate activate all
show_pbar = False
# levels > 0 should display a tqdm loading bar
if verbose > 0:
show_pbar = True
# train the model and make the predictions according to the cross-validation
# schema provided
if n_jobs == 1:
cv_results = [
_evalCrossValFold(
_n_fold=i,
_model=model,
_X_train=X_dt.array_data[train_idx],
_y_train=y_dt.array_data[train_idx],
_X_test=X_dt.array_data[test_idx],
_y_test=y_dt.array_data[test_idx],
_train_idx=train_idx,
_test_idx=test_idx,
_predict_train=save_train_preds,
_return_model=save_models,
_reset_model_fit=True, # inplace modifications take place inside this function
_transforms=transforms,
_return_transforms=save_transforms,
_reset_transforms=True, # inplace modifications take place inside this function
_op_instance_args=op_instance_args
) for i, (train_idx, test_idx) in tqdm(
enumerate(cv.split(X_dt.array_data, y_dt.array_data)),
desc='Performing cross-validation...', disable=not show_pbar)
]
else:
if n_jobs == -1:
n_jobs = mp.cpu_count()
if n_jobs <= 0:
raise TypeError(
'Parameter "n_jobs" cannot be less than 0 (only -1 is allowed indicating use all cpu cores).')
cv_results = joblib.Parallel(n_jobs=n_jobs, backend='loky')(
joblib.delayed(_evalCrossValFold)(
_n_fold=i,
_model=model,
_X_train=X_dt.array_data[train_idx],
_y_train=y_dt.array_data[train_idx],
_X_test=X_dt.array_data[test_idx],
_y_test=y_dt.array_data[test_idx],
_train_idx=train_idx,
_test_idx=test_idx,
_predict_train=save_train_preds,
_return_model=save_models,
# inplace modifications will not take place inside this function so save the computation setting this
# to False, but... better prevent
_reset_model_fit=True, # inplace modifications take place inside this function
_transforms=transforms,
_return_transforms=save_transforms,
_reset_transforms=True, # inplace modifications take place inside this function
_op_instance_args=op_instance_args
) for i, (train_idx, test_idx) in tqdm(
enumerate(cv.split(X_dt.array_data, y_dt.array_data)),
desc='Performing cross-validation...', disable=not show_pbar)
)
# the model should not remain fitted after the execution of the previous subroutines
if model.is_fitted:
warnings.warn(
'Detected a fitted model after cross-validation procedure in "gojo.core.loops.evalCrossVal(...)"')
cv_report = _createCVReport(
cv_results=cv_results,
X_dataset=X_dt,
y_dataset=y_dt,
)
# add instance-level parameters as metadata if not None
cv_report.addMetadata(
op_instance_args=op_instance_args)
return cv_report
[docs]def evalCrossValNestedHPO(
X: np.ndarray or pd.DataFrame,
y: np.ndarray or pd.DataFrame or pd.Series,
model: Model,
search_space: dict,
outer_cv: RepeatedKFold or RepeatedStratifiedKFold or LeaveOneOut or SimpleSplitter,
inner_cv: RepeatedKFold or RepeatedStratifiedKFold or LeaveOneOut or SimpleSplitter,
hpo_sampler: optuna.samplers.BaseSampler,
hpo_n_trials: int,
minimization: bool,
metrics: List[Metric],
objective_metric: str = None,
agg_function: callable = None,
transforms: List[Transform] or None = None,
verbose: int = -1,
n_jobs: int = 1,
inner_cv_n_jobs: int = 1,
save_train_preds: bool = False,
save_transforms: bool = False,
save_models: bool = False,
op_instance_args: dict = None,
enable_experimental: bool = False):
""" Subroutine used to evaluate a model according to a cross-validation scheme provided by the `outer_cv` argument.
This function also perform a nested cross-validation for hyperparameter optimization (HPO) based on the `optuna`
library.
Parameters
----------
X : np.ndarray or pd.DataFrame
Variables used to fit the model.
y : np.ndarray or pd.DataFrame or pd.Series
Target prediction variable.
model : :class:`gojo.interfaces.Model`
Model to be trained. The input model must follow the :class:`gojo.base.Model` interfaz.
search_space : dict
Search space used for performing the HPO. For more information about distributions and sampling strategies
consult `optuna <https://optuna.org>`_.
>>> search_space = {
>>> # sample from a categorical distribution
>>> 'max_depth': ('suggest_int', (2, 10)),
>>> # ... from a uniform distribution
>>> 'max_samples': ('suggest_float', (0.5, 1.0)),
>>> }
Keyword arguments can be passed by providing a dictionary in the third position where the key will
correspond to the name of the parameter:
>>> search_space = {
>>> # sample from a categorical distribution in log space
>>> 'max_depth': ('suggest_int', (2, 40), dict(step=1, log=True))),
>>> # ... from a uniform distribution
>>> 'max_samples': ('suggest_float', (0.5, 1.0)),
>>> }
outer_cv : Cross-validation splitter
Cross-validation schema. For more information about cross validation see `sklearn.model_selection` module.
The gojo module implements useful functions for easy loading of cross-validation objects (see
:func:`gojo.util.getCrossValObj`). Supported splitters are :class:`sklearn.model_selection.RepeatedKFold`,
:class:`sklearn.model_selection.RepeatedStratifiedKFold`, :class:`sklearn.model_selection.LeaveOneOut`,
:class:`gojo.util.splitter.SimpleSplitter`, :class:`gojo.util.splitter.InstanceLevelKFoldSplitter` or
:class:`gojo.util.splitter.PredefinedSplitter`
inner_cv : Cross-validation splitter
Inner cross-validation schema used for evaluating model performance in the nested cross-validation used for
optimize the model hyperparameters. For more information about cross validation see `sklearn.model_selection`
module. The gojo module implements useful functions for easy loading of cross-validation objects (see
:func:`gojo.util.getCrossValObj`). Supported splitters are :class:`sklearn.model_selection.RepeatedKFold`,
:class:`sklearn.model_selection.RepeatedStratifiedKFold`, :class:`sklearn.model_selection.LeaveOneOut`,
:class:`gojo.util.splitter.SimpleSplitter`, :class:`gojo.util.splitter.InstanceLevelKFoldSplitter` or
:class:`gojo.util.splitter.PredefinedSplitter`
hpo_sampler : optuna.samplers.BaseSampler
Sampler used for suggest model hyperparameters. For more information see `optuna <https://optuna.org>`_.
hpo_n_trials : int
Number of HPO iterations.
minimization: bool
Parameter indicating if the HPO objetive function must be minimized. If `minimization=False` the objective
function will be maximized.
metrics : List[:class:`gojo.core.evaluation.Metric`]
Metrics used within the nested-cross validation to evaluate the hyperparameter configuration.
objective_metric : str, default=None
It is possible to indicate which of the metrics provided by the `metrics` parameter are to be optimized within
the HPO. The metric must be provided as a string and must be included in the list of metrics provided. If this
parameter is not provided, an aggregation function must be provided by means of the `agg_function` parameter.
agg_function : callable, default=None
This function will receive a dataframe with the metrics calculated on each of the folds generated by the
`inner_cv` and taking into account this information it will provide a score that will be maximized/minimized
within the HPO. If the X parameter is not provided, this parameter must be provided. If both parameters are
provided, X will be ignored.
transforms : List[Transform] or None, default=None
Transformations applied to the data before being provided to the models. These transformations will be fitted
using the training data, and will be applied to both training and test data. For more information see the
module :py:mod:`gojo.core.transform`.
verbose : int, default=-1
Verbosity level.
: int, default=1
Number of cores used to parallelise internal cross validation.
n_jobs : int, default=1
Number of jobs used for parallelization. Parallelisation will be done at the `optuna` trial level and will
depend on a temporary database that will be created and automatically removed once the optimizzation ends.
This is an experimental feature, to enable this parameter you have to specify `enable_experimental=True`.
save_train_preds : bool, default=False
Parameter that indicates whether the predictions made on the training set will be saved in
:class:`gojo.core.report.CVReport`. For large training sets this may involve higher computational and
storage costs.
save_transforms : bool, default=False
Parameter that indicates whether the fitted transforms will be saved in :class:`gojo.core.report.CVReport`.
save_models : bool, default=False
Parameter that indicates whether the fitted models will be saved in :class:`gojo.core.report.CVReport`. For
larger models this may involve higher computational and storage costs.
op_instance_args : dict, default=None
Instance-level optional arguments. This parameter should be a dictionary whose values must be list on an
array-like iterable containing the same number of elements as instances in `X` and `y`.
enable_experimental: bool, default=False
Parameter indicating whether the experimental characteristics of the function are allowed.
Returns
-------
cv_obj : :class:`gojo.core.report.CVReport`
Cross validation report. For more information see :class:`gojo.core.report.CVReport`. The HPO history will be
save in the report metadata (:attr:`gojo.core.report.CVReport.metadata`.
Examples
--------
>>> import optuna
>>> import pandas as pd
>>> from sklearn import datasets
>>> from sklearn.svm import SVC
>>> from sklearn.preprocessing import StandardScaler
>>> from sklearn.decomposition import PCA
>>>
>>> # GOJO libraries
>>> import gojo
>>> from gojo import core
>>>
>>> N_JOBS = 8
>>>
>>> # load test dataset (Wine)
>>> wine_dt = datasets.load_wine()
>>>
>>> # create the target variable. Classification problem 0 vs rest
>>> # to see the target names you can use wine_dt['target_names']
>>> y = (wine_dt['target'] == 1).astype(int)
>>> X = wine_dt['data']
>>>
>>> # previous model transforms
>>> transforms = [
>>> core.SKLearnTransformWrapper(StandardScaler),
>>> core.SKLearnTransformWrapper(PCA, n_components=5)
>>> ]
>>>
>>> # model hyperparameters
>>> search_space = {
>>> 'degree': ('suggest_int', (1, 10)),
>>> 'class_weight': ('suggest_categorical', [('balanced', None)]),
>>> 'coef0': ('suggest_float', (0.0, 100.00 ))
>>> }
>>>
>>> # default model
>>> model = core.SklearnModelWrapper(
>>> SVC, kernel='poly', degree=1, coef0=0.0,
>>> cache_size=1000, class_weight=None
>>> )
>>>
>>> # perform the HPO to optimice model-hyperparameters
>>> cv_report = core.evalCrossValNestedHPO(
>>> X=X,
>>> y=y,
>>> model=model,
>>> search_space=search_space,
>>> outer_cv=gojo.util.splitter.getCrossValObj(cv=5, repeats=1, stratified=True, loocv=False, random_state=1997),
>>> inner_cv=gojo.util.splitter.getCrossValObj(cv=5, repeats=1, stratified=True, loocv=False, random_state=1997),
>>> hpo_sampler=optuna.samplers.TPESampler(n_startup_trials=40),
>>> hpo_n_trials=80,
>>> minimization=False,
>>> transforms=transforms,
>>> metrics=core.getDefaultMetrics('binary_classification', bin_threshold=0.5),
>>> objective_metric='f1_score',
>>> verbose=1,
>>> save_train_preds=True,
>>> save_models=False,
>>> n_jobs=1
>>> )
>>>
>>> scores = cv_report.getScores(core.getDefaultMetrics('binary_classification', bin_threshold=0.5))
>>> results = pd.concat([
>>> pd.DataFrame(scores['train'].mean(axis=0)).round(decimals=3),
>>> pd.DataFrame(scores['test'].mean(axis=0)).round(decimals=3)],
>>> axis=1).drop(index=['n_fold'])
>>> results.columns = ['Train', 'Test']
>>> results
>>>
"""
def _getOptunaStorageTemp():
""" Subroutine used to create a temporary database to store the results of the parallelisation performed with
optuna."""
tmp_dir = os.path.join('.', '.tmp_gojo_optuna_hpo')
curr_time = datetime.now().strftime('%Y%m%d_%H%M%S')
if not os.path.exists(tmp_dir):
try:
# Control cases where concurrent attempts are made to create the temporary directory and generate
# errors
Path(tmp_dir).mkdir(parents=True)
except Exception as ex:
time.sleep(1)
if not os.path.exists(tmp_dir):
raise ex
return os.path.abspath(os.path.join(tmp_dir, '%s_gojo_optuna_hpo.db' % curr_time))
def _trialHPO(
_trial,
_X: np.ndarray,
_y: np.ndarray,
_model: Model,
_search_space: dict,
_cv: RepeatedKFold or RepeatedStratifiedKFold or LeaveOneOut or SimpleSplitter,
_metrics: list,
_minimize: bool,
_objective_metric: str = None,
_customAggFunction: callable = None,
_op_instance_args: dict = None,
_n_jobs: int = 1
) -> float:
""" Subroutine used to run a HPO trial. """
# default parameter
_op_instance_args = {} if _op_instance_args is None else _op_instance_args
if _objective_metric is None and _customAggFunction is None:
raise TypeError(
'gojo.core.loops.evalCrossValNestedHPO._trialHPO(). Either "_objective_metric" or "_customAggFunction" '
'should be defined')
# sample parameters from the trial distribution
_optim_params = {}
for _name, _values in _search_space.items():
if len(_values) == 2:
_optim_params[_name] = getattr(_trial, _values[0])(_name, *_values[1])
elif len(_values) == 3:
_optim_params[_name] = getattr(_trial, _values[0])(_name, *_values[1], **_values[2])
else:
raise ValueError(
'INNER ERROR IN gojo.core.loops._trialHPO number of input parameters for param "{}" ({})'.format(
_name, _values))
_model = model.copy() # avoid inplace modifications
_model.update(**_optim_params) # update model parameters
# perform the nested cross-validation
_cv_report = evalCrossVal(
X=_X,
y=_y,
model=_model,
cv=_cv,
transforms=None,
verbose=0,
n_jobs=_n_jobs, # nested parallel executions
save_train_preds=_customAggFunction is not None, # save only if a costume aggregation function was provided
save_models=False, # does not save models
save_transforms=False,
op_instance_args=_op_instance_args
)
# compute performance metrics
_scores = _cv_report.getScores(metrics=_metrics, supress_warnings=True)
if _customAggFunction is not None:
# use a custom aggregation function to aggregate the fold results, the input for this
# function will correspond to the scores returned by the gojo.core.report.CVReport.getScores
# function
_objective_score = _customAggFunction(_scores)
else:
# by default consider the average value of the specified function over the test set
assert 'test' in _scores.keys(), 'Internal error in gojo.core.loops.evalCrossValNestedHPO._trialHPO. ' \
'Missing "test" key in CVReport.getScores keys.'
# select the test scores
_test_scores = _scores['test']
# check that the specified metric exists
if _objective_metric not in _test_scores.columns:
raise TypeError('Missing metric "%s". Available metrics are: %r' % (
_objective_metric, _test_scores.columns.tolist()))
_objective_score = _test_scores[_objective_metric].mean()
# by default optuna perform a minimization
if not _minimize:
_objective_score = -1 * _objective_score
if not isinstance(_objective_score, (int, float)):
raise TypeError(
'Returned score used to optimize model hyperparameters should be a scalar. '
'Returned type: {}'.format(type(_objective_score)))
return float(_objective_score)
# check provided input types
checkMultiInputTypes(
('X', X, [np.ndarray, pd.DataFrame]),
('y', y, [np.ndarray, pd.DataFrame, pd.Series]),
('model', model, [Model]),
('search_space', search_space, [dict]),
('outer_cv', outer_cv, [
RepeatedKFold, RepeatedStratifiedKFold, LeaveOneOut, SimpleSplitter, InstanceLevelKFoldSplitter,
PredefinedSplitter]),
('inner_cv', inner_cv, [
RepeatedKFold, RepeatedStratifiedKFold, LeaveOneOut, SimpleSplitter, InstanceLevelKFoldSplitter,
PredefinedSplitter]),
('hpo_sampler', hpo_sampler, [optuna.samplers.BaseSampler]),
('metrics', metrics, [list]),
('objective_metric', objective_metric, [str, type(None)]),
('hpo_n_trials', hpo_n_trials, [int]),
('minimization', minimization, [bool]),
('transforms', transforms, [list, type(None)]),
('verbose', verbose, [int]),
('n_jobs', n_jobs, [int]),
('inner_cv_n_jobs', inner_cv_n_jobs, [int]),
('save_models', save_models, [bool]),
('save_transforms', save_transforms, [bool]),
('save_train_preds', save_train_preds, [bool]),
('op_instance_args', op_instance_args, [dict, type(None)])
)
# check consistency of the search space dictionary
for i, (param_name, hpo_values) in enumerate(search_space.items()):
checkMultiInputTypes(
('search_space (item %d)' % i, param_name, [str]),
('search_space["%s"]' % param_name, hpo_values, [tuple, list]),
('search_space["%s"][0]' % param_name, hpo_values[0], [str]),
('search_space["%s"][1]' % param_name, hpo_values[1], [tuple, list]))
if len(hpo_values) == 3:
checkInputType('search_space["%s"][1]', hpo_values[2], [dict])
# check the provided aggregation function
if agg_function is not None:
checkCallable('agg_function', agg_function)
# check number of jobs
if n_jobs == -1:
n_jobs = mp.cpu_count()
if n_jobs <= 0:
raise TypeError(
'Parameter "n_jobs" cannot be less than 0 (only -1 is allowed indicating use all cpu cores).')
if n_jobs > 1 and not enable_experimental:
raise ValueError(
'Parallelisation of hyperparameter optimisation is an experimental feature. To activate it you '
'will have to use `enable_experimental=True`.')
if (n_jobs > 1) and (platform.system().lower() == 'windows'):
warnings.warn('Parallelization of the HPO in optuna is not optimised for Windows and can lead to a significant '
'loss in performance (can result in slower executions than without using parallelization).')
# create the model datasets
X_dt = Dataset(X)
y_dt = Dataset(y)
# check op_instance_args argument
if op_instance_args is not None:
for var_name, var_values in op_instance_args.items():
checkInputType('op_instance_args["%s"]' % var_name, var_values, [list, np.ndarray])
if len(X_dt) != len(var_values):
raise TypeError(
'Missmatch in X shape (%d) and op_instance_args["%s"] shape (%d).' % (
len(X_dt), var_name, len(var_values)))
else:
op_instance_args = {}
# check data lengths
if len(X_dt) != len(y_dt):
raise TypeError('Missmatch in X shape (%d) and y shape (%d).' % (len(X_dt), len(y_dt)))
# verbose parameters
verbose = np.inf if verbose < 0 else verbose # negative values indicate activate all
# levels > 0 should display the number of the current fold
show_fold_number = False
show_best_combinations = False
show_hpo_best_values = False
# verbosity 1 to show pbar
show_pbar = False
if verbose == 1:
show_pbar = True
# verbosity greater than 1
if verbose > 1:
show_fold_number = True
show_best_combinations = True
show_hpo_best_values = True
# verbosity grater than 2 to show optuna logs
if verbose < 2:
optuna.logging.set_verbosity(optuna.logging.WARNING) # supress optuna warnings below verbosity level <= 1
# train the model optimizing their hyperparameters
hpo_trials_history = {}
hpo_trials_best_params = {}
fold_stats = [] # used to init the gojo.core.report.CVReport instance
created_storages = []
try:
for i, (train_idx, test_idx) in tqdm(
enumerate(outer_cv.split(X_dt.array_data, y_dt.array_data)),
desc='Performing cross-validation...',
disable=not show_pbar):
if show_fold_number: # verbose information
pprint('\nFold %d =============================================\n' % (i+1))
# extract train/test data
X_train = X_dt.array_data[train_idx]
y_train = y_dt.array_data[train_idx]
X_test = X_dt.array_data[test_idx]
y_test = y_dt.array_data[test_idx]
# extract instance-level parameters
op_train_instance_args = {}
op_test_instance_args = {}
if len(op_instance_args) > 0:
for var_name, var_values in op_instance_args.items():
op_train_instance_args[var_name] = [var_values[idx] for idx in train_idx]
op_test_instance_args[var_name] = [var_values[idx] for idx in test_idx]
transforms_ = None
if transforms is not None:
# TODO. Another option is to apply the transforms inside the HPO, but
# TODO. it can become a very computationally-intensive alternative...
# apply transformations based on the training data (DESIGN DECISION)
if save_transforms:
# fit a copy of the input transformations
transforms_ = [trans.copy() for trans in transforms]
else:
# reset fit and allow inplace modifications of the input transforms
for transform in transforms:
transform.resetFit()
transforms_ = transforms
# fit and apply the input transformations based on the training data
X_train, X_test = _fitAndApplyTransforms(
transforms=transforms_, X_train=X_train, X_test=X_test, y_train=y_train, y_test=y_test,
op_train_instance_args=op_train_instance_args, op_test_instance_args=op_test_instance_args
)
# create a partial initialization of the function to optimize
partial_trialHPO = partial(
_trialHPO,
_X=X_train,
_y=y_train,
_model=model,
_search_space=search_space,
_cv=inner_cv,
_metrics=metrics,
_minimize=minimization,
_objective_metric=objective_metric,
_customAggFunction=agg_function,
_op_instance_args=op_train_instance_args,
_n_jobs=inner_cv_n_jobs
)
# create the optuna study instance
# deepcopy the provided sampler to avoid inplace modifications
if n_jobs > 1:
# create a temporary database
storage_name = _getOptunaStorageTemp()
created_storages.append(storage_name)
study_name = os.path.split(storage_name)[-1].replace('.db', '')
study = optuna.create_study(
study_name=study_name,
storage='sqlite:///{}'.format(storage_name),
sampler=deepcopy(hpo_sampler))
else:
study = optuna.create_study(sampler=deepcopy(hpo_sampler))
study.optimize(partial_trialHPO, n_trials=hpo_n_trials, n_jobs=n_jobs)
# save HPO results
hpo_trials_history[i] = study.trials_dataframe()
hpo_trials_best_params[i] = study.best_params
# display verbosity information
if show_hpo_best_values:
study_df = study.trials_dataframe()
pprint('Best trial: %d' % study_df.iloc[np.argmin(study_df['value'].values)].loc['number'])
pprint('Best value: %.5f' % study_df.iloc[np.argmin(study_df['value'].values)].loc['value'])
pprint()
if show_best_combinations:
pprint('Optimized model hyperparameters: {}\n'.format(study.best_params))
# update input model hyperparameters
optim_model = model.copy()
optim_model.update(**study.best_params)
# train the model and make the predictions on the test data
fold_results = _evalCrossValFold(
_n_fold=i,
_model=optim_model,
_X_train=X_train,
_y_train=y_train,
_X_test=X_test,
_y_test=y_test,
_train_idx=train_idx,
_test_idx=test_idx,
_predict_train=save_train_preds,
_return_model=save_models,
_reset_model_fit=True,
_transforms=None, # transforms were applied at the beginning of the loop
_return_transforms=False,
_reset_transforms=False,
_op_instance_args=op_instance_args
)
# add transforms to the returned fold results
if save_transforms:
fold_results = list(fold_results) # convert to list for inplace modifications
for idx, (name, _) in enumerate(fold_results):
# replace 'transforms' key
if name == 'transforms':
fold_results[idx] = (name, transforms_)
fold_results = tuple(fold_results)
fold_stats.append(fold_results)
if n_jobs > 1:
optuna.delete_study(
study_name=study_name,
storage='sqlite:///{}'.format(storage_name))
del study
except Exception as ex:
print('Exception generated during the execution of gojo.core.evalCrossValNestedHPO. {} "{}"'.format(
type(ex), ex))
raise ex
finally:
# remove optuna storage databases
base_paths = []
for storage in created_storages:
base_paths.append(os.path.split(storage)[0])
try:
os.remove(storage) # remove individual database
except Exception as ex:
print('Exception when removing optuna temporal files: {} - {}'.format(type(ex), ex))
# remove optuna storage directory
base_paths = list(set(base_paths))
for base_path in base_paths:
if len(os.listdir(base_path)) == 0:
try:
os.rmdir(base_path) # remove storage folder
except Exception as ex:
print('Exception when removing optuna temporal directory: {} - {}'.format(type(ex), ex))
# the model should not remain fitted after the execution of the previous subroutines
if model.is_fitted:
warnings.warn(
'Detected a fitted model after cross-validation procedure in "gojo.core.loops.evalCrossVal(...)"')
cv_report = _createCVReport(
cv_results=fold_stats,
X_dataset=X_dt,
y_dataset=y_dt)
# add HPO metadata and instance-level parameters
cv_report.addMetadata(
hpo_history=hpo_trials_history,
hpo_best_params=hpo_trials_best_params,
op_instance_args=op_instance_args
)
return cv_report