Source code for gojo.deepl.loading

# Module with data loading utilities
#
# Author: Fernando García Gutiérrez
# Email: ga.gu.fernando.concat@gmail.com
#
# STATUS: completed, functional, and documented.
#

import torch
import numpy as np
import pandas as pd
import warnings
import torch_geometric as geom
from copy import deepcopy
from typing import List, Union
from torch.utils.data import Dataset

from ..interfaces import data as data_interface
from ..util.validation import (
    checkInputType,
    checkMultiInputTypes,
    checkCallable)


[docs]class GraphDataset(Dataset): """ Class used to generate a dataset adapted to operate with Graph Neural Networks. This class can be passed to `torch.utils.data.DataLoader` and subsequently used by the :func:`gojo.deepl.loops.fitNeuralNetwork` function. Parameters ---------- X : np.ndarray or pd.DataFrame or List[np.ndarray] Input predictor variables used to adjust the models. If a numpy array or a pandas DataFrame is provided, entries along dimension 0 will be interpreted as instances, and the 1-axis will be interpreted as the number of nodes in the network. In the case where a list of numpy arrays is provided, each element of the list will be interpreted as an instance, the 0-axis as the number of nodes, and the remaining dimensions as node features. y : np.ndarray or pd.DataFrame or pd.Series, default=None Target variables to fit the models (or None). adj_matrix : np.ndarray or pd.DataFrame or List[Union[np.ndarray, pd.DataFrame]], default=None Adjacency matrix. If a numpy array or a pandas DataFrame is provided, it must have a shape of (`n_nodes`, `n_nodes`). In the case where a list of numpy arrays is provided, each element of the list will be interpreted as a graph, and it must have a shape of (`n_nodes`, `n_nodes`). One of `adj_matrix` or `edge_index` must be provided. edge_index : np.ndarray or pd.DataFrame or List[Union[np.ndarray, pd.DataFrame]], default=None Edge index. If a numpy array or a pandas DataFrame is provided, it must have a shape of (`2`, `n_nodes`). In the case where a list of numpy arrays is provided, each element of the list will be interpreted as a graph, and it must have a shape of (`2`, `n_nodes`). One of `adj_matrix` or `edge_index` must be provided. tabular_x: np.ndarray or pd.DataFrame or List[np.ndarray], default=None Tabular characteristics that will be stored in the `tabular_x` attribute of the instances ( `torch_geometric.data.DataBatch`) returned by this dataset. .. important:: Internally a dimension will be added along axis 1 to prevent `torch_geometric` dataloaders from flattening the data to a single dimension. Example ------- >>> import numpy as np >>> import gojo >>> >>> n_samples = 10 # number of instances >>> n_node_feats = 3 # number of node features >>> >>> # generate random adjacency matrices, one for each sample >>> adj_matrices = [] >>> for _ in range(n_samples): >>> n_nodes = np.random.randint(5, 30) >>> adj_matrices.append(np.random.randint(0, 2, size=(n_nodes, n_nodes))) >>> >>> # generate the node features >>> # each sample will be (n_nodes, n_node_features) >>> node_feats = [ >>> np.random.uniform(size=(adj_matrix.shape[0], n_node_feats)) >>> for adj_matrix in adj_matrices >>> ] >>> >>> # generate a target feature >>> target = np.random.randint(0, 2, size=n_samples) >>> >>> # create the dataset >>> graph_dt = gojo.experimental.deepl_loading.GraphDataset( >>> X=node_feats, >>> y=target, >>> adj_matrix=adj_matrices >>> ) >>> """ def __init__( self, X: np.ndarray or pd.DataFrame or List[np.ndarray], y: np.ndarray or pd.DataFrame or pd.Series = None, adj_matrix: np.ndarray or pd.DataFrame or List[Union[np.ndarray, pd.DataFrame]] = None, edge_index: np.ndarray or pd.DataFrame or List[Union[np.ndarray, pd.DataFrame]] = None, tabular_x: np.ndarray or pd.DataFrame or List[np.ndarray] = None ): super(GraphDataset, self).__init__() # check input arguments if adj_matrix is not None and edge_index is not None: warnings.warn( 'Both "adj_matrix" and "edge_index" have been provided, "edge_index" will be ignored.') if adj_matrix is None and edge_index is None: raise TypeError('At least one of "adj_matrix" or "edge_index" must be provided.') # process the y variable y_tensor = torch.from_numpy(np.array([np.nan] * len(X)).astype(np.float32)).float() if y is not None: # get the y data as a numpy array np_y = data_interface.Dataset(y).array_data # add extra dimension to y (n_samples, n_targets) if len(np_y.shape) == 1: np_y = np_y[:, np.newaxis] # convert the y variable to a torch.Tensor y_tensor = torch.from_numpy(np_y.astype(np.float32)).float() # process X variable x_list_tensor = [] if isinstance(X, list): for i, e in enumerate(X): checkInputType('X[i]', e, [np.ndarray]) # create a list of tensors x_list_tensor = [ torch.from_numpy(X[i].astype(np.float32)) for i in range(len(X))] # if the elements of the tensor are of shape (n_nodes) add n_node_features dimension for i in range(len(x_list_tensor)): if len(x_list_tensor[i].shape) == 1: x_list_tensor[i] = x_list_tensor[i].unsqueeze(-1) else: # get the X data as a numpy array np_X = data_interface.Dataset(X).array_data.astype(np.float32) # add extra dimension to X (n_samples, n_nodes, n_node_features) if len(np_X.shape) == 2: np_X = np_X[:, :, np.newaxis] # create a list of tensors x_list_tensor = [ torch.from_numpy(np_X[i, ...]).float() for i in range(np_X.shape[0])] # check y and X shape if len(y_tensor) != len(x_list_tensor): raise TypeError( 'Number of samples in "X" (%d) does not match the number of samples in "y" (%d)' % ( len(x_list_tensor), len(y_tensor))) # process the adjacency matrix / edge index edge_index_ = None if adj_matrix is not None: if isinstance(adj_matrix, list): for i, e in enumerate(adj_matrix): checkInputType('adj_matrix[i]', e, [np.ndarray]) # convert adjacency matrix to edge index edge_index_ = [ torch.nonzero(torch.from_numpy(adj_matrix[i].astype(int))).t() for i in range(len(adj_matrix))] else: # create copies of the adjacency matrix as edge index adj_matrix_np = data_interface.Dataset(adj_matrix).array_data.astype(int) edge_index_ = [ torch.nonzero(torch.from_numpy(adj_matrix_np)).t() for _ in range(len(x_list_tensor))] else: if isinstance(edge_index, list): # convert edge index to torch.Tensor for i, e in enumerate(edge_index): checkInputType('edge_index[i]', e, [np.ndarray]) edge_index[i] = torch.from_numpy(edge_index[i].astype(int)) edge_index_ = edge_index else: # create copies of the edge index np_edge_index = data_interface.Dataset(edge_index).array_data.astype(int) edge_index_ = [ torch.from_numpy(np_edge_index) for _ in range(len(x_list_tensor))] # check the shape of the edge index assert len(edge_index_) == len(x_list_tensor),\ 'Missmatch in internal "edge_index_" (%d) shape and "x_list_tensor" (%d).' % ( len(edge_index_), len(x_list_tensor)) for i in range(len(edge_index_)): if len(edge_index_[i].shape) != 2: raise TypeError( 'edge_index[%d] shape different from 2 (%d)' % (i, len(edge_index_[i].shape))) if edge_index_[i].shape[0] != 2: raise TypeError( 'edge_index[%d].shape[0] different from 2 (%d)' % (i, edge_index_[i].shape[0])) # check the consistency in the number of nodes for i, (nodes_, sample_) in enumerate(zip(edge_index_, x_list_tensor)): if nodes_.max()+1 != sample_.shape[0]: raise TypeError( 'Different number of nodes in sample %d (edge_index=%d, feature_vector=%d)' % ( i, nodes_.max()+1, sample_.shape[0])) # check tabular information checkInputType('tabular_x', tabular_x, [pd.DataFrame, np.ndarray, List, type(None)]) if not (tabular_x is None or len(tabular_x) == len(X)): raise TypeError( '"tabular_x" shape along 0 axis missmatch (expected %d, input %d)' % (len(x_list_tensor), len(X))) else: if isinstance(tabular_x, list): for i, e in enumerate(tabular_x): checkInputType('tabular_x[%d]' % i, e, [np.ndarray]) tabular_x = np.stack(tabular_x) if tabular_x is None: tabular_x = [None] * len(X) else: tabular_x = torch.from_numpy(data_interface.Dataset(tabular_x).array_data.astype(np.float32)) tabular_x = tabular_x.unsqueeze(1) # ---- end development # TODO. Implement edge_attr # ... self.y_tensor = y_tensor self.x_list_tensor = x_list_tensor self.edge_index = edge_index_ self.tabular_x = tabular_x # create the torch_geometric.data.Data instances self.data_list = [ geom.data.Data( x=x, edge_index=ei, y=y, tabular_x=tab_x ) for x, ei, y, tab_x in zip(x_list_tensor, edge_index_, y_tensor, tabular_x) ] def __getitem__(self, idx: int): return self.data_list[idx], self.y_tensor[idx] def __len__(self): return len(self.data_list)
[docs]class TorchDataset(Dataset): """ Basic Dataset class torch models. This class can be passed to `torch.DataLoaders` and subsequently used by the :func:`gojo.deepl.loops.fitNeuralNetwork` function or :class:`gojo.interfaces.TorchSKInterface` and :class:`gojo.interfaces.ParametrizedTorchSKInterface` classes. Parameters ---------- X : np.ndarray or pd.DataFrame or pd.Series Input predictor variables used to fit the models. y : np.ndarray or pd.DataFrame or pd.Series, default=None Target variables to fit the models (or None). x_transforms : list, default=None Transformations to be applied to the data provided in `X`. This parameter must be provided as a list of callables which will receive as input the `X` data. y_transforms : list, default=None Transformations to be applied to the data provided in `y`. This parameter must be provided as a list of callables which will receive as input the `y` data. x_stream_data : bool, default=False Parameter indicating whether `X` data will be loaded in streaming. In this case the parameters of `X` will be passed to `x_loading_fn` and this function must return the data that (if provided) will then go to the transforms and subsequently be returned by the dataset. x_loading_fn : callable, default=None Function used to load streaming data. This parameter will have no effect if 'x_stream_data' has not been provided. y_stream_data : bool, default=False Same logic as `x_stream_data` but applied to the `y` parameter. y_loading_fn : callable, default=None Same logic as `x_loading_fn` but applied to the `y` parameter. **op_instance_args Instance-level optional arguments. This parameter should be a dictionary whose values must be `np.ndarray` containing the same number of elements as instances in `X` and `y`. Example ------- >>> from gojo import deepl >>> from torch.utils.data import DataLoader >>> >>> # dataset loading ... >>> X = np.random.uniform(size=(30, 10)) >>> y = np.random.uniform(size=30) >>> >>> # use TorchDataset for an easy use of pytorch DataLoaders >>> dataloader = DataLoader( >>> deepl.loading.TorchDataset(X=X, y=y), >>> batch_size=16, shuffle=True) >>> """ def __init__( self, X: list or np.ndarray or pd.DataFrame or pd.Series, y: list or np.ndarray or pd.DataFrame or pd.Series = None, x_transforms: List[callable] = None, y_transforms: List[callable] = None, x_stream_data: bool = False, x_loading_fn: callable = None, y_stream_data: bool = False, y_loading_fn: callable = None, **op_instance_args): super(TorchDataset, self).__init__() # check the input arguments checkMultiInputTypes( ('X', X, [list, np.ndarray, pd.Series, pd.DataFrame]), ('y', y, [list, np.ndarray, pd.Series, pd.DataFrame, type(None)]), ('op_instance_args', op_instance_args, [dict, type(None)]), ('x_transforms', x_transforms, [list, type(None)]), ('y_transforms', y_transforms, [list, type(None)]), ('x_stream_data', x_stream_data, [bool]), ('y_stream_data', y_stream_data, [bool]) ) # check transforms if x_transforms is not None: for i, transform in enumerate(x_transforms): checkCallable('x_transforms[%d]' % i, transform) if y_transforms is not None: for i, transform in enumerate(y_transforms): checkCallable('y_transforms[%d]' % i, transform) # save parameters self.x_transforms = x_transforms self.y_transforms = y_transforms # check op_instance_args op_instance_args = deepcopy(op_instance_args) # avoid inplace modifications if op_instance_args is not None: for var_name, var_values in op_instance_args.items(): checkInputType('op_instance_args["%s"]' % var_name, var_values, [np.ndarray, list]) if len(X) != len(var_values): raise TypeError( 'Missmatch in X shape (%d) and op_instance_args["%s"] shape (%d).' % ( len(X), var_name, len(var_values))) # save parameters self.op_instance_args = op_instance_args # check loading functions (if stream_data == True) if x_stream_data: if not x_loading_fn is None: checkCallable('x_loading_fn', x_loading_fn) if y_stream_data: if not y_loading_fn is None: checkCallable('y_loading_fn', y_loading_fn) # process X-related parameters self.X = None self.x_loading_fn = None self.X_dataset = None if x_stream_data: self.X = X self.x_loading_fn = x_loading_fn else: # process X-related parameters X_dt = data_interface.Dataset(X) np_X = X_dt.array_data self.X = torch.from_numpy(np_X.astype(np.float32)) self.X_dataset = X_dt # process y-related parameters self.y = None self.y_loading_fn = None self.y_dataset = None if y is not None: if y_stream_data: self.y = y self.y_loading_fn = y_loading_fn else: y_dt = data_interface.Dataset(y) np_y = y_dt.array_data # remove the extra dimension from y if len(np_y.shape) == 2 and np_y.shape[1] == 1: np_y = np_y.reshape(-1) if len(self.X) != np_y.shape[0]: raise TypeError( 'Input "X" (shape[0] = %d) and "y" (shape[0] = %d) must contain the same number of entries ' 'in the first dimension.' % (len(self.X), np_y.shape[0])) self.y = torch.from_numpy(np_y) self.y_dataset = y_dt self.x_stream_data = x_stream_data self.y_stream_data = y_stream_data def __getitem__(self, idx: int): elements_to_return = [] # load X data if self.x_stream_data: X = self.x_loading_fn(self.X[idx]) else: X = self.X[idx] # apply transforms (optionally) if self.x_transforms is not None: for transform in self.x_transforms: X = transform(X) # check that X is a torch Tensor if not isinstance(X, torch.Tensor): raise TypeError( 'The load function (Xs) must return tensors. The returned type is {}. To solve it you can ' 'provide transformations or reformulate the load function.'.format(type(X))) # add X to the elements that will be returned elements_to_return.append(X) # add y to the elements that will be returned (if provided) if self.y is not None: if self.y_stream_data: y = self.y_loading_fn(self.y[idx]) else: y = self.y[idx] # apply transforms (optionally) if self.y_transforms is not None: for transform in self.y_transforms: y = transform(y) # check that y is a torch Tensor if not isinstance(y, torch.Tensor): raise TypeError( 'The load function (Ys) must return tensors. The returned type is {}. To solve it you can ' 'provide transformations or reformulate the load function.'.format(type(y))) elements_to_return.append(y) if self.op_instance_args is not None: for values in self.op_instance_args.values(): elements_to_return.append(values[idx]) return tuple(elements_to_return) def __len__(self): return len(self.X)