Source code for olympus.datasets.dataset

#!/usr/bin/env python

import json
from pandas import DataFrame, read_csv
import numpy as np

from olympus import Logger
from olympus.campaigns import ParameterSpace
from olympus.objects import Parameter

import os
from glob import glob


# =========================
# Main Class of This Module
# =========================
[docs]class Dataset:
    """
    A ``Dataset`` object stores the data of a dataset by wrapping a ``pandas.DataFrame`` in its ``data`` attribute, provides
    additional information on the dataset, and provides convenience methods to access features and targets as well as
    to generate training/validation/test splits.

    Args:
        kind (str): kind of the Olympus dataset to load.
        data (array): custom dataset. Same input as for pandas.DataFrame.
        columns (list): column names. Same input as for pandas.DataFrame.
        target_ids (list): list of column indices, or names if provided, that identify the targets for the predictions.
        test_frac (float): fraction of the data to be used as test set.
        num_folds (int): number of cross validation folds the training set will be split into.
        random_seed (int): random seed for numpy. Setting a seed makes the random splits reproducible.
    """

    def __init__(
        self,
        kind=None,
        data=None,
        columns=None,
        target_ids=None,
        test_frac=0.2,
        num_folds=5,
        random_seed=None,
    ):

        _validate_dataset_args(kind, data, columns, target_ids)

        self.kind = kind
        self.name = kind
        self.test_frac = test_frac
        self.num_folds = num_folds
        self.random_seed = random_seed

        # we defined the param_space as part of the Dataset object. When we
        # load a standard dataset, we define the param_space from the config
        # file. If Dataset is a custom dataset, the user will need to set_param_space
        self.param_space = None
        self.value_space = None

        # ------------------------------------------
        # Case 1: data directly provided by the user
        # ------------------------------------------
        if kind is None:
            self._description = "Custom Dataset"
            if target_ids is None:
                self._targets = []
            else:
                self._targets = target_ids
            _data = data

        # ------------------------------
        # Case 2: Olympus dataset loaded
        # ------------------------------
        elif kind is not None:
            _data, _config, self._description = load_dataset(kind)
            self._targets = [t["name"] for t in _config["measurements"]]
            columns = [f["name"] for f in _config["parameters"]] + self._targets
            # create param_space from config file
            self._create_param_space(_config)
            self._create_value_space(_config)
            # define attributes of interest - done here so to avoid calling load_dataset again
            self.constraints = _config["constraints"]
            self._goal = _config["default_goal"]

        # the numeric data is stored here, wrapping a DataFrame
        self.data = DataFrame(data=_data, index=None, columns=columns)

        # make sure the targets are the last column(s)
        for tgt in self._targets:
            assert tgt in list(self.data.columns)[-len(self._targets) :]

        # create dataset splits
        self.create_train_validate_test_splits()

    @property
    def goal(self):
        if not "_goal" in self.__dict__:
            _data, _config, _description = load_dataset(self.kind)
            self._goal = _config["default_goal"]
        return self._goal

    @property
    def measurement_name(self):
        if not "_measurement_name" in self.__dict__:
            _data, _config, _description = load_dataset(self.kind)
            self._measurement_name = _config["measurements"][0]["name"]
        return self._measurement_name

    def __getattr__(self, attr):
        if attr == "__getstate__":
            return lambda: self.__dict__
        elif attr == "__setstate__":

            def set_state(_dict):
                self.__dict__ = _dict

            return set_state
        elif attr == "kind" and not attr in self.__dict__:
            val = getattr(self, "name")
            setattr(self, attr, val)
            return val
        print('ATTR', attr)
        return getattr(self, attr)

    def __str__(self):
        # TODO: add some more info here...
        string = f"Dataset(kind={self.kind})"
        return string

    @property
    def size(self):
        return self.data.shape[0]

    @property
    def shape(self):
        return self.data.shape

    # we could also overwrite the DataFrame.info method
    def dataset_info(self):
        """Provide summary info about dataset.
        """
        Logger.log(self._description, "INFO")

    def set_param_space(self, param_space):
        """Define the parameter space of the dataset.

        Args:
            param_space (ParameterSpace): ParameterSpace object with information about all variables in the dataset.

        """

        # validate first...
        for i, param in enumerate(param_space.parameters):
            # check names are matching
            if self.feature_names[i] != param.name:
                message = f"Parameter name `{self.feature_names[i]}` does not match name `{param.name}` found in dataset!"
                Logger.log(message, "WARNING")
            # check data provided is within param_space bounds
            if param.low > np.min(self.data.iloc[:, i]):
                message = f"Lower bound of {param.low} provided for parameter `{param.name}` is higher than minimum found in the data!"
                Logger.log(message, "ERROR")
            if param.high < np.min(self.data.iloc[:, i]):
                message = f"Upper bound of {param.high} provided for parameter `{param.name}` is lower than maximum found in the data!"
                Logger.log(message, "ERROR")

        # ...then assign
        self.param_space = param_space

    def infer_param_space(self):
        """Guess the parameter space from the dataset. The range for all parameters will be define based on the
        minimum and maximum values in the dataset for each variable. All variables will be assumed not to be periodic.
        """
        param_space = ParameterSpace()
        for name in self.feature_names:
            param_dict = {
                "name": name,
                "low": np.min(self.data.loc[:, name]),
                "high": np.max(self.data.loc[:, name]),
            }
            param = Parameter().from_dict(param_dict)
            param_space.add(param)

        # set the param space
        self.set_param_space(param_space)

    def to_disk(self, folder='custom_dataset'):
        """Save the dataset to disk in the format expected by Olympus for its own datasets. This can be useful if you
        plan to upload the dataset to the community datasets available online.

        Args:
            folder (str): Folder in which to save the dataset files.
        """

        # create the folder
        os.mkdir(folder)

        # save the numeric data
        self.data.to_csv(f'{folder}/data.csv', header=False, index=False)

        # save the description
        self._generate_description()
        with open(f'{folder}/description.txt', 'w') as f:
            f.write(self._description)

        # save the config file
        _config = {}
        _config['constraints'] = {'parameters': 'none', 'measurements': 'none'}

        _config['parameters'] = []
        for param in self.param_space.parameters:
            d = {'name': param.name, 'type': param.kind, 'lower': param.low, 'upper':param.high}
            _config['parameters'].append(d)

        _config['measurements'] = []
        for target in self._targets:
            t = {'name': target, 'type': 'continuous'}
            _config['measurements'].append(t)

        with open(f'{folder}/config.json', 'w') as f:
            f.write(json.dumps(_config, indent=4, sort_keys=True))

    def _generate_description(self):
        _description = []
        if self.kind is None:
            _description.append('Custom Dataset\n')
        else:
            _description.append(f'{self.kind}\n')

        _description.append('=========================================')
        _description.append('                Summary')
        _description.append('-----------------------------------------')
        _description.append(f'    Number of Samples       {self.size:>10}')
        _description.append(f'    Dimensionality          {len(self.feature_names):>10}')
        _description.append(f'    Features:')
        for param in self.param_space.parameters:
            _description.append(f'        {param.name:<10}          {param.kind:>10}')
        _description.append(f'    Targets:')
        for target in self._targets:
            _description.append(f'        {target:<10}          continuous')

        self._description = "\n".join(_description)

    # ----------------------------------
    # Methods about features and targets
    # ----------------------------------
    @property
    def feature_names(self):
        # return names of features
        return [c for c in self.data.columns if c not in self._targets]

    @property
    def target_names(self):
        # return names of targets
        return self._targets

    @property
    def features(self):
        # return data for features
        return self.data.loc[:, self.feature_names]

    @property
    def targets(self):
        # return data for targets
        if type(self.target_names[0]) == str:
            return self.data.loc[:, self.target_names]
        elif type(self.target_names[0]) == int:
            return self.data.iloc[:, self.target_names]

    @property
    def features_dim(self):
        return len(self.feature_names)

    @property
    def targets_dim(self):
        return len(self.target_names)

    # ----------------------------------------------------------------
    # Methods for dataset splitting into training/validation/test sets
    # ----------------------------------------------------------------
    def create_train_validate_test_splits(
        self, test_frac=0.2, num_folds=5, test_indices=None
    ):
        """
        Args:
            test_frac (float)
            num_folds (int)
            test_indices (array)
                Array with the indices of the samples to be used as test set.
        """
        # update num_folds and test frac
        self.test_frac = test_frac
        if test_indices is not None:
            self.test_frac = len(test_indices) / len(self.data)
        self.num_folds = num_folds

        np.random.seed(self.random_seed)  # set random seed
        nrows = self.data.shape[0]
        indices = range(nrows)
        n_train_samples = int(round(nrows * self.test_frac, ndigits=0))

        # test set
        if test_indices is None:
            self.test_indices = np.random.choice(
                indices, size=n_train_samples, replace=False
            )
        else:
            self.test_indices = np.array(test_indices)

        # training set
        self.train_indices = np.setdiff1d(indices, self.test_indices)

        # cross-validation sets are subsets of the training set
        train_indices_shuffled = np.random.permutation(
            self.train_indices
        )  # random shuffle
        self.cv_fold_indices = np.array_split(
            train_indices_shuffled, self.num_folds
        )  # split in approx equal sized arrays
        # define train/valid indices for each fold
        self.cross_val_indices = []
        for i in range(self.num_folds):
            fold_train = np.concatenate(
                np.delete(self.cv_fold_indices, i, axis=0), axis=0
            )
            fold_valid = self.cv_fold_indices[i]
            self.cross_val_indices.append([fold_train, fold_valid])
        # TODO: unfix random seed?

    @property
    def train_set(self):
        # The training set with features+targets
        return self.data.loc[self.train_indices, :]

    @property
    def train_set_features(self):
        # The training set with only the features
        return self.data.loc[self.train_indices, self.feature_names]

    @property
    def train_set_targets(self):
        # The training set with only the targets
        return self.data.loc[self.train_indices, self.target_names]

    @property
    def test_set(self):
        # The test set with features+targets
        return self.data.loc[self.test_indices, :]

    @property
    def test_set_features(self):
        # The test set with only the features
        return self.data.loc[self.test_indices, self.feature_names]

    @property
    def test_set_targets(self):
        # The test set with only the targets
        return self.data.loc[self.test_indices, self.target_names]

    @property
    def cross_val_sets(self):
        # List of cv sets with features+targets
        cv_sets = []
        for train_id, valid_id in self.cross_val_indices:
            train_data = self.data.loc[train_id, :]
            valid_data = self.data.loc[valid_id, :]
            cv_sets.append((train_data, valid_data))
        return cv_sets

    @property
    def cross_val_sets_features(self):
        # List of cv sets with only features
        cv_sets = []
        for train_id, valid_id in self.cross_val_indices:
            train_data = self.data.loc[train_id, self.feature_names]
            valid_data = self.data.loc[valid_id, self.feature_names]
            cv_sets.append((train_data, valid_data))
        return cv_sets

    @property
    def cross_val_sets_targets(self):
        # List of cv sets with only targets
        cv_sets = []
        for train_id, valid_id in self.cross_val_indices:
            train_data = self.data.loc[train_id, self.target_names]
            valid_data = self.data.loc[valid_id, self.target_names]
            cv_sets.append((train_data, valid_data))
        return cv_sets

    def get_cv_fold(self, fold):
        """Get the data for a specific cross-validation fold.

        Args:
            fold (int): fold id.

        Returns:
            data (DataFrame): data for the chosen fold.
        """
        indices = self.cv_fold_indices[fold]
        return self.data.loc[indices, :]

    # ---------------
    # Private Methods
    # ---------------
    def _create_param_space(self, config):
        self.param_space = ParameterSpace()
        self.param_space.add(
            [Parameter().from_dict(feature) for feature in config["parameters"]]
        )

    def _create_value_space(self, config):
        self.value_space = ParameterSpace()
        self.value_space.add(
            [Parameter().from_dict(feature) for feature in config["measurements"]]
        )


# ===============
# Other Functions
# ===============
def load_dataset(kind):
    """Loads a dataset from the Olympus dataset library.

    Args:
        kind (str): kind of Olympus dataset to load.

    Returns:
        (tuple): tuple containing:
            data (array): numpy array where each row is a sample and each column is a feature/target. The columns are
            sorted such features come first and then targets.
            config (dict): dict containing information on the features and targets present in data.
            description (str): string describing the dataset.
    """

    _validate_dataset_args(kind=kind, data=None, columns=None, target_names=None)
    datasets_path = os.path.dirname(os.path.abspath(__file__))

    # load description
    with open("".join(f"{datasets_path}/dataset_{kind}/description.txt")) as txtfile:
        description = txtfile.read()

    # load info on features/targets
    with open("".join(f"{datasets_path}/dataset_{kind}/config.json"), "r") as content:
        config = json.loads(content.read())

    # load data
    csv_file = "".join(f"{datasets_path}/dataset_{kind}/data.csv")
    try:
        data = read_csv(csv_file, header=None).to_numpy()
    except FileNotFoundError:
        Logger.log(f"Could not find data.csv for dataset {kind}", "FATAL")

    return data, config, description


def _validate_dataset_args(kind, data, columns, target_names):
    if kind is not None:
        # -----------------------------------
        # check that a correct name is passed
        # -----------------------------------
        # TODO: reduce redundant code by importing the list from where we have it already
        module_path = os.path.dirname(os.path.abspath(__file__))
        olympus_datasets = []
        for dir_name in glob(f"{module_path}/dataset_*"):
            dir_name = dir_name.split("/")[-1][8:]
            olympus_datasets.append(dir_name)
        if kind not in olympus_datasets:
            message = (
                "Could not find dataset `{0}`. Please choose from one of the available "
                "datasets: {1}.".format(kind, ", ".join(list(olympus_datasets)))
            )
            Logger.log(message, "FATAL")
        # --------------------------------------------------------------
        # we will discard these arguments, so check if they are provided
        # --------------------------------------------------------------
        if data is not None:
            message = (
                "One of the Olympus datasets has been loaded via the argument `kind`, argument `data` "
                "will be discarded"
            )
            Logger.log(message, "WARNING")
        if columns is not None:
            message = (
                "One of the Olympus datasets has been loaded via the argument `kind`, argument `columns` "
                "will be discarded"
            )
            Logger.log(message, "WARNING")
        if target_names is not None:
            message = (
                "One of the Olympus datasets has been loaded via the argument `kind`, argument "
                "`target_names` will be discarded"
            )
            Logger.log(message, "WARNING")