Source code for orion.analysis.base

"""
Base tools to compute diverse analysis
======================================

"""
import itertools
from functools import reduce

import numpy
import pandas as pd
from sklearn.ensemble import (
    AdaBoostRegressor,
    BaggingRegressor,
    ExtraTreesRegressor,
    GradientBoostingRegressor,
    RandomForestRegressor,
)

from orion.core.utils import format_trials
from orion.core.worker.transformer import build_required_space

_regressors_ = {
    "AdaBoostRegressor": AdaBoostRegressor,
    "BaggingRegressor": BaggingRegressor,
    "ExtraTreesRegressor": ExtraTreesRegressor,
    "GradientBoostingRegressor": GradientBoostingRegressor,
    "RandomForestRegressor": RandomForestRegressor,
}


# pylint:disable=dangerous-default-value
[docs]def average(trials, group_by="order", key="best", return_var=False):
    """Compute the average of some trial attribute.

    By default it will compute the average objective at each time step across
    multiple experiments.

    Parameters
    ----------
    trials: DataFrame
        A dataframe of trials containing, at least, the columns 'best' and 'order'.
    group_by: str, optional
        The attribute to use to group trials for the average. By default it group trials
        by order (ex: all first trials across experiments.)
    key: str, optional
        One attribute or a list of attributes split by ',' to average.
        Defaults to 'best' as returned by ``orion.analysis.regret``.
    return_var: bool, optional
        If True, and a column '{key}_var' where '{key}' is the value of the argument `key`.
        Defaults to False.

    Returns
    -------
    A dataframe with columns 'order', '{key}_mean' and '{key}_var'.

    """
    if trials.empty:
        return trials

    group = trials.groupby(group_by)
    means = []
    keys = [v.strip() for v in key.split(",")]
    for k in keys:
        mean = group[k].mean().reset_index().rename(columns={k: f"{k}_mean"})
        if return_var:
            mean[f"{k}_var"] = group[k].var().reset_index()[k]
        means.append(mean)

    df_merged = reduce(pd.merge, means)

    return df_merged


# pylint:disable=unsupported-assignment-operation
[docs]def ranking(trials, group_by="order", key="best"):
    """Compute the ranking of some trial attribute.

    By default it will compute the ranking with respect to objectives at each time step across
    multiple experiments.

    Parameters
    ----------
    trials: DataFrame
        A dataframe of trials containing, at least, the columns 'best' and 'order'.
    group_by: str, optional
        The attribute to use to group trials for the ranking. By default it group trials
        by order (ex: all first trials across experiments.)
    key: str, optional
        The attribute to use for the ranking. Defaults to 'best' as returned by
        ``orion.analysis.regret``.

    Returns
    -------
    A copy of the original dataframe with a new column 'rank' for the rankings.

    """
    if trials.empty:
        return trials

    def rank(row):
        indices = row[key].argsort().to_numpy()
        ranks = numpy.empty_like(indices)
        ranks[indices] = numpy.arange(len(ranks))
        row["rank"] = ranks
        return row

    return trials.groupby(group_by).apply(rank)


[docs]def flatten_params(space, params=None):
    """Return the params of the corresponding flat space

    If no params are passed, returns all flattened params.
    If params are passed, returns the corresponding flattened params.

    Parameters
    ----------
    space: Space object
        A space object from an experiment.
    params: list of str, optional
        The parameters to select from the search space. If the flattened search space
        contains flattened params such as ('y' -> 'y[0]', 'y[1]'), passing 'y' in the list of
        params will returned the flattened version ['y[0]', 'y[1]']

    Examples
    --------
    If space has x~uniform(0, 1) and y~uniform(0, 1, shape=(1, 2)).
    >>> flatten_params(space)
    ['x', 'y[0,0]', 'y[0,1]']
    >>> flatten_params(space, params=['x'])
    ['x']
    >>> flatten_params(space, params=['x', 'y'])
    ['x', 'y[0,0]', 'y[0,1]']
    >>> flatten_params(space, params=['x', 'y[0,1]'])
    ['x', 'y[0,1]']
    >>> flatten_params(space, params=['y[0,1]', 'x'])
    ['x', 'y[0,1]']

    Raises
    ------
    ValueError
        If one of the parameter names passed is not in the flattened space.

    """
    keys = set(space.keys())
    flattened_keys = set(
        build_required_space(
            space,
            dist_requirement="linear",
            type_requirement="numerical",
            shape_requirement="flattened",
        ).keys()
    )

    if params is None:
        return sorted(flattened_keys)

    flattened_params = []
    for param in params:
        if param not in flattened_keys and param not in keys:
            raise ValueError(
                f"Parameter {param} not contained in space: {flattened_keys}"
            )
        elif param not in flattened_keys and param in keys:
            dim = space[param]
            flattened_params += [
                f'{dim.name}[{",".join(map(str, index))}]'
                for index in itertools.product(*map(range, dim.shape))
            ]
        else:
            flattened_params.append(param)

    return flattened_params


[docs]def to_numpy(trials, space):
    """Convert trials in DataFrame to Numpy array of (params + objective)"""
    return trials[list(space.keys()) + ["objective"]].to_numpy()


[docs]def flatten_numpy(trials_array, flattened_space):
    """Flatten dimensions"""

    flattened_points = numpy.array(
        [
            format_trials.trial_to_tuple(
                flattened_space.transform(
                    format_trials.tuple_to_trial(point[:-1], flattened_space.original)
                ),
                flattened_space,
            )
            for point in trials_array
        ]
    )

    return numpy.concatenate((flattened_points, trials_array[:, -1:]), axis=1)


[docs]def train_regressor(regressor_name, data, **kwargs):
    """Train regressor model

    Parameters
    ----------
    model: str
        Name of the regression model to use. Can be one of
        - AdaBoostRegressor
        - BaggingRegressor
        - ExtraTreesRegressor
        - GradientBoostingRegressor
        - RandomForestRegressor (Default)
    trials: DataFrame or dict
        A dataframe of trials containing, at least, the columns 'objective' and 'id'. Or a dict
        equivalent.

    **kwargs
        Arguments for the regressor model.

    """
    if regressor_name not in _regressors_:
        raise ValueError(
            f"{regressor_name} is not a supported regressor. "
            f"Did you mean any of thesis: list(_regressors_.keys())"
        )

    regressor = _regressors_[regressor_name](**kwargs)
    return regressor.fit(data[:, :-1], data[:, -1])