Source code for orion.analysis.base

"""
Base tools to compute diverse analysis
======================================

"""
import itertools
from functools import reduce

import numpy
import pandas as pd
from sklearn.ensemble import (
    AdaBoostRegressor,
    BaggingRegressor,
    ExtraTreesRegressor,
    GradientBoostingRegressor,
    RandomForestRegressor,
)

from orion.core.utils import format_trials
from orion.core.worker.transformer import build_required_space

_regressors_ = {
    "AdaBoostRegressor": AdaBoostRegressor,
    "BaggingRegressor": BaggingRegressor,
    "ExtraTreesRegressor": ExtraTreesRegressor,
    "GradientBoostingRegressor": GradientBoostingRegressor,
    "RandomForestRegressor": RandomForestRegressor,
}


# pylint:disable=dangerous-default-value
[docs]def average(trials, group_by="order", key="best", return_var=False): """Compute the average of some trial attribute. By default it will compute the average objective at each time step across multiple experiments. Parameters ---------- trials: DataFrame A dataframe of trials containing, at least, the columns 'best' and 'order'. group_by: str, optional The attribute to use to group trials for the average. By default it group trials by order (ex: all first trials across experiments.) key: str, optional One attribute or a list of attributes split by ',' to average. Defaults to 'best' as returned by ``orion.analysis.regret``. return_var: bool, optional If True, and a column '{key}_var' where '{key}' is the value of the argument `key`. Defaults to False. Returns ------- A dataframe with columns 'order', '{key}_mean' and '{key}_var'. """ if trials.empty: return trials group = trials.groupby(group_by) means = [] keys = [v.strip() for v in key.split(",")] for k in keys: mean = group[k].mean().reset_index().rename(columns={k: f"{k}_mean"}) if return_var: mean[f"{k}_var"] = group[k].var().reset_index()[k] means.append(mean) df_merged = reduce(pd.merge, means) return df_merged
# pylint:disable=unsupported-assignment-operation
[docs]def ranking(trials, group_by="order", key="best"): """Compute the ranking of some trial attribute. By default it will compute the ranking with respect to objectives at each time step across multiple experiments. Parameters ---------- trials: DataFrame A dataframe of trials containing, at least, the columns 'best' and 'order'. group_by: str, optional The attribute to use to group trials for the ranking. By default it group trials by order (ex: all first trials across experiments.) key: str, optional The attribute to use for the ranking. Defaults to 'best' as returned by ``orion.analysis.regret``. Returns ------- A copy of the original dataframe with a new column 'rank' for the rankings. """ if trials.empty: return trials def rank(row): indices = row[key].argsort().to_numpy() ranks = numpy.empty_like(indices) ranks[indices] = numpy.arange(len(ranks)) row["rank"] = ranks return row return trials.groupby(group_by).apply(rank)
[docs]def flatten_params(space, params=None): """Return the params of the corresponding flat space If no params are passed, returns all flattened params. If params are passed, returns the corresponding flattened params. Parameters ---------- space: Space object A space object from an experiment. params: list of str, optional The parameters to select from the search space. If the flattened search space contains flattened params such as ('y' -> 'y[0]', 'y[1]'), passing 'y' in the list of params will returned the flattened version ['y[0]', 'y[1]'] Examples -------- If space has x~uniform(0, 1) and y~uniform(0, 1, shape=(1, 2)). >>> flatten_params(space) ['x', 'y[0,0]', 'y[0,1]'] >>> flatten_params(space, params=['x']) ['x'] >>> flatten_params(space, params=['x', 'y']) ['x', 'y[0,0]', 'y[0,1]'] >>> flatten_params(space, params=['x', 'y[0,1]']) ['x', 'y[0,1]'] >>> flatten_params(space, params=['y[0,1]', 'x']) ['x', 'y[0,1]'] Raises ------ ValueError If one of the parameter names passed is not in the flattened space. """ keys = set(space.keys()) flattened_keys = set( build_required_space( space, dist_requirement="linear", type_requirement="numerical", shape_requirement="flattened", ).keys() ) if params is None: return sorted(flattened_keys) flattened_params = [] for param in params: if param not in flattened_keys and param not in keys: raise ValueError( f"Parameter {param} not contained in space: {flattened_keys}" ) elif param not in flattened_keys and param in keys: dim = space[param] flattened_params += [ f'{dim.name}[{",".join(map(str, index))}]' for index in itertools.product(*map(range, dim.shape)) ] else: flattened_params.append(param) return flattened_params
[docs]def to_numpy(trials, space): """Convert trials in DataFrame to Numpy array of (params + objective)""" return trials[list(space.keys()) + ["objective"]].to_numpy()
[docs]def flatten_numpy(trials_array, flattened_space): """Flatten dimensions""" flattened_points = numpy.array( [ format_trials.trial_to_tuple( flattened_space.transform( format_trials.tuple_to_trial(point[:-1], flattened_space.original) ), flattened_space, ) for point in trials_array ] ) return numpy.concatenate((flattened_points, trials_array[:, -1:]), axis=1)
[docs]def train_regressor(regressor_name, data, **kwargs): """Train regressor model Parameters ---------- model: str Name of the regression model to use. Can be one of - AdaBoostRegressor - BaggingRegressor - ExtraTreesRegressor - GradientBoostingRegressor - RandomForestRegressor (Default) trials: DataFrame or dict A dataframe of trials containing, at least, the columns 'objective' and 'id'. Or a dict equivalent. **kwargs Arguments for the regressor model. """ if regressor_name not in _regressors_: raise ValueError( f"{regressor_name} is not a supported regressor. " f"Did you mean any of thesis: list(_regressors_.keys())" ) regressor = _regressors_[regressor_name](**kwargs) return regressor.fit(data[:, :-1], data[:, -1])