Source code for evalml.automl.automl_algorithm.iterative_algorithm

import inspect
from operator import itemgetter

import numpy as np
from skopt.space import Categorical, Integer, Real

from .automl_algorithm import AutoMLAlgorithm, AutoMLAlgorithmException

from evalml.model_family import ModelFamily
from evalml.pipelines.utils import _make_stacked_ensemble_pipeline

_ESTIMATOR_FAMILY_ORDER = [
    ModelFamily.LINEAR_MODEL,
    ModelFamily.DECISION_TREE,
    ModelFamily.EXTRA_TREES,
    ModelFamily.RANDOM_FOREST,
    ModelFamily.XGBOOST,
    ModelFamily.LIGHTGBM,
    ModelFamily.CATBOOST,
    ModelFamily.ARIMA,
]


[docs]class IterativeAlgorithm(AutoMLAlgorithm): """ An automl algorithm which first fits a base round of pipelines with default parameters, then does a round of parameter tuning on each pipeline in order of performance. Arguments: allowed_pipelines (list(class)): A list of PipelineBase instances indicating the pipelines allowed in the search. The default of None indicates all pipelines for this problem type are allowed. max_iterations (int): The maximum number of iterations to be evaluated. tuner_class (class): A subclass of Tuner, to be used to find parameters for each pipeline. The default of None indicates the SKOptTuner will be used. random_seed (int): Seed for the random number generator. Defaults to 0. pipelines_per_batch (int): The number of pipelines to be evaluated in each batch, after the first batch. Defaults to 5. n_jobs (int or None): Non-negative integer describing level of parallelism used for pipelines. Defaults to None. number_features (int): The number of columns in the input features. Defaults to None. ensembling (boolean): If True, runs ensembling in a separate batch after every allowed pipeline class has been iterated over. Defaults to False. text_in_ensembling (boolean): If True and ensembling is True, then n_jobs will be set to 1 to avoid downstream sklearn stacking issues related to nltk. Defaults to None. pipeline_params (dict or None): Pipeline-level parameters that should be passed to the proposed pipelines. Defaults to None. custom_hyperparameters (dict or None): Custom hyperparameter ranges specified for pipelines to iterate over. Defaults to None. _estimator_family_order (list(ModelFamily) or None): specify the sort order for the first batch. Defaults to None, which uses _ESTIMATOR_FAMILY_ORDER. """ def __init__( self, allowed_pipelines=None, max_iterations=None, tuner_class=None, random_seed=0, pipelines_per_batch=5, n_jobs=-1, # TODO remove number_features=None, # TODO remove ensembling=False, text_in_ensembling=False, pipeline_params=None, custom_hyperparameters=None, _estimator_family_order=None, ): """An automl algorithm which first fits a base round of pipelines with default parameters, then does a round of parameter tuning on each pipeline in order of performance. Arguments: allowed_pipelines (list(class)): A list of PipelineBase instances indicating the pipelines allowed in the search. The default of None indicates all pipelines for this problem type are allowed. max_iterations (int): The maximum number of iterations to be evaluated. tuner_class (class): A subclass of Tuner, to be used to find parameters for each pipeline. The default of None indicates the SKOptTuner will be used. random_seed (int): Seed for the random number generator. Defaults to 0. pipelines_per_batch (int): The number of pipelines to be evaluated in each batch, after the first batch. Defaults to 5. n_jobs (int or None): Non-negative integer describing level of parallelism used for pipelines. Defaults to -1. number_features (int): The number of columns in the input features. Defaults to None. ensembling (boolean): If True, runs ensembling in a separate batch after every allowed pipeline class has been iterated over. Defaults to False. text_in_ensembling (boolean): If True and ensembling is True, then n_jobs will be set to 1 to avoid downstream sklearn stacking issues related to nltk. Defaults to None. pipeline_params (dict or None): Pipeline-level parameters that should be passed to the proposed pipelines. Defaults to None. custom_hyperparameters (dict or None): Custom hyperparameter ranges specified for pipelines to iterate over. Defaults to None. _estimator_family_order (list(ModelFamily) or None): specify the sort order for the first batch. Defaults to None, which uses _ESTIMATOR_FAMILY_ORDER. """ self._estimator_family_order = ( _estimator_family_order or _ESTIMATOR_FAMILY_ORDER ) indices = [] pipelines_to_sort = [] pipelines_end = [] for pipeline in allowed_pipelines or []: if pipeline.model_family in self._estimator_family_order: indices.append( self._estimator_family_order.index(pipeline.model_family) ) pipelines_to_sort.append(pipeline) else: pipelines_end.append(pipeline) pipelines_start = [ pipeline for _, pipeline in ( sorted(zip(indices, pipelines_to_sort), key=lambda pair: pair[0]) or [] ) ] allowed_pipelines = pipelines_start + pipelines_end super().__init__( allowed_pipelines=allowed_pipelines, custom_hyperparameters=custom_hyperparameters, max_iterations=max_iterations, tuner_class=tuner_class, random_seed=random_seed, ) self.pipelines_per_batch = pipelines_per_batch self.n_jobs = n_jobs self.number_features = number_features self._first_batch_results = [] self._best_pipeline_info = {} self.ensembling = ensembling and len(self.allowed_pipelines) > 1 self.text_in_ensembling = text_in_ensembling self._pipeline_params = pipeline_params or {} self._custom_hyperparameters = custom_hyperparameters or {} if custom_hyperparameters and not isinstance(custom_hyperparameters, dict): raise ValueError( f"If custom_hyperparameters provided, must be of type dict. Received {type(custom_hyperparameters)}" ) for param_name_val in self._pipeline_params.values(): for _, param_val in param_name_val.items(): if isinstance(param_val, (Integer, Real, Categorical)): raise ValueError( "Pipeline parameters should not contain skopt.Space variables, please pass them " "to custom_hyperparameters instead!" ) for hyperparam_name_val in self._custom_hyperparameters.values(): for _, hyperparam_val in hyperparam_name_val.items(): if not isinstance(hyperparam_val, (Integer, Real, Categorical)): raise ValueError( "Custom hyperparameters should only contain skopt.Space variables such as Categorical, Integer," " and Real!" )
[docs] def next_batch(self): """Get the next batch of pipelines to evaluate Returns: list(PipelineBase): a list of instances of PipelineBase subclasses, ready to be trained and evaluated. """ if self._batch_number == 1: if len(self._first_batch_results) == 0: raise AutoMLAlgorithmException( "No results were reported from the first batch" ) self._first_batch_results = sorted( self._first_batch_results, key=itemgetter(0) ) next_batch = [] if self._batch_number == 0: next_batch = [ pipeline.new( parameters=self._transform_parameters(pipeline, {}), random_seed=self.random_seed, ) for pipeline in self.allowed_pipelines ] # One after training all pipelines one round elif ( self.ensembling and self._batch_number != 1 and (self._batch_number) % (len(self._first_batch_results) + 1) == 0 ): input_pipelines = [] for pipeline_dict in self._best_pipeline_info.values(): pipeline = pipeline_dict["pipeline"] pipeline_params = pipeline_dict["parameters"] parameters = self._transform_parameters(pipeline, pipeline_params) input_pipelines.append( pipeline.new(parameters=parameters, random_seed=self.random_seed) ) n_jobs_ensemble = 1 if self.text_in_ensembling else self.n_jobs ensemble = _make_stacked_ensemble_pipeline( input_pipelines, input_pipelines[0].problem_type, random_seed=self.random_seed, n_jobs=n_jobs_ensemble, ) next_batch.append(ensemble) else: num_pipelines = ( (len(self._first_batch_results) + 1) if self.ensembling else len(self._first_batch_results) ) idx = (self._batch_number - 1) % num_pipelines pipeline = self._first_batch_results[idx][1] for i in range(self.pipelines_per_batch): proposed_parameters = self._tuners[pipeline.name].propose() parameters = self._transform_parameters(pipeline, proposed_parameters) next_batch.append( pipeline.new(parameters=parameters, random_seed=self.random_seed) ) self._pipeline_number += len(next_batch) self._batch_number += 1 return next_batch
[docs] def add_result(self, score_to_minimize, pipeline, trained_pipeline_results): """Register results from evaluating a pipeline Arguments: score_to_minimize (float): The score obtained by this pipeline on the primary objective, converted so that lower values indicate better pipelines. pipeline (PipelineBase): The trained pipeline object which was used to compute the score. trained_pipeline_results (dict): Results from training a pipeline. """ if pipeline.model_family != ModelFamily.ENSEMBLE: if self.batch_number == 1: try: super().add_result( score_to_minimize, pipeline, trained_pipeline_results ) except ValueError as e: if "is not within the bounds of the space" in str(e): raise ValueError( "Default parameters for components in pipeline {} not in the hyperparameter ranges: {}".format( pipeline.name, e ) ) else: raise (e) else: super().add_result( score_to_minimize, pipeline, trained_pipeline_results ) if self.batch_number == 1: self._first_batch_results.append((score_to_minimize, pipeline)) current_best_score = self._best_pipeline_info.get( pipeline.model_family, {} ).get("mean_cv_score", np.inf) if ( score_to_minimize is not None and score_to_minimize < current_best_score and pipeline.model_family != ModelFamily.ENSEMBLE ): self._best_pipeline_info.update( { pipeline.model_family: { "mean_cv_score": score_to_minimize, "pipeline": pipeline, "parameters": pipeline.parameters, "id": trained_pipeline_results["id"], } } )
def _transform_parameters(self, pipeline, proposed_parameters): """Given a pipeline parameters dict, make sure n_jobs and number_features are set.""" parameters = {} if "pipeline" in self._pipeline_params: parameters["pipeline"] = self._pipeline_params["pipeline"] for ( name, component_instance, ) in pipeline.component_graph.component_instances.items(): component_class = type(component_instance) component_parameters = proposed_parameters.get(name, {}) init_params = inspect.signature(component_class.__init__).parameters # For first batch, pass the pipeline params to the components that need them if name in self._custom_hyperparameters and self._batch_number == 0: for param_name, value in self._custom_hyperparameters[name].items(): if isinstance(value, (Integer, Real)): # get a random value in the space component_parameters[param_name] = value.rvs( random_state=self.random_seed )[0] # Categorical else: component_parameters[param_name] = value.rvs( random_state=self.random_seed ) if name in self._pipeline_params and self._batch_number == 0: for param_name, value in self._pipeline_params[name].items(): component_parameters[param_name] = value # Inspects each component and adds the following parameters when needed if "n_jobs" in init_params: component_parameters["n_jobs"] = self.n_jobs if "number_features" in init_params: component_parameters["number_features"] = self.number_features if ( name in self._pipeline_params and name == "Drop Columns Transformer" and self._batch_number > 0 ): component_parameters["columns"] = self._pipeline_params[name]["columns"] if "pipeline" in self._pipeline_params: for param_name, value in self._pipeline_params["pipeline"].items(): if param_name in init_params: component_parameters[param_name] = value parameters[name] = component_parameters return parameters