Source code for evalml.automl.automl_algorithm.automl_algorithm
"""Base class for the AutoML algorithms which power EvalML."""importinspectfromabcimportABC,abstractmethodfromskopt.spaceimportCategorical,Integer,Realfromevalml.exceptionsimportPipelineNotFoundErrorfromevalml.pipelines.utilsimport_make_stacked_ensemble_pipelinefromevalml.problem_typesimportis_multiclassfromevalml.tunersimportSKOptTuner
[docs]classAutoMLAlgorithmException(Exception):"""Exception raised when an error is encountered during the computation of the automl algorithm."""pass
[docs]classAutoMLAlgorithm(ABC):"""Base class for the AutoML algorithms which power EvalML. This class represents an automated machine learning (AutoML) algorithm. It encapsulates the decision-making logic behind an automl search, by both deciding which pipelines to evaluate next and by deciding what set of parameters to configure the pipeline with. To use this interface, you must define a next_batch method which returns the next group of pipelines to evaluate on the training data. That method may access state and results recorded from the previous batches, although that information is not tracked in a general way in this base class. Overriding add_result is a convenient way to record pipeline evaluation info if necessary. Args: allowed_pipelines (list(class)): A list of PipelineBase subclasses indicating the pipelines allowed in the search. The default of None indicates all pipelines for this problem type are allowed. search_parameters (dict): Search parameter ranges specified for pipelines to iterate over. tuner_class (class): A subclass of Tuner, to be used to find parameters for each pipeline. The default of None indicates the SKOptTuner will be used. text_in_ensembling (boolean): If True and ensembling is True, then n_jobs will be set to 1 to avoid downstream sklearn stacking issues related to nltk. Defaults to None. random_seed (int): Seed for the random number generator. Defaults to 0. """def__init__(self,allowed_pipelines=None,allowed_model_families=None,excluded_model_families=None,allowed_component_graphs=None,search_parameters=None,tuner_class=None,text_in_ensembling=False,random_seed=0,n_jobs=-1,):self.random_seed=random_seedself._tuner_class=tuner_classorSKOptTunerself._tuners={}self._best_pipeline_info={}self.text_in_ensembling=text_in_ensemblingself.n_jobs=n_jobsself._selected_cols=Noneself.search_parameters=search_parametersor{}self._hyperparameters={}self._pipeline_parameters={}self.allowed_pipelines=[]ifallowed_pipelinesisnotNone:self._set_allowed_pipelines(allowed_pipelines)self._pipeline_number=0self._batch_number=0self._default_max_batches=1ifallowed_component_graphsisnotNone:ifexcluded_model_familiesisnotNone:raiseValueError("Both `excluded_model_families` and `allowed_component_graphs` cannot be set.",)self.allowed_component_graphs=allowed_component_graphsifallowed_model_familiesisnotNoneandexcluded_model_familiesisnotNone:raiseValueError("Both `allowed_model_families` and `excluded_model_families` cannot be set.",)self.allowed_model_families=allowed_model_familiesself.excluded_model_families=excluded_model_families
[docs]@abstractmethoddefnext_batch(self):"""Get the next batch of pipelines to evaluate. Returns: list[PipelineBase]: A list of instances of PipelineBase subclasses, ready to be trained and evaluated. """
[docs]@abstractmethoddefnum_pipelines_per_batch(self,batch_number):"""Return the number of pipelines in the nth batch. Args: batch_number (int): which batch to calculate the number of pipelines for. Returns: int: number of pipelines in the given batch. """
def_set_allowed_pipelines(self,allowed_pipelines):"""Sets the allowed parameters and creates the tuners for the input pipelines."""self.allowed_pipelines=allowed_pipelinesforpipelineinself.allowed_pipelines:self._create_tuner(pipeline)def_create_tuner(self,pipeline):"""Creates a tuner given the input pipeline."""pipeline_hyperparameters=pipeline.get_hyperparameter_ranges(self._hyperparameters,)self._tuners[pipeline.name]=self._tuner_class(pipeline_hyperparameters,random_seed=self.random_seed,)def_separate_hyperparameters_from_parameters(self):"""Seperate out the parameter and hyperparameter values from the search parameters dict."""forkey,valueinself.search_parameters.items():hyperparam={}param={}forname,parametersinvalue.items():ifisinstance(parameters,(Integer,Categorical,Real)):hyperparam[name]=parameterselse:param[name]=parametersifhyperparam:self._hyperparameters[key]=hyperparamifparam:self._pipeline_parameters[key]=paramdef_transform_parameters(self,pipeline,proposed_parameters):"""Given a pipeline parameters dict, make sure pipeline_parameters, custom_hyperparameters, n_jobs are set properly. Arguments: pipeline (PipelineBase): The pipeline object to update the parameters. proposed_parameters (dict): Parameters to use when updating the pipeline. """parameters={}if"pipeline"inself._pipeline_parameters:parameters["pipeline"]=self._pipeline_parameters["pipeline"]for(name,component_instance,)inpipeline.component_graph.component_instances.items():component_class=type(component_instance)component_parameters=proposed_parameters.get(name,{})init_params=inspect.signature(component_class.__init__).parameters# Only overwrite the parameters that were passed in on pipeline parameters# if they don't exist in the propsed parametersifnameinself._pipeline_parametersandnamenotincomponent_parameters:forparam_name,valueinself._pipeline_parameters[name].items():component_parameters[param_name]=value# Inspects each component and adds the following parameters when neededif"n_jobs"ininit_params:component_parameters["n_jobs"]=self.n_jobsif"number_features"ininit_paramsandhasattr(self,"number_features"):component_parameters["number_features"]=self.number_featuresif"pipeline"inself.search_parameters:forparam_name,valueinself.search_parameters["pipeline"].items():ifparam_nameininit_params:component_parameters[param_name]=valueparameters[name]=component_parametersreturnparameters
[docs]defadd_result(self,score_to_minimize,pipeline,trained_pipeline_results):"""Register results from evaluating a pipeline. Args: score_to_minimize (float): The score obtained by this pipeline on the primary objective, converted so that lower values indicate better pipelines. pipeline (PipelineBase): The trained pipeline object which was used to compute the score. trained_pipeline_results (dict): Results from training a pipeline. Raises: PipelineNotFoundError: If pipeline is not allowed in search. """ifpipeline.namenotinself._tuners:raisePipelineNotFoundError(f"No such pipeline allowed in this AutoML search: {pipeline.name}",)self._tuners[pipeline.name].add(pipeline.parameters,score_to_minimize)
@propertydefpipeline_number(self):"""Returns the number of pipelines which have been recommended so far."""returnself._pipeline_number@propertydefbatch_number(self):"""Returns the number of batches which have been recommended so far."""returnself._batch_number@propertydefdefault_max_batches(self):"""Returns the number of max batches AutoMLSearch should run by default."""return1def_create_ensemble(self,label_encoder_params=None):next_batch=[]best_pipelines=list(self._best_pipeline_info.values())problem_type=best_pipelines[0]["pipeline"].problem_typen_jobs_ensemble=1ifself.text_in_ensemblingelseself.n_jobsinput_pipelines=[]cached_data={model_family:x["cached_data"]formodel_family,xinself._best_pipeline_info.items()}forpipeline_dictinbest_pipelines:pipeline=pipeline_dict["pipeline"]input_pipelines.append(pipeline)iflabel_encoder_paramsisnotNone:label_encoder_params={"Label Encoder":label_encoder_params}else:label_encoder_params={}ensemble=_make_stacked_ensemble_pipeline(input_pipelines,problem_type,random_seed=self.random_seed,n_jobs=n_jobs_ensemble,cached_data=cached_data,label_encoder_params=label_encoder_params,)next_batch.append(ensemble)returnnext_batchdef_set_additional_pipeline_params(self):drop_columns=(self.search_parameters["Drop Columns Transformer"]["columns"]if"Drop Columns Transformer"inself.search_parameterselseNone)index_and_unknown_columns=list(self.X.ww.select(["index","unknown"],return_schema=True).columns,)unknown_columns=list(self.X.ww.select("unknown",return_schema=True).columns)iflen(index_and_unknown_columns)>0anddrop_columnsisNone:self.search_parameters["Drop Columns Transformer"]={"columns":index_and_unknown_columns,}iflen(unknown_columns):self.logger.info(f"Removing columns {unknown_columns} because they are of 'Unknown' type",)kina_columns=self.search_parameters.get("pipeline",{}).get("known_in_advance",[],)ifkina_columns:no_kin_columns=[cforcinself.X.columnsifcnotinkina_columns]kin_name="Known In Advance Pipeline - Select Columns Transformer"no_kin_name="Not Known In Advance Pipeline - Select Columns Transformer"self.search_parameters[kin_name]={"columns":kina_columns}self.search_parameters[no_kin_name]={"columns":no_kin_columns}def_filter_estimators(self,estimators,problem_type,allow_long_running_models,allowed_model_families,y_unique,logger,):"""Function to remove computationally expensive and long-running estimators from datasets with large numbers of unique classes. Thresholds were determined empirically."""estimators_to_drop=[]if(notis_multiclass(problem_type)orallow_long_running_modelsorallowed_model_familiesisnotNone):returnestimatorsify_unique>75:estimators_to_drop.extend(["Elastic Net Classifier","XGBoost Classifier"])ify_unique>150:estimators_to_drop.append("CatBoost Classifier")dropped_estimators=[eforeinestimatorsife.nameinestimators_to_drop]iflen(dropped_estimators):logger.info("Dropping estimators {} because the number of unique targets is {} and `allow_long_running_models` is set to {}".format(", ".join(sorted([e.nameforeindropped_estimators])),y_unique,allow_long_running_models,),)estimators=[eforeinestimatorsifenotindropped_estimators]returnestimators