Source code for evalml.automl.automl_algorithm.default_algorithm
"""An automl algorithm that consists of two modes: fast and long, where fast is a subset of long."""importloggingimportnumpyasnpfromevalml.automl.automl_algorithm.automl_algorithmimportAutoMLAlgorithmfromevalml.model_familyimportModelFamilyfromevalml.pipelines.componentsimport(EmailFeaturizer,OneHotEncoder,RFClassifierSelectFromModel,RFRegressorSelectFromModel,URLFeaturizer,)fromevalml.pipelines.components.transformers.column_selectorsimport(SelectByType,SelectColumns,)fromevalml.pipelines.components.transformers.encoders.ordinal_encoderimport(OrdinalEncoder,)fromevalml.pipelines.components.utilsimportget_estimators,handle_component_classfromevalml.pipelines.utilsimport(_get_sampler,_make_pipeline_from_multiple_graphs,make_pipeline,)fromevalml.problem_typesimportis_multiseries,is_regression,is_time_seriesfromevalml.utilsimportinfer_feature_typesfromevalml.utils.loggerimportget_logger
[docs]classDefaultAlgorithm(AutoMLAlgorithm):"""An automl algorithm that consists of two modes: fast and long, where fast is a subset of long. 1. Naive pipelines: a. run baseline with default preprocessing pipeline b. run naive linear model with default preprocessing pipeline c. run basic RF pipeline with default preprocessing pipeline 2. Naive pipelines with feature selection a. subsequent pipelines will use the selected features with a SelectedColumns transformer At this point we have a single pipeline candidate for preprocessing and feature selection 3. Pipelines with preprocessing components: a. scan rest of estimators (our current batch 1). 4. First ensembling run Fast mode ends here. Begin long mode. 6. Run top 3 estimators: a. Generate 50 random parameter sets. Run all 150 in one batch 7. Second ensembling run 8. Repeat these indefinitely until stopping criterion is met: a. For each of the previous top 3 estimators, sample 10 parameters from the tuner. Run all 30 in one batch b. Run ensembling Args: X (pd.DataFrame): Training data. y (pd.Series): Target data. problem_type (ProblemType): Problem type associated with training data. sampler_name (BaseSampler): Sampler to use for preprocessing. tuner_class (class): A subclass of Tuner, to be used to find parameters for each pipeline. The default of None indicates the SKOptTuner will be used. random_seed (int): Seed for the random number generator. Defaults to 0. search_parameters (dict or None): Pipeline-level parameters and custom hyperparameter ranges specified for pipelines to iterate over. Hyperparameter ranges must be passed in as skopt.space objects. Defaults to None. n_jobs (int or None): Non-negative integer describing level of parallelism used for pipelines. Defaults to -1. text_in_ensembling (boolean): If True and ensembling is True, then n_jobs will be set to 1 to avoid downstream sklearn stacking issues related to nltk. Defaults to False. top_n (int): top n number of pipelines to use for long mode. num_long_explore_pipelines (int): number of pipelines to explore for each top n pipeline at the start of long mode. num_long_pipelines_per_batch (int): number of pipelines per batch for each top n pipeline through long mode. allow_long_running_models (bool): Whether or not to allow longer-running models for large multiclass problems. If False and no pipelines, component graphs, or model families are provided, AutoMLSearch will not use Elastic Net or XGBoost when there are more than 75 multiclass targets and will not use CatBoost when there are more than 150 multiclass targets. Defaults to False. features (list)[FeatureBase]: List of features to run DFS on in AutoML pipelines. Defaults to None. Features will only be computed if the columns used by the feature exist in the input and if the feature has not been computed yet. run_feature_selection (bool): If True, will run a separate feature selection pipeline and only use selected features in subsequent batches. If False, will use all of the features for every pipeline. Only used for default algorithm. verbose (boolean): Whether or not to display logging information regarding pipeline building. Defaults to False. exclude_featurizers (list[str]): A list of featurizer components to exclude from the pipelines built by DefaultAlgorithm. Valid options are "DatetimeFeaturizer", "EmailFeaturizer", "URLFeaturizer", "NaturalLanguageFeaturizer", "TimeSeriesFeaturizer" allowed_model_families (list(str, ModelFamily)): The model families to search. The default of None searches over all model families. Run evalml.pipelines.components.utils.allowed_model_families("binary") to see options. Change `binary` to `multiclass` or `regression` depending on the problem type. For default algorithm, this only applies to estimators in the non-naive batches. excluded_model_families (list(str, ModelFamily)): A list of model families to exclude from the estimators used when building pipelines. For default algorithm, this only excludes estimators in the non-naive batches. """def__init__(self,X,y,problem_type,sampler_name,allowed_model_families=None,excluded_model_families=None,tuner_class=None,random_seed=0,search_parameters=None,n_jobs=1,text_in_ensembling=False,top_n=3,ensembling=False,num_long_explore_pipelines=50,num_long_pipelines_per_batch=10,allow_long_running_models=False,features=None,run_feature_selection=True,verbose=False,exclude_featurizers=None,):super().__init__(allowed_pipelines=[],allowed_model_families=allowed_model_families,excluded_model_families=excluded_model_families,allowed_component_graphs=None,search_parameters=search_parameters,tuner_class=None,random_seed=random_seed,)self.X=infer_feature_types(X)self.y=infer_feature_types(y)self.problem_type=problem_typeself.sampler_name=sampler_nameself.n_jobs=n_jobsself._best_pipeline_info={}self.text_in_ensembling=text_in_ensemblingself.search_parameters=search_parametersor{}self._top_n_pipelines=Noneself.num_long_explore_pipelines=num_long_explore_pipelinesself.num_long_pipelines_per_batch=num_long_pipelines_per_batchself.top_n=top_nself.verbose=verboseself._selected_cat_cols=[]self._split=Falseself.allow_long_running_models=allow_long_running_modelsself._X_with_cat_cols=Noneself._X_without_cat_cols=Noneself.features=featuresself.run_feature_selection=run_feature_selectionself.ensembling=ensemblingself.exclude_featurizers=exclude_featurizersor[]ifallowed_model_familiesisnotNoneandexcluded_model_familiesisnotNone:raiseValueError("Both `allowed_model_families` and `excluded_model_families` cannot be set.",)self.allowed_model_families=allowed_model_familiesself.excluded_model_families=excluded_model_families# TODO remove on resolution of 3186ifis_time_series(self.problem_type)andself.ensembling:raiseValueError("Ensembling is not available for time series problems in DefaultAlgorithm.",)ifverbose:self.logger=get_logger(f"{__name__}.verbose")else:self.logger=logging.getLogger(__name__)ifsearch_parametersandnotisinstance(search_parameters,dict):raiseValueError(f"If search_parameters provided, must be of type dict. Received {type(search_parameters)}",)self._set_additional_pipeline_params()self._separate_hyperparameters_from_parameters()@propertydefdefault_max_batches(self):"""Returns the number of max batches AutoMLSearch should run by default."""ifself.ensembling:return3elifis_multiseries(self.problem_type):return1else:return2
[docs]defnum_pipelines_per_batch(self,batch_number):"""Return the number of pipelines in the nth batch. Args: batch_number (int): which batch to calculate the number of pipelines for. Returns: int: number of pipelines in the given batch. """ifbatch_number==0:returnlen(self._naive_estimators())elifbatch_number==1:returnlen(self._non_naive_estimators())ifself.ensembling:ifbatch_number%2==0:return1elifbatch_number==3:returnself.num_long_explore_pipelines*self.top_nelse:ifbatch_number==2:returnself.num_long_explore_pipelines*self.top_nreturnself.num_long_pipelines_per_batch*self.top_n
[docs]defnext_batch(self):"""Get the next batch of pipelines to evaluate. Returns: list(PipelineBase): a list of instances of PipelineBase subclasses, ready to be trained and evaluated. """ifself.ensembling:ifself._batch_number==0:next_batch=self._create_naive_pipelines(use_features=self.run_feature_selection,)elifself._batch_number==1:next_batch=self._create_fast_final()elifself.batch_number==2:next_batch=self._create_ensemble(self._pipeline_parameters.get("Label Encoder",{}),)elifself.batch_number==3:next_batch=self._create_long_exploration(n=self.top_n)elifself.batch_number%2==0:next_batch=self._create_ensemble(self._pipeline_parameters.get("Label Encoder",{}),)else:next_batch=self._create_n_pipelines(self._top_n_pipelines,self.num_long_pipelines_per_batch,)# this logic needs to be updated once time series also supports ensemblingelifis_time_series(self.problem_type):# Skip the naive batch for multiseries time seriesbatch=(self._batch_numberifnotis_multiseries(self.problem_type)elseself._batch_number+1)ifbatch==0:next_batch=self._create_naive_pipelines()elifbatch==1:next_batch=self._create_fast_final()elifbatch==2:next_batch=self._create_long_exploration(n=self.top_n)else:next_batch=self._create_n_pipelines(self._top_n_pipelines,self.num_long_pipelines_per_batch,)else:ifself._batch_number==0:next_batch=self._create_naive_pipelines(use_features=self.run_feature_selection,)elifself._batch_number==1:next_batch=self._create_fast_final()elifself.batch_number==2:next_batch=self._create_long_exploration(n=self.top_n)else:next_batch=self._create_n_pipelines(self._top_n_pipelines,self.num_long_pipelines_per_batch,)self._pipeline_number+=len(next_batch)self._batch_number+=1returnnext_batch
def_get_feature_provenance_and_remove_engineered_features(self,pipeline,component_name,to_be_removed,to_be_added,):component=pipeline.get_component(component_name)feature_provenance=component._get_feature_provenance()fororiginal_colinfeature_provenance:selected=Falseforencoded_colinfeature_provenance[original_col]:ifencoded_colinto_be_removed:selected=Trueto_be_removed.remove(encoded_col)ifselected:to_be_added.append(original_col)def_parse_selected_categorical_features(self,pipeline):# Ordinal will always be categorical in nature, but it won't have OneHotEncoded features made for itiflist(self.X.ww.select("Ordinal",return_schema=True).columns):self._get_feature_provenance_and_remove_engineered_features(pipeline,OrdinalEncoder.name,self._selected_cols,self._selected_cat_cols,)eliflist(self.X.ww.select("category",return_schema=True).columns):self._get_feature_provenance_and_remove_engineered_features(pipeline,OneHotEncoder.name,self._selected_cols,self._selected_cat_cols,)if(list(self.X.ww.select("URL",return_schema=True).columns)and"URLFeaturizer"notinself.exclude_featurizers):self._get_feature_provenance_and_remove_engineered_features(pipeline,URLFeaturizer.name,self._selected_cat_cols,self._selected_cat_cols,)if(list(self.X.ww.select("EmailAddress",return_schema=True).columns)and"EmailFeaturizer"notinself.exclude_featurizers):self._get_feature_provenance_and_remove_engineered_features(pipeline,EmailFeaturizer.name,self._selected_cat_cols,self._selected_cat_cols,)
[docs]defadd_result(self,score_to_minimize,pipeline,trained_pipeline_results,cached_data=None,):"""Register results from evaluating a pipeline. In batch number 2, the selected column names from the feature selector are taken to be used in a column selector. Information regarding the best pipeline is updated here as well. Args: score_to_minimize (float): The score obtained by this pipeline on the primary objective, converted so that lower values indicate better pipelines. pipeline (PipelineBase): The trained pipeline object which was used to compute the score. trained_pipeline_results (dict): Results from training a pipeline. cached_data (dict): A dictionary of cached data, where the keys are the model family. Expected to be of format {model_family: {hash1: trained_component_graph, hash2: trained_component_graph...}...}. Defaults to None. """cached_data=cached_dataor{}ifpipeline.model_family!=ModelFamily.ENSEMBLE:ifself.batch_number>=2:super().add_result(score_to_minimize,pipeline,trained_pipeline_results,)if(self.batch_number==1andself._selected_colsisNoneandnotis_time_series(self.problem_type)):ifis_regression(self.problem_type):self._selected_cols=pipeline.get_component("RF Regressor Select From Model",).get_names()else:self._selected_cols=pipeline.get_component("RF Classifier Select From Model",).get_names()self._parse_selected_categorical_features(pipeline)current_best_score=self._best_pipeline_info.get(pipeline.model_family,{},).get("mean_cv_score",np.inf)if(score_to_minimizeisnotNoneandscore_to_minimize<current_best_scoreandpipeline.model_family!=ModelFamily.ENSEMBLE):self._best_pipeline_info.update({pipeline.model_family:{"mean_cv_score":score_to_minimize,"pipeline":pipeline,"parameters":pipeline.parameters,"id":trained_pipeline_results["id"],"cached_data":cached_data,},},)
def_make_split_pipeline(self,estimator,pipeline_name=None):# Should be category, not categorical so that we make sure to exclude# all logical types with the "category" tagnumeric_exclude_types=["category","EmailAddress","URL"]if(self.run_feature_selectionandself._X_with_cat_colsisNoneorself._X_without_cat_colsisNone):self._X_without_cat_cols=self.X.ww.select(exclude=numeric_exclude_types,)self._X_with_cat_cols=self.X.ww[self._selected_cat_cols]if(self.run_feature_selectionandself._selected_cat_colsandself._selected_cols):self._split=Truecategorical_pipeline_parameters={"Select Columns Transformer":{"columns":self._selected_cat_cols},}numeric_pipeline_parameters={"Select Columns Transformer":{"columns":self._selected_cols},"Select Columns By Type Transformer":{"column_types":numeric_exclude_types,"exclude":True,},}categorical_pipeline=make_pipeline(self._X_with_cat_cols,self.y,estimator,self.problem_type,sampler_name=None,parameters=categorical_pipeline_parameters,extra_components_before=[SelectColumns],use_estimator=False,exclude_featurizers=self.exclude_featurizers,)numeric_pipeline=make_pipeline(self._X_without_cat_cols,self.y,estimator,self.problem_type,sampler_name=None,parameters=numeric_pipeline_parameters,extra_components_before=[SelectByType],extra_components_after=[SelectColumns],use_estimator=False,exclude_featurizers=self.exclude_featurizers,)pre_pipeline_components=({"DFS Transformer":["DFS Transformer","X","y"]}ifself.featureselse{})ifself.sampler_name:sampler=_get_sampler(self.X,self.y,self.problem_type,estimator,self.sampler_name,)[0]post_pipelines_components={sampler.name:[sampler.name,"X","y"]}else:post_pipelines_components=Noneinput_pipelines=[numeric_pipeline,categorical_pipeline]sub_pipeline_names={numeric_pipeline.name:"Numeric",categorical_pipeline.name:"Categorical",}return_make_pipeline_from_multiple_graphs(input_pipelines,estimator,self.problem_type,pipeline_name=pipeline_name,random_seed=self.random_seed,sub_pipeline_names=sub_pipeline_names,pre_pipeline_components=pre_pipeline_components,post_pipelines_components=post_pipelines_components,)elif(self.run_feature_selectionandself._selected_cat_colsandnotself._selected_cols):categorical_pipeline_parameters={"Select Columns Transformer":{"columns":self._selected_cat_cols},}categorical_pipeline=make_pipeline(self._X_with_cat_cols,self.y,estimator,self.problem_type,sampler_name=self.sampler_name,parameters=categorical_pipeline_parameters,extra_components_before=[SelectColumns],features=self.features,exclude_featurizers=self.exclude_featurizers,)returncategorical_pipelineelifself.run_feature_selection:numeric_pipeline_parameters={"Select Columns Transformer":{"columns":self._selected_cols},}numeric_pipeline=make_pipeline(self.X,self.y,estimator,self.problem_type,sampler_name=self.sampler_name,parameters=numeric_pipeline_parameters,extra_components_after=[SelectColumns],features=self.features,exclude_featurizers=self.exclude_featurizers,)returnnumeric_pipelineelse:pipeline=make_pipeline(self.X,self.y,estimator,self.problem_type,sampler_name=self.sampler_name,exclude_featurizers=self.exclude_featurizers,)returnpipeline