"""Utility methods for EvalML components."""importinspectfromtypingimportDict,Listimportnumpyasnpimportpandasaspdimportscipy.statsasstfromsklearn.baseimportBaseEstimator,ClassifierMixin,RegressorMixinfromsklearn.utils.multiclassimportunique_labelsfromsklearn.utils.validationimportcheck_is_fittedfromevalml.exceptionsimportMissingComponentErrorfromevalml.model_family.utilsimportModelFamily,handle_model_familyfromevalml.pipelines.components.component_baseimportComponentBasefromevalml.pipelines.components.estimators.estimatorimportEstimatorfromevalml.pipelines.components.transformers.transformerimportTransformerfromevalml.problem_typesimportProblemTypes,handle_problem_typesfromevalml.utilsimportget_importable_subclassesdef_all_estimators():returnget_importable_subclasses(Estimator,used_in_automl=False)def_all_estimators_used_in_search():returnget_importable_subclasses(Estimator,used_in_automl=True)def_all_transformers():returnget_importable_subclasses(Transformer,used_in_automl=False)
[docs]defall_components():"""Get all available components."""return_all_estimators()+_all_transformers()
[docs]defallowed_model_families(problem_type):"""List the model types allowed for a particular problem type. Args: problem_type (ProblemTypes or str): ProblemTypes enum or string. Returns: list[ModelFamily]: A list of model families. """estimators=[]problem_type=handle_problem_types(problem_type)forestimatorin_all_estimators_used_in_search():ifproblem_typeinset(handle_problem_types(problem)forprobleminestimator.supported_problem_types):estimators.append(estimator)returnlist(set([e.model_familyforeinestimators]))
[docs]defget_estimators(problem_type,model_families=None):"""Returns the estimators allowed for a particular problem type. Can also optionally filter by a list of model types. Args: problem_type (ProblemTypes or str): Problem type to filter for. model_families (list[ModelFamily] or list[str]): Model families to filter for. Returns: list[class]: A list of estimator subclasses. Raises: TypeError: If the model_families parameter is not a list. RuntimeError: If a model family is not valid for the problem type. """ifmodel_familiesisnotNoneandnotisinstance(model_families,list):raiseTypeError("model_families parameter is not a list.")problem_type=handle_problem_types(problem_type)ifmodel_familiesisNone:model_families=allowed_model_families(problem_type)model_families=[handle_model_family(model_family)formodel_familyinmodel_families]all_model_families=allowed_model_families(problem_type)formodel_familyinmodel_families:ifmodel_familynotinall_model_families:raiseRuntimeError("Unrecognized model type for problem type %s: %s"%(problem_type,model_family),)estimator_classes=[]forestimator_classin_all_estimators_used_in_search():ifproblem_typenotin[handle_problem_types(supported_pt)forsupported_ptinestimator_class.supported_problem_types]:continueifestimator_class.model_familynotinmodel_families:continueestimator_classes.append(estimator_class)returnestimator_classes
[docs]defestimator_unable_to_handle_nans(estimator_class):"""If True, provided estimator class is unable to handle NaN values as an input. Args: estimator_class (Estimator): Estimator class Raises: ValueError: If estimator is not a valid estimator class. Returns: bool: True if estimator class is unable to process NaN values, False otherwise. """ifnothasattr(estimator_class,"model_family"):raiseValueError("`estimator_class` must have a `model_family` attribute.")returnestimator_class.model_familyin[ModelFamily.EXTRA_TREES,ModelFamily.RANDOM_FOREST,ModelFamily.LINEAR_MODEL,ModelFamily.DECISION_TREE,]
[docs]defhandle_component_class(component_class):"""Standardizes input from a string name to a ComponentBase subclass if necessary. If a str is provided, will attempt to look up a ComponentBase class by that name and return a new instance. Otherwise if a ComponentBase subclass or Component instance is provided, will return that without modification. Args: component_class (str, ComponentBase): Input to be standardized. Returns: ComponentBase Raises: ValueError: If input is not a valid component class. MissingComponentError: If the component cannot be found. Examples: >>> from evalml.pipelines.components.estimators.regressors.decision_tree_regressor import DecisionTreeRegressor >>> handle_component_class(DecisionTreeRegressor) <class 'evalml.pipelines.components.estimators.regressors.decision_tree_regressor.DecisionTreeRegressor'> >>> handle_component_class("Random Forest Regressor") <class 'evalml.pipelines.components.estimators.regressors.rf_regressor.RandomForestRegressor'> """ifisinstance(component_class,ComponentBase)or(inspect.isclass(component_class)andissubclass(component_class,ComponentBase)):returncomponent_classifnotisinstance(component_class,str):raiseValueError(("component_class may only contain str or ComponentBase subclasses, not '{}'").format(type(component_class)),)component_classes={component.name:componentforcomponentinall_components()}ifcomponent_classnotincomponent_classes:raiseMissingComponentError('Component "{}" was not found'.format(component_class),)component_class=component_classes[component_class]returncomponent_class
[docs]defdrop_natural_language_columns(X):"""Drops natural language columns from dataframes for the imputers. Args: X (pd.Dataframe): The dataframe that we want to impute on. Returns: pd.Dataframe: the dataframe with any natural language columns dropped. list: list of all the columns that are considered natural language. """natural_language_columns=list(X.ww.select(["NaturalLanguage"],return_schema=True).columns.keys(),)ifnatural_language_columns:X=X.ww.copy()X=X.ww.drop(columns=natural_language_columns)returnX,natural_language_columns
[docs]defset_boolean_columns_to_integer(X):"""Sets boolean columns to integer nullable for the imputer. Args: X (pd.Dataframe): The dataframe that we want to impute on. Returns: pd.Dataframe: the dataframe with any of its ww columns that are boolean set to integer nullable. """X=X.ww.copy()original_X_schema=X.ww.schema.get_subset_schema(subset_cols=list(X.ww.select(exclude=["Boolean","BooleanNullable"],return_schema=True,).columns,),)X_boolean_cols=list(X.ww.select(include=["Boolean","BooleanNullable"],return_schema=True,).columns,)new_ltypes_for_boolean_cols={col:"IntegerNullable"forcolinX_boolean_cols}X.ww.init(schema=original_X_schema,logical_types=new_ltypes_for_boolean_cols)returnX
[docs]defget_prediction_intevals_for_tree_regressors(X:pd.DataFrame,predictions:pd.Series,coverage:List[float],estimators:List[Estimator],)->Dict[str,pd.Series]:"""Find the prediction intervals for tree-based regressors. Args: X (pd.DataFrame): Data of shape [n_samples, n_features]. predictions (pd.Series): Predictions from the regressor. coverage (list[float]): A list of floats between the values 0 and 1 that the upper and lower bounds of the prediction interval should be calculated for. estimators (list): Collection of fitted sub-estimators. Returns: dict: Prediction intervals, keys are in the format {coverage}_lower or {coverage}_upper. """prediction_interval_result={}forconf_intincoverage:preds=np.zeros((len(X),len(estimators)))forind,estimator_inenumerate(estimators):preds[:,ind]=estimator_.predict(X)std_preds=np.std(preds,axis=1)preds_lower=(predictions+st.norm.ppf(round((1-conf_int)/2,3))*std_preds)preds_upper=(predictions+st.norm.ppf(round((1+conf_int)/2,3))*std_preds)preds_lower=pd.Series(preds_lower,index=X.index,name=None)preds_upper=pd.Series(preds_upper,index=X.index,name=None)prediction_interval_result[f"{conf_int}_lower"]=preds_lowerprediction_interval_result[f"{conf_int}_upper"]=preds_upperreturnprediction_interval_result
[docs]classWrappedSKClassifier(BaseEstimator,ClassifierMixin):"""Scikit-learn classifier wrapper class."""def__init__(self,pipeline):"""Scikit-learn classifier wrapper class. Takes an EvalML pipeline as input and returns a scikit-learn classifier class wrapping that pipeline. Args: pipeline (PipelineBase or subclass obj): EvalML pipeline. """self.pipeline=pipelineself._estimator_type="classifier"ifpipeline._is_fitted:self._is_fitted=Trueself.classes_=pipeline.classes_
[docs]deffit(self,X,y):"""Fits component to data. Args: X (pd.DataFrame or np.ndarray): The input training data of shape [n_samples, n_features]. y (pd.Series, optional): The target training data of length [n_samples]. Returns: self """self.classes_=unique_labels(y)self.X_=Xself.y_=yself.is_fitted_=Trueself.pipeline.fit(X,y)returnself
[docs]defpredict(self,X):"""Make predictions using selected features. Args: X (pd.DataFrame): Features Returns: np.ndarray: Predicted values. """check_is_fitted(self,"is_fitted_")returnself.pipeline.predict(X).to_numpy()
[docs]defpredict_proba(self,X):"""Make probability estimates for labels. Args: X (pd.DataFrame): Features. Returns: np.ndarray: Probability estimates. """returnself.pipeline.predict_proba(X).to_numpy()
[docs]classWrappedSKRegressor(BaseEstimator,RegressorMixin):"""Scikit-learn regressor wrapper class."""def__init__(self,pipeline):"""Scikit-learn regressor wrapper class. Takes an EvalML pipeline as input and returns a scikit-learn regressor class wrapping that pipeline. Args: pipeline (PipelineBase or subclass obj): EvalML pipeline. """self.pipeline=pipelineself._estimator_type="regressor"self._is_fitted_=True# We need an attribute that ends in an underscore for scikit-learn to treat as fitted
[docs]deffit(self,X,y):"""Fits component to data. Args: X (pd.DataFrame or np.ndarray): the input training data of shape [n_samples, n_features] y (pd.Series, optional): the target training data of length [n_samples] Returns: self """self.pipeline.fit(X,y)returnself
[docs]defpredict(self,X):"""Make predictions using selected features. Args: X (pd.DataFrame): Features. Returns: np.ndarray: Predicted values. """returnself.pipeline.predict(X).to_numpy()
[docs]defscikit_learn_wrapped_estimator(evalml_obj):"""Wraps an EvalML object as a scikit-learn estimator."""fromevalml.pipelines.pipeline_baseimportPipelineBase"""Wrap an EvalML pipeline or estimator in a scikit-learn estimator."""ifisinstance(evalml_obj,PipelineBase):ifevalml_obj.problem_typein[ProblemTypes.REGRESSION,ProblemTypes.TIME_SERIES_REGRESSION,]:returnWrappedSKRegressor(evalml_obj)elif(evalml_obj.problem_type==ProblemTypes.BINARYorevalml_obj.problem_type==ProblemTypes.MULTICLASS):returnWrappedSKClassifier(evalml_obj)else:# EvalML Estimatorifevalml_obj.supported_problem_types==[ProblemTypes.REGRESSION,ProblemTypes.TIME_SERIES_REGRESSION,]:returnWrappedSKRegressor(evalml_obj)elifevalml_obj.supported_problem_types==[ProblemTypes.BINARY,ProblemTypes.MULTICLASS,ProblemTypes.TIME_SERIES_BINARY,ProblemTypes.TIME_SERIES_MULTICLASS,]:returnWrappedSKClassifier(evalml_obj)raiseValueError("Could not wrap EvalML object in scikit-learn wrapper.")
[docs]defgenerate_component_code(element):r"""Creates and returns a string that contains the Python imports and code required for running the EvalML component. Args: element (component instance): The instance of the component to generate string Python code for. Returns: String representation of Python code that can be run separately in order to recreate the component instance. Does not include code for custom component implementation. Raises: ValueError: If the input element is not a component instance. Examples: >>> from evalml.pipelines.components.estimators.regressors.decision_tree_regressor import DecisionTreeRegressor >>> assert generate_component_code(DecisionTreeRegressor()) == "from evalml.pipelines.components.estimators.regressors.decision_tree_regressor import DecisionTreeRegressor\n\ndecisionTreeRegressor = DecisionTreeRegressor(**{'criterion': 'squared_error', 'max_features': 'auto', 'max_depth': 6, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0})" ... >>> from evalml.pipelines.components.transformers.imputers.simple_imputer import SimpleImputer >>> assert generate_component_code(SimpleImputer()) == "from evalml.pipelines.components.transformers.imputers.simple_imputer import SimpleImputer\n\nsimpleImputer = SimpleImputer(**{'impute_strategy': 'most_frequent', 'fill_value': None})" """# hold the imports needed and add code to endcode_strings=[]base_string=""ifnotisinstance(element,ComponentBase):raiseValueError("Element must be a component instance, received {}".format(type(element)),)ifelement.__class__inall_components():code_strings.append("from {} import {}\n".format(element.__class__.__module__,element.__class__.__name__,),)component_parameters=element.parametersname=element.name[0].lower()+element.name[1:].replace(" ","")base_string+="{0} = {1}(**{2})".format(name,element.__class__.__name__,component_parameters,)code_strings.append(base_string)return"\n".join(code_strings)
[docs]defmake_balancing_dictionary(y,sampling_ratio):"""Makes dictionary for oversampler components. Find ratio of each class to the majority. If the ratio is smaller than the sampling_ratio, we want to oversample, otherwise, we don't want to sample at all, and we leave the data as is. Args: y (pd.Series): Target data. sampling_ratio (float): The balanced ratio we want the samples to meet. Returns: dict: Dictionary where keys are the classes, and the corresponding values are the counts of samples for each class that will satisfy sampling_ratio. Raises: ValueError: If sampling ratio is not in the range (0, 1] or the target is empty. Examples: >>> import pandas as pd >>> y = pd.Series([1] * 4 + [2] * 8 + [3]) >>> assert make_balancing_dictionary(y, 0.5) == {2: 8, 1: 4, 3: 4} >>> assert make_balancing_dictionary(y, 0.9) == {2: 8, 1: 7, 3: 7} >>> assert make_balancing_dictionary(y, 0.1) == {2: 8, 1: 4, 3: 1} """ifsampling_ratio<=0orsampling_ratio>1:raiseValueError("Sampling ratio must be in range (0, 1], received {}".format(sampling_ratio,),)iflen(y)==0:raiseValueError("Target data must not be empty")value_counts=y.value_counts()ratios=value_counts/value_counts.values[0]class_dic={}sample_amount=int(value_counts.values[0]*sampling_ratio)forindex,valueinratios.items():ifvalue<sampling_ratio:# we want to oversample this classclass_dic[index]=sample_amountelse:# this class is already larger than the ratio, don't changeclass_dic[index]=value_counts[index]returnclass_dic
[docs]defhandle_float_categories_for_catboost(X):"""Updates input data to be compatible with CatBoost estimators. CatBoost cannot handle data in X that is the Categorical Woodwork logical type with floating point categories. This utility determines if the floating point categories can be converted to integers without truncating any data, and if they can be, converts them to int64 categories. Will not attempt to use values that are truly floating points. Args: X (pd.DataFrame): Input data to CatBoost that has Woodwork initialized Returns: DataFrame: Input data with exact same Woodwork typing info as the original but with any float categories converted to be int64 when possible. Raises: ValueError: if the numeric categories are actual floats that cannot be converted to integers without truncating data """original_schema=X.ww.schemaoriginal_dtypes=X.dtypes# Determine which categorical columns have float categories, which CatBoost would error oncategorical_columns=X.ww.select("category",return_schema=True).columns.keys()cols_with_float_categories=[colforcolincategorical_columnsiforiginal_dtypes[col].categories.dtype=="float64"]ifnotcols_with_float_categories:returnX# determine which columns are really integers vs are actually floatsnew_dtypes={}forcolincols_with_float_categories:col_categories=original_dtypes[col].categoriesfloats_are_really_ints=(col_categories%1==0).all()iffloats_are_really_ints:# We can use non nullable int64 here because there will not be any nans at this pointnew_categories=col_categories.astype("int64")new_dtypes[col]=pd.CategoricalDtype(categories=new_categories,ordered=original_dtypes[col].ordered,)else:# CatBoost explanation as to why they don't support float categories: https://catboost.ai/en/docs/concepts/faq#floating-point-values# CatBoost bug keeping us from converting to string: https://github.com/catboost/catboost/issues/1965# Pandas bug keeping us from converting `.astype("string").astype("object")`: https://github.com/pandas-dev/pandas/issues/51074raiseValueError(f"Invalid category found in {col}. CatBoost does not support floats as categories.",)X_t=X.astype(new_dtypes)X_t.ww.init(schema=original_schema)returnX_t