[docs]defcalculate_permutation_importance(pipeline,X,y,objective,n_repeats=5,n_jobs=None,random_seed=0,):"""Calculates permutation importance for features. Args: pipeline (PipelineBase or subclass): Fitted pipeline. X (pd.DataFrame): The input data used to score and compute permutation importance. y (pd.Series): The target data. objective (str, ObjectiveBase): Objective to score on. n_repeats (int): Number of times to permute a feature. Defaults to 5. n_jobs (int or None): Non-negative integer describing level of parallelism used for pipelines. None and 1 are equivalent. If set to -1, all CPUs are used. For n_jobs below -1, (n_cpus + 1 + n_jobs) are used. Defaults to None. random_seed (int): Seed for the random number generator. Defaults to 0. Returns: pd.DataFrame: Mean feature importance scores over a number of shuffles. Raises: ValueError: If objective cannot be used with the given pipeline. """X=infer_feature_types(X)y=infer_feature_types(y)objective=get_objective(objective,return_instance=True)ifnotobjective.is_defined_for_problem_type(pipeline.problem_type):raiseValueError(f"Given objective '{objective.name}' cannot be used with '{pipeline.name}'",)ifpipeline._supports_fast_permutation_importance:precomputed_features=pipeline.transform_all_but_final(X,y)perm_importance=_fast_permutation_importance(pipeline,X,y,objective,precomputed_features,n_repeats=n_repeats,n_jobs=n_jobs,random_seed=random_seed,)else:perm_importance=_slow_permutation_importance(pipeline,X,y,objective,n_repeats=n_repeats,n_jobs=n_jobs,random_seed=random_seed,)mean_perm_importance=perm_importance["importances_mean"]feature_names=list(X.columns)mean_perm_importance=list(zip(feature_names,mean_perm_importance))mean_perm_importance.sort(key=lambdax:x[1],reverse=True)returnpd.DataFrame(mean_perm_importance,columns=["feature","importance"])
[docs]defgraph_permutation_importance(pipeline,X,y,objective,importance_threshold=0):"""Generate a bar graph of the pipeline's permutation importance. Args: pipeline (PipelineBase or subclass): Fitted pipeline. X (pd.DataFrame): The input data used to score and compute permutation importance. y (pd.Series): The target data. objective (str, ObjectiveBase): Objective to score on. importance_threshold (float, optional): If provided, graph features with a permutation importance whose absolute value is larger than importance_threshold. Defaults to 0. Returns: plotly.Figure, a bar graph showing features and their respective permutation importance. Raises: ValueError: If importance_threshold is not greater than or equal to 0. """go=import_or_raise("plotly.graph_objects",error_msg="Cannot find dependency plotly.graph_objects",)ifjupyter_check():import_or_raise("ipywidgets",warning=True)perm_importance=calculate_permutation_importance(pipeline,X,y,objective)perm_importance["importance"]=perm_importance["importance"]ifimportance_threshold<0:raiseValueError(f"Provided importance threshold of {importance_threshold} must be greater than or equal to 0",)# Remove features with close to zero importanceperm_importance=perm_importance[abs(perm_importance["importance"])>=importance_threshold]# List is reversed to go from ascending order to descending orderperm_importance=perm_importance.iloc[::-1]title="Permutation Importance"subtitle=("The relative importance of each input feature's ""overall influence on the pipelines' predictions, computed using ""the permutation importance algorithm.")data=[go.Bar(x=perm_importance["importance"],y=perm_importance["feature"],orientation="h",),]layout={"title":"{0}<br><sub>{1}</sub>".format(title,subtitle),"height":800,"xaxis_title":"Permutation Importance","yaxis_title":"Feature","yaxis":{"type":"category"},}fig=go.Figure(data=data,layout=layout)returnfig
[docs]defcalculate_permutation_importance_one_column(pipeline,X,y,col_name,objective,n_repeats=5,fast=True,precomputed_features=None,random_seed=0,):"""Calculates permutation importance for one column in the original dataframe. Args: pipeline (PipelineBase or subclass): Fitted pipeline. X (pd.DataFrame): The input data used to score and compute permutation importance. y (pd.Series): The target data. col_name (str, int): The column in X to calculate permutation importance for. objective (str, ObjectiveBase): Objective to score on. n_repeats (int): Number of times to permute a feature. Defaults to 5. fast (bool): Whether to use the fast method of calculating the permutation importance or not. Defaults to True. precomputed_features (pd.DataFrame): Precomputed features necessary to calculate permutation importance using the fast method. Defaults to None. random_seed (int): Seed for the random number generator. Defaults to 0. Returns: float: Mean feature importance scores over a number of shuffles. Raises: ValueError: If pipeline does not support fast permutation importance calculation. ValueError: If precomputed_features is None. """X=infer_feature_types(X)y=infer_feature_types(y)objective=get_objective(objective,return_instance=True)iffast:ifnotpipeline._supports_fast_permutation_importance:raiseValueError("Pipeline does not support fast permutation importance calculation",)ifprecomputed_featuresisNone:raiseValueError("Fast method of calculating permutation importance requires precomputed_features",)permutation_importance=_fast_permutation_importance(pipeline,X,y,objective,precomputed_features,col_name=col_name,n_repeats=n_repeats,random_seed=random_seed,)else:permutation_importance=_slow_permutation_importance(pipeline,X,y,objective,col_name=col_name,n_repeats=n_repeats,random_seed=random_seed,)returnpermutation_importance["importances_mean"]
def_fast_permutation_importance(pipeline,X,y,objective,precomputed_features,col_name=None,n_repeats=5,n_jobs=None,random_seed=None,):"""Calculate permutation importance faster by only computing the estimator features once. Only used for pipelines that support this optimization. """ifis_classification(pipeline.problem_type):y=pipeline._encode_targets(y)baseline_score=_fast_scorer(pipeline,precomputed_features,X,y,objective)ifcol_nameisNone:scores=Parallel(n_jobs=n_jobs)(delayed(_calculate_permutation_scores_fast)(pipeline,precomputed_features,y,objective,col_name,random_seed,n_repeats,_fast_scorer,baseline_score,)forcol_nameinX.columns)importances=baseline_score-np.array(scores)return{"importances_mean":np.mean(importances,axis=1)}else:scores=_calculate_permutation_scores_fast(pipeline,precomputed_features,y,objective,col_name,random_seed,n_repeats,_fast_scorer,baseline_score,)importances=baseline_score-np.array(scores)importances_mean=(np.mean(importances,axis=1)ifcol_nameisNoneelsenp.mean(importances))return{"importances_mean":importances_mean}def_calculate_permutation_scores_fast(pipeline,precomputed_features,y,objective,col_name,random_seed,n_repeats,scorer,baseline_score,):"""Calculate the permutation score when `col_name` is permuted."""random_state=np.random.RandomState(random_seed)scores=np.zeros(n_repeats)# If column is not in the features or provenance, assume the column was droppedif(col_namenotinprecomputed_features.columnsandcol_namenotinpipeline._get_feature_provenance()):returnscores+baseline_scoreifcol_nameinprecomputed_features.columns:col_idx=precomputed_features.columns.get_loc(col_name)else:col_idx=[precomputed_features.columns.get_loc(col)forcolinpipeline._get_feature_provenance()[col_name]]return_shuffle_and_score_helper(pipeline,precomputed_features,y,objective,col_idx,n_repeats,scorer,random_state,is_fast=True,)def_slow_permutation_importance(pipeline,X,y,objective,col_name=None,n_repeats=5,n_jobs=None,random_seed=None,):"""If `col_name` is not None, calculates permutation importance for only the column with that name. Otherwise, calculates the permutation importance for all columns in the input dataframe. """baseline_score=_slow_scorer(pipeline,X,y,objective)ifcol_nameisNone:scores=Parallel(n_jobs=n_jobs)(delayed(_calculate_permutation_scores_slow)(pipeline,X,y,col_idx,objective,_slow_scorer,n_repeats,random_seed,)forcol_idxinrange(X.shape[1]))else:baseline_score=_slow_scorer(pipeline,X,y,objective)scores=_calculate_permutation_scores_slow(pipeline,X,y,col_name,objective,_slow_scorer,n_repeats,random_seed,)importances=baseline_score-np.array(scores)importances_mean=(np.mean(importances,axis=1)ifcol_nameisNoneelsenp.mean(importances))return{"importances_mean":importances_mean}def_calculate_permutation_scores_slow(estimator,X,y,col_name,objective,scorer,n_repeats,random_seed,):"""Calculate score when `col_idx` is permuted."""random_state=np.random.RandomState(random_seed)col_idx=col_nameifcol_nameinX.columns:col_idx=X.columns.get_loc(col_name)return_shuffle_and_score_helper(estimator,X,y,objective,col_idx,n_repeats,scorer,random_state,is_fast=False,)def_shuffle_and_score_helper(pipeline,X_features,y,objective,col_idx,n_repeats,scorer,random_state,is_fast=True,):scores=np.zeros(n_repeats)# This is what sk_permutation_importance does. Useful for thread safetyX_permuted=X_features.copy()shuffling_idx=np.arange(X_features.shape[0])forn_roundinrange(n_repeats):random_state.shuffle(shuffling_idx)col=X_permuted.iloc[shuffling_idx,col_idx]col.index=X_permuted.indexX_permuted.iloc[:,col_idx]=colX_permuted.ww.init(schema=X_features.ww.schema)ifis_fast:feature_score=scorer(pipeline,X_permuted,X_features,y,objective)else:feature_score=scorer(pipeline,X_permuted,y,objective)scores[n_round]=feature_scorereturnscoresdef_slow_scorer(pipeline,X,y,objective):scores=pipeline.score(X,y,objectives=[objective])return(scores[objective.name]ifobjective.greater_is_betterelse-scores[objective.name])def_fast_scorer(pipeline,features,X,y,objective):ifobjective.score_needs_proba:preds=pipeline.estimator.predict_proba(features)else:preds=pipeline.estimator.predict(features)ifis_regression(pipeline.problem_type):preds=pipeline.inverse_transform(preds)score=pipeline._score(X,y,preds,objective)returnscoreifobjective.greater_is_betterelse-score