Source code for evalml.model_understanding.permutation_importance

"""Permutation importance methods."""
import numpy as np
import pandas as pd
from joblib import Parallel, delayed

from evalml.objectives.utils import get_objective
from evalml.problem_types import is_classification
from evalml.problem_types.utils import is_regression
from evalml.utils import import_or_raise, infer_feature_types, jupyter_check


[docs]def calculate_permutation_importance( pipeline, X, y, objective, n_repeats=5, n_jobs=None, random_seed=0, ): """Calculates permutation importance for features. Args: pipeline (PipelineBase or subclass): Fitted pipeline. X (pd.DataFrame): The input data used to score and compute permutation importance. y (pd.Series): The target data. objective (str, ObjectiveBase): Objective to score on. n_repeats (int): Number of times to permute a feature. Defaults to 5. n_jobs (int or None): Non-negative integer describing level of parallelism used for pipelines. None and 1 are equivalent. If set to -1, all CPUs are used. For n_jobs below -1, (n_cpus + 1 + n_jobs) are used. Defaults to None. random_seed (int): Seed for the random number generator. Defaults to 0. Returns: pd.DataFrame: Mean feature importance scores over a number of shuffles. Raises: ValueError: If objective cannot be used with the given pipeline. """ X = infer_feature_types(X) y = infer_feature_types(y) objective = get_objective(objective, return_instance=True) if not objective.is_defined_for_problem_type(pipeline.problem_type): raise ValueError( f"Given objective '{objective.name}' cannot be used with '{pipeline.name}'", ) if pipeline._supports_fast_permutation_importance: precomputed_features = pipeline.transform_all_but_final(X, y) perm_importance = _fast_permutation_importance( pipeline, X, y, objective, precomputed_features, n_repeats=n_repeats, n_jobs=n_jobs, random_seed=random_seed, ) else: perm_importance = _slow_permutation_importance( pipeline, X, y, objective, n_repeats=n_repeats, n_jobs=n_jobs, random_seed=random_seed, ) mean_perm_importance = perm_importance["importances_mean"] feature_names = list(X.columns) mean_perm_importance = list(zip(feature_names, mean_perm_importance)) mean_perm_importance.sort(key=lambda x: x[1], reverse=True) return pd.DataFrame(mean_perm_importance, columns=["feature", "importance"])
[docs]def graph_permutation_importance(pipeline, X, y, objective, importance_threshold=0): """Generate a bar graph of the pipeline's permutation importance. Args: pipeline (PipelineBase or subclass): Fitted pipeline. X (pd.DataFrame): The input data used to score and compute permutation importance. y (pd.Series): The target data. objective (str, ObjectiveBase): Objective to score on. importance_threshold (float, optional): If provided, graph features with a permutation importance whose absolute value is larger than importance_threshold. Defaults to 0. Returns: plotly.Figure, a bar graph showing features and their respective permutation importance. Raises: ValueError: If importance_threshold is not greater than or equal to 0. """ go = import_or_raise( "plotly.graph_objects", error_msg="Cannot find dependency plotly.graph_objects", ) if jupyter_check(): import_or_raise("ipywidgets", warning=True) perm_importance = calculate_permutation_importance(pipeline, X, y, objective) perm_importance["importance"] = perm_importance["importance"] if importance_threshold < 0: raise ValueError( f"Provided importance threshold of {importance_threshold} must be greater than or equal to 0", ) # Remove features with close to zero importance perm_importance = perm_importance[ abs(perm_importance["importance"]) >= importance_threshold ] # List is reversed to go from ascending order to descending order perm_importance = perm_importance.iloc[::-1] title = "Permutation Importance" subtitle = ( "The relative importance of each input feature's " "overall influence on the pipelines' predictions, computed using " "the permutation importance algorithm." ) data = [ go.Bar( x=perm_importance["importance"], y=perm_importance["feature"], orientation="h", ), ] layout = { "title": "{0}<br><sub>{1}</sub>".format(title, subtitle), "height": 800, "xaxis_title": "Permutation Importance", "yaxis_title": "Feature", "yaxis": {"type": "category"}, } fig = go.Figure(data=data, layout=layout) return fig
[docs]def calculate_permutation_importance_one_column( pipeline, X, y, col_name, objective, n_repeats=5, fast=True, precomputed_features=None, random_seed=0, ): """Calculates permutation importance for one column in the original dataframe. Args: pipeline (PipelineBase or subclass): Fitted pipeline. X (pd.DataFrame): The input data used to score and compute permutation importance. y (pd.Series): The target data. col_name (str, int): The column in X to calculate permutation importance for. objective (str, ObjectiveBase): Objective to score on. n_repeats (int): Number of times to permute a feature. Defaults to 5. fast (bool): Whether to use the fast method of calculating the permutation importance or not. Defaults to True. precomputed_features (pd.DataFrame): Precomputed features necessary to calculate permutation importance using the fast method. Defaults to None. random_seed (int): Seed for the random number generator. Defaults to 0. Returns: float: Mean feature importance scores over a number of shuffles. Raises: ValueError: If pipeline does not support fast permutation importance calculation. ValueError: If precomputed_features is None. """ X = infer_feature_types(X) y = infer_feature_types(y) objective = get_objective(objective, return_instance=True) if fast: if not pipeline._supports_fast_permutation_importance: raise ValueError( "Pipeline does not support fast permutation importance calculation", ) if precomputed_features is None: raise ValueError( "Fast method of calculating permutation importance requires precomputed_features", ) permutation_importance = _fast_permutation_importance( pipeline, X, y, objective, precomputed_features, col_name=col_name, n_repeats=n_repeats, random_seed=random_seed, ) else: permutation_importance = _slow_permutation_importance( pipeline, X, y, objective, col_name=col_name, n_repeats=n_repeats, random_seed=random_seed, ) return permutation_importance["importances_mean"]
def _fast_permutation_importance( pipeline, X, y, objective, precomputed_features, col_name=None, n_repeats=5, n_jobs=None, random_seed=None, ): """Calculate permutation importance faster by only computing the estimator features once. Only used for pipelines that support this optimization. """ if is_classification(pipeline.problem_type): y = pipeline._encode_targets(y) baseline_score = _fast_scorer(pipeline, precomputed_features, X, y, objective) if col_name is None: scores = Parallel(n_jobs=n_jobs)( delayed(_calculate_permutation_scores_fast)( pipeline, precomputed_features, y, objective, col_name, random_seed, n_repeats, _fast_scorer, baseline_score, ) for col_name in X.columns ) importances = baseline_score - np.array(scores) return {"importances_mean": np.mean(importances, axis=1)} else: scores = _calculate_permutation_scores_fast( pipeline, precomputed_features, y, objective, col_name, random_seed, n_repeats, _fast_scorer, baseline_score, ) importances = baseline_score - np.array(scores) importances_mean = ( np.mean(importances, axis=1) if col_name is None else np.mean(importances) ) return {"importances_mean": importances_mean} def _calculate_permutation_scores_fast( pipeline, precomputed_features, y, objective, col_name, random_seed, n_repeats, scorer, baseline_score, ): """Calculate the permutation score when `col_name` is permuted.""" random_state = np.random.RandomState(random_seed) scores = np.zeros(n_repeats) # If column is not in the features or provenance, assume the column was dropped if ( col_name not in precomputed_features.columns and col_name not in pipeline._get_feature_provenance() ): return scores + baseline_score if col_name in precomputed_features.columns: col_idx = precomputed_features.columns.get_loc(col_name) else: col_idx = [ precomputed_features.columns.get_loc(col) for col in pipeline._get_feature_provenance()[col_name] ] return _shuffle_and_score_helper( pipeline, precomputed_features, y, objective, col_idx, n_repeats, scorer, random_state, is_fast=True, ) def _slow_permutation_importance( pipeline, X, y, objective, col_name=None, n_repeats=5, n_jobs=None, random_seed=None, ): """If `col_name` is not None, calculates permutation importance for only the column with that name. Otherwise, calculates the permutation importance for all columns in the input dataframe. """ baseline_score = _slow_scorer(pipeline, X, y, objective) if col_name is None: scores = Parallel(n_jobs=n_jobs)( delayed(_calculate_permutation_scores_slow)( pipeline, X, y, col_idx, objective, _slow_scorer, n_repeats, random_seed, ) for col_idx in range(X.shape[1]) ) else: baseline_score = _slow_scorer(pipeline, X, y, objective) scores = _calculate_permutation_scores_slow( pipeline, X, y, col_name, objective, _slow_scorer, n_repeats, random_seed, ) importances = baseline_score - np.array(scores) importances_mean = ( np.mean(importances, axis=1) if col_name is None else np.mean(importances) ) return {"importances_mean": importances_mean} def _calculate_permutation_scores_slow( estimator, X, y, col_name, objective, scorer, n_repeats, random_seed, ): """Calculate score when `col_idx` is permuted.""" random_state = np.random.RandomState(random_seed) col_idx = col_name if col_name in X.columns: col_idx = X.columns.get_loc(col_name) return _shuffle_and_score_helper( estimator, X, y, objective, col_idx, n_repeats, scorer, random_state, is_fast=False, ) def _shuffle_and_score_helper( pipeline, X_features, y, objective, col_idx, n_repeats, scorer, random_state, is_fast=True, ): scores = np.zeros(n_repeats) # This is what sk_permutation_importance does. Useful for thread safety X_permuted = X_features.copy() shuffling_idx = np.arange(X_features.shape[0]) for n_round in range(n_repeats): random_state.shuffle(shuffling_idx) col = X_permuted.iloc[shuffling_idx, col_idx] col.index = X_permuted.index X_permuted.iloc[:, col_idx] = col X_permuted.ww.init(schema=X_features.ww.schema) if is_fast: feature_score = scorer(pipeline, X_permuted, X_features, y, objective) else: feature_score = scorer(pipeline, X_permuted, y, objective) scores[n_round] = feature_score return scores def _slow_scorer(pipeline, X, y, objective): scores = pipeline.score(X, y, objectives=[objective]) return ( scores[objective.name] if objective.greater_is_better else -scores[objective.name] ) def _fast_scorer(pipeline, features, X, y, objective): if objective.score_needs_proba: preds = pipeline.estimator.predict_proba(features) else: preds = pipeline.estimator.predict(features) if is_regression(pipeline.problem_type): preds = pipeline.inverse_transform(preds) score = pipeline._score(X, y, preds, objective) return score if objective.greater_is_better else -score