[docs]defreadable_explanation(pipeline,X=None,y=None,importance_method="permutation",max_features=5,min_importance_threshold=0.05,objective="auto",):"""Outputs a human-readable explanation of trained pipeline behavior. Args: pipeline (PipelineBase): The pipeline to explain. X (pd.DataFrame): If importance_method is permutation, the holdout X data to compute importance with. Ignored otherwise. y (pd.Series): The holdout y data, used to obtain the name of the target class. If importance_method is permutation, used to compute importance with. importance_method (str): The method of determining feature importance. One of ["permutation", "feature"]. Defaults to "permutation". max_features (int): The maximum number of influential features to include in an explanation. This does not affect the number of detrimental features reported. Defaults to 5. min_importance_threshold (float): The minimum percent of total importance a single feature can have to be considered important. Defaults to 0.05. objective (str, ObjectiveBase): If importance_method is permutation, the objective to compute importance with. Ignored otherwise, defaults to "auto". Raises: ValueError: if any arguments passed in are invalid or the pipeline is not fitted. """logger=get_logger(f"{__name__}.explain")ifnotpipeline._is_fitted:raiseValueError("Pipelines must be fitted in order to run feature explanations.",)ifmin_importance_threshold>=1ormin_importance_threshold<0:raiseValueError(f"The minimum importance threshold must be a percentage value in the range [0, 1), not {min_importance_threshold}.",)ifimportance_method=="permutation":ifobjective=="auto":objective=evalml.automl.get_default_primary_search_objective(pipeline.problem_type,)ifXisNoneoryisNone:raiseValueError("X and y are required parameters for explaining pipelines with permutation importance.",)X=infer_feature_types(X)y=infer_feature_types(y)imp_df=calculate_permutation_importance(pipeline,X,y,objective)elifimportance_method=="feature":objective=Noneimp_df=pipeline.feature_importanceelse:raiseValueError(f"Unknown importance method {importance_method}.")linear_importance=Falseif(pipeline.estimator.model_family==ModelFamily.LINEAR_MODELandimportance_method=="feature"):linear_importance=True(most_important_features,somewhat_important_features,detrimental_features,)=get_influential_features(imp_df,max_features,min_importance_threshold,linear_importance,)target=yifyisNoneelsey.nameexplanation=_fill_template(pipeline.estimator,target,objective,most_important_features,somewhat_important_features,detrimental_features,)logger.info(explanation)
[docs]defget_influential_features(imp_df,max_features=5,min_importance_threshold=0.05,linear_importance=False,):"""Finds the most influential features as well as any detrimental features from a dataframe of feature importances. Args: imp_df (pd.DataFrame): DataFrame containing feature names and associated importances. max_features (int): The maximum number of features to include in an explanation. Defaults to 5. min_importance_threshold (float): The minimum percent of total importance a single feature can have to be considered important. Defaults to 0.05. linear_importance (bool): When True, negative feature importances are not considered detrimental. Defaults to False. Returns: (list, list, list): Lists of feature names corresponding to heavily influential, somewhat influential, and detrimental features, respectively. """heavy_importance_threshold=max(0.2,min_importance_threshold+0.1)# Separate negative and positive features, if situation callsiflinear_importance:pos_imp_df=imp_dfpos_imp_df["importance"]=abs(pos_imp_df["importance"])neg_imp_df=pd.DataFrame({"feature":[],"importance":[]})else:neg_imp_df=imp_df[imp_df["importance"]<0]pos_imp_df=imp_df[imp_df["importance"]>=0]# Normalize the positive features to sum to 1pos_imp_df["importance"]=pos_imp_df["importance"]/sum(pos_imp_df["importance"])num_feats=min(len(pos_imp_df),max_features)imp_features=pos_imp_df[:num_feats]heavy_importance=imp_features[imp_features["importance"]>=heavy_importance_threshold]somewhat_importance=imp_features[imp_features["importance"]<heavy_importance_threshold]return(list(heavy_importance["feature"]),list(somewhat_importance[somewhat_importance["importance"]>=min_importance_threshold]["feature"],),list(neg_imp_df["feature"]),)
def_fill_template(estimator,target,objective,most_important,somewhat_important,detrimental_feats,):# Get the objective to a printable stringifobjectiveisnotNone:ifisinstance(objective,evalml.objectives.ObjectiveBase):objective=objective.nameifobjective!="R2":# Remove any title case if necessaryobjective=objective.lower()# Beginning of descriptionobjective_str=f" as measured by {objective}"ifobjectiveisnotNoneelse""beginning=(f"{estimator}: The output{objective_str}"iftargetisNoneelsef"{estimator}: The prediction of {target}{objective_str}")defenumerate_features(feature_list):text=""iflen(feature_list)==2else","foriinrange(1,len(feature_list)):ifi==len(feature_list)-1:text=text+f" and {feature_list[i]}"else:text=text+f" {feature_list[i]},"returntext# Heavily influential descriptionheavy=""iflen(most_important)>0:heavy=f" is heavily influenced by {most_important[0]}"iflen(most_important)>1:heavy=heavy+enumerate_features(most_important)iflen(somewhat_important)>0:heavy=heavy+", and"# Somewhat influential descriptionsomewhat=""iflen(somewhat_important)>0:somewhat=f" is somewhat influenced by {somewhat_important[0]}"iflen(somewhat_important)>1:somewhat=somewhat+enumerate_features(somewhat_important)# Neither!neither="."ifnot(len(heavy)orlen(somewhat)):neither=" is not strongly influenced by any single feature. Lower the `min_importance_threshold` to see more."# Detrimental Descriptiondetrimental=""iflen(detrimental_feats)>0:iflen(detrimental_feats)==1:detrimental=f"\nThe feature {detrimental_feats[0]}"tag="this feature."else:detrimental=f"\nThe features {detrimental_feats[0]}"detrimental=detrimental+enumerate_features(detrimental_feats)tag="these features."detrimental=(detrimental+" detracted from model performance. We suggest removing "+tag)returnbeginning+heavy+somewhat+neither+detrimental