from evalml.data_checks import ( DataCheck, DataCheckMessageCode, DataCheckWarning ) from evalml.utils import infer_feature_types [docs]class MulticollinearityDataCheck(DataCheck): """Check if any set features are likely to be multicollinear.""" [docs] def __init__(self, threshold=0.9): """Check if any set of features are likely to be multicollinear. Arguments: threshold (float): The threshold to be considered. Defaults to 0.9. """ if threshold < 0 or threshold > 1: raise ValueError("threshold must be a float between 0 and 1, inclusive.") self.threshold = threshold [docs] def validate(self, X, y=None): """Check if any set of features are likely to be multicollinear. Arguments: X (ww.DataTable, pd.DataFrame, np.ndarray): The input features to check Returns: dict: dict with a DataCheckWarning if there are any potentially multicollinear columns. """ messages = { "warnings": [], "errors": [] } X = infer_feature_types(X) mutual_info_df = X.mutual_information() if mutual_info_df.empty: return messages above_threshold = mutual_info_df.loc[mutual_info_df['mutual_info'] >= self.threshold] correlated_cols = [(col_1, col_2) for col_1, col_2 in zip(above_threshold['column_1'], above_threshold['column_2'])] if correlated_cols: warning_msg = "Columns are likely to be correlated: {}" messages["warnings"].append(DataCheckWarning(message=warning_msg.format(correlated_cols), data_check_name=self.name, message_code=DataCheckMessageCode.IS_MULTICOLLINEAR, details={"columns": correlated_cols}).to_dict()) return messages