Source code for evalml.utils.woodwork_utils

"""Woodwork utility methods."""
import numpy as np
import pandas as pd
import woodwork as ww
from woodwork.logical_types import Unknown

from evalml.utils.gen_utils import is_all_numeric

numeric_and_boolean_ww = [
    ww.logical_types.Integer.type_string,
    ww.logical_types.Double.type_string,
    ww.logical_types.Boolean.type_string,
]


def _numpy_to_pandas(array):
    if len(array.shape) == 1:
        data = pd.Series(array)
    else:
        data = pd.DataFrame(array)
    return data


def _list_to_pandas(list):
    return _numpy_to_pandas(np.array(list))


_nullable_types = {"Int64", "Float64", "boolean"}


def _raise_value_error_if_nullable_types_detected(data):
    types = {data.name: data.dtype} if isinstance(data, pd.Series) else data.dtypes
    cols_with_nullable_types = {
        col: str(ptype)
        for col, ptype in dict(types).items()
        if str(ptype) in _nullable_types
    }
    if cols_with_nullable_types:
        raise ValueError(
            "Evalml does not support the new pandas nullable types because "
            "our dependencies (sklearn, xgboost, lightgbm) do not support them yet."
            "If your data does not have missing values, please use the non-nullable types (bool, int64, float64). "
            "If your data does have missing values, use float64 for int and float columns and category for boolean columns. "
            f"These are the columns with nullable types: {list(cols_with_nullable_types.items())}"
        )


[docs]def infer_feature_types(data, feature_types=None): """Create a Woodwork structure from the given list, pandas, or numpy input, with specified types for columns. If a column's type is not specified, it will be inferred by Woodwork. Args: data (pd.DataFrame, pd.Series): Input data to convert to a Woodwork data structure. feature_types (string, ww.logical_type obj, dict, optional): If data is a 2D structure, feature_types must be a dictionary mapping column names to the type of data represented in the column. If data is a 1D structure, then feature_types must be a Woodwork logical type or a string representing a Woodwork logical type ("Double", "Integer", "Boolean", "Categorical", "Datetime", "NaturalLanguage") Returns: A Woodwork data structure where the data type of each column was either specified or inferred. Raises: ValueError: If there is a mismatch between the dataframe and the woodwork schema. """ if isinstance(data, list): data = _list_to_pandas(data) elif isinstance(data, np.ndarray): data = _numpy_to_pandas(data) _raise_value_error_if_nullable_types_detected(data) def convert_all_nan_unknown_to_double(data): def is_column_pd_na(data, col): return data[col].isna().all() def is_column_unknown(data, col): return isinstance(data.ww.logical_types[col], Unknown) if isinstance(data, pd.DataFrame): all_null_unk_cols = [ col for col in data.columns if (is_column_unknown(data, col) and is_column_pd_na(data, col)) ] if len(all_null_unk_cols): for col in all_null_unk_cols: data.ww.set_types({col: "Double"}) return data if data.ww.schema is not None: if isinstance(data, pd.DataFrame) and not ww.is_schema_valid( data, data.ww.schema ): ww_error = ww.get_invalid_schema_message(data, data.ww.schema) if "dtype mismatch" in ww_error: ww_error = ( "Dataframe types are not consistent with logical types. This usually happens " "when a data transformation does not go through the ww accessor. Call df.ww.init() to " f"get rid of this message. This is a more detailed message about the mismatch: {ww_error}" ) else: ww_error = f"{ww_error}. Please initialize ww with df.ww.init() to get rid of this message." raise ValueError(ww_error) data.ww.init(schema=data.ww.schema) return convert_all_nan_unknown_to_double(data) if isinstance(data, pd.Series): if all(data.isna()): data = data.replace(pd.NA, np.nan) feature_types = "Double" return ww.init_series(data, logical_type=feature_types) else: ww_data = data.copy() ww_data.ww.init(logical_types=feature_types) return convert_all_nan_unknown_to_double(ww_data)
def _convert_numeric_dataset_pandas(X, y): """Convert numeric and non-null data to pandas datatype. Raises ValueError if there is null or non-numeric data. Used with data sampler strategies. Args: X (pd.DataFrame, np.ndarray): Data to transform. y (pd.Series, np.ndarray): Target data. Returns: Tuple(pd.DataFrame, pd.Series): Transformed X and y. """ X_ww = infer_feature_types(X) if not is_all_numeric(X_ww): raise ValueError( "Values not all numeric or there are null values provided in the dataset" ) y_ww = infer_feature_types(y) return X_ww, y_ww