Source code for evalml.utils.woodwork_utils


import numpy as np
import pandas as pd
import woodwork as ww

numeric_and_boolean_ww = [ww.logical_types.Integer, ww.logical_types.Double, ww.logical_types.Boolean]


[docs]def infer_feature_types(data, feature_types=None):
    """Create a Woodwork structure from the given list, pandas, or numpy input, with specified types for columns.
        If a column's type is not specified, it will be inferred by Woodwork.

    Arguments:
        data (pd.DataFrame): Input data to convert to a Woodwork data structure.
        feature_types (string, ww.logical_type obj, dict, optional): If data is a 2D structure, feature_types must be a dictionary
            mapping column names to the type of data represented in the column. If data is a 1D structure, then feature_types must be
            a Woodwork logical type or a string representing a Woodwork logical type ("Double", "Integer", "Boolean", "Categorical", "Datetime", "NaturalLanguage")

    Returns:
        A Woodwork data structure where the data type of each column was either specified or inferred.
    """
    ww_data = data
    if isinstance(data, ww.DataTable) or isinstance(data, ww.DataColumn):
        return ww_data
    if isinstance(data, list):
        ww_data = np.array(data)

    ww_data = ww_data.copy()
    if len(ww_data.shape) == 1:
        name = ww_data.name if isinstance(ww_data, pd.Series) else None
        return ww.DataColumn(ww_data, name=name, logical_type=feature_types)
    return ww.DataTable(ww_data, logical_types=feature_types)


def _convert_woodwork_types_wrapper(pd_data):
    """
    Converts a pandas data structure that may have extension or nullable dtypes to dtypes that numpy can understand and handle.

    Arguments:
        pd_data (pd.Series, pd.DataFrame, pd.ExtensionArray): Pandas data structure

    Returns:
        Modified pandas data structure (pd.DataFrame or pd.Series) with original data and dtypes that can be handled by numpy
    """
    nullable_to_numpy_mapping = {pd.Int64Dtype: 'int64',
                                 pd.BooleanDtype: 'bool',
                                 pd.StringDtype: 'object'}
    nullable_to_numpy_mapping_nan = {pd.Int64Dtype: 'float64',
                                     pd.BooleanDtype: 'object',
                                     pd.StringDtype: 'object'}

    if isinstance(pd_data, pd.api.extensions.ExtensionArray):
        if pd.isna(pd_data).any():
            return pd.Series(pd_data.to_numpy(na_value=np.nan), dtype=nullable_to_numpy_mapping_nan[type(pd_data.dtype)])
        return pd.Series(pd_data.to_numpy(na_value=np.nan), dtype=nullable_to_numpy_mapping[type(pd_data.dtype)])
    if (isinstance(pd_data, pd.Series) and type(pd_data.dtype) in nullable_to_numpy_mapping):
        if pd.isna(pd_data).any():
            return pd.Series(pd_data.to_numpy(na_value=np.nan), dtype=nullable_to_numpy_mapping_nan[type(pd_data.dtype)], index=pd_data.index, name=pd_data.name)
        return pd.Series(pd_data.to_numpy(na_value=np.nan), dtype=nullable_to_numpy_mapping[type(pd_data.dtype)], index=pd_data.index, name=pd_data.name)
    if isinstance(pd_data, pd.DataFrame):
        for col_name, col in pd_data.iteritems():
            if type(col.dtype) in nullable_to_numpy_mapping:
                if pd.isna(pd_data[col_name]).any():
                    pd_data[col_name] = pd.Series(pd_data[col_name].to_numpy(na_value=np.nan), dtype=nullable_to_numpy_mapping_nan[type(pd_data[col_name].dtype)])
                else:
                    pd_data[col_name] = pd_data[col_name].astype(nullable_to_numpy_mapping[type(col.dtype)])
    return pd_data


def _retain_custom_types_and_initalize_woodwork(old_datatable, new_dataframe, ltypes_to_ignore=None):
    """
    Helper method which will take an old Woodwork DataTable and a new pandas DataFrame and return a
    new DataTable that will try to retain as many logical types from the old DataTable that exist in the new
    pandas DataFrame as possible.

    Arguments:
        old_datatable (ww.DataTable): Woodwork DataTable to use
        new_dataframe (pd.DataFrame): Pandas data structure
        ltypes_to_ignore (list): List of Woodwork logical types to ignore. Columns from the old DataTable that have a logical type
        specified in this list will not have their logical types carried over to the new DataTable returned

    Returns:
        A new DataTable where any of the columns that exist in the old input DataTable and the new DataFrame try to retain
        the original logical type, if possible and not specified to be ignored.
    """
    retained_logical_types = {}
    if ltypes_to_ignore is None:
        ltypes_to_ignore = []
    col_intersection = set(old_datatable.columns).intersection(set(new_dataframe.columns))
    logical_types = old_datatable.logical_types
    for col in col_intersection:
        if logical_types[col] in ltypes_to_ignore:
            continue
        if str(new_dataframe[col].dtype) != logical_types[col].pandas_dtype:
            try:
                new_dataframe[col].astype(logical_types[col].pandas_dtype)
                retained_logical_types[col] = old_datatable[col].logical_type
            except (ValueError, TypeError):
                pass
    return ww.DataTable(new_dataframe, logical_types=retained_logical_types)