Source code for evalml.problem_types.utils

"""Utility methods for the ProblemTypes enum in EvalML."""
import pandas as pd
from pandas.api.types import is_numeric_dtype

from evalml.problem_types.problem_types import ProblemTypes


[docs]def handle_problem_types(problem_type):
    """Handles problem_type by either returning the ProblemTypes or converting from a str.

    Args:
        problem_type (str or ProblemTypes): Problem type that needs to be handled.

    Returns:
        ProblemTypes enum

    Raises:
        KeyError: If input is not a valid ProblemTypes enum value.
        ValueError: If input is not a string or ProblemTypes object.

    Examples:
        >>> assert handle_problem_types("regression") == ProblemTypes.REGRESSION
        >>> assert handle_problem_types("TIME SERIES BINARY") == ProblemTypes.TIME_SERIES_BINARY
        >>> assert handle_problem_types("Multiclass") == ProblemTypes.MULTICLASS
    """
    if isinstance(problem_type, str):
        try:
            tpe = ProblemTypes._all_values[problem_type.upper()]
        except KeyError:
            raise KeyError("Problem type '{}' does not exist".format(problem_type))
        return tpe
    if isinstance(problem_type, ProblemTypes):
        return problem_type
    raise ValueError(
        "`handle_problem_types` was not passed a str or ProblemTypes object",
    )


[docs]def detect_problem_type(y):
    """Determine the type of problem is being solved based on the targets (binary vs multiclass classification, regression). Ignores missing and null data.

    Args:
        y (pd.Series): The target labels to predict.

    Returns:
        ProblemType: ProblemType Enum

    Examples:
        >>> y = pd.Series([0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1])
        >>> assert detect_problem_type(y) == ProblemTypes.BINARY
        ...
        >>> y = pd.Series([1, 2, 3, 2, 1, 1, 1, 2, 2, 3, 3])
        >>> assert detect_problem_type(y) == ProblemTypes.MULTICLASS
        ...
        >>> y = pd.Series([1.6, 4.2, 3.3, 2.9, 4, 1, 5.5, 2, -2, -3.2, 3])
        >>> assert detect_problem_type(y) == ProblemTypes.REGRESSION

    Raises:
        ValueError: If the input has less than two classes.
    """
    y = pd.Series(y).dropna()
    num_classes = y.nunique()
    if num_classes < 2:
        raise ValueError("Less than 2 classes detected! Target unusable for modeling")
    if num_classes == 2:
        return ProblemTypes.BINARY
    if is_numeric_dtype(y.dtype):
        if num_classes > 10:
            return ProblemTypes.REGRESSION
    return ProblemTypes.MULTICLASS


[docs]def is_regression(problem_type):
    """Determines if the provided problem_type is a regression problem type.

    Args:
        problem_type (str or ProblemTypes): type of supervised learning problem. See evalml.problem_types.ProblemType.all_problem_types for a full list.

    Returns:
        bool: Whether or not the provided problem_type is a regression problem type.

    Examples:
        >>> assert is_regression("Regression")
        >>> assert is_regression(ProblemTypes.REGRESSION)
        >>> assert is_regression(ProblemTypes.TIME_SERIES_REGRESSION)
    """
    return handle_problem_types(problem_type) in [
        ProblemTypes.REGRESSION,
        ProblemTypes.TIME_SERIES_REGRESSION,
    ]


[docs]def is_binary(problem_type):
    """Determines if the provided problem_type is a binary classification problem type.

    Args:
        problem_type (str or ProblemTypes): type of supervised learning problem. See evalml.problem_types.ProblemType.all_problem_types for a full list.

    Returns:
        bool: Whether or not the provided problem_type is a binary classification problem type.

    Examples:
        >>> assert is_binary("Binary")
        >>> assert is_binary(ProblemTypes.BINARY)
        >>> assert is_binary(ProblemTypes.TIME_SERIES_BINARY)
    """
    return handle_problem_types(problem_type) in [
        ProblemTypes.BINARY,
        ProblemTypes.TIME_SERIES_BINARY,
    ]


[docs]def is_multiclass(problem_type):
    """Determines if the provided problem_type is a multiclass classification problem type.

    Args:
        problem_type (str or ProblemTypes): type of supervised learning problem. See evalml.problem_types.ProblemType.all_problem_types for a full list.

    Returns:
        bool: Whether or not the provided problem_type is a multiclass classification problem type.

    Examples:
        >>> assert is_multiclass("Multiclass")
        >>> assert is_multiclass(ProblemTypes.MULTICLASS)
        >>> assert is_multiclass(ProblemTypes.TIME_SERIES_MULTICLASS)
    """
    return handle_problem_types(problem_type) in [
        ProblemTypes.MULTICLASS,
        ProblemTypes.TIME_SERIES_MULTICLASS,
    ]


[docs]def is_classification(problem_type):
    """Determines if the provided problem_type is a classification problem type.

    Args:
        problem_type (str or ProblemTypes): type of supervised learning problem. See evalml.problem_types.ProblemType.all_problem_types for a full list.

    Returns:
        bool: Whether or not the provided problem_type is a classification problem type.

    Examples:
        >>> assert is_classification("Multiclass")
        >>> assert is_classification(ProblemTypes.TIME_SERIES_BINARY)
        >>> assert not is_classification(ProblemTypes.REGRESSION)
    """
    return is_binary(problem_type) or is_multiclass(problem_type)


[docs]def is_time_series(problem_type):
    """Determines if the provided problem_type is a time series problem type.

    Args:
        problem_type (str or ProblemTypes): type of supervised learning problem. See evalml.problem_types.ProblemType.all_problem_types for a full list.

    Returns:
        bool: Whether or not the provided problem_type is a time series problem type.

    Examples:
        >>> assert is_time_series("time series regression")
        >>> assert is_time_series(ProblemTypes.TIME_SERIES_BINARY)
        >>> assert not is_time_series(ProblemTypes.REGRESSION)
    """
    return handle_problem_types(problem_type) in [
        ProblemTypes.TIME_SERIES_BINARY,
        ProblemTypes.TIME_SERIES_MULTICLASS,
        ProblemTypes.TIME_SERIES_REGRESSION,
    ]