Source code for evalml.automl.engine.dask_engine

"""A Future-like wrapper around jobs created by the DaskEngine."""

import joblib
from dask.distributed import Client, LocalCluster

from evalml.automl.engine.engine_base import (
    EngineBase,
    EngineComputation,
    evaluate_pipeline,
    score_pipeline,
    train_pipeline,
)


[docs]class DaskComputation(EngineComputation):
    """A Future-like wrapper around jobs created by the DaskEngine.

    Args:
        dask_future (callable): Computation to do.
    """

    def __init__(self, dask_future):
        self.work = dask_future
        self.meta_data = {}

[docs]    def done(self):
        """Returns whether the computation is done."""
        return self.work.done()

[docs]    def get_result(self):
        """Gets the computation result. Will block until the computation is finished.

        Raises:
            Exception: If computation fails. Returns traceback.

        Returns:
            Computation results.
        """
        return self.work.result()

[docs]    def cancel(self):
        """Cancel the current computation."""
        return self.work.cancel()

    @property
    def is_cancelled(self):
        """Returns whether computation was cancelled."""
        return self.work.status


[docs]class DaskEngine(EngineBase):
    """The dask engine.

    Args:
        cluster (None or dd.Client): If None, creates a local, threaded Dask client for processing.
            Defaults to None.
    """

    def __init__(self, cluster=None):
        if cluster is not None and not isinstance(cluster, (LocalCluster)):
            raise TypeError(
                f"Expected dask.distributed.Client, received {type(cluster)}",
            )
        elif cluster is None:
            cluster = LocalCluster(processes=False)
        self.cluster = cluster
        self.client = Client(self.cluster)
        self._data_futures_cache = {}

    def __enter__(self):
        """Enter runtime context."""
        return self

    def __exit__(self, exc_type, exc_val, exc_tb):
        """Exit runtime context."""
        self.close()

[docs]    def send_data_to_cluster(self, X, y):
        """Send data to the cluster.

        The implementation uses caching so the data is only sent once. This follows
        dask best practices.

        Args:
            X (pd.DataFrame): Input data for modeling.
            y (pd.Series): Target data for modeling.

        Returns:
            dask.Future: The modeling data.
        """
        data_hash = joblib.hash(X), joblib.hash(y)
        if data_hash in self._data_futures_cache:
            X_future, y_future = self._data_futures_cache[data_hash]
            if not (X_future.cancelled() or y_future.cancelled()):
                return X_future, y_future
        self._data_futures_cache[data_hash] = self.client.scatter(
            [X, y],
            broadcast=True,
        )
        return self._data_futures_cache[data_hash]

[docs]    def submit_evaluation_job(
        self,
        automl_config,
        pipeline,
        X,
        y,
        X_holdout=None,
        y_holdout=None,
    ):
        """Send evaluation job to cluster.

        Args:
            automl_config: Structure containing data passed from AutoMLSearch instance.
            pipeline (pipeline.PipelineBase): Pipeline to evaluate.
            X (pd.DataFrame): Input data for modeling.
            y (pd.Series): Target data for modeling.
            X_holdout (pd.Series): Holdout input data for holdout scoring.
            y_holdout (pd.Series): Holdout target data for holdout scoring.

        Returns:
            DaskComputation: An object wrapping a reference to a future-like computation
                occurring in the dask cluster.
        """
        logger = self.setup_job_log()
        X, y = self.send_data_to_cluster(X, y)
        dask_future = self.client.submit(
            evaluate_pipeline,
            pipeline=pipeline,
            automl_config=automl_config,
            X=X,
            y=y,
            X_holdout=X_holdout,
            y_holdout=y_holdout,
            logger=logger,
        )
        return DaskComputation(dask_future)

[docs]    def submit_training_job(self, automl_config, pipeline, X, y):
        """Send training job to cluster.

        Args:
            automl_config: Structure containing data passed from AutoMLSearch instance.
            pipeline (pipeline.PipelineBase): Pipeline to train.
            X (pd.DataFrame): Input data for modeling.
            y (pd.Series): Target data for modeling.

        Returns:
            DaskComputation: An object wrapping a reference to a future-like computation
                occurring in the dask cluster.
        """
        X, y = self.send_data_to_cluster(X, y)
        dask_future = self.client.submit(
            train_pipeline,
            pipeline=pipeline,
            X=X,
            y=y,
            automl_config=automl_config,
        )
        return DaskComputation(dask_future)

[docs]    def submit_scoring_job(
        self,
        automl_config,
        pipeline,
        X,
        y,
        objectives,
        X_train=None,
        y_train=None,
    ):
        """Send scoring job to cluster.

        Args:
            automl_config: Structure containing data passed from AutoMLSearch instance.
            pipeline (pipeline.PipelineBase): Pipeline to train.
            X (pd.DataFrame): Input data for modeling.
            y (pd.Series): Target data for modeling.
            X_train (pd.DataFrame): Training features. Used for feature engineering in time series.
            y_train (pd.Series): Training target. Used for feature engineering in time series.
            objectives (list[ObjectiveBase]): List of objectives to score on.

        Returns:
            DaskComputation: An object wrapping a reference to a future-like computation
                occurring in the dask cluster.
        """
        # Get the schema before we lose it
        X_schema = X.ww.schema
        y_schema = y.ww.schema
        X, y = self.send_data_to_cluster(X, y)
        X_train, y_train = self.send_data_to_cluster(X_train, y_train)
        dask_future = self.client.submit(
            score_pipeline,
            pipeline=pipeline,
            X=X,
            y=y,
            objectives=objectives,
            X_schema=X_schema,
            y_schema=y_schema,
            X_train=X_train,
            y_train=y_train,
        )
        computation = DaskComputation(dask_future)
        computation.meta_data["pipeline_name"] = pipeline.name
        return computation

[docs]    def close(self):
        """Closes the underlying cluster."""
        # TODO: Might want to rethink this if using something other than a LocalCluster.
        self.cluster.close()
        self.client.close()

    @property
    def is_closed(self):
        """Property that determines whether the Engine's Client's resources are shutdown."""
        return self.cluster.status.value == "closed"