Source code for dasf.transforms.base

#!/usr/bin/python3

""" Definition of the generic operators of the pipeline. """

import inspect
from uuid import uuid4

import dask.array as da
import numpy as np

try:
    import cupy as cp
    import dask_cudf as dcudf
    from rmm._cuda.gpu import CUDARuntimeError
except ImportError: # pragma: no cover
    pass
except CUDARuntimeError:
    pass

from dasf.utils.decorators import task_handler
from dasf.utils.funcs import block_chunk_reduce
from dasf.utils.types import (
    is_dask_array,
    is_dask_cpu_dataframe,
    is_dask_dataframe,
    is_dask_gpu_dataframe,
)



[docs]
class Operator:
    """
    Class representing a generic Operator of the pipeline.
    """


[docs]
    def get_uuid(self):
        """
        Return the UUID representation of the Operator.
        """
        if not hasattr(self, "_uuid"):
            self._uuid = uuid4()
        return self._uuid




[docs]
class Fit(Operator):
    """
    Class representing a Fit operation of the pipeline.
    """


[docs]
    def _lazy_fit_cpu(self, X, y=None, **kwargs):
        """
        Respective lazy fit mocked function for CPUs.
        """
        raise NotImplementedError



[docs]
    def _lazy_fit_gpu(self, X, y=None, **kwargs):
        """
        Respective lazy fit mocked function for GPUs.
        """
        raise NotImplementedError



[docs]
    def _fit_cpu(self, X, y=None, **kwargs):
        """
        Respective immediate fit mocked function for local CPU(s).
        """
        raise NotImplementedError



[docs]
    def _fit_gpu(self, X, y=None, **kwargs):
        """
        Respective immediate fit mocked function for local GPU(s).
        """
        raise NotImplementedError



[docs]
    @task_handler
    def fit(self, X, y, sample_weight=None, **kwargs):
        """
        Generic fit funtion according executor.
        """
        ...



[docs]
    @staticmethod
    def fit_from_model(model, X, y, sample_weight=None, **kwargs):
        """
        Return the model of a previous created object.
        """
        return model.fit(X=X, y=y, sample_weight=sample_weight, **kwargs)





[docs]
class FitPredict(Operator):
    """
    Class representing a Fit with Predict operation of the pipeline.
    """


[docs]
    def _lazy_fit_predict_cpu(self, X, y=None, **kwargs):
        """
        Respective lazy fit with predict mocked function for CPUs.
        """
        raise NotImplementedError



[docs]
    def _lazy_fit_predict_gpu(self, X, y=None, **kwargs):
        """
        Respective lazy fit with predict mocked function for GPUs.
        """
        raise NotImplementedError



[docs]
    def _fit_predict_cpu(self, X, y=None, **kwargs):
        """
        Respective immediate fit with predict mocked function for local CPU(s).
        """
        raise NotImplementedError



[docs]
    def _fit_predict_gpu(self, X, y=None, **kwargs):
        """
        Respective immediate fit with predict mocked function for local GPU(s).
        """
        raise NotImplementedError



[docs]
    @task_handler
    def fit_predict(self, X, y=None, **kwargs):
        """
        Generic fit with predict funtion according executor.
        """
        ...



[docs]
    @staticmethod
    def fit_predict_from_model(model, X, y, sample_weight=None, **kwargs):
        """
        Return the model of a previous created object.
        """
        return model.fit_predict(X=X, y=y, sample_weight=sample_weight,
                                 **kwargs)





[docs]
class FitTransform(Operator):
    """
    Class representing a Fit with Transform operation of the pipeline.
    """


[docs]
    def _lazy_fit_transform_cpu(self, X, y=None, **kwargs):
        """
        Respective lazy fit with transform mocked function for CPUs.
        """
        raise NotImplementedError



[docs]
    def _lazy_fit_transform_gpu(self, X, y=None, **kwargs):
        """
        Respective lazy fit with transform mocked function for GPUs.
        """
        raise NotImplementedError



[docs]
    def _fit_transform_cpu(self, X, y=None, **kwargs):
        """
        Respective immediate fit with transform mocked function for local CPU(s).
        """
        raise NotImplementedError



[docs]
    def _fit_transform_gpu(self, X, y=None, **kwargs):
        """
        Respective immediate fit with transform mocked function for local GPU(s).
        """
        raise NotImplementedError



[docs]
    @task_handler
    def fit_transform(self, X, y=None, **kwargs):
        """
        Generic fit with transform funtion according executor.
        """
        ...



[docs]
    @staticmethod
    def fit_transform_from_model(model, X, y, sample_weight=None, **kwargs):
        """
        Return the model of a previous created object.
        """
        return model.fit_transform(X=X, y=y, sample_weight=sample_weight,
                                   **kwargs)





[docs]
class Predict(Operator):
    """
    Class representing a Predict operation of the pipeline.
    """


[docs]
    def _lazy_predict_cpu(self, X, sample_weight=None, **kwargs):
        """
        Respective lazy predict mocked function for CPUs.
        """
        raise NotImplementedError



[docs]
    def _lazy_predict_gpu(self, X, sample_weight=None, **kwargs):
        """
        Respective lazy predict mocked function for GPUs.
        """
        raise NotImplementedError



[docs]
    def _predict_cpu(self, X, sample_weight=None, **kwargs):
        """
        Respective immediate predict mocked function for local CPU(s).
        """
        raise NotImplementedError



[docs]
    def _predict_gpu(self, X, sample_weight=None, **kwargs):
        """
        Respective immediate predict mocked function for local GPU(s).
        """
        raise NotImplementedError



[docs]
    @task_handler
    def predict(self, X, sample_weight=None, **kwargs):
        """
        Generic predict funtion according executor.
        """
        ...



[docs]
    @staticmethod
    def predict_from_model(model, X, sample_weight=None, **kwargs):
        """
        Return the model of a previous created object.
        """
        if 'sample_weight' not in inspect.signature(model.predict).parameters:
            return model.predict(X=X, **kwargs)
        return model.predict(X=X, sample_weight=sample_weight, **kwargs)





[docs]
class GetParams(Operator):
    """
    Class representing a Get Parameters operation of the pipeline.
    """


[docs]
    def _lazy_get_params_cpu(self, deep=True, **kwargs):
        """
        Respective lazy get_params mocked function for CPUs.
        """
        raise NotImplementedError



[docs]
    def _lazy_get_params_gpu(self, deep=True, **kwargs):
        """
        Respective lazy get_params mocked function for GPUs.
        """
        raise NotImplementedError



[docs]
    def _get_params_cpu(self, deep=True, **kwargs):
        """
        Respective immediate get_params mocked function for local CPU(s).
        """
        raise NotImplementedError



[docs]
    def _get_params_gpu(self, deep=True, **kwargs):
        """
        Respective immediate get_params mocked function for local GPU(s).
        """
        raise NotImplementedError



[docs]
    @task_handler
    def get_params(self, deep=True, **kwargs):
        """
        Generic get_params funtion according executor.
        """
        ...





[docs]
class SetParams(Operator):
    """
    Class representing a Set Parameters operation of the pipeline.
    """


[docs]
    def _lazy_set_params_cpu(self, **params):
        """
        Respective lazy set_params mocked function for CPUs.
        """
        raise NotImplementedError



[docs]
    def _lazy_set_params_gpu(self, **params):
        """
        Respective lazy set_params mocked function for GPUs.
        """
        raise NotImplementedError



[docs]
    def _set_params_cpu(self, **params):
        """
        Respective immediate set_params mocked function for local CPU(s).
        """
        raise NotImplementedError



[docs]
    def _set_params_gpu(self, **params):
        """
        Respective immediate set_params mocked function for local GPU(s).
        """
        raise NotImplementedError



[docs]
    @task_handler
    def set_params(self, **params):
        """
        Generic set_params funtion according executor.
        """
        ...





[docs]
class Transform(Operator):
    """
    Class representing a Transform operation of the pipeline.
    """


[docs]
    def _lazy_transform_cpu(self, X, **kwargs):
        """
        Respective lazy transform mocked function for CPUs.
        """
        raise NotImplementedError



[docs]
    def _lazy_transform_gpu(self, X, **kwargs):
        """
        Respective lazy transform mocked function for GPUs.
        """
        raise NotImplementedError



[docs]
    def _transform_cpu(self, X, **kwargs):
        """
        Respective immediate transform mocked function for local CPU(s).
        """
        raise NotImplementedError



[docs]
    def _transform_gpu(self, X, **kwargs):
        """
        Respective immediate transform mocked function for local GPU(s).
        """
        raise NotImplementedError



[docs]
    @task_handler
    def transform(self, X, **kwargs):
        """
        Generic transform funtion according executor.
        """
        ...



[docs]
    @staticmethod
    def transform_from_model(model, X, **kwargs):
        """
        Return the model of a previous created object.
        """
        return model.transform(X=X, **kwargs)





[docs]
class TargeteredTransform(Transform):
    """
    Class representing a Targetered Transform operation of the pipeline.
    
    This specific transform operates according the parameters of the
    constructor.
    
    Parameters
    ----------
    run_local : bool
        Define that the operator will run locally and not distributed.
    run_gpu : bool
        Define if the operator will use GPU(s) or not.

    """

    def __init__(self, run_local=None, run_gpu=None):
        """ Constructor of the class TargeteredTransform. """
        super().__init__()

        self._run_local = run_local
        self._run_gpu = run_gpu




[docs]
class MappedTransform(Transform):
    """Class representing a MappedTransform based on Transform
    object.

    This object refers to any operation that can be done in blocks.
    In special, for Dask chunks. There are several ways of doing
    that. This class tries to simplify how the functions are applied
    into a block.

    Parameters
    ----------
    function : Callable
        A function that will be applied in a block.
    depth : tuple
        The value of the boundary elements per axis (the default is
        None).
    boundary : str
        The type of the boundary. See Dask boundaries for more
        examples (the default is None).
    trim : bool
        Option to trim the data after an overlap (the default is
        True).
    output_chunk : tuple
        New shape of the output after computing the function (the
        default is None).
    drop_axis : tuple
        Which axis should be deleted after computing the function
        (the default is None).
    new_axis : tuple
        Which axis represent a new axis after computing the function
        (the default is None).

    """

    def __init__(
        self,
        function,
        depth=None,
        boundary=None,
        trim=True,
        output_chunk=None,
        drop_axis=None,
        new_axis=None,
    ):

        self.function = function
        self.depth = depth
        self.boundary = boundary
        self.trim = trim
        self.output_chunk = output_chunk
        self.drop_axis = drop_axis
        self.new_axis = new_axis

        if (
            self.boundary is None
            and self.depth is not None
            or self.boundary is not None
            and self.depth is None
        ):
            raise Exception("Both boundary and depth should be passed "
                            "together")

    def __lazy_transform_generic(self, X, xp, **kwargs):
        if self.drop_axis is not None or self.new_axis is not None:
            drop_axis, new_axis = self.drop_axis, self.new_axis
        else:
            drop_axis, new_axis = block_chunk_reduce(X, self.output_chunk)

        if self.output_chunk is None:
            __output_chunk = X.chunks
        else:
            __output_chunk = self.output_chunk

        if self.depth and self.boundary:
            if self.trim:
                new_data = X.map_overlap(
                    self.function,
                    **kwargs,
                    dtype=X.dtype,
                    depth=self.depth,
                    boundary=self.boundary,
                    meta=xp.array(()),
                )
            else:
                data_blocks = da.overlap.overlap(
                    X, depth=self.depth, boundary=self.boundary
                )

                new_data = data_blocks.map_blocks(
                    self.function,
                    dtype=X.dtype,
                    drop_axis=drop_axis,
                    new_axis=new_axis,
                    chunks=__output_chunk,
                    meta=xp.array(()),
                    **kwargs,
                )
        else:
            if is_dask_array(X):
                new_data = X.map_blocks(
                    self.function,
                    dtype=X.dtype,
                    drop_axis=drop_axis,
                    new_axis=new_axis,
                    chunks=__output_chunk,
                    meta=xp.array(()),
                    **kwargs,
                )
            elif is_dask_dataframe(X):
                new_data = X.map_partitions(self.function, **kwargs)

        return new_data


[docs]
    def _lazy_transform_cpu(self, X, **kwargs):
        return self.__lazy_transform_generic(X, xp=np, **kwargs)



[docs]
    def _lazy_transform_gpu(self, X, **kwargs):
        return self.__lazy_transform_generic(X, xp=cp, **kwargs)



[docs]
    def _transform_cpu(self, X, **kwargs):
        return self.function(X, **kwargs)



[docs]
    def _transform_gpu(self, X, **kwargs):
        return self.function(X, **kwargs)



[docs]
    @task_handler
    def transform(self, X, **kwargs):
        ...





[docs]
class ReductionTransform(Transform):
    """Class representing a Reduction based on Transform
    object.

    This is a simple MapReduction operation using Dask.

    Parameters
    ----------
    output_size : tuple
        The size of the new output.
    func_aggregate : Callable
        The function called to aggregate the result of each chunk.
    func_chunk : Callable
        The function applied in each chunk.
    func_combine : Callable
        The function to combine each reduction of aggregate (the
        default is None).

    """
    def __init__(self, output_size, func_aggregate, func_chunk, func_combine=None):
        self.output_size = output_size

        self.func_aggregate = func_aggregate
        self.func_chunk = func_chunk
        self.func_combine = func_combine


[docs]
    def _operation_aggregate_cpu(self, block, axis=None, keepdims=False):
        return self.func_aggregate(block, axis, keepdims, xp=np)



[docs]
    def _operation_aggregate_gpu(self, block, axis=None, keepdims=False):
        return self.func_aggregate(block, axis, keepdims, xp=cp)



[docs]
    def _operation_combine_cpu(self, block, axis=None, keepdims=False):
        return self.func_combine(block, axis, keepdims, xp=np)



[docs]
    def _operation_combine_gpu(self, block, axis=None, keepdims=False):
        return self.func_combine(block, axis, keepdims, xp=cp)



[docs]
    def _operation_chunk_cpu(self, block, axis=None, keepdims=False):
        return self.func_chunk(block, axis, keepdims, xp=np)



[docs]
    def _operation_chunk_gpu(self, block, axis=None, keepdims=False):
        return self.func_chunk(block, axis, keepdims, xp=cp)



[docs]
    def _lazy_transform_cpu(self, X, *args, **kwargs):
        if self.func_combine is not None:
            kwargs['combine'] = self._operation_combine_cpu

        if is_dask_cpu_dataframe(X):
            return X.reduction(chunk=self._operation_chunk_cpu,
                               aggregate=self._operation_aggregate_cpu,
                               meta=self.output_size,
                               *args,
                               **kwargs)
        else:
            return da.reduction(X,
                                chunk=self._operation_chunk_cpu,
                                aggregate=self._operation_aggregate_cpu,
                                dtype=X.dtype,
                                meta=np.array(self.output_size,
                                              dtype=X.dtype),
                                *args,
                                **kwargs)



[docs]
    def _lazy_transform_gpu(self, X, *args, **kwargs):
        if self.func_combine is not None:
            kwargs['combine'] = self._operation_combine_gpu

        if is_dask_gpu_dataframe(X):
            return X.reduction(chunk=self._operation_chunk_gpu,
                               aggregate=self._operation_aggregate_gpu,
                               meta=self.output_size,
                               *args,
                               **kwargs)
        else:
            return da.reduction(X,
                                chunk=self._operation_chunk_gpu,
                                aggregate=self._operation_aggregate_gpu,
                                dtype=X.dtype,
                                meta=cp.array(self.output_size,
                                              dtype=X.dtype),
                                *args,
                                **kwargs)



[docs]
    def _transform_cpu(self, X, *args, **kwargs):
        return self.func_chunk(block=X, xp=np, *args, **kwargs)



[docs]
    def _transform_gpu(self, X, *args, **kwargs):
        return self.func_chunk(block=X, xp=cp, *args, **kwargs)



[docs]
    @task_handler
    def transform(self, X, *args, **kwargs):
        ...