Source code for adenine.core.pipelines

#!/usr/bin/python
# -*- coding: utf-8 -*-

######################################################################
# Copyright (C) 2016 Samuele Fiorini, Federico Tomasi, Annalisa Barla
#
# FreeBSD License
######################################################################

import copy
import logging
import numpy as np


[docs]def create(pdef):
    """Scikit-learn Pipelines objects creation (deprecated).

    This function creates a list of sklearn Pipeline objects starting from the
    list of list of tuples given in input that could be created using the
    adenine.core.define_pipeline module.

    Parameters
    -----------
    pdef : list of list of tuples
        This arguments contains the specification needed by sklearn in order
        to create a working Pipeline object.

    Returns
    -----------
    pipes : list of sklearn.pipeline.Pipeline objects
        The list of Piplines, each of them can be fitted and trasformed
        with some data.
    """
    from sklearn.pipeline import Pipeline
    return [Pipeline(p) for p in pdef]


[docs]def which_level(label):
    """Define the step level according to the input step label [DEPRECATED].

    This function return the level (i.e.: imputing, preproc, dimred, clustring,
    None) according to the step label provided as input.

    Parameters
    -----------
    label : string
        This is the step level as it is reported in the ade_config file.

    Returns
    -----------
    level : {imputing, preproc, dimred, clustering, None}
        The appropriate level of the input step.
    """
    if not isinstance(label, basestring):
        raise ValueError("String expected")

    label = label.lower()
    if label.startswith('impute'):
        level = 'imputing'
    elif label in ('recenter', 'standardize', 'normalize', 'minmax'):
        level = 'preproc'
    elif label in ('pca', 'incrementalpca', 'randomizedpca', 'kernelpca',
                   'isomap', 'lle', 'se', 'mds', 'tsne', 'rbm'):
        level = 'dimred'
    elif label in ('kmeans', 'ap', 'ms', 'spectral',
                   'hierarchical'):
        level = 'clustering'
    else:
        level = 'None'
    return level


[docs]def evaluate(level, step, X):
    """Transform or predict according to the input level.

    This function uses the transform or the predict method on the input
    sklearn-like step according to its level (i.e. imputing, preproc, dimred,
    clustering, none).

    Parameters
    -----------
    level : {'imputing', 'preproc', 'dimred', 'clustering', 'None'}
        The step level.

    step : sklearn-like object
        This might be an Imputer, or a PCA, or a KMeans (and so on...)
        sklearn-like object.

    X : array of float, shape : n_samples x n_features
        The input data matrix.

    Returns
    -----------
    res : array of float
        A matrix projection in case of dimred, a label vector in case of
        clustering, and so on.
    """
    if level in ('imputing', 'preproc', 'dimred', 'None'):
        if hasattr(step, 'embedding_'):
            res = step.embedding_
        else:
            res = step.transform(X)
    elif level == 'clustering':
        if hasattr(step, 'labels_'):
            res = step.labels_  # e.g. in case of spectral clustering
        elif hasattr(step, 'affinity') and step.affinity == 'precomputed':
            if not hasattr(step.estimator, 'labels_'):
                step.estimator.fit(X)
            res = step.estimator.labels_
        else:
            res = step.predict(X)
    return res


[docs]def pipe_worker(pipe_id, pipe, pipes_dump, X):
    """Parallel pipelines execution.

    Parameters
    -----------
    pipe_id : string
        Pipeline identifier.

    pipe : list of tuples
        Tuple containing a label and a sklearn Pipeline object.

    pipes_dump : multiprocessing.Manager.dict
        Dictionary containing the results of the parallel execution.

    X : array of float, shape : n_samples x n_features, default : ()
        The input data matrix.
    """
    step_dump = dict()

    # COPY X as X_curr (to avoid that the next pipeline
    # works on the results of the previuos one)
    X_curr = np.array(X)
    for j, step in enumerate(pipe):
        # step[0] -> step_label | step[1] -> model, sklearn (or sklearn-like)
        # object
        step_id = 'step' + str(j)
        # 1. define which level of step is this (i.e.: imputing, preproc,
        # dimred, clustering, none)
        level = step[-1]
        # 2. fit the model (whatever it is)
        if step[1].get_params().get('method') == 'hessian':
            # check hessian lle constraints
            n_components = step[1].get_params().get('n_components')
            n_neighbors = 1 + (n_components * (n_components + 3) / 2)
            step[1].set_params(n_neighbors=n_neighbors)
        try:
            step[1].fit(X_curr)

            # 3. evaluate (i.e. transform or predict according to the level)
            # X_curr = evaluate(level, step[1], X_curr)
            X_next = evaluate(level, step[1], X_curr)
            # 3.1 if the model is suitable for voronoi tessellation: fit also
            # on 2D
            mdl_voronoi = None
            if hasattr(step[1], 'cluster_centers_'):
                mdl_voronoi = copy.copy(step[1].best_estimator_ if hasattr(
                    step[1], 'best_estimator_') else step[1])
                if not hasattr(step[1], 'affinity') or step[1].affinity != 'precomputed':
                    mdl_voronoi.fit(X_curr[:, :2])
                else:
                    mdl_voronoi.fit(X_curr)

            # 4. save the results in a dictionary of dictionaries of the form:
            # save memory and do not dump data after preprocessing (unused in
            # analysys)
            if level in ('preproc', 'imputing'):
                result = [step[0], level, step[1].get_params(),
                          np.empty(0), np.empty(0), step[1], mdl_voronoi]
                X_curr = np.array(X_next)  # update the matrix

            # save memory dumping X_curr only in case of clustering
            elif level == 'dimred':
                result = [step[0], level, step[1].get_params(),
                          X_next, np.empty(0), step[1], mdl_voronoi]
                X_curr = X_next  # update the matrix

            # clustering
            elif level == 'clustering':
                result = [step[0], level, step[1].get_params(),
                          X_next, X_curr, step[1], mdl_voronoi]
            if level != 'None':
                step_dump[step_id] = result

        except (AssertionError, ValueError) as e:
            logging.critical("Pipeline %s failed at step %s. "
                             "Traceback: %s", pipe_id, step[0], e)


    # Monkey-patch, see: https://github.com/scikit-learn/scikit-learn/issues/7562
    # and wait for the next numpy update
    # step_dump['step2'][-2] = None

    if pipes_dump is None:
        return step_dump

    pipes_dump[pipe_id] = step_dump