Source code for adenine.core.pipelines

#!/usr/bin/python
# -*- coding: utf-8 -*-

######################################################################
# Copyright (C) 2016 Samuele Fiorini, Federico Tomasi, Annalisa Barla
#
# FreeBSD License
######################################################################

import copy
import logging
import numpy as np


[docs]def create(pdef): """Scikit-learn Pipelines objects creation (deprecated). This function creates a list of sklearn Pipeline objects starting from the list of list of tuples given in input that could be created using the adenine.core.define_pipeline module. Parameters ----------- pdef : list of list of tuples This arguments contains the specification needed by sklearn in order to create a working Pipeline object. Returns ----------- pipes : list of sklearn.pipeline.Pipeline objects The list of Piplines, each of them can be fitted and trasformed with some data. """ from sklearn.pipeline import Pipeline return [Pipeline(p) for p in pdef]
[docs]def which_level(label): """Define the step level according to the input step label [DEPRECATED]. This function return the level (i.e.: imputing, preproc, dimred, clustring, None) according to the step label provided as input. Parameters ----------- label : string This is the step level as it is reported in the ade_config file. Returns ----------- level : {imputing, preproc, dimred, clustering, None} The appropriate level of the input step. """ if not isinstance(label, basestring): raise ValueError("String expected") label = label.lower() if label.startswith('impute'): level = 'imputing' elif label in ('recenter', 'standardize', 'normalize', 'minmax'): level = 'preproc' elif label in ('pca', 'incrementalpca', 'randomizedpca', 'kernelpca', 'isomap', 'lle', 'se', 'mds', 'tsne', 'rbm'): level = 'dimred' elif label in ('kmeans', 'ap', 'ms', 'spectral', 'hierarchical'): level = 'clustering' else: level = 'None' return level
[docs]def evaluate(level, step, X): """Transform or predict according to the input level. This function uses the transform or the predict method on the input sklearn-like step according to its level (i.e. imputing, preproc, dimred, clustering, none). Parameters ----------- level : {'imputing', 'preproc', 'dimred', 'clustering', 'None'} The step level. step : sklearn-like object This might be an Imputer, or a PCA, or a KMeans (and so on...) sklearn-like object. X : array of float, shape : n_samples x n_features The input data matrix. Returns ----------- res : array of float A matrix projection in case of dimred, a label vector in case of clustering, and so on. """ if level in ('imputing', 'preproc', 'dimred', 'None'): if hasattr(step, 'embedding_'): res = step.embedding_ else: res = step.transform(X) elif level == 'clustering': if hasattr(step, 'labels_'): res = step.labels_ # e.g. in case of spectral clustering elif hasattr(step, 'affinity') and step.affinity == 'precomputed': if not hasattr(step.estimator, 'labels_'): step.estimator.fit(X) res = step.estimator.labels_ else: res = step.predict(X) return res
[docs]def pipe_worker(pipe_id, pipe, pipes_dump, X): """Parallel pipelines execution. Parameters ----------- pipe_id : string Pipeline identifier. pipe : list of tuples Tuple containing a label and a sklearn Pipeline object. pipes_dump : multiprocessing.Manager.dict Dictionary containing the results of the parallel execution. X : array of float, shape : n_samples x n_features, default : () The input data matrix. """ step_dump = dict() # COPY X as X_curr (to avoid that the next pipeline # works on the results of the previuos one) X_curr = np.array(X) for j, step in enumerate(pipe): # step[0] -> step_label | step[1] -> model, sklearn (or sklearn-like) # object step_id = 'step' + str(j) # 1. define which level of step is this (i.e.: imputing, preproc, # dimred, clustering, none) level = step[-1] # 2. fit the model (whatever it is) if step[1].get_params().get('method') == 'hessian': # check hessian lle constraints n_components = step[1].get_params().get('n_components') n_neighbors = 1 + (n_components * (n_components + 3) / 2) step[1].set_params(n_neighbors=n_neighbors) try: step[1].fit(X_curr) # 3. evaluate (i.e. transform or predict according to the level) # X_curr = evaluate(level, step[1], X_curr) X_next = evaluate(level, step[1], X_curr) # 3.1 if the model is suitable for voronoi tessellation: fit also # on 2D mdl_voronoi = None if hasattr(step[1], 'cluster_centers_'): mdl_voronoi = copy.copy(step[1].best_estimator_ if hasattr( step[1], 'best_estimator_') else step[1]) if not hasattr(step[1], 'affinity') or step[1].affinity != 'precomputed': mdl_voronoi.fit(X_curr[:, :2]) else: mdl_voronoi.fit(X_curr) # 4. save the results in a dictionary of dictionaries of the form: # save memory and do not dump data after preprocessing (unused in # analysys) if level in ('preproc', 'imputing'): result = [step[0], level, step[1].get_params(), np.empty(0), np.empty(0), step[1], mdl_voronoi] X_curr = np.array(X_next) # update the matrix # save memory dumping X_curr only in case of clustering elif level == 'dimred': result = [step[0], level, step[1].get_params(), X_next, np.empty(0), step[1], mdl_voronoi] X_curr = X_next # update the matrix # clustering elif level == 'clustering': result = [step[0], level, step[1].get_params(), X_next, X_curr, step[1], mdl_voronoi] if level != 'None': step_dump[step_id] = result except (AssertionError, ValueError) as e: logging.critical("Pipeline %s failed at step %s. " "Traceback: %s", pipe_id, step[0], e) # Monkey-patch, see: https://github.com/scikit-learn/scikit-learn/issues/7562 # and wait for the next numpy update # step_dump['step2'][-2] = None if pipes_dump is None: return step_dump pipes_dump[pipe_id] = step_dump