Source code for adenine.core.plotting

#!/usr/bin/python -W ignore::DeprecationWarning
# -*- coding: utf-8 -*-
"""Adenine plotting module."""

######################################################################
# Copyright (C) 2016 Samuele Fiorini, Federico Tomasi, Annalisa Barla
#
# FreeBSD License
######################################################################

import os
import logging
# import cPickle as pkl
import numpy as np
import scipy as sp
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns; sns.set(font="monospace")
from sklearn import metrics
# Legacy import
try:
    from sklearn.model_selection import StratifiedShuffleSplit
except ImportError:
    from sklearn.cross_validation import StratifiedShuffleSplit

from adenine.utils.extra import title_from_filename, Palette

__all__ = ("silhouette", "scatter", "voronoi", "tree",
           "dendrogram", "pcmagnitude", "eigs")

DEFAULT_EXT = 'png'


[docs]def silhouette(root, data_in, labels, model=None):
    """Generate and save the silhouette plot of data_in w.r.t labels.

    This function generates the silhouette plot representing how data are
    correctly clustered, based on labels.
    The plots will be saved into the root folder in a tree-like structure.

    Parameters
    -----------
    root : string
        The root path for the output creation

    data_in : array of float, shape : (n_samples, n_dimensions)
        The low space embedding estimated by the dimensionality reduction and
        manifold learning algorithm.

    labels : array of float, shape : n_samples
        The label vector. It can contain true or estimated labels.

    model : sklearn or sklearn-like object
        An instance of the class that evaluates a step.
    """
    if labels is None:
        logging.warning('Cannot make silhouette plot with no real labels.')
        return

    if len(labels) < 2 or len(labels) > data_in.shape[0] - 1:
        logging.warning('Cannot make silhouette if number of labels is %d. '
                        'Valid values are 2 to n_samples - 1 '
                        '(inclusive).', len(labels))
        return

    # Create a subplot with 1 row and 2 columns
    fig, (ax1) = plt.subplots(1, 1)
    fig.set_size_inches(20, 15)

    # The silhouette coefficient can range from -1, 1
    # ax1.set_xlim([-1, 1])

    # The (n_clusters+1)*10 is for inserting blank space between silhouette
    # plots of individual clusters.
    n_clusters = np.unique(labels).shape[0]
    ax1.set_ylim([0, len(data_in) + (n_clusters + 1) * 10])

    # The silhouette_score gives the average value for all the samples.
    # This gives a perspective into the density and separation of the formed
    # clusters
    metric = model.affinity if hasattr(model, 'affinity') else 'euclidean'
    # catch exceptions of Spectral Embedding affinity
    if metric == 'rbf':
        def metric(x, y):
            return metrics.pairwise.rbf_kernel(x.reshape(1, -1),
                                               y.reshape(1, -1))
    if metric == 'nearest_neighbors':
        metric = 'euclidean'
    sample_silhouette_values = metrics.silhouette_samples(data_in, labels,
                                                          metric=metric)
    sil = np.mean(sample_silhouette_values)

    y_lower = 10
    palette = Palette()
    for _, label in enumerate(np.unique(labels)):
        # Aggregate the silhouette scores for samples belonging to
        # cluster i, and sort them
        ith_cluster_silhouette_values = sample_silhouette_values[labels == label]
        ith_cluster_silhouette_values.sort()

        size_cluster_i = ith_cluster_silhouette_values.shape[0]
        y_upper = y_lower + size_cluster_i

        # color = cm.spectral(float(i) / n_clusters)
        color = palette.next()
        ax1.fill_betweenx(np.arange(y_lower, y_upper),
                          0, ith_cluster_silhouette_values,
                          facecolor=color, edgecolor=color, alpha=0.7)

        # Label the silhouette plots with their cluster numbers at the middle
        ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(label))

        # Compute the new y_lower for next plot
        y_lower = y_upper + 10  # 10 for the 0 samples

    # ax1.set_title("Silhouette plot for the various clusters.")
    ax1.set_xlabel("silhouette coefficient values")
    ax1.set_ylabel("cluster label")

    # The vertical line for average silhoutte score of all the values
    ax1.axvline(x=sil, color="red", linestyle="--")
    ax1.set_yticks([])
    # ax1.set_xticks([-0.6, -0.4, -0.2, 0, 0.2, 0.4, 0.6, 0.8, 1])

    plt.suptitle("Silhouette analysis. "
                 "{0} clusters for {2} samples, average score {1:.4f}"
                 .format(n_clusters, sil, data_in.shape[0]))

    filename = os.path.join(
        root, os.path.basename(root) + "_silhouette." + DEFAULT_EXT)
    fig.savefig(filename)
    logging.info('Figure saved %s', filename)
    plt.close()


[docs]def scatter(root, data_in, labels=None, true_labels=False, model=None):
    """Generate the scatter plot of the dimensionality reduced data set.

    This function generates the scatter plot representing the dimensionality
    reduced data set. The plots will be saved into the root folder in a
    tree-like structure.

    Parameters
    ----------
    root : string
        The root path for the output creation

    data_in : array of float, shape : (n_samples, n_dimensions)
        The low space embedding estimated by the dimensinality reduction and
        manifold learning algorithm.

    labels : array of float, shape : n_samples
        The label vector. It can contain true or estimated labels.

    true_labels : boolean
        Identify if labels contains true or estimated labels.

    model : sklearn or sklearn-like object
        An instance of the class that evaluates a step. In particular this must
        be a clustering model provided with the clusters_centers_ attribute
        (e.g. KMeans).
    """
    if hasattr(model, 'affinity') and model.affinity == 'precomputed':
        logging.info("Scatter cannot be performed with precomputed distances.")
        return

    n_samples, n_dim = data_in.shape

    # Define plot color
    if labels is None:
        labels = np.zeros(n_samples, dtype=np.short)
        hue = ' '
    else:
        labels = np.asarray(labels)
        hue = 'Classes' if true_labels else 'Estimated Labels'

    title = title_from_filename(root)

    # Seaborn scatter plot
    # 2D plot
    X = data_in[:, :2]
    idx = np.argsort(labels)

    df = pd.DataFrame(
        data=np.hstack((X[idx, :2], labels[idx][:, np.newaxis])),
        columns=["$x_1$", "$x_2$", hue]).astype(
            {"$x_1$": float, "$x_2$": float})
    if df.dtypes[hue] != 'O':
        df[hue] = df[hue].astype('int64')
    # Generate seaborn plot
    g = sns.FacetGrid(df, hue=hue, palette="Set1", size=5, legend_out=False)
    g.map(plt.scatter, "$x_1$", "$x_2$", s=100, lw=.5, edgecolor="white")
    if hue != ' ':
        g.add_legend()  # customize legend
    # g.set_xticklabels([])
    # g.set_yticklabels([])
    g.ax.autoscale_view(True, True, True)
    plt.title(title)
    filename = os.path.join(
        root, os.path.basename(root) + "_scatter2D." + DEFAULT_EXT)
    plt.savefig(filename)
    logging.info('Figure saved %s', filename)
    plt.close()

    # 3D plot
    filename = os.path.join(
        root, os.path.basename(root) + "_scatter3D." + DEFAULT_EXT)
    if n_dim < 3:
        logging.warning(
            '%s not generated (data have less than 3 dimensions)', filename)
    else:
        try:
            from mpl_toolkits.mplot3d import Axes3D
            ax = plt.figure().gca(projection='3d')
            # ax.scatter(X[:,0], X[:,1], X[:,2], y, c=y, cmap='hot', s=100,
            #            linewidth=.5, edgecolor="white")
            palette = Palette(n_colors=len(np.unique(labels)))
            for _, label in enumerate(np.unique(labels)):
                idx = np.where(labels == label)[0]
                ax.plot(
                    data_in[:, 0][idx], data_in[:, 1][idx], data_in[:, 2][idx],
                    'o', c=palette.next(), label=str(label), mew=.5,
                    mec="white")

            ax.set_xlabel(r'$x_1$')
            ax.set_ylabel(r'$x_2$')
            ax.set_zlabel(r'$x_3$')
            ax.autoscale_view(True, True, True)
            ax.set_title(title)
            ax.legend(loc='upper left', numpoints=1, ncol=10, fontsize=8,
                      bbox_to_anchor=(0, 0))
            plt.savefig(filename)
            logging.info('Figure saved %s', filename)
            plt.close()
        except StandardError as e:
            logging.error('Error in 3D plot: %s', e)

    # seaborn pairplot
    n_cols = min(n_dim, 3)
    cols = ["$x_{}$".format(i + 1) for i in range(n_cols)]
    X = data_in[:, :3]
    idx = np.argsort(labels)
    df = pd.DataFrame(data=np.hstack((X[idx, :], labels[idx, np.newaxis])),
                      columns=cols + [hue]).astype(
                          dict(zip(cols, [float] * n_cols)))
    if df.dtypes[hue] != 'O':
        df[hue] = df[hue].astype('int64')
    g = sns.PairGrid(df, hue=hue, palette="Set1", vars=cols)
    g = g.map_diag(plt.hist)  # , palette="Set1")
    g = g.map_offdiag(plt.scatter, s=100, linewidth=.5, edgecolor="white")

    # g = sns.pairplot(df, hue=hue, palette="Set1",
    #    vars=["$x_1$","$x_2$","$x_3$"]), size=5)
    if hue != ' ':
        plt.legend(title=hue, bbox_to_anchor=(1.05, 1), loc=2,
                   borderaxespad=0., fontsize="large")
    plt.suptitle(title, x=0.6, y=1.01, fontsize="large")
    filename = os.path.join(
        root, os.path.basename(root) + "_pairgrid." + DEFAULT_EXT)
    g.savefig(filename)
    logging.info('Figure saved %s', filename)
    plt.close()


[docs]def voronoi(root, data_in, labels=None, true_labels=False, model=None):
    """Generate the Voronoi tessellation obtained from the clustering algorithm.

    This function generates the Voronoi tessellation obtained from the
    clustering algorithm applied on the data projected on a two-dimensional
    embedding. The plots will be saved into the appropriate folder of the
    tree-like structure created into the root folder.

    Parameters
    -----------
    root : string
        The root path for the output creation

    data_in : array of float, shape : (n_samples, n_dimensions)
        The low space embedding estimated by the dimensinality reduction and
        manifold learning algorithm.

    labels : array of int, shape : n_samples
        The result of the clustering step.

    true_labels : boolean [deprecated]
        Identify if labels contains true or estimated labels.

    model : sklearn or sklearn-like object
        An instance of the class that evaluates a step. In particular this must
        be a clustering model provided with the clusters_centers_ attribute
        (e.g. KMeans).
    """
    n_samples, _ = data_in.shape

    # Define plot color
    if labels is None:
        labels = np.zeros(n_samples, dtype=np.short)
        hue = ' '
    else:
        labels = np.asarray(labels)
        hue = 'Classes'

    title = title_from_filename(root)

    # Seaborn scatter Plot
    idx = np.argsort(labels)
    X = data_in[idx, :2]
    labels = labels[idx, np.newaxis]
    df = pd.DataFrame(
        data=np.hstack((X, labels)), columns=["$x_1$", "$x_2$", hue]).astype(
            {"$x_1$": float, "$x_2$": float})
    if df.dtypes[hue] != 'O':
        df[hue] = df[hue].astype('int64')
    # Generate seaborn plot
    g = sns.FacetGrid(df, hue=hue, palette="Set1", size=5, legend_out=False)
    g.map(plt.scatter, "$x_1$", "$x_2$", s=100, lw=.5, edgecolor="white")
    if hue != ' ':
        g.add_legend()  # customize legend
    g.ax.autoscale_view(True, True, True)
    plt.title(title)

    # Add centroids
    if hasattr(model, 'cluster_centers_'):
        plt.scatter(model.cluster_centers_[:, 0], model.cluster_centers_[:, 1],
                    s=100, marker='h', c='w')

    # Make and add to the Plot the decision boundary.
    # the number of points in that makes the background.
    # Reducing this will decrease the quality of the voronoi background
    npoints = 1000
    x_min, x_max = X[:, 0].min(), X[:, 0].max()
    y_min, y_max = X[:, 1].min(), X[:, 1].max()
    offset = (x_max - x_min) / 5. + (y_max - y_min) / 5.  # zoom out the plot
    xx, yy = np.meshgrid(np.linspace(x_min - offset, x_max + offset, npoints),
                         np.linspace(y_min - offset, y_max + offset, npoints))

    # Obtain labels for each point in mesh. Use last trained model.
    Z = model.predict(np.c_[xx.ravel(), yy.ravel()])
    # Put the result into a color plot
    Z = Z.reshape(xx.shape)

    plt.imshow(Z, interpolation='nearest',
               extent=(xx.min(), xx.max(), yy.min(), yy.max()),
               cmap=plt.get_cmap('Pastel1'), aspect='auto', origin='lower')

    plt.xlim([xx.min(), xx.max()])
    plt.ylim([yy.min(), yy.max()])

    filename = os.path.join(root, os.path.basename(root) + "." + DEFAULT_EXT)
    plt.savefig(filename)
    logging.info('Figure saved %s', filename)
    plt.close()


[docs]def tree(root, data_in, labels=None, index=None, model=None):
    """Generate the tree structure obtained from the clustering algorithm.

    This function generates the tree obtained from the clustering algorithm
    applied on the data. The plots will be saved into the appropriate folder of
    the tree-like structure created into the root folder.

    Parameters
    -----------
    root : string
        The root path for the output creation

    data_in : array of float, shape : (n_samples, n_dimensions)
        The low space embedding estimated by the dimensinality reduction and
        manifold learning algorithm.

    labels : array of int, shape : n_samples
        The result of the clustering step.

    index : list of integers (or strings)
        This is the samples identifier, if provided as first column (or row) of
        of the input file. Otherwise it is just an incremental range of size
        n_samples.

    model : sklearn or sklearn-like object
        An instance of the class that evaluates a step. In particular this must
        be a clustering model provided with the clusters_centers_ attribute
        (e.g. KMeans).
    """
    filename = os.path.join(root, os.path.basename(root) + '_tree.pdf')
    try:
        import itertools
        import pydot
        graph = pydot.Dot(graph_type='graph')

        if labels is None:
            labels = np.zeros(1, dtype=np.short)
            palette = Palette('hls', n_colors=np.unique(labels).shape[0])
            palette.palette[0] = (1.0, 1.0, 1.0)
        else:
            palette = Palette('hls', n_colors=np.unique(labels).shape[0])

        colors = dict(
            zip(np.unique(labels), np.arange(np.unique(labels).shape[0])))
        ii = itertools.count(data_in.shape[0])
        for _, x in enumerate(model.children_):
            root_node = next(ii)
            fillcolor = (lambda _:
                         palette.palette.as_hex()[colors[labels[x[_]]]]
                         if x[_] < labels.shape[0] else 'white')
            left_node = pydot.Node(
                str(index[x[0]] if x[0] < len(index) else x[0]),
                style="filled", fillcolor=fillcolor(0))
            right_node = pydot.Node(
                str(index[x[1]] if x[1] < len(index) else x[1]),
                style="filled", fillcolor=fillcolor(1))

            graph.add_node(left_node)
            graph.add_node(right_node)
            graph.add_edge(pydot.Edge(root_node, left_node))
            graph.add_edge(pydot.Edge(root_node, right_node))

        # graph.write_png(filename[:-2]+"ng")
        graph.write_pdf(filename)
        logging.info('Figure saved %s', filename)
    except StandardError as e:
        logging.critical('Cannot create %s. tb: %s', filename, e)


[docs]def dendrogram(root, data_in, labels=None, index=None, model=None, n_max=150):
    """Generate and save the dendrogram obtained from the clustering algorithm.

    This function generates the dendrogram obtained from the clustering
    algorithm applied on the data. The plots will be saved into the appropriate
    folder of the tree-like structure created into the root folder. The row
    colors of the heatmap are the either true or estimated data labels.

    Parameters
    -----------
    root : string
        The root path for the output creation

    data_in : array of float, shape : (n_samples, n_dimensions)
        The low space embedding estimated by the dimensinality reduction and
        manifold learning algorithm.

    labels : array of int, shape : n_samples
        The result of the clustering step.

    index : list of integers (or strings)
        This is the samples identifier, if provided as first column (or row) of
        of the input file. Otherwise it is just an incremental range of size
        n_samples.

    model : sklearn or sklearn-like object
        An instance of the class that evaluates a step. In particular this must
        be a clustering model provided with the clusters_centers_ attribute
        (e.g. KMeans).

    n_max : int, (INACTIVE)
        The maximum number of samples to include in the dendrogram.
        When the number of samples is bigger than n_max, only n_max samples
        randomly extracted from the dataset are represented. The random
        extraction is performed using
        sklearn.model_selection.StratifiedShuffleSplit
        (or sklearn.cross_validation.StratifiedShuffleSplit for legacy
        reasons).
    """
    # define col names
    col = ["$x_{" + str(i) + "}$" for i in np.arange(0, data_in.shape[1], 1)]
    df = pd.DataFrame(data=data_in, columns=col, index=index)

    # -- Code for row colors adapted from:
    # https://stanford.edu/~mwaskom/software/seaborn/examples/structured_heatmap.html
    # Create a custom palette to identify the classes
    if labels is None:
        labels = np.zeros(df.shape[0], dtype=np.short)
    else:
        mapping = dict(
            zip(np.unique(labels), np.arange(np.unique(labels).shape[0])))
        labels = np.vectorize(mapping.get)(labels)

    n_colors = np.unique(labels).shape[0]
    custom_pal = sns.color_palette("hls", n_colors)
    custom_lut = dict(zip(map(str, range(n_colors)), custom_pal))

    # Convert the palette to vectors that will be drawn on the matrix side
    custom_colors = pd.Series(map(str, labels)).map(custom_lut)

    # Create a custom colormap for the heatmap values
    cmap = sns.diverging_palette(220, 20, n=7, as_cmap=True)

    if model.affinity == 'precomputed':
        import scipy.spatial.distance as ssd
        from scipy.cluster.hierarchy import linkage
        # convert the redundant square matrix into a condensed one.
        # Even if the docs of scipy said so, linkage function does not
        # understand that the matrix is precomputed, unless it is 1-dimensional
        Z = linkage(ssd.squareform(data_in), method=model.linkage,
                    metric='euclidean')
        g = sns.clustermap(
            df, method=model.linkage, row_linkage=Z, col_linkage=Z,
            linewidths=.5, cmap=cmap)

    else:
        # workaround to a different name used for manhattan/cityblock distance
        if model.affinity == 'manhattan':
            model.affinity = 'cityblock'

        g = sns.clustermap(df, method=model.linkage, metric=model.affinity,
                           row_colors=custom_colors, linewidths=.5, cmap=cmap)

    plt.setp(g.ax_heatmap.yaxis.get_majorticklabels(), rotation=0, fontsize=5)
    filename = os.path.join(root, os.path.basename(root) +
                            '_dendrogram.' + DEFAULT_EXT)
    g.savefig(filename)
    logging.info('Figure saved %s', filename)
    plt.close()


[docs]def pcmagnitude(root, points, title='', ylabel=''):
    """Plot the trend of principal components magnitude.

    Parameters
    -----------
    root : string
        The root path for the output creation.

    points : array of float, shape : n_components
        This could be the explained variance ratio or the eigenvalues of the
        centered matrix, according to the PCA algorithm of choice, respectively
        PCA or KernelPCA.

    title : string
        Plot title.

    ylabel : string
        Y-axis label.
    """
    plt.plot(np.arange(1, len(points) + 1), points, '-o')
    plt.title(title)
    plt.grid('on')
    plt.ylabel(ylabel)
    plt.xlim([1, min(20, len(points) + 1)])  # Show maximum 20 components
    plt.ylim([0, 1])
    filename = os.path.join(root, os.path.basename(root) +
                            "_magnitude." + DEFAULT_EXT)
    plt.savefig(filename)
    plt.close()


[docs]def eigs(root, affinity, n_clusters=0, title='', ylabel='', normalised=True,
         n_components=20, filename=None, ylim='auto', rw=False):
    """Plot eigenvalues of the Laplacian associated to data affinity matrix.

    Parameters
    -----------
    root : string
        The root path for the output creation.

    affinity : array of float, shape : (n_samples, n_samples)
        The affinity matrix.

    n_clusters : int, optional
        The number of clusters.

    title : string, optional
        Plot title.

    ylabel : string, optional
        Y-axis label.

    normalised : boolean, optional, default True
        Choose whether to normalise the Laplacian matrix.

    n_components : int, optional, default 20
        Number of components to show in the plot.

    filename : None or str, optional, default None
        If not None, overrides default filename for saving the plot.

    ylim : 'auto', None, tuple or list, optional, default 'auto'
        If 'auto', choose the highest eigenvalue for the height of the plot.
        If None, plt.ylim is not called (matplotlib default is used).
        Otherwise, specify manually the desired ylim.

    rw : boolean, optional, default False
        Normalise the Laplacian matrix as the random walks point of view.
        This should be better suited with unclear data distributions.
    """
    # Efficient way to extract the main diagonal from a sparse matrix
    if isinstance(affinity, sp.sparse.csr.csr_matrix):
        dd = affinity.diagonal()
    else:  # or from a dense one
        dd = np.diag(affinity)
    W = affinity - np.diag(dd)
    D = np.diag([np.sum(x) for x in W])
    laplacian = D - W

    if rw:
        # compute normalised laplacian as random walks. Better with unclear
        # distributions
        aux = np.diag(1. / np.array([np.sum(x) for x in W]))
        laplacian = np.eye(laplacian.shape[0]) - np.dot(aux, W)
    elif normalised:
        # Compute the normalised Laplacian
        # aux = np.linalg.inv(np.diag([np.sqrt(np.sum(x)) for x in W]))
        aux = np.diag(1. / np.array([np.sqrt(np.sum(x)) for x in W]))
        laplacian = np.eye(laplacian.shape[0]) - (np.dot(np.dot(aux, W), aux))

    # TODO: replace the previous manual laplacian creation with sklearn utils.
    # Performance gain and np.allclose checks already performed with success.
    # from sklearn.utils import graph
    # laplacian = graph.graph_laplacian(affinity, normed=normalised)
    # warn: RW is not implemented in sklearn

    try:
        w = np.linalg.eigvals(laplacian)
        w = np.array(sorted(np.abs(w)))
        plt.plot(np.arange(1, len(w) + 1), w, '-o',
                 label='eigenvalues (sorted' +
                       (' and normalised rw)' if rw else
                        (' and normalised)' if normalised else ')')))
        plt.title(title)
        plt.grid('on')
        plt.ylabel(ylabel)
        plt.xlim([1, min(n_components, len(w) + 1)])  # Show max n_components
        if ylim == 'auto':
            plt.ylim((0, w[:min(n_components, len(w) + 1)][-1] +
                     2 * (w[:min(n_components, len(w) + 1)][-1] -
                          w[:min(n_components, len(w) + 1)][-2])))
        elif ylim is not None:
            plt.ylim(ylim)
        if n_clusters > 0:
            plt.axvline(x=n_clusters + .5, linestyle='--', color='r',
                        label='selected clusters')
        plt.legend(loc='upper left', numpoints=1, ncol=10, fontsize=8)
        # , bbox_to_anchor=(1, 1))
        if filename is None:
            filename = os.path.join(root, os.path.basename(root) +
                                    "_eigenvals." + DEFAULT_EXT)
        plt.savefig(filename)
    except np.linalg.LinAlgError:
        logging.critical("Error in plot_eigs: Affinity matrix contained "
                         "negative values. You can try by specifying "
                         "normalised=False")
    plt.close()