Source code for palladio.utils

"""Utilities functions and classes."""
import numpy as np
from types import ModuleType
from six import iteritems


[docs]def save_signature(filename, selected, threshold=0.75):
    """Save signature summary."""
    with open(filename, 'w') as f:
        line_drawn = False
        for k in reversed(sorted(
                selected, key=selected.__getitem__)):
            if not line_drawn and float(selected[k]) < threshold:
                line_drawn = True
                f.write("=" * 40)
                f.write("\n")
            f.write("{} : {}\n".format(k, selected[k] * 100.))
            # f.write("{}\n".format(k))


[docs]def retrieve_features(best_estimator):
    """Retrieve selected features from any estimator.

    In case it has the 'get_support' method, use it.
    Else, if it has a 'coef_' attribute, assume it's a linear model and the
    features correspond to the indices of the coefficients != 0
    """
    if hasattr(best_estimator, 'get_support'):
        return np.nonzero(best_estimator.get_support())[0]
    elif hasattr(best_estimator, 'coef_'):
        # print best_estimator.coef_
        if best_estimator.coef_.ndim > 1 and 1 not in best_estimator.coef_.shape:
            sel_feats = []
            for dim in range(best_estimator.coef_.ndim):
                sel_feats += np.nonzero(
                    best_estimator.coef_[dim])[0].ravel().tolist()
            return np.unique(sel_feats)
        return np.nonzero(best_estimator.coef_.flatten())[0]
    else:
        # Raise an error
        raise AttributeError('The best_estimator object does not have '
                             'neither the `coef_` attribute nor the '
                             '`get_support` method')


[docs]def get_selected_list(grid_search, vs_analysis=True):
    """Retrieve the list of selected features.

    Retrieves the list of selected features automatically identifying the
    type of object

    Returns
    -------
    index : nunmpy.array
        The indices of the selected features
    """
    # First, check whether it's a string, which means the list of features
    # must be taken from a step of a Pipeline object
    if type(vs_analysis) == str:
        selected_features = retrieve_features(
            grid_search.best_estimator_.named_steps[vs_analysis])
    else:
        selected_features = retrieve_features(grid_search.best_estimator_)
    return selected_features


[docs]def build_cv_results(dictionary, **results):
    """Function to build final cv_results_ dictionary with partial results."""
    for k, v in iteritems(results):
        if v is not None:
            dictionary.setdefault(k, []).append(v)


[docs]def signatures(splits_results, frequency_threshold=0.0):
    """Return (almost) nested signatures for each correlation value.

    The function returns 3 lists where each item refers to a signature
    (for increasing value of linear correlation).
    Each signature is orderer from the most to the least selected variable
    across KCV splits results.

    Parameters
    ----------
    splits_results : iterable
        List of results from L1L2Py module, one for each external split.
    frequency_threshold : float
        Only the variables selected more (or equal) than this threshold are
        included into the signature.

    Returns
    -------
    sign_totals : list of :class:`numpy.ndarray`.
        Counts the number of times each variable in the signature is selected.
    sign_freqs : list of :class:`numpy.ndarray`.
        Frequencies calculated from ``sign_totals``.
    sign_idxs : list of :class:`numpy.ndarray`.
        Indexes of the signatures variables .

    Examples
    --------
    >>> from palladio.utils import signatures
    >>> splits_results = [{'selected_list':[[True, False], [True, True]]},
    ...                   {'selected_list':[[True, False], [False, True]]}]
    >>> sign_totals, sign_freqs, sign_idxs = signatures(splits_results)
    >>> print sign_totals
    [array([ 2.,  0.]), array([ 2.,  1.])]
    >>> print sign_freqs
    [array([ 1.,  0.]), array([ 1. ,  0.5])]
    >>> print sign_idxs
    [array([0, 1]), array([1, 0])]
    """
    # Computing totals and frequencies
    selection_totals = selection_summary(splits_results)
    selection_freqs = selection_totals / len(splits_results)

    # Variables are ordered and filtered by frequency threshold
    sorted_idxs = np.argsort(selection_freqs, axis=1)
    sorted_idxs = (sorted_idxs.T)[::-1].T  # Reverse order

    # ... ordering
    for i, si in enumerate(sorted_idxs):
        selection_freqs[i] = selection_freqs[i][si]
        selection_totals[i] = selection_totals[i][si]

    # ... filtering
    threshold_mask = (selection_freqs >= frequency_threshold)

    # Signatures Ordered and Filtered!
    sign_totals = list()
    sign_freqs = list()
    sign_idxs = list()
    for i, mask in enumerate(threshold_mask):
        sign_totals.append(selection_totals[i][mask])
        sign_freqs.append(selection_freqs[i][mask])
        sign_idxs.append(sorted_idxs[i][mask])

    return sign_totals, sign_freqs, sign_idxs


[docs]def selection_summary(splits_results):
    """Count how many times each variables was selected.

    Parameters
    ----------
    splits_results : iterable
        List of results from L1L2Py module, one for each external split.

    Returns
    -------
    summary : :class:`numpy.ndarray`
        Selection summary. ``# mu_values X # variables`` matrix.
    """
    # Sum selection lists by mu values (mu_num x num_var)
    return np.sum(np.asarray(sr['selected_list'], dtype=float)
                  for sr in splits_results)


[docs]def confusion_matrix(labels, predictions):
    """Calculate a confusion matrix.

    From given real and predicted labels, the function calculated
    a confusion matrix as a double nested dictionary.
    The external one contains two keys, ``'T'`` and ``'F'``.
    Both internal dictionaries
    contain a key for each class label. Then the ``['T']['C1']`` entry counts
    the number of correctly predicted ``'C1'`` labels,
    while ``['F']['C2']`` the incorrectly predicted ``'C2'`` labels.

    Note that each external dictionary correspond to a confusion
    matrix diagonal and the function works only on two-class labels.

    Parameters
    ----------
    labels : iterable
        Real labels.

    predictions : iterable
        Predicted labels.

    Returns
    -------
    cm : dict
        Dictionary containing the confusion matrix values.
    """
    cm = {'T': dict(), 'F': dict()}

    real_unique_labels, real_C1, real_C2 = _check_unique_labels(labels)
    pred_unique_labels, pred_C1, pred_C2 = _check_unique_labels(predictions)

    if not np.all(real_unique_labels == pred_unique_labels):
        raise ValueError('real and predicted labels differ.')

    cm['T'][real_unique_labels[0]] = (real_C1 & pred_C1).sum()  # True C1
    cm['T'][real_unique_labels[1]] = (real_C2 & pred_C2).sum()  # True C2
    cm['F'][real_unique_labels[0]] = (real_C2 & pred_C1).sum()  # False C1
    cm['F'][real_unique_labels[1]] = (real_C1 & pred_C2).sum()  # False C2

    return cm


[docs]def classification_measures(confusion_matrix, positive_label=None):
    """Calculate some classification measures.

    Measures are calculated from a given confusion matrix
    (see :func:`confusion_matrix` for a detailed description of the
    required structure).

    The ``positive_label`` arguments allows to specify what label has to be
    considered the positive class. This is needed to calculate some
    measures like F-measure and set some aliases (e.g. precision and recall
    are respectively the 'predictive value' and the 'true rate' for the
    positive class).

    If ``positive_label`` is None, the resulting dictionary will not
    contain all the measures. Assuming to have to classes 'C1' and 'C2',
    and to indicate 'C1' as the positive (P) class, the function returns a
    dictionary with the following structure::

        {
            'C1': {'predictive_value': --,  # TP / (TP + FP)
                   'true_rate':        --}, # TP / (TP + FN)
            'C2': {'predictive_value': --,  # TN / (TN + FN)
                   'true_rate':        --}, # TN / (TN + FP)
            'accuracy':          --,        # (TP + TN) / (TP + FP + FN + TN)
            'balanced_accuracy': --,        # 0.5 * ( (TP / (TP + FN)) +
                                            #         (TN / (TN + FP)) )
            'MCC':               --,        # ( (TP * TN) - (FP * FN) ) /
                                            # sqrt( (TP + FP) * (TP + FN) *
                                            #       (TN + FP) * (TN + FN) )

            # Following, only with positive_labels != None
            'sensitivity':       --,        # P true rate: TP / (TP + FN)
            'specificity':       --,        # N true rate: TN / (TN + FP)
            'precision':         --,        # P predictive value: TP / (TP + FP)
            'recall':            --,        # P true rate: TP / (TP + FN)
            'F_measure':         --         # 2. * ( (Precision * Recall ) /
                                            #        (Precision + Recall) )
        }

    Parameters
    ----------
    confusion_matrix : dict
        Confusion matrix (as the one returned by :func:`confusion_matrix`).

    positive_label : str
        Positive class label.

    Returns
    -------
    summary : dict
        Dictionary containing calculated measures.
    """
    # Confusion Matrix
    #           True P      True N
    # Pred P      TP         FP         P Pred Value
    # Pred N      FN         TN         N Pred Value
    #         Sensitivity Specificity

    labels = confusion_matrix['T'].keys()

    if positive_label is not None:
        P = positive_label
        if P not in labels:
            raise ValueError('label %s not found.' % positive_label)

        N = set(labels).difference([positive_label]).pop()
    else:
        P, N = sorted(labels)

    # shortcuts ------------------------------------
    TP = confusion_matrix['T'][P]
    TN = confusion_matrix['T'][N]
    FP = confusion_matrix['F'][P]
    FN = confusion_matrix['F'][N]
    # ----------------------------------------------

    summary = dict({P: dict(), N: dict()})

    summary[P]['predictive_value'] = TP / float(TP + FP)
    summary[P]['true_rate'] = TP / float(TP + FN)           # sensitivity

    summary[N]['predictive_value'] = TN / float(TN + FN)
    summary[N]['true_rate'] = TN / float(TN + FP)           # specificity

    summary['accuracy'] = (TP + TN) / float(TP + FP + FN + TN)
    summary['balanced_accuracy'] = 0.5 * (summary[P]['true_rate'] +
                                          summary[N]['true_rate'])

    den = ((TP + FP) * (TP + FN) * (TN + FP) * (TN + FN))
    summary['MCC'] = (((TP * TN) - (FP * FN)) /
                      (1.0 if den == 0 else np.sqrt(den)))

    if positive_label is not None:
        summary['sensitivity'] = summary[P]['true_rate']
        summary['specificity'] = summary[N]['true_rate']

        summary['precision'] = summary[P]['predictive_value']
        summary['recall'] = summary['sensitivity']

        summary['F_measure'] = (
            2. * ((summary['precision'] * summary['recall']) /
                  (summary['precision'] + summary['recall']))
        )

    return summary


def _check_unique_labels(labels):
    labels = np.array([str(s).strip() for s in labels])
    unique_labels = np.unique(labels)

    if len(unique_labels) != 2:
        raise ValueError('more than 2 classes in labels.')

    unique_labels.sort(kind='mergesort')
    class1 = (labels == unique_labels[0])
    class2 = (labels == unique_labels[1])

    return unique_labels, class1, class2


[docs]def set_module_defaults(module, dictionary):
    """Set default variables of a module, given a dictionary.

    Used after the loading of the configuration file to set some defaults.
    """
    # for k, v in dictionary.iteritems():
    for k, v in iteritems(dictionary):

        try:
            getattr(module, k)
        except AttributeError:
            setattr(module, k, v)


[docs]def sec_to_timestring(seconds):
    """Transform seconds into a formatted time string.

    Parameters
    -----------
    seconds : int
        Seconds to be transformed.
    Returns
    -----------
    time : string
        A well formatted time string.
    """
    m, s = divmod(seconds, 60)
    h, m = divmod(m, 60)
    return "%02d:%02d:%02d" % (h, m, s)


[docs]def safe_run(function):
    """Decorator that tries to run a function and prints an error when fails."""
    def safe_run_function(*args, **kwargs):
        try:
            function(*args, **kwargs)
        except StandardError as error:
            print('Function {} failed: plot not '
            'created. Exception raised: {}'.format(function.__name__, error))
    return safe_run_function