Source code for adenine.utils.data_source

#!/usr/bin/env python
# -*- coding: utf-8 -*-

"""This module is just a wrapper for some sklearn.datasets functions."""

######################################################################
# Copyright (C) 2016 Samuele Fiorini, Federico Tomasi, Annalisa Barla
#
# FreeBSD License
######################################################################
import sys
import numpy as np
import pandas as pd
import logging
from sklearn import datasets
from sklearn.preprocessing import Binarizer

# Legacy import
try:
    from sklearn.model_selection import StratifiedShuffleSplit
except ImportError:
    from sklearn.cross_validation import StratifiedShuffleSplit


[docs]def generate_gauss(mu=None, std=None, n_sample=None):
    """Create a Gaussian dataset.

    Generates a dataset with n_sample * n_class examples and n_dim dimensions.

    Parameters
    -----------
    mu : array of float, shape : n_class x n_dim
        The mean of each class.

    std :  array of float, shape : n_class
        The standard deviation of each Gaussian distribution.

    n_sample : int
        Number of point per class.
    """
    n_class, n_var = mu.shape

    X = np.zeros((n_sample * n_class, n_var))
    y = np.zeros(n_sample * n_class, dtype=int)

    start = 0
    for i, s, m in zip(range(n_class), std, mu):
        end = start + n_sample
        X[start:end, :] = s * np.random.randn(n_sample, n_var) + m
        y[start:end] = i
        start = end

    return X, y


[docs]def load_custom(x_filename, y_filename, samples_on='rows', **kwargs):
    """Load a custom dataset.

    This function loads the data matrix and the label vector returning a
    unique sklearn-like object dataSetObj.

    Parameters
    -----------
    x_filename : string
        The data matrix file name.

    y_filename : string
        The label vector file name.

    samples_on : string
        This can be either in ['row', 'rows'] if the samples lie on the row of
        the input data matrix, or viceversa in ['col', 'cols'] the other way
        around.

    kwargs : dict
        Arguments of pandas.read_csv function.

    Returns
    -----------
    data : sklearn.datasets.base.Bunch
        An instance of the sklearn.datasets.base.Bunch class, the meaningful
        attributes are .data, the data matrix, and .target, the label vector.
    """
    if x_filename is None:
        raise IOError("Filename for X must be specified with mode 'custom'.")

    if x_filename.endswith('.npy'):  # it an .npy file is provided
        try:  # labels are not mandatory
            y = np.load(y_filename)
        except IOError as e:
            y = None
            e.strerror = "No labels file provided"
            logging.error("I/O error({0}): {1}".format(e.errno, e.strerror))
        X = np.load(x_filename)
        if samples_on not in ['row', 'rows']:
            # data matrix must be n_samples x n_features
            X = X.T
        return datasets.base.Bunch(data=X, target=y,
                                   index=np.arange(X.shape[0]))

    elif x_filename.endswith('.csv') or x_filename.endswith('.txt'):
        y = None
        kwargs.setdefault('header', 0)  # header on first row
        kwargs.setdefault('index_col', 0)  # indexes on first
        try:
            dfx = pd.read_csv(x_filename, **kwargs)
            if samples_on not in ['row', 'rows']:
                # data matrix must be n_samples x n_features
                dfx = dfx.transpose()
            if y_filename is not None:
                # Before loading labels, remove parameters that were likely
                # specified for data only.
                kwargs.pop('usecols', None)
                y = pd.read_csv(y_filename, **kwargs).as_matrix().ravel()

        except IOError as e:
            e.strerror = "Can't open {} or {}".format(x_filename, y_filename)
            logging.error("I/O error({0}): {1}".format(e.errno, e.strerror))
            sys.exit(-1)

        return datasets.base.Bunch(data=dfx.as_matrix(), feature_names=dfx.columns.tolist(),
                                   target=y, index=dfx.index.tolist())


[docs]def load(opt='custom', x_filename=None, y_filename=None, n_samples=0,
         samples_on='rows', **kwargs):
    """Load a specified dataset.

    This function can be used either to load one of the standard scikit-learn
    datasets or a different dataset saved as X.npy Y.npy in the working
    directory.

    Parameters
    -----------
    opt : {'iris', 'digits', 'diabetes', 'boston', 'circles', 'moons',
          'custom'}, default: 'custom'
        Name of a predefined dataset to be loaded.

    x_filename : string, default : None
        The data matrix file name.

    y_filename : string, default : None
        The label vector file name.

    n_samples : int
        The number of samples to be loaded. This comes handy when dealing with
        large datasets. When n_samples is less than the actual size of the
        dataset this function performs a random subsampling that is stratified
        w.r.t. the labels (if provided).

    samples_on : string
        This can be either in ['row', 'rows'] if the samples lie on the row of
        the input data matrix, or viceversa in ['col', 'cols'] the other way
        around.

    data_sep : string
        The data separator. For instance comma, tab, blank space, etc.

    Returns
    -----------
    X : array of float, shape : n_samples x n_features
        The input data matrix.

    y : array of float, shape : n_samples
        The label vector; np.nan if missing.

    feature_names : array of integers (or strings), shape : n_features
        The feature names; a range of number if missing.

    index : list of integers (or strings)
        This is the samples identifier, if provided as first column (or row) of
        of the input file. Otherwise it is just an incremental range of size
        n_samples.
    """
    data = None
    try:
        if opt.lower() == 'iris':
            data = datasets.load_iris()
        elif opt.lower() == 'digits':
            data = datasets.load_digits()
        elif opt.lower() == 'diabetes':
            data = datasets.load_diabetes()
            b = Binarizer(threshold=np.mean(data.target))
            data.target = b.fit_transform(data.data)
        elif opt.lower() == 'boston':
            data = datasets.load_boston()
            b = Binarizer(threshold=np.mean(data.target))
            data.target = b.fit_transform(data.data)
        elif opt.lower() == 'gauss':
            means = np.array([[-1, 1, 1, 1], [0, -1, 0, 0], [1, 1, -1, -1]])
            sigmas = np.array([0.33, 0.33, 0.33])
            if n_samples <= 1:
                n_samples = 333
            xx, yy = generate_gauss(mu=means, std=sigmas, n_sample=n_samples)
            data = datasets.base.Bunch(data=xx, target=yy)
        elif opt.lower() == 'circles':
            if n_samples == 0:
                n_samples = 400
            xx, yy = datasets.make_circles(n_samples=n_samples, factor=.3,
                                           noise=.05)
            data = datasets.base.Bunch(data=xx, target=yy)
        elif opt.lower() == 'moons':
            if n_samples == 0:
                n_samples = 400
            xx, yy = datasets.make_moons(n_samples=n_samples, noise=.01)
            data = datasets.base.Bunch(data=xx, target=yy)
        elif opt.lower() == 'custom':
            data = load_custom(x_filename, y_filename, samples_on, **kwargs)
    except IOError as e:
        print("I/O error({0}): {1}".format(e.errno, e.strerror))

    X, y = data.data, data.target
    if n_samples > 0 and X.shape[0] > n_samples:
        if y is not None:
            try:  # Legacy for sklearn
                sss = StratifiedShuffleSplit(y, test_size=n_samples, n_iter=1)
                # idx = np.random.permutation(X.shape[0])[:n_samples]
            except TypeError:
                sss = StratifiedShuffleSplit(test_size=n_samples) \
                    .split(X, y)
            _, idx = list(sss)[0]
        else:
            idx = np.arange(X.shape[0])
            np.random.shuffle(idx)
            idx = idx[:n_samples]

        X, y = X[idx, :], y[idx]
    else:
        # The length of index must be consistent with the number of samples
        idx = np.arange(X.shape[0])

    feat_names = data.feature_names if hasattr(data, 'feature_names') \
        else np.arange(X.shape[1])
    index = np.array(data.index)[idx] if hasattr(data, 'index') \
        else np.arange(X.shape[0])

    return X, y, feat_names, index