#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""This module is just a wrapper for some sklearn.datasets functions."""
######################################################################
# Copyright (C) 2016 Samuele Fiorini, Federico Tomasi, Annalisa Barla
#
# FreeBSD License
######################################################################
import sys
import numpy as np
import pandas as pd
import logging
from sklearn import datasets
from sklearn.preprocessing import Binarizer
# Legacy import
try:
from sklearn.model_selection import StratifiedShuffleSplit
except ImportError:
from sklearn.cross_validation import StratifiedShuffleSplit
[docs]def generate_gauss(mu=None, std=None, n_sample=None):
"""Create a Gaussian dataset.
Generates a dataset with n_sample * n_class examples and n_dim dimensions.
Parameters
-----------
mu : array of float, shape : n_class x n_dim
The mean of each class.
std : array of float, shape : n_class
The standard deviation of each Gaussian distribution.
n_sample : int
Number of point per class.
"""
n_class, n_var = mu.shape
X = np.zeros((n_sample * n_class, n_var))
y = np.zeros(n_sample * n_class, dtype=int)
start = 0
for i, s, m in zip(range(n_class), std, mu):
end = start + n_sample
X[start:end, :] = s * np.random.randn(n_sample, n_var) + m
y[start:end] = i
start = end
return X, y
[docs]def load_custom(x_filename, y_filename, samples_on='rows', **kwargs):
"""Load a custom dataset.
This function loads the data matrix and the label vector returning a
unique sklearn-like object dataSetObj.
Parameters
-----------
x_filename : string
The data matrix file name.
y_filename : string
The label vector file name.
samples_on : string
This can be either in ['row', 'rows'] if the samples lie on the row of
the input data matrix, or viceversa in ['col', 'cols'] the other way
around.
kwargs : dict
Arguments of pandas.read_csv function.
Returns
-----------
data : sklearn.datasets.base.Bunch
An instance of the sklearn.datasets.base.Bunch class, the meaningful
attributes are .data, the data matrix, and .target, the label vector.
"""
if x_filename is None:
raise IOError("Filename for X must be specified with mode 'custom'.")
if x_filename.endswith('.npy'): # it an .npy file is provided
try: # labels are not mandatory
y = np.load(y_filename)
except IOError as e:
y = None
e.strerror = "No labels file provided"
logging.error("I/O error({0}): {1}".format(e.errno, e.strerror))
X = np.load(x_filename)
if samples_on not in ['row', 'rows']:
# data matrix must be n_samples x n_features
X = X.T
return datasets.base.Bunch(data=X, target=y,
index=np.arange(X.shape[0]))
elif x_filename.endswith('.csv') or x_filename.endswith('.txt'):
y = None
kwargs.setdefault('header', 0) # header on first row
kwargs.setdefault('index_col', 0) # indexes on first
try:
dfx = pd.read_csv(x_filename, **kwargs)
if samples_on not in ['row', 'rows']:
# data matrix must be n_samples x n_features
dfx = dfx.transpose()
if y_filename is not None:
# Before loading labels, remove parameters that were likely
# specified for data only.
kwargs.pop('usecols', None)
y = pd.read_csv(y_filename, **kwargs).as_matrix().ravel()
except IOError as e:
e.strerror = "Can't open {} or {}".format(x_filename, y_filename)
logging.error("I/O error({0}): {1}".format(e.errno, e.strerror))
sys.exit(-1)
return datasets.base.Bunch(data=dfx.as_matrix(), feature_names=dfx.columns.tolist(),
target=y, index=dfx.index.tolist())
[docs]def load(opt='custom', x_filename=None, y_filename=None, n_samples=0,
samples_on='rows', **kwargs):
"""Load a specified dataset.
This function can be used either to load one of the standard scikit-learn
datasets or a different dataset saved as X.npy Y.npy in the working
directory.
Parameters
-----------
opt : {'iris', 'digits', 'diabetes', 'boston', 'circles', 'moons',
'custom'}, default: 'custom'
Name of a predefined dataset to be loaded.
x_filename : string, default : None
The data matrix file name.
y_filename : string, default : None
The label vector file name.
n_samples : int
The number of samples to be loaded. This comes handy when dealing with
large datasets. When n_samples is less than the actual size of the
dataset this function performs a random subsampling that is stratified
w.r.t. the labels (if provided).
samples_on : string
This can be either in ['row', 'rows'] if the samples lie on the row of
the input data matrix, or viceversa in ['col', 'cols'] the other way
around.
data_sep : string
The data separator. For instance comma, tab, blank space, etc.
Returns
-----------
X : array of float, shape : n_samples x n_features
The input data matrix.
y : array of float, shape : n_samples
The label vector; np.nan if missing.
feature_names : array of integers (or strings), shape : n_features
The feature names; a range of number if missing.
index : list of integers (or strings)
This is the samples identifier, if provided as first column (or row) of
of the input file. Otherwise it is just an incremental range of size
n_samples.
"""
data = None
try:
if opt.lower() == 'iris':
data = datasets.load_iris()
elif opt.lower() == 'digits':
data = datasets.load_digits()
elif opt.lower() == 'diabetes':
data = datasets.load_diabetes()
b = Binarizer(threshold=np.mean(data.target))
data.target = b.fit_transform(data.data)
elif opt.lower() == 'boston':
data = datasets.load_boston()
b = Binarizer(threshold=np.mean(data.target))
data.target = b.fit_transform(data.data)
elif opt.lower() == 'gauss':
means = np.array([[-1, 1, 1, 1], [0, -1, 0, 0], [1, 1, -1, -1]])
sigmas = np.array([0.33, 0.33, 0.33])
if n_samples <= 1:
n_samples = 333
xx, yy = generate_gauss(mu=means, std=sigmas, n_sample=n_samples)
data = datasets.base.Bunch(data=xx, target=yy)
elif opt.lower() == 'circles':
if n_samples == 0:
n_samples = 400
xx, yy = datasets.make_circles(n_samples=n_samples, factor=.3,
noise=.05)
data = datasets.base.Bunch(data=xx, target=yy)
elif opt.lower() == 'moons':
if n_samples == 0:
n_samples = 400
xx, yy = datasets.make_moons(n_samples=n_samples, noise=.01)
data = datasets.base.Bunch(data=xx, target=yy)
elif opt.lower() == 'custom':
data = load_custom(x_filename, y_filename, samples_on, **kwargs)
except IOError as e:
print("I/O error({0}): {1}".format(e.errno, e.strerror))
X, y = data.data, data.target
if n_samples > 0 and X.shape[0] > n_samples:
if y is not None:
try: # Legacy for sklearn
sss = StratifiedShuffleSplit(y, test_size=n_samples, n_iter=1)
# idx = np.random.permutation(X.shape[0])[:n_samples]
except TypeError:
sss = StratifiedShuffleSplit(test_size=n_samples) \
.split(X, y)
_, idx = list(sss)[0]
else:
idx = np.arange(X.shape[0])
np.random.shuffle(idx)
idx = idx[:n_samples]
X, y = X[idx, :], y[idx]
else:
# The length of index must be consistent with the number of samples
idx = np.arange(X.shape[0])
feat_names = data.feature_names if hasattr(data, 'feature_names') \
else np.arange(X.shape[1])
index = np.array(data.index)[idx] if hasattr(data, 'index') \
else np.arange(X.shape[0])
return X, y, feat_names, index