Na}{

File Content: plot_compare_reduction.py

#!/usr/bin/python
# -*- coding: utf-8 -*-
"""
=================================================================
Selecting dimensionality reduction with Pipeline and GridSearchCV
=================================================================

This example constructs a pipeline that does dimensionality
reduction followed by prediction with a support vector
classifier. It demonstrates the use of GridSearchCV and
Pipeline to optimize over different classes of estimators in a
single CV run -- unsupervised PCA and NMF dimensionality
reductions are compared to univariate feature selection during
the grid search.
"""
# Authors: Robert McGibbon, Joel Nothman

from __future__ import print_function, division

import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import load_digits
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.decomposition import PCA, NMF
from sklearn.feature_selection import SelectKBest, chi2

print(__doc__)

pipe = Pipeline([
    ('reduce_dim', PCA()),
    ('classify', LinearSVC())
])

N_FEATURES_OPTIONS = [2, 4, 8]
C_OPTIONS = [1, 10, 100, 1000]
param_grid = [
    {
        'reduce_dim': [PCA(iterated_power=7), NMF()],
        'reduce_dim__n_components': N_FEATURES_OPTIONS,
        'classify__C': C_OPTIONS
    },
    {
        'reduce_dim': [SelectKBest(chi2)],
        'reduce_dim__k': N_FEATURES_OPTIONS,
        'classify__C': C_OPTIONS
    },
]
reducer_labels = ['PCA', 'NMF', 'KBest(chi2)']

grid = GridSearchCV(pipe, cv=3, n_jobs=2, param_grid=param_grid)
digits = load_digits()
grid.fit(digits.data, digits.target)

mean_scores = np.array(grid.cv_results_['mean_test_score'])
# scores are in the order of param_grid iteration, which is alphabetical
mean_scores = mean_scores.reshape(len(C_OPTIONS), -1, len(N_FEATURES_OPTIONS))
# select score for best C
mean_scores = mean_scores.max(axis=0)
bar_offsets = (np.arange(len(N_FEATURES_OPTIONS)) *
               (len(reducer_labels) + 1) + .5)

plt.figure()
COLORS = 'bgrcmyk'
for i, (label, reducer_scores) in enumerate(zip(reducer_labels, mean_scores)):
    plt.bar(bar_offsets + i, reducer_scores, label=label, color=COLORS[i])

plt.title("Comparing feature reduction techniques")
plt.xlabel('Reduced number of features')
plt.xticks(bar_offsets + len(reducer_labels) / 2, N_FEATURES_OPTIONS)
plt.ylabel('Digit classification accuracy')
plt.ylim((0, 1))
plt.legend(loc='upper left')
plt.show()

Name	Size	Permission
applications	---	0755
bicluster	---	0755
calibration	---	0755
classification	---	0755
cluster	---	0755
covariance	---	0755
cross_decomposition	---	0755
datasets	---	0755
decomposition	---	0755
ensemble	---	0755
exercises	---	0755
feature_selection	---	0755
gaussian_process	---	0755
linear_model	---	0755
manifold	---	0755
mixture	---	0755
model_selection	---	0755
neighbors	---	0755
neural_networks	---	0755
preprocessing	---	0755
semi_supervised	---	0755
svm	---	0755
text	---	0755
tree	---	0755
README.txt	116 bytes	0644
feature_stacker.py	1911 bytes	0644
hetero_feature_union.py	6241 bytes	0644
missing_values.py	3055 bytes	0644
plot_compare_reduction.py	2489 bytes	0644
plot_cv_predict.py	799 bytes	0644
plot_digits_pipe.py	1813 bytes	0644
plot_isotonic_regression.py	1767 bytes	0644
plot_johnson_lindenstrauss_bound.py	7474 bytes	0644
plot_kernel_approximation.py	8004 bytes	0644
plot_kernel_ridge_regression.py	6269 bytes	0644
plot_multilabel.py	4157 bytes	0644
plot_multioutput_face_completion.py	3019 bytes	0644