Na}{

File Content: `missing_values.py`

"""
======================================================
Imputing missing values before building an estimator
======================================================

This example shows that imputing the missing values can give better results
than discarding the samples containing any missing value.
Imputing does not always improve the predictions, so please check via cross-validation.
Sometimes dropping rows or using marker values is more effective.

Missing values can be replaced by the mean, the median or the most frequent
value using the ``strategy`` hyper-parameter.
The median is a more robust estimator for data with high magnitude variables
which could dominate results (otherwise known as a 'long tail').

Script output::

  Score with the entire dataset = 0.56
  Score without the samples containing missing values = 0.48
  Score after imputation of the missing values = 0.55

In this case, imputing helps the classifier get close to the original score.
  
"""
import numpy as np

from sklearn.datasets import load_boston
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import Imputer
from sklearn.model_selection import cross_val_score

rng = np.random.RandomState(0)

dataset = load_boston()
X_full, y_full = dataset.data, dataset.target
n_samples = X_full.shape[0]
n_features = X_full.shape[1]

# Estimate the score on the entire dataset, with no missing values
estimator = RandomForestRegressor(random_state=0, n_estimators=100)
score = cross_val_score(estimator, X_full, y_full).mean()
print("Score with the entire dataset = %.2f" % score)

# Add missing values in 75% of the lines
missing_rate = 0.75
n_missing_samples = np.floor(n_samples * missing_rate)
missing_samples = np.hstack((np.zeros(n_samples - n_missing_samples,
                                      dtype=np.bool),
                             np.ones(n_missing_samples,
                                     dtype=np.bool)))
rng.shuffle(missing_samples)
missing_features = rng.randint(0, n_features, n_missing_samples)

# Estimate the score without the lines containing missing values
X_filtered = X_full[~missing_samples, :]
y_filtered = y_full[~missing_samples]
estimator = RandomForestRegressor(random_state=0, n_estimators=100)
score = cross_val_score(estimator, X_filtered, y_filtered).mean()
print("Score without the samples containing missing values = %.2f" % score)

# Estimate the score after imputation of the missing values
X_missing = X_full.copy()
X_missing[np.where(missing_samples)[0], missing_features] = 0
y_missing = y_full.copy()
estimator = Pipeline([("imputer", Imputer(missing_values=0,
                                          strategy="mean",
                                          axis=0)),
                      ("forest", RandomForestRegressor(random_state=0,
                                                       n_estimators=100))])
score = cross_val_score(estimator, X_missing, y_missing).mean()
print("Score after imputation of the missing values = %.2f" % score)

Name	Size	Permission
applications	---	0755
bicluster	---	0755
calibration	---	0755
classification	---	0755
cluster	---	0755
covariance	---	0755
cross_decomposition	---	0755
datasets	---	0755
decomposition	---	0755
ensemble	---	0755
exercises	---	0755
feature_selection	---	0755
gaussian_process	---	0755
linear_model	---	0755
manifold	---	0755
mixture	---	0755
model_selection	---	0755
neighbors	---	0755
neural_networks	---	0755
preprocessing	---	0755
semi_supervised	---	0755
svm	---	0755
text	---	0755
tree	---	0755
README.txt	116 bytes	0644
feature_stacker.py	1911 bytes	0644
hetero_feature_union.py	6241 bytes	0644
missing_values.py	3055 bytes	0644
plot_compare_reduction.py	2489 bytes	0644
plot_cv_predict.py	799 bytes	0644
plot_digits_pipe.py	1813 bytes	0644
plot_isotonic_regression.py	1767 bytes	0644
plot_johnson_lindenstrauss_bound.py	7474 bytes	0644
plot_kernel_approximation.py	8004 bytes	0644
plot_kernel_ridge_regression.py	6269 bytes	0644
plot_multilabel.py	4157 bytes	0644
plot_multioutput_face_completion.py	3019 bytes	0644

File Content: missing_values.py

File Content: `missing_values.py`