AnalysisWorkFlowModel.py.html


# coding: utf-8

# # Data science analysis: ttH(bb) dilepton channel
# 
# **Notebook by Christian Contreras-Campana, PhD**

# ## Introduction
# 
# Developing a data analytic report scheme for ttH(bb) multivariate analysis study using machine larning technologies.
# 
# The columns in the file are:
# - mass_tag_tag_min_deltaR: Mass for b-tag jet pair with minimum $\Delta$R
# - median_mass_jet_jet: Median invariant mass of all combinations of jet pairs
# - maxDeltaEta_tag_tag:  The $\Delta\eta$ between the two furthest b-tagged jets
# - mass_higgsLikeDijet:  The invariant mass of a jet pair ordered in closeness to a Higgs mass
# - HT_tags: Scalar sum of transverse momentum for all jets
# - btagDiscriminatorAverage_tagged:  Average CSV b-tag discriminant value for b-tagged jets
# - mass_jet_tag_min_deltaR:  Invariant mass of jet pair (with at least one b-tagged) $\Delta$R
# - mass_jet_jet_min_deltaR:  Invariant mass of jet pair $\Delta$R
# - mass_tag_tag_max_mass:  Mass for b-tagged jet pair with maximum invariant mass combination
# - centrality_jets_leps:  The ratio of the sum of the transverse momentum of all jets and leptons
# - maxDeltaEta_jet_jet:  Invariant mass of jet pair $\Delta$R
# - centrality_tags:  The ratio of the sum of the transverse momentum of all b-tagged jets
# 
# While we have some grasp on the matter, we're not experts, so the following might contain inaccuracies or even outright errors. Feel free to point them out, either in the comments or privately.

# ## Load Libraries
# 
# We load all the necessary python libraries that will permit us to load the data files, pre-process and clean the data, perform data validation, produce statistical summaries, conduct exploratory data analysis, as well as feature transformation, feature ranking, and feature selection. Python libraries will also be needed for model selection, evaluating overfitting, executing standard nested k-fold cross validation for hyper-parameter optimization and model evaluation.  

# In[1]:

## Import common python libraries
import sys
import time
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import random

# Import from root_numpy library
import root_numpy
from root_numpy import root2array, rec2array

# Import panda library
from pandas.tools import plotting
from pandas.tools.plotting import scatter_matrix
from pandas.core.index import Index
import pandas.core.common as com

# Import scipy
import scipy
from scipy.stats import ks_2samp
import scipy as sp

# Import itertools
import itertools
from itertools import cycle

# Import Jupyter
from IPython.core.interactiveshell import InteractiveShell

# Import scikit-learn
import sklearn
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.feature_selection import RFE
from sklearn.feature_selection import SelectFromModel

from sklearn.model_selection import ParameterGrid
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold, StratifiedKFold, ShuffleSplit
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import learning_curve

from sklearn.svm import SVC, LinearSVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, VotingClassifier
from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier, BaggingClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import RandomizedLasso

from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler
from sklearn.feature_selection import SelectPercentile, f_classif
from sklearn.feature_selection import RFECV
from sklearn.calibration import calibration_curve, CalibratedClassifierCV
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.metrics import (confusion_matrix, roc_auc_score, roc_curve, 
                             auc, average_precision_score, precision_score, 
                             brier_score_loss, recall_score, f1_score, log_loss, 
                             classification_report, precision_recall_curve)
from sklearn.dummy import DummyClassifier

from sklearn.externals import joblib
from sklearn import feature_selection

# Import imblearn
import imblearn
from imblearn.over_sampling import ADASYN, SMOTE, RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler

from collections import defaultdict, Counter

import re

# Check the versions of libraries/packages
print("Python version " + sys.version)
print("Sklearn version " + sklearn.__version__)
print("Root_numpy version " + root_numpy.__version__)
print("Numpy version " + np.__version__)
print("Scipy version " + scipy.__version__)
print("Pandas version " + pd.__version__)
print("Matplotlib version " + matplotlib.__version__)
print("Seaborn version " + sns.__version__)
print("Imblance version " +imblearn.__version__)

# Fix random seed for reproducibility
seed = 7
np.random.seed(seed)

# Specifying which nodes should be run interactively
InteractiveShell.ast_node_interactivity = "all"
print(__doc__)


# ## Load Data Files
# 
# Most data files contain approximately 15K events. There are a total of 4 files totaling 80K data events. We list the features and response names. We store the data in a Pandas DataFrame for greater ease of data manipulation.
# 
# **Note: To reduce running time of the program we use our most signal-like category which is statistically limited**

# In[2]:

## Define data load function

def load(sig_filename, bkg_filename, category, features):
    """Multi class version of Logarithmic Loss metric.
    https://www.kaggle.com/wiki/MultiClassLogLoss

    Parameters
    ----------
    sig_filename : array, shape = [n_samples]
            true class, intergers in [0, n_classes - 1)
    bkg_filename : array, shape = [n_samples, n_classes]
    category: string
    features: array, shape = [n_features]

    Returns
    -------
    data : pandas.DataFrame
    """

    signal = root2array(sig_filename, category, features)
    signal = rec2array(signal)

    backgr = root2array(bkg_filename, category, features)
    backgr = rec2array(backgr)

    # for sklearn data is usually organised
    # into one 2D array of shape (n_samples x n_features)
    # containing all the data and one array of categories
    # of length n_samples
    X = np.concatenate((signal, backgr))
    y = np.concatenate((np.ones(signal.shape[0]), np.zeros(backgr.shape[0])))

    # convert to numpy ndarray into pandas dataframe
    dataframe_X = pd.DataFrame(data=X, columns=features)
    dataframe_y = pd.DataFrame(data=y, columns=['y'])

    data = pd.concat([dataframe_X, dataframe_y], axis=1)

    return data


# In[3]:

## Load data files

# Feature names
branch_names = """mass_tag_tag_min_deltaR,median_mass_jet_jet,
    maxDeltaEta_tag_tag,mass_higgsLikeDijet,HT_tags,
    btagDiscriminatorAverage_tagged,mass_jet_tag_min_deltaR,
    mass_jet_jet_min_deltaR,mass_tag_tag_max_mass,maxDeltaEta_jet_jet,
    centrality_jets_leps,centrality_tags""".split(",")

features = [c.strip() for c in branch_names]
features = (b.replace(" ", "_") for b in features)
features = list(b.replace("-", "_") for b in features)

wall = time.time()
process = time.clock()

# Load dataset
signal_sample = "combined/signalData.root"
background_sample = "combined/backgroundData.root"
tree_category = "event_mvaVariables_step7_cate4"

data = load(signal_sample, background_sample, tree_category, features)

print "Total number of events: {}\nNumber of features: {}".format(data.shape[0], data.shape[1])

# Store a copy for later use
df_archived = data.copy(deep=True)

print "\nWall time to read in file input: ", time.time()-wall
print "Elapsed time to read in file input: ", time.clock()-process


# ## Statistical Summary
# 
# We give a statistical summary below to make sure the data makes sense and that nothing anomolous is present. As we can see values look promising and have acceptable variances.

# In[4]:

## Print statistical summary of dataset

# To print out all rows and columns to the terminal
pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)

wall = time.time()
process = time.clock()

print "Head:"
data.head()
print "Information:" 
data.info()
print "Describe:"
data.describe()

print "\nWall time to print statistical summary: ", time.time()-wall
print "Elapsed time to print statistical summary: ", time.clock()-process


# In[5]:

## Define class label counts and percentages

def class_info(classes):
    # Store the number of signal and background events
    class_count = {}
    counts = Counter(classes)
    total = sum(counts.values())

    for cls in counts.keys():
        class_count[class_label[cls]] = counts[cls]
        print("%6s: % 7d  =  % 5.1f%%" % (class_label[cls], counts[cls], float(counts[cls])/float((total))*100.0))

    return (class_count["signal"], class_count["background"])


# In[6]:

# Determine class label counts and percentages
class_label = {0.0: "background", 1.0: "signal"}
class_info(data.y);


# ## Feature Visualization: Basic Exploratory Data Analyses¶
# We conduct a basic exploratory data analyses by producing correlaiton matrices between all variables of interest. In addition, we visually depict the relationship signal and background rate and feature variables.

# In[7]:

## Define linear correlation matrix

def correlations(data, **kwds):
    """Multi class version of Logarithmic Loss metric.
    https://www.kaggle.com/wiki/MultiClassLogLoss

    Parameters
    ----------
    data : array, shape = [n_samples]
            true class, intergers in [0, n_classes - 1)
    kwds : array, shape = [n_samples, n_classes]

    Returns
    -------
    loss : float
    """
        
    
    """To calculate pairwise correlation between features.
    
    Extra arguments are passed on to DataFrame.corr()
    """
    
    # Select signal or background label for plot title
    if (data["y"] > 0.5).all(axis=0):
        label = "signal"
    elif (data["y"] < 0.5).all(axis=0):
        label = "background"
    
    # simply call df.corr() to get a table of
    # correlation values if you do not need
    # the fancy plotting
    data = data.drop("y", axis=1) 
 
    # Add colorbar, make sure to specify tick locations to match desired ticklabels
    labels = data.corr(**kwds).columns.values
    
    fig, ax1 = plt.subplots(ncols=1, figsize=(8,7))
    
    opts = {"annot" : True,
            "ax" : ax1,
            "vmin": 0, "vmax": 1*100,
            "annot_kws" : {"size": 8}, 
            "cmap": plt.get_cmap("Blues", 20),
            }
    
    ax1.set_title("Correlations: " + label)

    sns.heatmap(data.corr(method="spearman").iloc[::-1]*100, **opts) 
    
    plt.yticks(rotation=0)
    plt.xticks(rotation=90)
    
    for ax in (ax1,):
        # shift location of ticks to center of the bins
        ax.set_xticks(np.arange(len(labels))+0.5, minor=False)
        ax.set_yticks(np.arange(len(labels))+0.5, minor=False)
        ax.set_xticklabels(labels[::-1], minor=False, ha="right", rotation=70)
        ax.set_yticklabels(np.flipud(labels), minor=False)
        
    plt.tight_layout()
    
    return plt.show()


# In[8]:

## Plot feature correlations (assuming linear correlations)

wall = time.time()
process = time.clock()

# Remove the y column from the correlation matrix
# after using it to select background and signal
sig = data[data["y"] > 0.5]
bg = data[data["y"] < 0.5]

# Correlation Matrix
correlations(sig)
correlations(bg)

print "Wall time to plot correlation matrix: ", time.time()-wall
print "Elapsed time to plot correlation matrix: ", time.clock()-process


# In[9]:

## Scatter Plot
sns.set(style="ticks", color_codes=True)
wall = time.time()
process = time.clock()

random.seed(a=seed)

#_ = sns.pairplot(data.drop(data.y, axis=0), size=2.5, hue="y")
_ = sns.pairplot(data.drop(data.y, axis=0), size=2.5, hue="y", 
                 markers=["o", "s"], plot_kws={ "s":5,"alpha":0.7 })
sns.plt.show()

print "Wall time to plot scatter distribution: ", time.time()-wall
print "Elapsed time to plot scatter distribution: ", time.clock()-process


# In[10]:

## Plot signal and background distributions for some variables

# The first two arguments select what is "signal"
# and what is "background". This means you can
# use it for more general comparisons of two
# subsets as well.

def signal_background(data1, data2, column=None, grid=True,
                      xlabelsize=None, xrot=None, ylabelsize=None,
                      yrot=None, ax=None, sharex=False,
                      sharey=False, figsize=None,
                      layout=None, bins=10, **kwds):
   """Draw histogram of the DataFrame's series comparing the distribution
   in `data1` to `data2`.

   data1: DataFrame
   data2: DataFrame
   column: string or sequence
       If passed, will be used to limit data to a subset of columns
   grid : boolean, default True
       Whether to show axis grid lines
   xlabelsize : int, default None
       If specified changes the x-axis label size
   xrot : float, default None
       rotation of x axis labels
   ylabelsize : int, default None
       If specified changes the y-axis label size
   yrot : float, default None
       rotation of y axis labels
   ax : matplotlib axes object, default None
   sharex : bool, if True, the X axis will be shared amongst all subplots.
   sharey : bool, if True, the Y axis will be shared amongst all subplots.
   figsize : tuple
       The size of the figure to create in inches by default
   layout: (optional) a tuple (rows, columns) for the layout of the histograms
   bins: integer, default 10
       Number of histogram bins to be used
   kwds : other plotting keyword arguments
       To be passed to hist function
   """        

   if "alpha" not in kwds:
       kwds["alpha"] = 0.5

   w, h = (12, 8)
   figsize = (w, h)

   if column is not None:
       if not isinstance(column, (list, np.ndarray, Index)):
           column = [column]
       data1 = data1[column]
       data2 = data2[column]

   data1 = data1._get_numeric_data()
   data2 = data2._get_numeric_data()
   naxes = len(data1.columns)


   fig, axes = plotting._subplots(naxes=naxes, 
                                  ax=ax, 
                                  squeeze=False,
                                  sharex=sharex,
                                  sharey=sharey,
                                  figsize=figsize,
                                  layout=layout)
   xs = plotting._flatten(axes)

   for i, col in enumerate(com._try_sort(data1.columns)):
       ax = xs[i]
       low = min(data1[col].min(), data2[col].min())
       high = max(data1[col].max(), data2[col].max())
       ax.hist(data1[col].dropna().values,
               bins=bins, histtype='stepfilled', range=(low,high), **kwds)
       ax.hist(data2[col].dropna().values,
               bins=bins, histtype='stepfilled', range=(low,high), **kwds)
       ax.set_title(col)
       ax.legend(['background', 'signal'], loc='best')
 
       ax.set_axis_bgcolor('white')
    
       # Customize the major grid
       ax.grid(which='major', linestyle='-', linewidth='0.2', color='gray')
       ax.set_axis_bgcolor('white')
    

   plotting._set_ticks_props(axes, xlabelsize=xlabelsize, xrot=xrot,
                             ylabelsize=ylabelsize, yrot=yrot)
   fig.subplots_adjust(wspace=0.5, hspace=0.8)

   return plt.show()


# In[11]:

## Plot feature hitograms

wall = time.time()
process = time.clock()

signal_background(data[data["y"] < 0.5], data[data["y"] > 0.5],
                  column=features, bins=40);

print "\nWall time to plot exploratory data features: ", time.time()-wall
print "Elapsed time to plot exploratory data features: ", time.clock()-process


# In[12]:

## Create features dataframe and target array

df_X = data.drop("y", axis=1, inplace=False)
df_y = data["y"]


# ## Model performance measure
# 
# We investigate several machine learning models in order to establish which algorithm may be the most promising for the discrimination modeling of signal and background processes. Two performance measures wil be used to help select our model, namely, accuracy and the area under the receiver operating characteristic (ROC) curve. Receiver Operating Characteristic (ROC) curve number is equal to the probability that a random positive example will be ranked above a random negative example.

# In[13]:

## Compute ROC curve and area under the curve

def roc_plot(models, X, y):
    """Multi class version of Logarithmic Loss metric.
    https://www.kaggle.com/wiki/MultiClassLogLoss

    Parameters
    ----------
    models : dictionary, shape = [n_models]
    X : DataFrame, shape = [n_samples, n_classes]
    y : DataFrame, shape = [n_classes]

    Returns
    -------
    roc : matplotlib plot
    """
     
    # Split data into a development and evaluation set
    X_dev,X_eval, y_dev,y_eval = train_test_split(X, y, test_size=0.33, random_state=42)
    # Split development set into a train and test set
    X_train, X_test, y_train, y_test = train_test_split(X_dev, y_dev, 
                                                        test_size=0.33, random_state=seed)
    
    # contains rates for ML classifiers
    fpr = {}
    tpr = {}
    roc_auc = {}
    
    # Customize the major grid
    fig, ax = plt.subplots()
    ax.grid(which='major', linestyle='-', linewidth='0.2', color='gray')
    ax.set_axis_bgcolor('white')
    
    # Include random by chance 'luck' curve
    plt.plot([1, 0], [0, 1], '--', color=(0.1, 0.1, 0.1), label='Luck')
        
    # Loop through classifiers
    for (name, model) in models.items():
        
        print "\n\x1b[1;31mBuilding model ...\x1b[0m"
        process = time.clock()
        model.fit(X_train, y_train)

        print "\t%s fit time: %.3f"%(name, time.clock()-process)
        
        y_predicted = model.predict(X_test)
        
        if hasattr(model, "predict_proba"):
            decisions = model.predict_proba(X_test)[:, 1] # probability estimates of the positive class(as needed in the roc_curve function)
        else:  # use decision function
            decisions = model.decision_function(X_test)
        
        print "\tArea under ROC curve for %s: %.4f"%(name, roc_auc_score(y_test,decisions))
        
        process = time.clock()
        scores = cross_val_score(model, X_test, y_test, scoring="roc_auc",
                                 n_jobs=1, cv=3) #n_jobs=-1
        print "\tAUC ROC accuracy: %0.5f (+/- %0.5f)"%(scores.mean(), scores.std())

        print classification_report(y_test, y_predicted, target_names=['signal', 'background'])
        print "\tDuration of cross-validation score: ", time.clock()-process
        print("\tScore of test dataset: {:.5f}".format(model.score(X_test, y_test)))
        
        process = time.clock()
        fpr[name], tpr[name], thresholds = roc_curve(y_test, decisions)
        print "\tArea under ROC time: ", time.clock()-process
        
        roc_auc[name] = auc(fpr[name], tpr[name])
    
    # color choices: https://css-tricks.com/snippets/css/named-colors-and-hex-equivalents/
    colors = cycle(['aqua', 'darkorange', 'cornflowerblue', 
                    'green', 'yellow', 'SlateBlue', 'DarkSlateGrey',
                    'CadetBlue', 'Chocolate', 'darkred', 'GoldenRod'])
  
    for (name, model), color in zip(models.items(), colors):

        plt.plot(tpr[name], 1-fpr[name], color=color, lw=2,
                 label='%s (AUC = %0.3f)'%(name, roc_auc[name]))                 
    
    # Plot all ROC curves
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title("Receiver operating characteristic ({} events)".format(X.shape[0]))
    leg = plt.legend(loc="lower left", frameon=True, fancybox=True, fontsize=8) # loc='best'
    leg.get_frame().set_edgecolor('w')
    frame = leg.get_frame()
    frame.set_facecolor('White')
    
    return plt.show()


# In[14]:

# Plot AUC for ROC curve for several classifiers out-of-the-box

wall = time.time()
process = time.clock()

# Set feature scaling type
scaler = RobustScaler()

# NOTE: When using scikit-learn's DecisionTreeClassifier, 
# always set min_samples_leaf to something like 5 or 10. 
# Its default value of 1 is useless and is guaranteed to overfit. 
# (This is why every example of DecisionTreeClassifier in their docs shows overfitting.)

# prepare models: create a mapping of ML classifier name to algorithm
pipe_classifiers = {
    'SVM':  make_pipeline(scaler, SVC()),
    'NB' :  make_pipeline(scaler, GaussianNB()), 
    'MLP':  make_pipeline(scaler, MLPClassifier()),
    'LR' :  make_pipeline(scaler, LogisticRegression()),
    'ADA':  make_pipeline(None,   AdaBoostClassifier()),
    'KNN':  make_pipeline(scaler, KNeighborsClassifier()),
    'RFC':  make_pipeline(None,   RandomForestClassifier()),
    'CART': make_pipeline(None,   DecisionTreeClassifier(min_samples_leaf=10)),
    'LDA':  make_pipeline(scaler, LinearDiscriminantAnalysis()),
    'GRAD': make_pipeline(None,   GradientBoostingClassifier()),
    'BAGG': make_pipeline(None,   BaggingClassifier())
}

# Assessing a Classifier's Performance
roc_plot(pipe_classifiers, df_X, df_y)
                   
print "\nWall time to generate ROC plots: ", time.time()-wall
print "Elapsed time to generate ROC plots: ", time.clock()-process


# In[15]:

# Plot AUC for ROC curve for several classifiers out-of-the-box

wall = time.time()
process = time.clock()

# Set feature scaling type
scaler = RobustScaler()

# NOTE: When using scikit-learn's DecisionTreeClassifier, 
# always set min_samples_leaf to something like 5 or 10. 
# Its default value of 1 is useless and is guaranteed to overfit. 
# (This is why every example of DecisionTreeClassifier in their docs shows overfitting.)

# prepare models: create a mapping of ML classifier name to algorithm
pipe_classifiers = {
    'SVM':  make_pipeline(scaler, SVC(class_weight="balanced")),
    'NB' :  make_pipeline(scaler, GaussianNB()), 
    'MLP':  make_pipeline(scaler, MLPClassifier()),
    'LR' :  make_pipeline(scaler, LogisticRegression(class_weight="balanced")),
    'ADA':  make_pipeline(None,   AdaBoostClassifier()),
    'KNN':  make_pipeline(scaler, KNeighborsClassifier()),
    'RFC':  make_pipeline(None,   RandomForestClassifier()),
    'CART': make_pipeline(None,   DecisionTreeClassifier(min_samples_leaf=10, class_weight="balanced")),
    'LDA':  make_pipeline(scaler, LinearDiscriminantAnalysis()),
    'GRAD': make_pipeline(None,   GradientBoostingClassifier()),
    'BAGG': make_pipeline(None,   BaggingClassifier())
}

# Assessing a Classifier's Performance
roc_plot(pipe_classifiers, df_X, df_y)
                   
print "\nWall time to generate ROC plots: ", time.time()-wall
print "Elapsed time to generate ROC plots: ", time.clock()-process


# ## Precision-Recall Plots
# 
# Precision-Recall metric to evaluate classifier output quality.
# 
# Recall is a performance measure of the whole positive part of a dataset, whereas precision is a performance measure of positive predictions.
# 
# In information retrieval, precision is a measure of result relevancy, while recall is a measure of how many truly relevant results are returned. A high area under the curve represents both high recall and high precision, where high precision relates to a low false positive rate, and high recall relates to a low false negative rate. High scores for both show that the classifier is returning accurate results (high precision), as well as returning a majority of all positive results (high recall).
# 
# A system with high recall but low precision returns many results, but most of its predicted labels are incorrect when compared to the training labels. A system with high precision but low recall is just the opposite, returning very few results, but most of its predicted labels are correct when compared to the training labels. An ideal system with high precision and high recall will return many results, with all results labeled correctly.
# 
# It is important to note that the precision may not decrease with recall. The definition of precision ($\frac{T_p}{T_p + F_p}$) shows that lowering the threshold of a classifier may increase the denominator, by increasing the number of results returned. If the threshold was previously set too high, the new results may all be true positives, which will increase precision. If the previous threshold was about right or too low, further lowering the threshold will introduce false positives, decreasing precision.
# 
# Recall is defined as $\frac{T_p}{T_p+F_n}$, where $T_p+F_n$ does not depend on the classifier threshold. This means that lowering the classifier threshold may increase recall, by increasing the number of true positive results. It is also possible that lowering the threshold may leave recall unchanged, while the precision fluctuates.
# 
# SOURCE: http://scikit-learn.org/stable/auto_examples/model_selection/plot_precision_recall.html
# 
# "it has been shown by Davis & Goadrich that an algorithm that optimizes the area under the ROC curve is not guaranteed to optimize the area under the PR curve."

# In[16]:

## Define precision-recall curve

def plot_PR_curve(classifier, X, y, n_folds=5):
    """
    Plot a basic precision/recall curve.
    """
    
    # Customize the major grid
    fig, ax = plt.subplots()
    ax.grid(which='major', linestyle='-', linewidth='0.2', color='gray')
    ax.set_axis_bgcolor('white')
    
    # Calculate the random luck for PR 
    # (above the constant line is a classifier that is well modeled)
    signal_count, background_count = class_info(y)
    ratio = float(signal_count)/float(signal_count + background_count)
    
    # store average precision calculation
    avg_scores = []
    
    # Loop through classifiers
    for (name, model) in classifier.items():
        
        skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=seed)
        
        for i, (train, test) in enumerate(skf.split(X,y)):
            model.fit(X[train], y[train])
                   
            if hasattr(model, "predict_proba"):
                probas_ = model.predict_proba(X[test])[:, 1]
            else:  # use decision function
                probas_ = model.decision_function(X[test])
            
            # Compute ROC curve and area the curve 
            precision, recall, thresholds = precision_recall_curve(y[test],
                                                                   probas_, pos_label=1)
            average_precision = average_precision_score(y[test], probas_)
            avg_scores.append(average_precision)
        
        plt.plot(recall, precision, lw=1, 
                 label='{0} (auc = {1:0.2f})'.format(name,np.mean(avg_scores, axis=0)))
    
    plt.plot([ratio,ratio], '--', color=(0.1, 0.1, 0.1), 
             label='Luck (auc = {0:0.2f})'.format(ratio))

    plt.xlim([-0.05, 1.05])
    plt.ylim([-0.05, 1.05])
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title('Precision-recall curve')
    plt.legend(loc="lower left")

    return plt.show()


# In[17]:

# Plot precision-recall curve for several classifiers out-of-the-box

wall = time.time()
process = time.clock()

plot_PR_curve(pipe_classifiers, df_X.values, df_y.values, n_folds=5)

print "\nWall time to generate Precision-Recall plots: ", time.time()-wall
print "Elapsed time to generate Precision-Recall plots: ", time.clock()-process


# Among the models with best performance on the test set:
# 
# - Random forests
# - Gradient boosting 
# - Boosting decision tree
# 
# We observe that the GradientBoostingClassifier has relatively the best accuracy and area under the ROC curve value. Therefore, we select this predictive model and proceed to evaluate whether it is overfitting the model to the noise (e.g. statistical fluctuation) of the data.

# ## Overfitting Evaluation
# 
# Comparing the ML classifier output distribution for the training and testing set to check for overtraining. By comparing the ML classifier's decision function for each class, as well as overlaying it with the shape of the decision function in the training set.
# 
# 
# Using the default parameters for the ML Classifiers we study whether the model is over-fitting the test data.

# In[18]:

## Defined overfitting plot

def compare_train_test(clf, X, y, bins=30):
    """Multi class version of Logarithmic Loss metric.

    Parameters
    ----------
    y_true : array, shape = [n_samples]
            true class, intergers in [0, n_classes - 1)
    y_pred : array, shape = [n_samples, n_classes]

    Returns
    -------
    loss : float
    """
    
    # Split data into a development and evaluation set
    X_dev,X_eval, y_dev,y_eval = train_test_split(X, y, test_size=0.33, random_state=42)
    # Split development set into a train and test set
    X_train, X_test, y_train, y_test = train_test_split(X_dev, y_dev, 
                                                        test_size=0.33, random_state=seed)
    
    # use subplot to extract axis to add ks and p-value to plot
    fig, ax = plt.subplots()
    
    # Customize the major grid
    ax.grid(which='major', linestyle='-', linewidth='0.2', color='gray')
    ax.set_axis_bgcolor('white')
    
    decisions = []
    for X, y in ((X_train, y_train), (X_test, y_test)):

        if hasattr(clf,"decision_function"):
            d1 = clf.decision_function(X[y>0.5]).ravel()
            d2 = clf.decision_function(X[y<0.5]).ravel()
        else:
            d1 = clf.predict_proba(X[y>0.5])[:, 1]
            d2 = clf.predict_proba(X[y<0.5])[:, 1]
        
        decisions += [d1, d2]
        
    low = min(np.min(d) for d in decisions)
    high = max(np.max(d) for d in decisions)
    low_high = (low,high)
    
    plt.hist(decisions[0],
             color='r', alpha=0.5, range=low_high, bins=bins,
             histtype='stepfilled', normed=True,
             label='signal (train)')
    plt.hist(decisions[1],
             color='b', alpha=0.5, range=low_high, bins=bins,
             histtype='stepfilled', normed=True,
             label='background (train)')
    
    hist, bins = np.histogram(decisions[2],
                              bins=bins, range=low_high, normed=True)
    scale = len(decisions[2]) / sum(hist)
    err = np.sqrt(hist * scale) / scale
    
    width = (bins[1] - bins[0])
    center = (bins[:-1] + bins[1:]) / 2
    plt.errorbar(center, hist, yerr=err, fmt='o', c='r', label='signal (test)')
    
    hist, bins = np.histogram(decisions[3],
                              bins=bins, range=low_high, normed=True)
    scale = len(decisions[2]) / sum(hist)
    err = np.sqrt(hist * scale) / scale
    
    plt.errorbar(center, hist, yerr=err, fmt='o', c='b', label='background (test)')
    
    # Define signal and background histograms for training & testing 
    hist_sig_train, bins = np.histogram(decisions[0], bins=bins, range=low_high, normed=True)
    hist_bkg_train, bins = np.histogram(decisions[1], bins=bins, range=low_high, normed=True)
    
    hist_sig_test, bins = np.histogram(decisions[2], bins=bins, range=low_high, normed=True)
    hist_bkg_test, bins = np.histogram(decisions[3], bins=bins, range=low_high, normed=True)
    
    # Estimate ks-test and p-values as an indicator of overtraining of fit model
    s_ks, s_pv = ks_2samp(hist_sig_train, hist_sig_test)
    b_ks, b_pv = ks_2samp(hist_bkg_train, hist_bkg_test)
    
    if hasattr(clf, "steps"):
        name = clf.steps[1][0]
    else:
        name = clf.__class__.__name__
    
    ax.set_title("Classifier: %s\nsignal(background) ks: %f(%f), p-value: %f (%f)" 
                 % (name, s_ks, b_ks, s_pv, b_pv))

    plt.xlabel("Decision output")
    plt.ylabel("Arbitrary units")
    plt.legend(loc='best')
    
    return plt.show()


# In[19]:

## Overfitting evaluation

wall = time.time()
process = time.clock()

# Uncalibrated model predictions
model = pipe_classifiers["GRAD"]
compare_train_test(model, df_X, df_y, bins=40)

# Calibrated with isotonic calibration
model_isotonic = CalibratedClassifierCV(model, cv=5, method='sigmoid')
model_isotonic.fit(df_X, df_y)
compare_train_test(model_isotonic, df_X, df_y, bins=40)

# Uncalibrated model predictions
model = pipe_classifiers["ADA"]
compare_train_test(model, df_X, df_y, bins=40)

# Calibrated with isotonic calibration
model_isotonic = CalibratedClassifierCV(model, cv=5, method='sigmoid')
model_isotonic.fit(df_X, df_y)
compare_train_test(model_isotonic, df_X, df_y, bins=40)

# Uncalibrated model predictions
model = pipe_classifiers["SVM"]
compare_train_test(model, df_X, df_y, bins=40)

# Calibrated with isotonic calibration
model_isotonic = CalibratedClassifierCV(model, cv=5, method='sigmoid')
model_isotonic.fit(df_X, df_y)
compare_train_test(model_isotonic, df_X, df_y, bins=40)

# Uncalibrated model predictions
model = pipe_classifiers["LDA"]
compare_train_test(model, df_X, df_y, bins=40)

# Calibrated with isotonic calibration
model_isotonic = CalibratedClassifierCV(model, cv=5, method='sigmoid')
model_isotonic.fit(df_X, df_y)
compare_train_test(model_isotonic, df_X, df_y, bins=40)

# Uncalibrated model predictions
model = pipe_classifiers["KNN"]
compare_train_test(model, df_X, df_y, bins=40)

# Calibrated with isotonic calibration
model_isotonic = CalibratedClassifierCV(model, cv=5, method='sigmoid')
model_isotonic.fit(df_X, df_y)
compare_train_test(model_isotonic, df_X, df_y, bins=40)

# Uncalibrated model predictions
model = pipe_classifiers["LR"]
compare_train_test(model, df_X, df_y, bins=40)

# Calibrated with isotonic calibration
model_isotonic = CalibratedClassifierCV(model, cv=5, method='sigmoid')
model_isotonic.fit(df_X, df_y)
compare_train_test(model_isotonic, df_X, df_y, bins=40)

# Uncalibrated model predictions
model = pipe_classifiers["CART"]
compare_train_test(model, df_X, df_y, bins=40)

# Calibrated with isotonic calibration
model_isotonic = CalibratedClassifierCV(model, cv=5, method='sigmoid')
model_isotonic.fit(df_X, df_y)
compare_train_test(model_isotonic, df_X, df_y, bins=40)

# Uncalibrated model predictions
model = pipe_classifiers["RFC"]
compare_train_test(model, df_X, df_y, bins=40)

# Calibrated with isotonic calibration
model_isotonic = CalibratedClassifierCV(model, cv=5, method='sigmoid')
model_isotonic.fit(df_X, df_y)
compare_train_test(model_isotonic, df_X, df_y, bins=40)

# Uncalibrated model predictions
model = pipe_classifiers["NB"]
compare_train_test(model, df_X, df_y, bins=40)

# Calibrated with isotonic calibration
model_isotonic = CalibratedClassifierCV(model, cv=5, method='sigmoid')
model_isotonic.fit(df_X, df_y)
compare_train_test(model_isotonic, df_X, df_y, bins=40)


print "\nWall time to generate over-training plots: ", time.time()-wall
print "Elapsed time to generate over-traing plots: ", time.clock()-process


# ## Probability calibration
# 
# When performing classification you often want not only to predict the class label, but also obtain a probability of the respective label. This probability gives you some kind of confidence on the prediction. Some models can give you poor estimates of the class probabilities and some even do not not support probability prediction. Well calibrated classifiers are probabilistic classifiers for which the output of the predict_proba method can be directly interpreted as a confidence level.
# 
# Two approaches for performing calibration of probabilistic predictions are provided: 
# - a parametric approach based on Platt's sigmoid model and a non-parametric approach based on isotonic regression. 
# 
# Probability calibration should be done on new data not used for model fitting. The modelue uses a cross-validation generator and estimates for each split the model parameter on the train samples and the calibration of the test samples. The probabilities predicted for the folds are then averaged. Already fitted classifiers can be calibrated by CalibratedClassifierCV via the paramter cv="prefit". In this case, the user has to take care manually that data for model fitting and calibration are disjoint.

# In[20]:

## Define calibration curve (reliability curve)

def plot_calibration_curve(est, X, y, fig_index):
    """Plot calibration curve for est w/o and with calibration. """

    # Split data into a development and evaluation set
    X_dev,X_eval, y_dev,y_eval = train_test_split(X, y,
                                                  test_size=0.33, random_state=42)
    # Split development set into a train and test set
    X_train, X_test, y_train, y_test = train_test_split(X_dev, y_dev, test_size=0.33,
                                                        random_state=seed)
    
    name = est.steps[1][0]
    
    # Calibrated with isotonic calibration
    isotonic = CalibratedClassifierCV(est, cv=2, method='isotonic')

    # Calibrated with sigmoid calibration
    sigmoid = CalibratedClassifierCV(est, cv=2, method='sigmoid')

    # We take the no calibration as baseline
    fig = plt.figure(fig_index, figsize=(6, 6))
    ax1 = plt.subplot2grid((3, 1), (0, 0), rowspan=2)
    ax2 = plt.subplot2grid((3, 1), (2, 0))

    ax1.plot([0, 1], [0, 1], "--", label="Perfectly calibrated")
    
    for clf, name in [(est, name,)
                      (isotonic, name + ' + Isotonic'),
                      (sigmoid, name + ' + Sigmoid')]: # Also called Platt Scaling
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)

        if hasattr(clf, "predict_proba"):
            prob_pos = clf.predict_proba(X_test)[:, 1]
        else:  # use decision function
            prob_pos = clf.decision_function(X_test)
            prob_pos = (prob_pos - prob_pos.min()) / (prob_pos.max() - prob_pos.min())

        clf_score = brier_score_loss(y_test, prob_pos, pos_label=y.max())
        print("\n\x1b[1;31mclassifier %s:\x1b[0m" % name)
        print("\tBrier: %1.3f" % (clf_score))
        print("\tPrecision: %1.3f" % precision_score(y_test, y_pred))
        print("\tRecall: %1.3f" % recall_score(y_test, y_pred))
        print("\tF1: %1.3f\n" % f1_score(y_test, y_pred))

        fraction_of_positives, mean_predicted_value =             calibration_curve(y_test, prob_pos, n_bins=10)

        ax1.plot(mean_predicted_value, fraction_of_positives, "o-",
                 label="%s (%1.3f)" % (name, clf_score))

        ax2.hist(prob_pos, range=(0, 1), bins=10, label=name,
                 histtype="step", lw=2)

    ax1.set_ylabel("Fraction of positives")
    ax1.set_ylim([-0.05, 1.05])
    ax1.legend(loc="lower right")
    ax1.set_title('Calibration plots (reliability curve)')
 
    # Customize the major grid
    ax1.grid(which='major', linestyle='-', linewidth='0.2', color='gray')
    ax1.set_axis_bgcolor('white')

    ax2.set_xlabel("Mean predicted value")
    ax2.set_ylabel("Count")
    ax2.legend(loc="best", ncol=1)
    
    # Customize the major grid
    ax2.grid(which='major', linestyle='-', linewidth='0.2', color='gray')
    ax2.set_axis_bgcolor('white')
    
    plt.tight_layout()
    plt.show()


# In[21]:

## Plot reliability curve (i.e. calibration curve)

wall = time.time()
process = time.clock()

plot_calibration_curve(make_pipeline(None, SVC()), df_X, df_y, 2)
plot_calibration_curve(pipe_classifiers["SVM"],   df_X, df_y, 2)

plot_calibration_curve(make_pipeline(None, GaussianNB()), df_X, df_y, 2)
plot_calibration_curve(pipe_classifiers["NB"],   df_X, df_y, 2)

plot_calibration_curve(make_pipeline(None, LogisticRegression()), df_X, df_y, 2)
plot_calibration_curve(pipe_classifiers["LR"],   df_X, df_y, 2)

plot_calibration_curve(make_pipeline(None, LinearDiscriminantAnalysis()), df_X, df_y, 2)
plot_calibration_curve(pipe_classifiers["LDA"],  df_X, df_y, 2)

plot_calibration_curve(make_pipeline(None, KNeighborsClassifier()), df_X, df_y, 2)
plot_calibration_curve(pipe_classifiers["KNN"],  df_X, df_y, 2)

# Tree-based classifier
plot_calibration_curve(pipe_classifiers["CART"], df_X, df_y, 2)
plot_calibration_curve(pipe_classifiers["RFC"],  df_X, df_y, 2)
plot_calibration_curve(pipe_classifiers["ADA"],  df_X, df_y, 2)
plot_calibration_curve(pipe_classifiers["GRAD"], df_X, df_y, 2)

#plot_calibration_curve(pipe_classifiers["MLP"],  df_X, df_y, 2)

print "\nWall time to generate Calibration (reliability) plots: ", time.time()-wall
print "Elapsed time to generate Calibration (reliability) plots: ", time.clock()-process


# In[22]:

## Define confusion matrix plot

def plot_confusion_matrix(clf, X, y, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    
    """Multi class version of Logarithmic Loss metric.
    https://www.kaggle.com/wiki/MultiClassLogLoss

    Parameters
    ----------
    y_true : array, shape = [n_samples]
            true class, intergers in [0, n_classes - 1)
    y_pred : array, shape = [n_samples, n_classes]

    Returns
    -------
    loss : float
    """
    
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    # Split data into a development and evaluation set
    X_dev,X_eval, y_dev,y_eval = train_test_split(X, y,
                                              test_size=0.33, random_state=42)
    # Split development set into a train and test set
    X_train, X_test, y_train, y_test = train_test_split(X_dev, y_dev, test_size=0.33,
                                                        random_state=seed)
    
    classifier = clf.fit(X_train, y_train)
    y_pred = classifier.predict(X_test)
    
    # Compute confusion matrix
    cm = confusion_matrix(y_test, y_pred)
    
    plt.imshow(cm, interpolation='nearest', cmap=cmap);
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)
 
    name = clf.steps[1][0]

    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    thresh = cm.max() / 2.
 
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, cm[i, j],
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")
    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.grid(False, which='both')
    
    return plt.show()


# In[24]:

## Generate confusion matrix plot

wall = time.time()
process = time.clock()
    
np.set_printoptions(precision=2)
class_names = ['Background', 'Signal']

# Plot non-normalized confusion matrix
clf = pipe_classifiers["GRAD"]
plot_confusion_matrix(clf, df_X, df_y, classes=class_names,
                      title="Classifier: %s\nConfusion matrix, without normalization"%("Gradient-Boosting"))
# Plot normalized confusion matrix
clf = pipe_classifiers["GRAD"]
plot_confusion_matrix(clf, df_X, df_y, classes=class_names, normalize=True,
                      title="Classifier: %s\nNormalized confusion matrix"%("Gradient-Boosting"))

# Plot non-normalized confusion matrix
clf = pipe_classifiers["ADA"]
plot_confusion_matrix(clf, df_X, df_y, classes=class_names,
                      title="Classifier: %s\nConfusion matrix, without normalization"%("Adaptive-Boosting"))
# Plot normalized confusion matrix
clf = pipe_classifiers["ADA"]
plot_confusion_matrix(clf, df_X, df_y, classes=class_names, normalize=True,
                      title="Classifier: %s\nNormalized confusion matrix"%("Adaptive-Boosting"))

# Plot non-normalized confusion matrix
clf = pipe_classifiers["SVM"]
plot_confusion_matrix(clf, df_X, df_y, classes=class_names,
                      title="Classifier: %s\nConfusion matrix, without normalization"%("Support Vector"))
# Plot normalized confusion matrix
clf = pipe_classifiers["SVM"]
plot_confusion_matrix(clf, df_X, df_y, classes=class_names, normalize=True,
                      title="Classifier: %s\nNormalized confusion matrix"%("Super Vector"))

# Plot non-normalized confusion matrix
clf = pipe_classifiers["LDA"]
plot_confusion_matrix(clf, df_X, df_y, classes=class_names,
                      title="Classifier: %s\nConfusion matrix, without normalization"%("Linear Discriminant Analysis"))
# Plot normalized confusion matrix
clf = pipe_classifiers["LDA"]
plot_confusion_matrix(clf, df_X, df_y, classes=class_names, normalize=True,
                      title="Classifier: %s\nNormalized confusion matrix"%("Linear Discriminant Analysis"))

# Plot non-normalized confusion matrix
clf = pipe_classifiers["LR"]
plot_confusion_matrix(clf, df_X, df_y, classes=class_names,
                      title="Classifier: %s\nConfusion matrix, without normalization"%("Logistic Regression"))
# Plot normalized confusion matrix
clf = pipe_classifiers["LR"]
plot_confusion_matrix(clf, df_X, df_y, classes=class_names, normalize=True,
                      title="Classifier: %s\nNormalized confusion matrix"%("Logistic Regression"))

# Plot non-normalized confusion matrix
clf = pipe_classifiers["RFC"]
plot_confusion_matrix(clf, df_X, df_y, classes=class_names,
                      title="Classifier: %s\nConfusion matrix, without normalization"%("Random Forest"))
# Plot normalized confusion matrix
clf = pipe_classifiers["RFC"]
plot_confusion_matrix(clf, df_X, df_y, classes=class_names, normalize=True,
                      title="Classifier: %s\nNormalized confusion matrix"%("Random Forest"))

# Plot non-normalized confusion matrix
clf = pipe_classifiers["NB"]
plot_confusion_matrix(clf, df_X, df_y, classes=class_names,
                      title="Classifier: %s\nConfusion matrix, without normalization"%("Guassian Naive Bayes"))
# Plot normalized confusion matrix
clf = pipe_classifiers["NB"]
plot_confusion_matrix(clf, df_X, df_y, classes=class_names, normalize=True,
                      title="Classifier: %s\nNormalized confusion matrix"%("Gaussian Naive Bayes"))

# Plot non-normalized confusion matrix
clf = pipe_classifiers["CART"]
plot_confusion_matrix(clf, df_X, df_y, classes=class_names,
                      title="Classifier: %s\nConfusion matrix, without normalization"%("Decision Tree"))
# Plot normalized confusion matrix
clf = pipe_classifiers["CART"]
plot_confusion_matrix(clf, df_X, df_y, classes=class_names, normalize=True,
                      title="Classifier: %s\nNormalized confusion matrix"%("Decision Tree"))

# Plot non-normalized confusion matrix
clf = pipe_classifiers["KNN"]
plot_confusion_matrix(clf, df_X, df_y, classes=class_names,
                      title="Classifier: %s\nConfusion matrix, without normalization"%("K-Nearest Neighbor"))
# Plot normalized confusion matrix
clf = pipe_classifiers["KNN"]
plot_confusion_matrix(clf, df_X, df_y, classes=class_names, normalize=True,
                      title="Classifier: %s\nNormalized confusion matrix"%("K-Nearest Neighbor"))

print "\nWall time to generate confusion matrix plots: ", time.time()-wall
print "Elapsed time to generate confusion matrix plots: ", time.clock()-process


# ## Early stopping: Validation Plots
# 
# To validate a model we need a scoring function, for example accuracy for classifiers. The proper way of choosing multiple hyperparameters of an estimator are of course grid search or similar methods that select the hyperparameter with the maximum score on a validation set or multiple validation sets. Note that if we optimized the hyperparameters based on a validation score the validation score is biased and not a good estimate of the generalization any longer. To get a proper estimate of the generalization we have to compute the score on another test set.
# 
# However, it is sometimes helpful to plot the influence of a single hyperparameter on the training score and the validation score to find out whether the estimator is overfitting or underfitting for some hyperparameter values.
# 
# measure the performance of our ensemble as we go along and stop adding trees 
# once we think we have reached the minimum (min. test error).
# 
# It will repeatedly add one more base estimator to the ensemble, measure the performance, and check if we reached minimum. If we reached the minimum it stops, otherwise it keeps adding base estimators until it reaches the maximum number of iterations.
# 
# - There is a minimum number of trees required to skip over the noisier part of the score function
# - Early stopping does not actually stop at the minimum, instead it continues on until the score has increased by scale above the current minimum. This is a simple solution to the problem that we only know we reached the minimum by seeing the score increase again.

# In[25]:

## Validation curve definition

def validation_curve(clf, X, y):
    """Validation curve.

    Parameters
    ----------
    clf : array, shape = [n_samples]
            true class, intergers in [0, n_classes - 1)
    X : array, shape = [n_samples, n_classes]
    y : array, shape = [n_samples, n_classes]
    Returns
    -------
    plt : matplotlib
    """
    
    # Split data into a development and evaluation set
    X_dev,X_eval, y_dev, y_eval = train_test_split(X, y, test_size=.33,
                                                   random_state=seed)
    # Split development set into a train and test set
    X_train, X_test, y_train, y_test = train_test_split(X_dev, y_dev, test_size=0.33,
                                                        random_state=seed+31415)

    clf.fit(X_train, y_train)

    # Customize the major grid
    fig, ax = plt.subplots()
    ax.grid(which='major', linestyle='-', linewidth='0.2', color='gray')
    ax.set_axis_bgcolor('white')
    
    test_score = np.empty(len(clf.estimators_))
    train_score = np.empty(len(clf.estimators_))

    for i, pred in enumerate(clf.staged_predict_proba(X_test)):
        test_score[i] = 1-roc_auc_score(y_test, pred[:,1])

    for i, pred in enumerate(clf.staged_predict_proba(X_train)):
        train_score[i] = 1-roc_auc_score(y_train, pred[:,1])

    best_iter = np.argmin(test_score)
    learn = clf.get_params()['learning_rate']
    depth = clf.get_params()['max_depth']
        
    test_line = plt.plot(test_score, label='test (1-roc_auc=%.3f)'%(test_score[best_iter]))

    colour = test_line[-1].get_color()
    plt.plot(train_score, '--', color=colour, label='train (1-roc_auc=%.3f)\nlearn=%.1f depth=%i'
             %(train_score[best_iter],learn,depth))

    plt.title("Validation curve")
    plt.xlabel("Number of boosting iterations")
    plt.ylabel("1 - area under ROC")
    plt.legend(loc='best')
    plt.axvline(x=best_iter, color=colour)
    
    return plt.show()


# In[26]:

##  plot the validation curve for our fitted classifier
# and check with the test set at which number of n_estimators we reach the minimum test error.

wall = time.time()
process = time.clock()

# Set of hyper-parameter selected
opts = dict(max_depth=2, learning_rate=0.1, n_estimators=200)

clf = GradientBoostingClassifier(**opts)

validation_curve(clf, df_X, df_y)

print "\nWall time to generate Validation plots: ", time.time()-wall
print "Elapsed time to generate Validation plots: ", time.clock()-process


# ## Threshold optimisation
# 
# what happens if we use a different hyper-parameter: the threshold applied to decide which class a sample falls in during prediction time, that is normally defaulted to 0.5 threshold.

# In[27]:

# Split data into a development and evaluation set
X_dev,X_eval, y_dev,y_eval = train_test_split(df_X, df_y,
                                              test_size=0.33, random_state=seed)
# Split development set into a train and test set
X_train, X_test, y_train, y_test = train_test_split(X_dev, y_dev, test_size=0.33,
                                                    random_state=seed)

# Removing features with low variance
select = VarianceThreshold()
#pipe_param_grid = {'variancethreshold__threshold': [0.1, 0.2, 0.4, 0.6, 0.8, 1.0]}
pipe_param_grid = {'randomforestclassifier__max_depth': [100],#[70, 100],
                   'randomforestclassifier__max_features': [7], #[4, 6],
                   'variancethreshold__threshold': [0.9] #[0.4, 0.6, 0.8, 1.0]
                  }

grids = [GridSearchCV(make_pipeline(select, RandomForestClassifier(random_state=seed+n_iter)),
                      param_grid=pipe_param_grid, n_jobs=1, verbose=0).fit(X_train, y_train)
         for n_iter in range(10)]

scores = [grid.best_score_ for grid in grids]
print("Average score: %.4f+-%.4f" %(np.mean(scores), sp.stats.sem(scores)))


# ##  Learning curve
# 
# A learning curve shows the validation and training score of an estimator for varying numbers of training samples. It is a tool to find out how much we benefit from adding more training data and whether the estimator suffers more from a variance error or a bias error. If both the validation score and the training score converge to a value that is too low with increasing size of the training set, we will not benefit much from more training data. 

# In[28]:

# Define learning curve

def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None,
                        n_jobs=1, train_sizes=np.linspace(.1, 1.0, 5)):
    """
    Generate a simple plot of the test and training learning curve.

    Parameters
    ----------
    estimator : object type that implements the "fit" and "predict" methods
        An object of that type which is cloned for each validation.

    title : string
        Title for the chart.

    X : array-like, shape (n_samples, n_features)
        Training vector, where n_samples is the number of samples and
        n_features is the number of features.

    y : array-like, shape (n_samples) or (n_samples, n_features), optional
        Target relative to X for classification or regression;
        None for unsupervised learning.

    ylim : tuple, shape (ymin, ymax), optional
        Defines minimum and maximum yvalues plotted.

    cv : int, cross-validation generator or an iterable, optional
        Determines the cross-validation splitting strategy.
        Possible inputs for cv are:
          - None, to use the default 3-fold cross-validation,
          - integer, to specify the number of folds.
          - An object to be used as a cross-validation generator.
          - An iterable yielding train/test splits.

        For integer/None inputs, if ``y`` is binary or multiclass,
        :class:`StratifiedKFold` used. If the estimator is not a classifier
        or if ``y`` is neither binary nor multiclass, :class:`KFold` is used.

        Refer :ref:`User Guide <cross_val_score>` for the various
        cross-validators that can be used here.

    n_jobs : integer, optional
        Number of jobs to run in parallel (default 1).
    """
    
    # Customize the major grid
    fig, ax = plt.subplots()
    ax.grid(which='major', linestyle='-', linewidth='0.2', color='gray')
    ax.set_axis_bgcolor('white')

    plt.title(title)
    if ylim is not None:
        plt.ylim(*ylim)
    plt.xlabel("Training examples")
    plt.ylabel("Score")
    
    train_sizes, train_scores, test_scores = learning_curve(estimator, X, y, cv=cv, 
                                                            n_jobs=n_jobs, 
                                                            train_sizes=train_sizes)
    
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    
    plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std, alpha=0.1, color="g")
    plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.1,
                     color="r")
    plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
             label="Cross-validation score")
    plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
             label="Training score")

    plt.legend(loc="best")

    
    return plt.show()


# In[29]:

## Plot learning curve

wall = time.time()
process = time.clock()

# Cross validation with 100 iterations to get smoother mean test and train
# score curves, each time with 20% data randomly selected as a validation set.
cv = ShuffleSplit(n_splits=10, test_size=0.33, random_state=0)

estimator = GradientBoostingClassifier()
plot_learning_curve(estimator, "Learning Curves (Gradient Boosting)", df_X, df_y, ylim=(0.4, 1.01), cv=cv, n_jobs=-1);

print "\nWall time to generate Learning curve plots: ", time.time()-wall
print "Elapsed time to generate Learning curve plots: ", time.clock()-process


# ## Feature Ranking and Feature Selection
# 
# There are various methods for determining feature importance, namely, model-based selection, variance based selection, univariate statistics selections, recursive feature elemination, iterative selection.

# In[30]:

## Define feature ranking 

def feature_ranking_plot(X, indices, title):

    # Customize the major grid
    fig, ax = plt.subplots()
    ax.grid(which='major', linestyle='-', linewidth='0.2', color='gray')
    ax.set_axis_bgcolor('white')
    
    print(title)
    for i in range(X.shape[1]):
        print("%d. %s (%f)" % (i + 1, X.columns[indices[i]], importances[indices[i]]))
    
    #std = np.std([tree.feature_importances_ for tree in forest.estimators_],
    #         axis=0)
    
    # Plot the feature importances of the model
    plt.title(title)
    plt.bar(range(X.shape[1]), importances[indices],
            color="r", align="center") #yerr=std[indices]
    plt.xticks(range(X.shape[1]), X.columns[indices], rotation='vertical')
    plt.xlim([-1, X.shape[1]])
    plt.tight_layout()

    return plt.show()


# In[31]:

## Feature ranking study

process = time.time()
process = time.clock()

# Univariate Statistics
select = VarianceThreshold(threshold=(.8 * (1 - .8)))

# Features selected by variance threshold
_= select.fit_transform(df_X);

importances = select.variances_
indices = np.argsort(importances)[::-1]

feature_ranking_plot(df_X, indices, "Feature importances based on variance ranking")

# The SelectFromModel class selects all features that have an importance measure 
# of the feature (as provided by the supervised model) greater than the provided 
# threshold.
select_model = SelectFromModel(RandomForestClassifier(n_estimators=10, random_state=seed), 
                         threshold="median") #threshold=0.25

# Features selected by SelectFromModel using the RandomForestClassifier
_= select_model.fit_transform(df_X, df_y)

importances = select_model.estimator_.feature_importances_
std = np.std([tree.feature_importances_ for tree in select_model.estimator_], axis=0)
indices = np.argsort(importances)[::-1]

feature_ranking_plot(df_X, indices, "Model-based feature importances ranking")

# Build a forest and compute the feature importances
extra_forest = ExtraTreesClassifier(random_state=0)
_= extra_forest.fit(df_X, df_y)

importances = extra_forest.feature_importances_
std = np.std([tree.feature_importances_ for tree in extra_forest.estimators_], axis=0)
indices = np.argsort(importances)[::-1]

feature_ranking_plot(df_X, indices, "Feature importances based on Extra-Tree classification ranking")

# Univariate Statistics
# Features selected k-highest scores
#score_func=f_classif, k=6
#select = SelectKBest(score_func=f_classif, k=12)

#_= select.fit_transform(df_X);

#importances = select.importances_
#indices = np.argsort(importances)[::-1]

#feature_ranking_plot(df_X, indices, "Feature importances based on variance ranking")

#  Recursive feature elimination (RFE-model base method)
rfe = RFE(estimator=AdaBoostClassifier(), n_features_to_select=11, step=1) #n_features_to_select=8  
_= rfe.fit(df_X, df_y)

importances = rfe.ranking_
indices = np.argsort(importances)[::-1]

feature_ranking_plot(df_X, indices,"Feature importances based on RFE model ranking")

print "Wall time to produce feature ranking plots: ", time.time()-wall
print "Elapsed time to prodduce feature ranking plots: ", time.clock()-process


# In[32]:

## define extract feature selection

def extract_feature_selected(clf, X, y):
    
    # Split data into a development and evaluation set
    X_dev,X_eval, y_dev, y_eval = train_test_split(df_X, df_y, test_size=.33, 
                                                   random_state=seed)
    # Split development set into a train and test set
    X_train, X_test, y_train, y_test = train_test_split(X_dev, y_dev, test_size=0.33,
                                                        random_state=seed+31415)

    clf.fit(X_train, y_train)
    select_indices = clf.named_steps['SELECT'].transform(
    np.arange(len(X_train.columns)).reshape(1, -1))

    feature_names = X_train.columns[select_indices]
    
    return feature_names


# In[33]:

## Feature selection study

def features_selection_model_performance(clf, X, y, parameter_set):

    # Customize the major grid
    fig, ax = plt.subplots()
    ax.grid(which='major', linestyle='-', linewidth='0.2', color='gray')
    ax.set_axis_bgcolor('white')
    
    this_scores = list()
    score_means = list()
    score_stds = list()

    params = {'SELECT__k': 'top k features', 
              'SELECT__threshold': 'feature threshold',
              'SELECT__n_features_to_select': 'n features to select',
              'SELECT__percentile': 'percentile',
              'SELECT__cv': 'k-fold',
              'SELECT__selection_threshold':'selection threshold'}
    
    label = [keyname for keyname in clf.get_params().keys() if keyname in params.keys()][0]
    
    for k in parameter_set:

        param = {label: k}
        clf.set_params(**param) 
        
        # Compute cross-validation score using 1 CPU
        this_scores = cross_val_score(clf, X, y, cv=3, n_jobs=1)
        score_means.append(this_scores.mean())
        score_stds.append(this_scores.std())

    plt.errorbar(parameter_set, score_means, np.array(score_stds))

    model = clf.steps[1][0]

    title = 'Performance of the {}-{} varying for features selected'.format(model,
                                                                            clf.get_params().keys()[1])
    
    plt.title(title)
    plt.xlabel(params[label])
    plt.ylabel('Prediction rate')

    print  extract_feature_selected(clf, X, y)
    
    return plt.show()


# In[34]:

## Feature selection

process = time.time()
process = time.clock()

# Removing features with low variance
select = VarianceThreshold()
p = [0.1, 0.2, 0.4, 0.6, 0.8, 1.0]

clf = Pipeline([('SELECT', select), ('rf', RandomForestClassifier())])
features_selection_model_performance(clf, df_X, df_y, p)

#  Univariate feature selection based on the k highest scores
select = SelectKBest(score_func=f_classif, k=6)
p = [2, 4, 6, 8, 10, 12]

clf = Pipeline([('SELECT', select), ('rf', RandomForestClassifier())])
features_selection_model_performance(clf, df_X, df_y, p)

# Univariate feature selection according to a percentile of the highest scores
select = feature_selection.SelectPercentile(f_classif)
p = (6, 10, 15, 20, 30, 40, 60, 80, 100)

clf = Pipeline([('SELECT', select), ('rf', RandomForestClassifier())])
features_selection_model_performance(clf, df_X, df_y, p)

# Recursive feature elimination  by recursively removing attributes 
# and building a model on those attributes that remain.
select = RFE(estimator=RandomForestClassifier(), n_features_to_select=8, step=1)
p = [2, 4, 6, 8, 10, 12]

clf = Pipeline([('SELECT', select), ('rf', RandomForestClassifier())])
features_selection_model_performance(clf, df_X, df_y, p)

# Recursive feature elimination with cross-validation
select = RFECV(estimator=RandomForestClassifier(), step=1, 
               cv=StratifiedKFold(3), scoring='accuracy')
p = [2, 3, 5, 7]

clf = Pipeline([('SELECT', select), ('rf', RandomForestClassifier())])
features_selection_model_performance(clf, df_X, df_y, p)

# Stability selection method for feature selection, based on subsampling in combination with selection algorithms
#select = RandomizedLasso(alpha=0.025) ## problematic one
#p = [0.025, 0.05, 0.07]

#clf = Pipeline([('SELECT', select), ('rf', RandomForestClassifier())])
#features_selection_model_performance(clf, df_X, df_y, p)

# Feature selection using SelectFromModel (## problematic one)
#select = SelectFromModel(RandomForestClassifier(min_samples_leaf=0.0001), threshold=0.5)
#p = [0.1, 0.3, 0.6]
#p = ["mean", "median"]

#clf = Pipeline([('SELECT', select), ('rfe', RandomForestClassifier())])
#features_selection_model_performance(clf, df_X, df_y, p)

print "\nWall time to generate features selection performance plots: ", time.time()-wall
print "Elapsed time to generate features selection performance plots: ", time.clock()-process


# In[ ]:

#fig, ax = plt.subplots(figsize=(10, 10))

#sns.heatmap(np.c_[df_X.values, df_y.values],names=features+['y'])

#sns.corrplot(data, ax=ax,names=features+['y'])


# In[ ]:

## Overly inblanced datat set between signal and background composition

# Apply the random over-sampling
ros = RandomOverSampler()
X_overresampled, y_overresampled = ros.fit_sample(df_X, df_y)

# Apply the random under-sampling
rus = RandomUnderSampler()
X_underresampled, y_underresampled = rus.fit_sample(df_X, df_y)

# Apply SMOTE SVM
sm = SMOTE(kind='svm')
X_resampled, y_resampled = sm.fit_sample(df_X, df_y)


# ## Machine Learning Algorithms for Model Building
# 
# ### Hyper-parameter Optimization and Model Evaluation
# 
# We employ a nested k-fold cross-validation utilizaiton a grid search for hyper-parameter optimization to avoid leaking information from the training dataset used to validate the hyper-parameters into the model evaluation which uses testing datasets.
# 
# We preform a hyper-parameter optimization to improve the accuary of our Gradient Boosting Classifier model then we evaluate the best cross-validated model in a hold out dataset that was not used during hyper-parameter validation.

# In[35]:

# Default pipeline setup with dummy place holder steps
pipe = Pipeline([('feature_scaling', None), 
                 ('feature_selection', None), 
                 ('classifier', DummyClassifier())])

# preprocessing using 0-1 scaling byremoving the mean and scaling to unit variance 
scaler = RobustScaler()

# feature selection
select = SelectKBest(k=8)

# create classifier for use in scikit-learn
model = GradientBoostingClassifier()

# prepare models: create a mapping of ML classifier name to algorithm
param_grid = [
    {'classifier': [model],
     'classifier__n_estimators': [100],
     'classifier__learning_rate': [0.1],
     'feature_selection': [select],
     'feature_selection__k': [12],
     'feature_scaling': [scaler]
    }
]

pcv = GridSearchCV(estimator=pipe, param_grid=param_grid, cv=3, n_jobs=1)
pcv.fit(X_train, y_train)

y_pred = pcv.predict(X_test)
report = classification_report( y_test, y_pred )
print report


# In[36]:

print("Pipeline steps:\n{}".format(pipe.steps))
# extract the first step 
components = pipe.named_steps["feature_scaling"]
print("components: {}".format(components))
classifier = pcv.best_estimator_.named_steps["classifier"]
print("Keras DNN classifier step:\n{}".format(classifier))
print("Best cross-validation accuracy: {:.2f}".format(pcv.best_score_)) 
print("Test set score: {:.2f}".format(pcv.score(X_test, y_test))) 
print("Best parameters: {}".format(pcv.best_params_))


# In[37]:

# now you can save it to a file
with open('gradboost_model.pkl', 'wb') as f:
    joblib.dump(clf, f,compress=9)

# and later you can load it
clf_load = joblib.load('gradboost_model.pkl')


# In[38]:

# Load dataset
rec_np_data = root2array("combined/run2016Data.root", 
                         "event_mvaVariables_step7_cate4", features)
np_data = rec2array(rec_np_data)


# convert to numpy ndarray into pandas dataframe
df_raw_data = pd.DataFrame(data=np_data, columns=features)

df_raw_data.describe()
df_raw_data.info()

X_data = df_raw_data.values


# In[39]:

predict_pos = pcv.predict_proba(X_data)[:,1]
print predict_pos[0]

df_loaded_model =pd.DataFrame(predict_pos, columns=['GradBoosting distribution'])
df_loaded_model.hist(bins=30)

plt.show()


# In[40]:

# Plot a mva distribution

def over_training_curve(model, mc_X, mc_y, data_X):
    # use subplot to extract axis to add ks and p-value to plot
    fig, ax = plt.subplots()
    
    # Customize the major grid
    ax.grid(which='major', linestyle='-', linewidth='0.2', color='gray')
    ax.set_axis_bgcolor('white')

    decisions = []
    
    prob_pos = model.predict_proba(data_X)[:, 1] # [:, 1]-->[:][1]
    d1 = model.predict_proba(mc_X[mc_y>0.5])[:, 1]
    d2 = model.predict_proba(mc_X[mc_y<0.5])[:, 1]
        
    decisions += [d1, d2]
        
    low = min(np.min(d) for d in decisions)
    high = max(np.max(d) for d in decisions)
    low_high = (low,high)  

    plt.hist([decisions[0], decisions[1]], color=['r','b'], alpha=0.5, histtype='stepfilled', 
             normed=True, label=['signal','background'], bins=30, stacked=True)
    
    bins = 30
    hist, bins = np.histogram(prob_pos, bins=bins, range=low_high, normed=True)
    
    scale = len(decisions) / sum(hist)
    err = np.sqrt(hist * scale) / scale
    #err = np.sqrt(hist)
    
    width = (bins[1] - bins[0])
    center = (bins[:-1] + bins[1:]) / 2
    plt.errorbar(center, hist, yerr=None, fmt='o', c='black', label='data')

    plt.xlabel("Decision output")
    plt.ylabel("Arbitrary units")
    plt.legend(loc='best')
    
    return plt.show()


# In[41]:

over_training_curve(pcv, df_X, df_y, X_data)


# In[42]:

## Features scaling, feature selection, model seleciton, hyper-parameter tuning and evaluation

process = time.time()
process = time.clock()

# Split data into a development and evaluation set
X_dev,X_eval, y_dev,y_eval = train_test_split(df_X, df_y, test_size=0.33, random_state=42)
# Split development set into a train and test set
X_train, X_test, y_train, y_test = train_test_split(X_dev, y_dev, test_size=.33, random_state=0)

# Number of folds to partition data set
k_fold = 3 #(study suggest to use 7)

# feature scaling (robust to outliners)
scaler = RobustScaler()

# Percentile-based selection
select_percentile = SelectPercentile(score_func=f_classif, percentile=100)

# has removed the first column, which has a probability p = 5/6 > .8 
# of containing a zero.
select_variance_threshold = VarianceThreshold(threshold=0.6)

# The SelectFromModel class selects all features that have an importance measure 
# of the feature (as provided by the supervised model) greater than the provided 
# threshold.
select_select_model = SelectFromModel(RandomForestClassifier(n_estimators=10,
                                                             random_state=seed),
                                      threshold="median") #threshold=0.25

# Build a forest and compute the feature importances
select_extra_trees = ExtraTreesClassifier(random_state=0)

# feature extraction
select_kbest = SelectKBest(score_func=f_classif, k=12)

#  Recursive feature elimination (RFE-model base method)
select_rfe = RFE(estimator=RandomForestClassifier(), n_features_to_select=10, step=1)

# Standard K-Fold cross-validation technique
standard_kfold_cv = KFold(n_splits=k_fold, shuffle=True, random_state=seed)

# Stratified K-fold cross-validation technique
select_stratified_kfold = StratifiedKFold(n_splits=k_fold, shuffle=True, random_state=seed)

# Default pipeline setup with dummy place holder steps
pipe = Pipeline([('preprocessing', None), ('feature_selection', None), 
                 ('classifier', DummyClassifier())])

# select a selection method from the above list
select = select_variance_threshold

param_grid = [
    #{'classifier': [LogisticRegression()],
    # 'preprocessing': [scaler, None],
    # 'feature_selection': [select]
    #},
    #{'classifier': [SVC()], 
    # 'preprocessing': [scaler, None],
    # 'feature_selection': [select]
    #},
    #{'classifier': [GradientBoostingClassifier()],
    # 'preprocessing': [None],
    # 'feature_selection': [select]
    #},
    #{'classifier': [AdaBoostClassifier()],
    # 'preprocessing': [None],
    # 'feature_selection': [select]
    #},
    #{'classifier': [RandomForestClassifier(n_estimators=10)],
    # 'preprocessing': [None],
    # 'feature_selection': [select]
    #},
    #{'classifier': [DecisionTreeClassifier(min_samples_leaf=10)],
    # 'preprocessing': [None],
    # 'feature_selection': [select]
    #},                                 
    #{'classifier': [GaussianNB()],
    # 'preprocessing': [scaler, None],
    # 'feature_selection': [select]
    #},
    #{'classifier': [LinearDiscriminantAnalysis()],
    # 'preprocessing': [scaler, None],
    # 'feature_selection': [select]
    #},
    {'classifier': [KNeighborsClassifier()],
     'preprocessing': [scaler, None],
     'feature_selection': [select],
     'classifier__n_neighbors': [1, 5]
    }
]

cv = standard_kfold_cv
grid = GridSearchCV(pipe, param_grid=param_grid, cv=3, n_jobs=-1) 
grid.fit(X_train, y_train)

print("Best params:\n{}\n".format(grid.best_params_))
print("Best cross-validation score: {:.2f}".format(grid.best_score_)) 
print("Test-set score: {:.2f}".format(grid.score(X_test, y_test)))
print("Cross-validation accuracy (pipeline): {:.2f}".format(
np.mean(cross_val_score(pipe, df_X, df_y, cv=5))))
print("Pipeline steps:\n{}".format(pipe.steps))

# fit to full dataset
#grid.fit(df_X, df_y)

# Calibrated with sigmoid calibration
#sigmoid = CalibratedClassifierCV(grid, cv=3, method='sigmoid')


print "\nWall time to generate classifier models: ", time.time()-wall
print "Elapsed time to generate classifier models: ", time.clock()-process


# ## Implementing a Weighted Majority Rule Ensemble Classifier
# 
# In order to trained a set of equally well performing models and balance out their individual weaknesses.

# In[43]:

## Esemble classifier

clf1 = LogisticRegression()
clf2 = RandomForestClassifier()
clf3 = GaussianNB()

estimators=[('lr', clf1), ('rf', clf2), ('gnb', clf3)]
voting='hard' #'soft'

eclf = VotingClassifier(estimators=estimators, voting=voting, weights=[1,1,1])

for clf, label in zip([clf1, clf2, clf3, eclf], ['Logistic Regression', 'Random Forest', 'naive Bayes', 'Ensemble']):

    scores = cross_val_score(clf, df_X, df_y, cv=5, scoring='accuracy')
    print("Accuracy: %0.2f (+/- %0.2f) [%s]" % (scores.mean(), scores.std(), label))


# In[44]:

from subprocess import call
import subprocess

def visualise_tree(tree):
    """Draw the decision tree"""
    if hasattr(tree,'tree_'):
        export_graphviz(tree, out_file="tree_plot.dot", feature_names=features)
        
        get_ipython().system(u'dot -Tpng -o tree_plot.png tree_plot.dot')
        from IPython.display import Image
        return Image("tree_plot.png")
    else:       
        for i_tree, tree_in_forest in enumerate(tree.estimators_):
            print i_tree
            with open('tree_' + str(i_tree) + '.dot', 'w') as my_file:
                #my_file = export_graphviz(tree_in_forest, feature_names=features, out_file = my_file)
                my_file = export_graphviz(tree_in_forest, feature_names=features, out_file = my_file)
                file_out = 'tree_' + str(i_tree) + '.png'
                file_in = 'tree_' + str(i_tree) + '.dot'
                #!dot -Tpng -o file_out file_in
                #call(["dot", "-Tpng", "-o", file_out, file_in])
                subprocess.Popen("dot -Tpng -o "+" "+file_out+" "+file_in, shell=True, stdout=subprocess.PIPE).stdout.read()
                from IPython.display import Image
                #return Image('tree_' + str(i_tree) + '.png') 
                Image('tree_plot.png')
                #return Image('tree_plot.png')


# In[45]:

from sklearn.tree import export_graphviz

dt = DecisionTreeClassifier(max_depth=2)
dt.fit(df_X, df_y)

visualise_tree(dt)

#rf = RandomForestClassifier(n_estimators=1)
#rf.fit(df_X, df_y)

#visualise_tree(rf)


# In[ ]:

def draw_decision_regions(X, y, estimator, resolution=0.01):
    """Draw samples and decision regions
    
    The true label is indicated by the colour of each
    marker. The decision tree's predicted label is
    shown by the colour of each region.
    """
    plot_step = resolution
    x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
    y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
    xx, yy = np.meshgrid(np.arange(x_min, x_max, plot_step),
                         np.arange(y_min, y_max, plot_step))

    Z = estimator.predict(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)

    fig, axis = plot_data(X,y)
    cs = axis.contourf(xx, yy, Z, alpha=0.5, cmap=plt.cm.RdBu)
    
with sns.axes_style('white'):
    draw_decision_regions(df_X.values, df_y.values, dt)


# In[ ]:

import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import make_blobs
from sklearn.ensemble import RandomForestClassifier

X, y = make_blobs(centers=[[0, 0], [1, 1]], random_state=61526, n_samples=50)

def plot_forest(max_depth=1):
    plt.figure()
    ax = plt.gca()
    h = 0.02

    x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5
    y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))

    if max_depth != 0:
        forest = RandomForestClassifier(n_estimators=20, max_depth=max_depth,
                                        random_state=1).fit(X, y)
        Z = forest.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1]
        Z = Z.reshape(xx.shape)
        ax.contourf(xx, yy, Z, alpha=.4)
        ax.set_title("max_depth = %d" % max_depth)
    else:
        ax.set_title("data set")
    ax.scatter(X[:, 0], X[:, 1], c=np.array(['b', 'r'])[y], s=60)
    ax.set_xlim(x_min, x_max)
    ax.set_ylim(y_min, y_max)
    ax.set_xticks(())
    ax.set_yticks(())


def plot_forest_interactive():
    from IPython.html.widgets import interactive, IntSlider
    slider = IntSlider(min=0, max=8, step=1, value=0)
    return interactive(plot_forest, max_depth=slider)


# In[ ]:

def visualize_tree(tree, feature_names):
    """Create tree png using graphviz.

    Args
    ----
    tree -- scikit-learn DecsisionTree.
    feature_names -- list of feature names.
    """
    with open("dt.dot", 'w') as f:
        export_graphviz(tree, out_file=f,
                        feature_names=feature_names)

    command = ["dot", "-Tpng", "dt.dot", "-o", "dt.png"]
    try:
        subprocess.check_call(command)
    except:
        exit("Could not run dot, ie graphviz, to "
             "produce visualization")


# In[ ]:

visualize_tree(dt, features)


# 
# ## Test with permutations the significance of a classification score
# 
# In order to test if a classification score is significative a technique in repeating the classification procedure after randomizing, permuting, the labels. The p-value is then given by the percentage of runs for which the score obtained is greater than the classification score obtained in the first place.
# 

# In[ ]:

from sklearn.model_selection import permutation_test_score


# In[ ]:

## Loading a dataset

n_classes = np.unique(y).size

# Some noisy data not correlated
random = np.random.RandomState(seed=0)
E = random.normal(size=(len(X), 500))

# Add noisy data to the informative features for make the task harder
X = np.c_[X, E]

svm = SVC(kernel='linear')
cv = StratifiedKFold(2)

score, permutation_scores, pvalue = permutation_test_score(
    svm, X, y, scoring="accuracy", cv=cv, n_permutations=10, n_jobs=1)

print("Classification score %s (pvalue : %s)" % (score, pvalue))


# In[ ]:

## View histogram of permutation scores

plt.hist(permutation_scores, 20, label='Permutation scores')
ylim = plt.ylim()

plt.plot(2 * [score], ylim, '--g', linewidth=3,
         label='Classification Score'
         ' (pvalue %s)' % pvalue)
plt.plot(2 * [1. / n_classes], ylim, '--k', linewidth=3, label='Luck')

plt.ylim(ylim)
plt.legend()
plt.xlabel('Score')
plt.show()


# ## Training a BDT with TMVA
# 
# First we will train the default Boosted Decision Tree classifier in TMVA. After this we will try and replicate this with scikit-learn.
# 
# In order to use TMVA we have to create a Factory object, to which we add the individual samples, labelling them as either "signal" or "background" and as "testing" or "training".

# In[ ]:

import ROOT as R
from ROOT import TMVA
from array import array

# ROOT and TMVA require an open file to store things
# as they go along
# Load dataset

# Split data into a development and evaluation set
X_dev,X_eval, y_dev,y_eval = train_test_split(df_X, df_y, test_size=0.50, random_state=seed)
# Split development set into a train and test set
X_train, X_test, y_train, y_test = train_test_split(X_dev, y_dev, test_size=.50, random_state=seed)

outfile = R.TFile('tmva_output.root', 'recreate')

factory = TMVA.Factory(tree_category, outfile, "AnalysisType=Classification")

print "Step 0"
for n in range(12):
    factory.AddVariable("f%i"%n, "F")
    print "good "

print "Step 1"
for y,row in zip(y_train.values, X_train.values):
    a = R.std.vector(R.Double)()
    for r in row:
        print r
        a.push_back(r)

    if y > 0:
        factory.AddSignalTrainingEvent(a)
    else:        
        factory.AddBackgroundTrainingEvent(a)

print "Step 2"       
for y,row in zip(y_test.values, X_test.values):
    print y, row
    a = R.std.vector(R.Double)() # instantiate a std::vector<double>
    for r in row:
        a.push_back(r)
        
    if y > 0:
        print "signal"
        factory.AddSignalTestEvent(a)
    else:
        print "background"
        factory.AddBackgroundTestEvent(a)

print "Step 3"
#factory.PrepareTrainingAndTestTree(R.TCut("1"), "SplitMode=Random:NormMode=NumEvents")
factory.PrepareTrainingAndTestTree(R.TCut("1.0"), "SplitMode=Block:SplitSeed=100:NormMode=EqualNumEvents:!V")


# In[ ]:

# After adding each sample we can now set what kind of classifier 
# we would like to train as well as the hyperparameters for that classifier. 
# After that we tell TMVA to go and train the classifier.
factory.BookMethod(TMVA.Types.kBDT, "BDT", 
                   "NTrees=800:nEventsMin=100:MaxDepth=3:"
                   "BoostType=AdaBoost:AdaBoostBeta=0.5:SeparationType=GiniIndex:"
                   "nCuts=-1"
                   )
print "Step 4"
factory.TrainAllMethods()
print "Step 5"


# In[ ]:

reader = TMVA.Reader()

print "Step 6"
for n in range(10):
    a = array("f", [0.])
    reader.AddVariable("f%i"%n, a)
 
print "Step 7"
reader.BookMVA("BDT", "weights/TMVAClassification_BDT.weights.xml")

print "Step 8"
y_predicted = []
decision_val = []
for row in X_test:
    a = R.std.vector(R.Double)()
    for r in row:
        a.push_back(r)

    print "Step 9"
    value = reader.EvaluateMVA(a, "BDT")
    decision_val.append(value)
    if value > 0:
        y_predicted.append(+1)
    else:
        y_predicted.append(-1)

print "Step 10"
print classification_report(y_test, y_predicted, target_names=["background", "signal"])
print "Area under ROC curve: %.4f"%(roc_auc_score(y_test, y_predicted))


# In[ ]:

D = np.array(decision_val)

print "Step 11"
plt.hist(D[y_test>0.],
         color='r', alpha=0.5, range=(-0.4,0.4), bins=30)
plt.hist(D[y_test<0.],
         color='b', alpha=0.5, range=(-0.4,0.4), bins=30)
plt.xlabel("TMVA BDT output")
plt.show()


# In[ ]: