From f612e094ed733d001de196df9a4f92f8cd0dc158 Mon Sep 17 00:00:00 2001
From: Bernhard Schlegel <bernhard.schlegel@mytum.de>
Date: Tue, 27 Mar 2018 22:38:57 +0200
Subject: [PATCH 1/6] Added class-senstive-scaling

---
 PCA.py                             | 2468 ++++++++++++++++++++++++++++
 imblearn/scaling/__init__.py       |    8 +
 imblearn/scaling/base.py           |   18 +
 imblearn/scaling/css.py            |  249 +++
 imblearn/scaling/tests/__init__.py |    0
 imblearn/scaling/tests/test_css.py |  167 ++
 imblearn/utils/validation.py       |    2 +-
 7 files changed, 2911 insertions(+), 1 deletion(-)
 create mode 100644 PCA.py
 create mode 100644 imblearn/scaling/__init__.py
 create mode 100644 imblearn/scaling/base.py
 create mode 100644 imblearn/scaling/css.py
 create mode 100644 imblearn/scaling/tests/__init__.py
 create mode 100644 imblearn/scaling/tests/test_css.py

diff --git a/PCA.py b/PCA.py
new file mode 100644
index 000000000..34310fbff
--- /dev/null
+++ b/PCA.py
@@ -0,0 +1,2468 @@
+
+# coding: utf-8
+
+# ### Do a PCA transformation
+# https://www.dataquest.io/blog/jupyter-notebook-tips-tricks-shortcuts/
+
+# In[ ]:
+
+import sys, os
+dir = os.path.dirname(os.path.abspath(os.path.realpath('.')))
+libRoot = os.path.join(dir, 'imbalanced-learn')
+sys.path.insert(0,libRoot)
+
+
+# In[ ]:
+
+import numpy as np
+import pandas as pd
+import matplotlib.pyplot as plt
+from numpy import genfromtxt
+from sklearn.decomposition import PCA
+import time
+
+import re # re.sub() for replacing using regexps
+import datetime # ping pong
+import multiprocessing # count cpus
+from pprint import pprint # beautifully print arrays
+import datetime # get current time
+import math as math
+
+
+# ### Setup on new machine
+# http://stackoverflow.com/questions/29329667/ipython-notebook-script-deprecated-how-to-replace-with-post-save-hook
+# 
+
+# ## Settings
+
+# In[ ]:
+
+n_folds = 5
+
+
+# In[ ]:
+
+use_ubi = False
+if use_ubi:
+    import os
+    import datetime 
+    from ubidots import ApiClient
+
+    # set proxy
+    os.environ['http_proxy'] = 'http://proxy.muc:8080' 
+    os.environ['https_proxy'] = 'http://proxy.muc:8080'
+
+    # get api and variable
+    api = ApiClient(token='O32sAiO8tw4VOTxz24Wmf1IRY7ZoeY')
+    ubi_last_timestamp = api.get_variable('5910aee076254222ee1d9d3f')
+
+    new_value = ubi_last_timestamp.save_value({'value': 10, 'context':{'lastTimestamp': "'" + str(datetime.datetime.now()) + "'"}})
+
+
+# In[ ]:
+
+logpath = "log.log"
+
+class Report():
+    @staticmethod
+    def getHeader():
+        return "\"TS\"" + "," +                "\"TARGET\"" + "," +                "\"DATASET\"" + "," +                "\"MODEL_TYPE\"" + "," +                "\"MODEL_TRAIN_TIME\"" + "," +                "\"MODEL_TRAIN_EVAL_TIME\"" + "," +                "\"MODEL_TEST_TIME\"" + "," +                "\"MODEL_TRAIN_ACCURACY\"" + "," +                "\"MODEL_TRAIN_AUROC\"" + "," +                "\"MODEL_TRAIN_AUPRC\"" + "," +                "\"MODEL_TRAIN_F1\"" + "," +                "\"MODEL_TRAIN_GPERFORMANCE\"" + "," +                "\"MODEL_ACCURACY\"" + "," +                "\"MODEL_AUROC\"" + "," +                "\"MODEL_AUPRC\"" + "," +                "\"MODEL_F1\"" + "," +                "\"MODEL_GPERFORMANCE\"" + "," +                "\"NUM_FEATURES\"" + "," +                "\"NUM_SAMPLE_DATASET\"" + "," +                "\"NUM_SAMPLE_DATASET_POS\"" + "," +                "\"NUM_SAMPLE_DATASET_NEG\"" + "," +                "\"NUM_SAMPLE_TRAIN_BEFORE\"" + "," +                "\"NUM_SAMPLE_TRAIN_BEFORE_POS\"" + "," +                "\"NUM_SAMPLE_TRAIN_BEFORE_NEG\"" + "," +                "\"NUM_SAMPLE_TRAIN_AFTER\"" + "," +                "\"NUM_SAMPLE_TRAIN_AFTER_POS\"" + "," +                "\"NUM_SAMPLE_TRAIN_AFTER_NEG\"" + "," +                "\"BS2\"" + "," +                "\"PROCESS_NAME\"" + "," +                "\"PROCESS_TIME\"" + "," +                "\"PROCESS_NAIVE\"" + "," +                "\"PROCESS_SAMPLING_UP_SMOTE\"" + "," +                "\"PROCESS_SAMPLING_UP_ADASYN\"" + "," +                "\"PROCESS_SAMPLING_DOWN_OSS\"" + "," +                "\"PROCESS_SAMPLING_DOWN_CNN\"" + "," +                "\"PROCESS_SAMPLING_DOWN_TOMEK\"" + "," +                "\"PROCESS_WEIGHT\"" + "," +                "\"PROCESS_SCALE_MINORITY\"" + "," +                "\"PROCESS_SCALE_MODE\"" + "," +                "\"PROCESS_SCALE_TARGET\"" + "," +                "\"PROCESS_SCALE_C\"" + "\r"       
+    
+    @staticmethod        
+    def logToFile(target,
+                dataset,
+                model_type,
+                model_train_time,
+                model_train_eval_time,
+                model_test_time,
+                model_train_accuracy,
+                model_train_auroc,
+                model_train_auprc,
+                model_train_f1,
+                model_train_gmean,
+                model_accuracy,
+                model_auroc,
+                model_auprc,
+                model_f1,
+                model_gmean,
+                num_features,
+                num_sample_dataset,
+                num_sample_dataset_pos,
+                num_sample_dataset_neg,
+                num_sample_train_before,
+                num_sample_train_before_pos,
+                num_sample_train_before_meg,
+                num_sample_train_after,
+                num_sample_train_after_pos, 
+                num_sample_train_after_neg,
+                bs2,
+                process_name,
+                process_time,
+                process_naive = 0,
+                process_sampling_up_smote = 0,
+                process_sampling_up_adasyn = 0,
+                process_sampling_down_oss = 0,
+                process_sampling_down_cnn = 0,
+                process_sampling_down_tomek = 0,
+                process_weight = 0,
+                process_scale_minority = 0,
+                process_scale_mode = 0,
+                process_scale_target = 0,
+                process_scale_c = 0):
+        global logpath
+        pth = logpath
+        import os.path
+        if (not os.path.isfile(pth)):
+             with open(pth, "a") as myfile:
+                    myfile.write(Report.getHeader())
+
+        with open(pth, "a") as myfile:
+            myfile.write(Report.getData(target, dataset, model_type, model_train_time, model_train_eval_time, model_test_time, 
+                                        model_train_accuracy, model_train_auroc, model_train_auprc, model_train_f1, model_train_gmean,
+                                        model_accuracy, model_auroc, model_auprc, model_f1, model_gmean, num_features,
+                                 num_sample_dataset, num_sample_dataset_pos, num_sample_dataset_neg,
+                                 num_sample_train_before, num_sample_train_before_pos, num_sample_train_before_meg,
+                                 num_sample_train_after, num_sample_train_after_pos, num_sample_train_after_neg, bs2,
+                                 process_name, process_time, process_naive,
+                                 process_sampling_up_smote, process_sampling_up_adasyn, process_sampling_down_oss,
+                                 process_sampling_down_cnn, process_sampling_down_tomek, process_weight,
+                                 process_scale_minority, process_scale_mode, process_scale_target, process_scale_c)
+                        )
+    
+    @staticmethod        
+    def getData(target,
+                dataset,
+                model_type,
+                model_train_time,
+                model_train_eval_time,
+                model_test_time,
+                model_train_accuracy,
+                model_train_auprc,
+                model_train_auroc,
+                model_train_f1,
+                model_train_gmean,
+                model_accuracy,
+                model_auroc,
+                model_auprc,
+                model_f1,
+                model_gmean,
+                num_features,
+                num_sample_dataset,
+                num_sample_dataset_pos,
+                num_sample_dataset_neg,
+                num_sample_train_before,
+                num_sample_train_before_pos,
+                num_sample_train_before_meg,
+                num_sample_train_after,
+                num_sample_train_after_pos, 
+                num_sample_train_after_neg,
+                bs2,
+                process_name,
+                process_time,
+                process_naive = 0,
+                process_sampling_up_smote = 0,
+                process_sampling_up_adasyn = 0,
+                process_sampling_down_oss = 0,
+                process_sampling_down_cnn = 0,
+                process_sampling_down_tomek = 0,
+                process_weight = 0,
+                process_scale_minority = 0,
+                process_scale_mode = 0,
+                process_scale_target = 0,
+                process_scale_c = 0):
+        return "\"" + str(datetime.datetime.now()) + "\"" + "," +                "\"" + str(target) + "\"" + "," +                "\"" + str(dataset) + "\"" + "," +                "\"" + str(model_type) + "\"" + "," +                str(model_train_time) + "," +                 str(model_train_eval_time) + "," +                 str(model_test_time) + "," +                   str(model_train_accuracy) + "," +                 str(model_train_auroc) + "," +                    str(model_train_auprc) + "," +                    str(model_train_f1) + "," +                  str(model_train_gmean) + "," +                str(model_accuracy) + "," +                 str(model_auroc) + "," +                    str(model_auprc) + "," +                    str(model_f1) + "," +                  str(model_gmean) + "," +                str(num_features) + "," +                  str(num_sample_dataset) + "," +                  str(num_sample_dataset_pos) + "," +                   str(num_sample_dataset_neg) + "," +                  str(num_sample_train_before) + "," +                   str(num_sample_train_before_pos) + "," +                    str(num_sample_train_before_meg) + "," +                     str(num_sample_train_after) + "," +                           str(num_sample_train_after_pos) + "," +                   str(num_sample_train_after_neg) + "," +                      str(bs2) + "," +                      "\"" + process_name + "\"" + "," +                          str(process_time) + "," +                             str(process_naive) + "," +                     str(process_sampling_up_smote) + "," +                     str(process_sampling_up_adasyn) + "," +                 str(process_sampling_down_oss) + "," +                       str(process_sampling_down_cnn) + "," +                     str(process_sampling_down_tomek) + "," +                  str(process_weight) + "," +                           str(process_scale_minority) + "," +                  "\"" + str(process_scale_mode) + "\"" + "," +                  "\"" + str(process_scale_target) + "\"" + "," +                  str(process_scale_c) + "\r"
+
+
+# In[ ]:
+
+# define Log function
+def log(text, silent=True, force=False):
+    if not silent or force:
+        print(time.strftime('%Y.%m.%d, %H:%M:%S') + ': ' + text)
+    
+def ping():
+    return datetime.datetime.now()
+
+def pong(dt):
+    now = datetime.datetime.now()
+    diff = now - dt
+    ms = round(diff.total_seconds()*1000)
+    return ms
+
+log('init finshed', force=True)
+
+
+# In[ ]:
+
+def getBS2(X, y):
+    tl = TomekLinks()
+    X_tl, y_tl = tl.fit_sample(X, y)
+    
+    num_pos_samples = sum(y)
+    num_tomek_links = len(y) - len(y_tl)
+
+    return num_tomek_links / num_pos_samples
+
+
+# In[ ]:
+
+def indexCategorical(df, columnName):
+    df[columnName] = pd.Categorical(df[columnName]).codes
+    return df
+
+def renameTargetDropSamePrefix(df, target) :    
+    
+    rows, colsBefore = df.shape
+    
+    prefixToDrop = re.sub(r"(.*?)___.*", r"\1___", target)
+    log("renaming " + target +         " to \"TARGET\" and dropping all other columns prefixed with " + prefixToDrop)
+    df.rename(columns={target: "TARGET"}, inplace = True)
+    
+    dfReturn = dropPrefix(df, prefixToDrop)
+    
+    rows, colsAfter = dfReturn.shape
+    log("reduced number of columns from {} to {}.".format(colsBefore, colsAfter))
+    assert colsAfter < colsBefore
+    
+    return dfReturn
+
+def dropPrefix(df, prefix) :
+    prefix = prefix + ".*"
+    log("dropping " + prefix)
+    return df.select(lambda x: not re.search(prefix,x), axis = 1)
+
+def testDropPrefix():
+    df = pd.DataFrame([
+              [1,3,1,0],
+              [1,4,1,1],
+              [1,5,1,0],
+              [1.5,6,1,0],
+              [1.7,7,1,0],
+              [1,4,1,0],
+              [1,6,1,0],
+              [1,5,1,1],
+              [1,12,1,1],
+              [1,9,1,1],
+              [1,2,1,1],
+              [1,3,1,1],
+              [1,5,1,1],
+              [2,8,1,0],
+              [3,1,0,1],
+              [3,2,0,1],
+              [4,2,0,0],
+              [5,3,0,0]], columns=['PRE1___1', 'PRE1___2', 'PRE2___1','PRE3___1'])
+
+    row, nCol13 = dropPrefix(df, "PRE2___").shape
+    row, nCol23 = dropPrefix(df, "PRE1___").shape
+
+    assert nCol13 == 3, "wrong number of cols dropped"
+    assert nCol23 == 2, "wrong number of cols dropped"
+
+def pcaFeatureGroup(dfIn, featureGroupPrefix, numberOfDimensionsWhole,
+                    numberOfDimensionsTarget, minResultingFeatures = 2):
+    """
+    e.g. featureGroupPrefix = "RO___"
+    """
+    rows, cBefore = dfIn.shape
+    npMatrix = dfIn.filter(regex = featureGroupPrefix + ".*").as_matrix()
+    
+    rows, cBeforeOfGroup = npMatrix.shape
+    
+    if(cBeforeOfGroup == 0):
+        log("There are no features belonging to group {}. Returning unmodified DataFrame.".format(featureGroupPrefix))
+        return dfIn
+    
+    # scale
+    npMatrix = scale(npMatrix)
+
+    # holds the number of the whole dataset. To be in scale, we first calculate the proportion of the current feature group
+    proportion = numberOfDimensionsTarget / numberOfDimensionsWhole
+    n_component_target = int(cBeforeOfGroup * proportion)
+    
+    if (n_component_target < minResultingFeatures):
+        n_component_target = minResultingFeatures
+    if (cBeforeOfGroup != 0): 
+        # http://scikit-learn.org/stable/modules/generated/sklearn.decomposition.PCA.html#sklearn.decomposition.PCA.fit_transform
+        pca = PCA(copy=True, 
+                  iterated_power='auto', 
+                  n_components=n_component_target, 
+                  random_state=None, 
+                  svd_solver='auto', 
+                  tol=0.0, 
+                  whiten=False)
+        dfB = dropPrefix(dfIn, featureGroupPrefix)
+        rB, cB = dfB.shape
+        
+        
+        npMatrixTransformed = pca.fit_transform(npMatrix)
+        dfA = pd.DataFrame(data=npMatrixTransformed[0:,0:], 
+                             columns=[featureGroupPrefix + str(num) for num in np.arange(1,npMatrixTransformed.shape[1]+1,1)])#,
+                             #index = dfB.index )
+        rIn, cIn = dfIn.shape
+        rA, cA = dfA.shape
+                                 
+        dfReturn = np.concatenate([dfA, dfB], axis = 1) # axis 1 is columns, so this direction ->
+        dfReturn = pd.DataFrame(data=dfReturn[0:,0:],    # values
+                                index=dfA.index,    # 1st column as index
+                                columns=dfA.columns.append(dfB.columns))  # 1st row as the column names
+        
+        rReturn, cReturn = dfReturn.shape
+        
+        
+        log("feature group has been compressed from {} to {} columns".format(cBeforeOfGroup, cA))
+        
+        assert rReturn == rIn, "num rows of inputted and outputted dataframe do not match"
+        assert rIn == rA, "num rows of inputted and PCAd dataframe do not match"
+        assert rIn == rB, "num rows of inputted and non PCAd part of initial dataframe do not match"
+        assert cReturn == (cB + cA), "concatenating PCAd and non PCAd df into returned df resulted to wrong number of cols"
+        assert cBefore == (cB + cBeforeOfGroup), "number of cols from non PCAd and PCAd dataframe before transformation" +                                                  "should add up to initial number of columns"
+        
+        return dfReturn
+    else:
+        log("no columns found matching prefix " + featureGroupPrefix + ". Skipping...")
+        return dfIn
+
+def print_full(df):
+    pd.set_option('display.max_columns', df.shape[1])
+    print(df)
+    pd.reset_option('display.max_rows')
+
+
+# ## Read in the different datasources
+
+# In[ ]:
+
+
+ts = ping()
+dfAutomotive = pd.read_csv("in.csv")
+nr, ncAutomotive = dfAutomotive.shape
+ms = pong(ts)
+log("read in dataframe with " + str(nr) + " columns and " + str(ncAutomotive) + " rows in " + str(ms) + "ms", force=True)
+
+
+# In[ ]:
+
+
+ts = ping()
+dfForest = pd.read_csv("data/forestfires_id.csv")
+nr, ncForest = dfForest.shape
+ms = pong(ts)
+log("read in forest dataframe with " + str(nr) + " columns and " + str(ncForest) + " rows in " + str(ms) + "ms")
+#dfForest['area'] = np.log(1 + dfForest['area'])
+dfForest['area'] = (dfForest['area'] > 50).astype(bool).astype(int)
+
+nPositive = sum(dfForest['area'])
+nNegative = nr-nPositive
+log("ratio of forest is {} majority class and {} minority class observations ({:.3f})".format(nNegative, nPositive, nPositive/nNegative))
+
+import matplotlib.pyplot as plt
+
+plt.hist(dfForest['area'], bins=30)
+plt.ylabel('Probability')
+
+
+# In[ ]:
+
+
+ts = ping()
+dfVowel = pd.read_csv("data/vowel-context.csv")
+nr, ncVowel = dfVowel.shape
+ms = pong(ts)
+
+nPositive = sum(dfVowel['Class'] == 1)
+nNegative = nr-nPositive
+log("ratio of vowel is {} majority class and {} minority class observations ({:.3f})".format(nNegative, nPositive, nPositive/nNegative))
+
+log("read in vowel dataframe with " + str(nr) + " columns and " + str(ncVowel) + " rows in " + str(ms) + "ms")
+
+
+# In[ ]:
+
+
+ts = ping()
+dfGlass = pd.read_csv("data/glass.csv")
+nr, ncGlass = dfGlass.shape
+ms = pong(ts)
+
+sums = []
+
+for i in [1, 2, 3, 5, 6, 7]:
+    sums.append(sum(dfGlass['Type'] == i))
+    
+nPositive = np.round(np.mean(sums))
+nNegative = nr-nPositive
+log("average ratio of vowel is {} majority class and {} minority class observations ({:.3f})".format(nNegative, nPositive, nPositive/nNegative))
+
+log("read in glass dataframe with " + str(nr) + " columns and " + str(ncGlass) + " rows in " + str(ms) + "ms", force=True)
+
+
+# In[ ]:
+
+dfPima = pd.read_csv("data/pima.csv") 
+nrPima, ncPima = dfPima.shape
+nPositive = sum(dfPima['Class'])
+nNegative = nrPima-nPositive
+log("average ratio of pima is {} majority class and {} minority class observations ({:.3f})".format(nNegative, nPositive, nPositive/nNegative))
+log("features={}".format(ncPima))
+
+dfPhoneme = pd.read_csv("data/phoneme.csv") 
+nrPhoneme, ncPhoneme = dfPhoneme.shape
+nPositive = sum(dfPhoneme['class'])
+nNegative = nrPhoneme-nPositive
+log("average ratio of phoneme is {} majority class and {} minority class observations ({:.3f})".format(nNegative, nPositive, nPositive/nNegative))
+
+dfVehicle = pd.read_csv("data/vehicle.csv") 
+nrVehicle, ncVehicle = dfVehicle.shape
+
+sums = []
+for i in [1, 2, 3]:
+    sums.append(sum(dfVehicle['TARGET'] == i))
+    
+nPositive = np.round(np.mean(sums))
+nNegative = nrVehicle-nPositive
+log("average ratio of vehicle is {} majority class and {} minority class observations ({:.3f})".format(nNegative, nPositive, nPositive/nNegative))
+
+dfAbalone = pd.read_csv("data/abalone_9_18.csv") 
+nrAbalone, ncAbalone = dfAbalone.shape
+nPositive = sum(dfAbalone['Rings'])
+nNegative = nrAbalone-nPositive
+log("ratio of abalone is {} majority class and {} minority class observations ({:.3f})".format(nNegative, nPositive, nPositive/nNegative))
+
+dfSatimage = pd.read_csv("data/satimage.csv") 
+nrSatimage, ncSatimage = dfSatimage.shape
+
+sums = []
+for i in [1, 2, 3, 4, 5, 7]:
+    sums.append(sum(dfSatimage['CLASS'] == i))
+nPositive = np.round(np.mean(sums))
+nNegative = nrSatimage-nPositive
+log("average ratio of satimage is {} majority class and {} minority class observations ({:.3f})".format(nNegative, nPositive, nPositive/nNegative))
+
+dfMammography = pd.read_csv("data/mammography.csv") 
+nrMammography, ncMammography = dfMammography.shape
+nPositive = sum(dfMammography['target'])
+nNegative = nrMammography-nPositive
+log("ratio of mammography is {} majority class and {} minority class observations ({:.3f})".format(nNegative, nPositive, nPositive/nNegative))
+
+
+# In[ ]:
+
+from sklearn.preprocessing import scale
+
+def train_test_split_index(X, y, index):
+    """
+    split X and y into training and testing based on index
+    """
+    X_scaled = scale(X)
+    
+    if not isinstance(y, np.ndarray):
+        y = y.as_matrix()
+    if not isinstance(X_scaled, np.ndarray):
+        X_scaled = X_scaled.as_matrix()
+    if not isinstance(index, np.ndarray):
+        index = index.as_matrix()
+    
+    X_train = X_scaled[index == 0]
+    X_test = X_scaled[index == 1]
+    y_train = y[index == 0]
+    y_test = y[index == 1]
+    
+
+    
+    return X_train, X_test, y_train, y_test
+
+def train_test_split_scaled(X, y):
+    """
+    split X and y into training and testing based on index
+    """
+    X_scaled = scale(X)
+    X_train, X_test, y_train, y_test = train_test_split(X_scaled, y)
+    
+    if not isinstance(y_train, np.ndarray):
+        y_train = y_train.as_matrix()
+    if not isinstance(y_test, np.ndarray):
+        y_test = y_test.as_matrix()
+    if not isinstance(X_train, np.ndarray):
+        X_train = X_train.as_matrix()
+    if not isinstance(X_test, np.ndarray):
+        X_test = X_test.as_matrix()
+    
+    return X_train, X_test, y_train, y_test
+
+
+# In[ ]:
+
+def getDataFrameForTarget(df, target = "DTC___1196802", prefixesToDrop = ["BEFUND___", "DK___"]) :
+    log("getting dataframe for target " + target + " while dropping " + str(prefixesToDrop))
+    # select target and all other columns except columns with the same prefix
+    dfTemp = renameTargetDropSamePrefix(df, target)
+
+    # get DTCs
+    #dfDTC2 = df.filter(regex=("(META|CP|RO|DTC|EE|SC|MV)___.*")) # doesn't work
+    for prefix in prefixesToDrop: 
+        dfTemp = dropPrefix(dfTemp, prefix)
+
+    # convert Categories to Indexes
+    dfTemp = indexCategorical(dfTemp, "META___CARID")
+    
+    return dfTemp
+    
+# do PCA for every featuregroup separately
+def doPCA(df, featureGroupsPCA = ["CP", "RO", "EE", "MV", "SC", "DTC"], numberOfDimensionsTarget = 100):
+    ## no meta
+    assert isinstance(df, pd.DataFrame), "dataframe needs to be a pandas.DataFrame to allow filtering for " +                                         "different feature groups."
+    dfPCA = df
+    dummy, colsBefore = dfPCA.shape
+    for group in featureGroupsPCA:
+        log("Working group " + group + "...")
+        dfPCA = pcaFeatureGroup(dfPCA, featureGroupPrefix = group + "___", numberOfDimensionsWhole = colsBefore,
+                                numberOfDimensionsTarget = numberOfDimensionsTarget)
+        
+    dummy, colsAfter = dfPCA.shape
+    log("reduced dimensions from {} to {} using PCA.".format(colsBefore, colsAfter))
+    
+    return dfPCA
+
+
+# # create the datasets according to the sampling strategy
+# 
+# Variants inlcude:
+# 1. a naive approach to serve as baseline
+# 2. one-sided selection
+# 3. condest nearest neighbour
+# 4. SMOTE
+# 5. assigning costs
+# 6. preferably sample same cars
+# 7. use heuristic to identify most valuable majority-class observations
+
+# In[ ]:
+
+
+from sklearn.model_selection import train_test_split
+from collections import Counter
+from sklearn.datasets import make_classification
+from imblearn.over_sampling import SMOTE 
+from imblearn.scaling import CSS 
+from imblearn.under_sampling import TomekLinks, OneSidedSelection, CondensedNearestNeighbour
+from collections import Counter
+from imblearn.under_sampling import RandomUnderSampler 
+    
+def createDatasetXY(df, indexFeatureStart = 0, indexFeatureEnd = -1, targetColumnName = "TARGET"):
+    """
+    creates train / test splits for X and y using the given dataframe
+    
+    :param df: The dataframe to be used
+    :param indexFeatureStart: Index to start selecting features
+    :param indexFeatureEnd: Index to stop selecting features
+    :return: X, y
+    """
+    
+    if(indexFeatureEnd == -1):
+        indexFeatureEnd = len(df.columns) - 4
+    X = df.ix[:,indexFeatureStart:indexFeatureEnd] # 3 meta cols (META___{RANDOM, CARID, PLANNED}) and the target row
+    y = df[targetColumnName].astype(bool).astype(int)
+
+    return X, y
+
+def shuffleTwo(a, b):
+    a1 = pd.DataFrame(a)
+    indexes = a1.index # get the indices, the first magical column
+    assert len(a) == len(b), "lenth of a ({}) doesn't match length of b ({})".format(len(a), len(b))
+    p = np.random.permutation(len(indexes))
+    indexesShuffled = indexes[p]
+    a2, b2 = a1.ix[indexesShuffled], b[indexesShuffled]
+    
+    return a2, b2
+
+def createDatasetUsingMetaXy(df, indexFeatureStart = 0, indexFeatureEnd = -1, 
+                           targetColumnName = "TARGET",
+                           metaColumnName = "META___PLANNED"):
+    # as majority observations such examples will be selected, where the metaColumnName is 1
+    
+    if(indexFeatureEnd == -1):
+        indexFeatureEnd = len(df.columns) - 4
+    X = df.ix[:,indexFeatureStart:indexFeatureEnd] # 3 meta cols (META___{RANDOM, CARID, PLANNED}) and the target row
+    prefix = re.sub(r'___.*', r'', metaColumnName)
+    y = df[targetColumnName].astype(bool).astype(int)
+    
+    # select only the "META == 1" or "TARGET == 1" rows.
+    indices = y[y > 0].index
+    indices = indices.append(X[X[metaColumnName] > 0].index)
+    indices = np.unique(indices)
+    X = X.loc[indices]
+    X = dropPrefix(X, prefix)   
+    y = y.loc[indices]
+    
+    return X, y
+
+def createDatasetUsingRandomXy(df, indexFeatureStart = 0, indexFeatureEnd = -1, 
+                               targetColumnName = "TARGET", ratio = 0.1):
+    # ratio defines the ratio between majority:minority (10 means: 10 times as much majority)
+    # as majority observations such examples will be selected, where the metaColumnName is 1
+    
+    if(indexFeatureEnd == -1):
+        indexFeatureEnd = len(df.columns) - 4
+    X = df.ix[:,indexFeatureStart:indexFeatureEnd] # 3 meta cols (META___{RANDOM, CARID, PLANNED}) and the target row
+    y = df[targetColumnName].astype(bool).astype(int)
+    
+    
+    log('Original dataset shape {}'.format(Counter(y)))
+
+    targetRatio = 1/ratio
+    num_pos = sum(y)
+    currentRatio = num_pos / (len(y)-num_pos)
+    log("Current ratio={}, targetRatio={}".format(currentRatio, targetRatio))
+    if(targetRatio < currentRatio):
+        targetRatio = currentRatio
+    
+    rus = RandomUnderSampler(random_state=42, ratio = targetRatio)
+    X_res, y_res = rus.fit_sample(X, y)
+    log('Resampled dataset shape {}'.format(Counter(y_res)))
+    
+    X_res = pd.DataFrame(data=X_res[0:,0:], # values
+                         #index=X.index,     # 1st column as index
+                         columns=X.columns)
+    
+    log("class returned by RandomXy for X is {}".format(str(type(X_res))))
+    
+    return X_res, y_res
+
+def createNaiveDataset(X_train, y_train):
+    """
+    creates train / test splits for X and y using the given dataframe
+    
+    :param df: The dataframe to be used
+    :param indexFeatureStart: Index to start selecting features
+    :param indexFeatureEnd: Index to stop selecting features
+    :return: X_train, X_test, y_train, y_test used for training
+    """
+    log("creating dataset [naive mode]...")
+    
+    return X_train, y_train
+
+def createSMOTEDataset(X_train, y_train):
+    log("creating dataset [SMOTE]...")
+    
+    log('Original dataset shape {}'.format(Counter(y_train)))
+
+    # SMOTE expects n_neighbors <= n_samples
+    n_neighbors = 5
+    n_samples = sum(y_train)
+    n_samples_total, dummy = X_train.shape
+    # bug in sklearn\neighbors\base.py in kneighbors(self, X, n_neighbors, return_distance)
+    # that causes the number of samples to be 1 smaller.
+    if ((n_samples-1) < n_neighbors):
+        log("reducing n_neighbors ({}) to number of samples ({})".format(n_neighbors, n_samples))
+        n_neighbors = n_samples - 1  
+    if ((n_samples_total-1) < n_neighbors):
+        log("reducing n_neighbors ({}) to total number of samples ({})".format(n_neighbors, n_samples_total))
+        n_neighbors = n_samples_total - 1
+    
+    sm = SMOTE(random_state=42, k_neighbors=n_neighbors)
+    X_train_res, y_train_res = sm.fit_sample(X_train, y_train)
+    log('Resampled dataset shape {}'.format(Counter(y_train_res)))
+    
+    # shuffle
+    X_train_res, y_train_res = shuffleTwo(X_train_res, y_train_res)
+    
+    return X_train_res, y_train_res
+
+def createADASYNDataset(X_train, y_train):
+    from imblearn.over_sampling import ADASYN 
+    ada = ADASYN()
+    X_train_res, y_train_res = ada.fit_sample(X_train, y_train)
+    
+    # shuffle
+    X_train_res, y_train_res = shuffleTwo(X_train_res, y_train_res)
+    
+    return X_train_res, y_train_res
+
+def createTomekDataset(X_train, y_train):
+    tl = TomekLinks(return_indices=True)
+    X_train_res, y_train_res, idx_resampled = tl.fit_sample(X_train, y_train)
+    
+    # shuffle
+    X_train_res, y_train_res = shuffleTwo(X_train_res, y_train_res)
+    
+    return X_train_res,  y_train_res
+
+def createOSSDataset(X_train, y_train):
+    oss = OneSidedSelection(return_indices=True)
+    X_train_res, y_train_res, idx_resampled = oss.fit_sample(X_train, y_train)
+    
+    # shuffle
+    X_train_res, y_train_res = shuffleTwo(X_train_res, y_train_res)
+#     X_train_res = pd.DataFrame(X_train_res)
+#     assert len(X_train_res) == len(y_train_res)
+#     p = np.random.permutation(len(X_train_res))
+#     X_train_res.reset_index(drop=True)
+#     X_train_res, y_train_res = X_train_res.ix[p], y_train_res[p]
+
+    return X_train_res, y_train_res
+
+def createCNNDataset(X_train, y_train):
+    cnn = CondensedNearestNeighbour(return_indices=True)
+    X_train_res, y_train_res, idx_resampled = cnn.fit_sample(X_train, y_train)
+    
+    # shuffle
+    X_train_res, y_train_res = shuffleTwo(X_train_res, y_train_res)
+    
+    return X_train_res, y_train_res
+
+def createScaledDataset(X_train, y_train, targetClass = "majority", c = 0.2, mode = "constant", 
+                        verbose = False):
+    
+    css = CSS(mode=mode, target=targetClass, c=c, shuffle=True)
+    return css.fit_sample(X_train,y_train) # X_s, y_s
+
+def testVisualCreateScaledDataset():
+    iVowel = 0
+    dfVowelSub = dfVowel.copy()
+    dfVowelSub['Class'] = (dfVowelSub['Class'] == iVowel).astype(bool)
+    dfVowelSub['Class'] = dfVowelSub['Class'].astype(int)
+
+    XVowel, yVowel = createDatasetXY(df = dfVowelSub, indexFeatureStart = 1, 
+                                     indexFeatureEnd = ncVowel-2, targetColumnName = "Class")
+    zVowel = dfVowel['Train or Test']
+    X_train_pre, X_test_pre, y_train_pre, y_test_pre = train_test_split_index(XVowel, yVowel, zVowel)     
+
+    X_train, X_test, y_train, y_test = createNaiveDataset(X_train_pre, X_test_pre, y_train_pre, y_test_pre)
+    retVal = trainLR(X_train, X_test, y_train, y_test, balanced=None, scoring="auROC")
+
+    pca = PCA(n_components=2)
+    pcaFitted = pca.fit(X_train)
+    X_r = pcaFitted.transform(X_train)
+    plt.scatter(X_r[:,0], X_r[:,1], c=y_train, alpha=0.5)
+    #np.savetxt("out/visualize/vowel_class_1/no_scale.csv", np.column_stack([X_r[:,0], X_r[:,1], y_train]), delimiter=",")
+    plt.show()
+
+    X_train, X_test, y_train, y_test = createScaledDataset(X_train_pre, X_test_pre, y_train_pre, y_test_pre, c = 0.3)
+    retVal = trainLR(X_train, X_test, y_train, y_test, balanced=None, scoring="auROC")
+    pca = PCA(n_components=2)
+    X_r = pcaFitted.transform(X_train)
+    plt.scatter(X_r[:,0], X_r[:,1], c=y_train, alpha=0.5)
+    #np.savetxt("out/visualize/vowel_class_1/scale_0.2.csv", np.column_stack([X_r[:,0], X_r[:,1], y_train]), delimiter=",")
+    plt.show()
+
+def testCreateScaledDataset():
+    df = pd.DataFrame([
+              [1,3,1,0],
+              [1,4,1,1],
+              [1,5,1,1],
+              [2,8,1,0],
+              [3,1,0,1],
+              [3,2,0,1],
+              [4,2,0,0],
+              [5,3,0,0]], columns=['a', 'b', 'target','train'])
+    dfTrainTest = df['train']
+    X, y = createDatasetXY(df, indexFeatureStart = 0, indexFeatureEnd = 2, targetColumnName = "target")
+    X_train, X_test, y_train, y_test = train_test_split_index(X, y, dfTrainTest)
+    X_train
+    c = 0.3
+    X_train_scaled, X_test_scaled, y_train_scaled, y_test_scaled = createScaledDataset(X_train, X_test, y_train, 
+                                                                                       y_test, mode = "single", c = c)
+    
+    
+    (X_train.ix[6,0] + X_train.ix[7,0])/2*c + (1-c) * X_train.ix[6,0] == X_train_scaled.ix[6,0]
+    (X_train.ix[6,1] + X_train.ix[7,1])/2*c + (1-c) * X_train.ix[7,1] == X_train_scaled.ix[7,1]
+    X_train.ix[0,1] == X_train_scaled.ix[0,1]
+    X_train.ix[3,1] == X_train_scaled.ix[3,1]
+
+
+# In[ ]:
+
+from sklearn import datasets, neighbors, linear_model, svm
+from sklearn.ensemble import RandomForestClassifier
+import numpy as np
+from sklearn import metrics
+from sklearn.metrics import roc_auc_score, make_scorer, auc, precision_recall_curve
+from sklearn.ensemble import BaggingClassifier
+from sklearn.model_selection import KFold, cross_val_score
+from sklearn.neural_network import MLPClassifier
+
+lastModel = 0
+lastY = 0
+lastPred = 0
+
+class trainResult():
+    auROC = -1
+    auPRC = -1
+    accuracy = -1
+    f1 = -1
+    gmean = -1
+    train_time = -1
+    test_time = -1
+    gmean = -1
+    train_eval_time = -1
+    train_accuracy = -1
+    train_auroc = -1
+    train_auprc = -1
+    train_f1 = -1
+    train_gmean = -1
+    ms_process = -1
+
+def get_au_prc(real, preds, pos_label=1):
+    precision, recall, _ = precision_recall_curve(real, preds, pos_label=pos_label)
+    auPRC = metrics.auc(precision, recall, reorder=True)
+    
+    return auPRC
+    
+def getMetrics(y = np.array([1, 1, 2, 2]), pred = np.array([0.1, 0.4, 0.35, 0.8]), threshold = 0.5):
+    
+    if (len(np.unique(pred)) < 2) : 
+        #log("all predictions the same. setting auc to 0 and f1 to 0.")
+        accuracy = metrics.accuracy_score(y, pred > threshold)
+        return (0, 0, accuracy, 0, 0)
+    
+    else:
+        fpr, tpr, thresholds = metrics.roc_curve(y, pred, pos_label=1)
+        auc = metrics.auc(fpr, tpr) # AU ROC
+        f1 = metrics.f1_score(y, pred > threshold)
+        accuracy = metrics.accuracy_score(y, pred > threshold)
+        precision = metrics.precision_score(y, pred > threshold)
+        recall = metrics.recall_score(y, pred > threshold)
+        auPRC = get_au_prc(y, pred, pos_label=1)
+        
+        g_mean = math.sqrt(precision * recall)
+        return (auc, f1, accuracy, g_mean, auPRC)
+
+def getNumberOfTomekLinks(X, y):
+    from imblearn.under_sampling import TomekLinks
+    tl = TomekLinks(return_indices=True)
+    n_row_before, dummy = X.shape
+    X_resampled, y_resampled, idx_resampled = tl.fit_sample(X, y)
+    n_row_after, dummy = X_resampled.shape
+    tlFound = n_row_before - n_row_after
+    log(str(tlFound) + " tomek links found")
+    return tlFound
+
+from sklearn.model_selection import cross_val_score
+def getCVPerformanceOld(clf, X_train, y_train, scoring = "roc_auc"):
+    
+    if len(np.unique(y_train)) != 2:
+        return 0
+    
+    np.set_printoptions(threshold=np.inf)
+    pprint(y_train)
+    
+    # for more scorers see http://scikit-learn.org/stable/modules/model_evaluation.html#scoring-parameter
+    scores = cross_val_score(clf, X_train, y_train, cv = n_folds, scoring = scoring)
+            
+    return np.mean(scores)
+    
+from sklearn.model_selection import cross_val_predict
+from sklearn.metrics import roc_auc_score
+def getCVPerformance(clf, X_train, y_train, scoring = "roc_auc"):
+    
+    if len(np.unique(y_train)) != 2:
+        return 0
+    
+    pred = cross_val_predict(clf, X_train, y_train, cv=n_folds)
+ 
+    return roc_auc_score(y_train, pred)
+
+def trainNNScale(X_train, X_test, y_train, y_test,
+                 c_scale = 0, mode = "constant", targetClass = "minority"):
+    log("training NN")
+    retVal = trainResult()
+    ms_process_total = 0
+
+    
+    # train    
+    pTrain = ping()
+    X_train_pre = pd.DataFrame(X_train).reset_index(drop=True).as_matrix() # otherwise indices from X will be used
+    X_test = pd.DataFrame(X_test).reset_index(drop=True).as_matrix().astype(float) # otherwise indices from X will be used
+            
+            
+    if (str(type(y_train)) != "<class 'numpy.ndarray'>"):
+        y_train_pre = y_train.reset_index(drop=True) # otherwise indices from X will be used
+    else:
+         y_train_pre = y_train
+            
+    if (str(type(y_test)) != "<class 'numpy.ndarray'>"):
+        y_test = y_test.reset_index(drop=True).as_matrix() # otherwise indices from X will be used
+    from sklearn.model_selection import KFold
+    bestAuROC = 0
+    bestAuPRC = 0
+    bestAccuracy = 0
+    bestGmean = 0
+    bestF1 = 0
+    bestSolver = 'lbfgs'
+    bestActivation = 'relu'
+    solverz = ['lbfgs', 'sgd', 'adam']
+    activationz = ['identity', 'logistic', 'tanh', 'relu']
+    layerz = [(2,2), (5,2), (5,5), (10,5), (10,10), (2,2,2), (5,5,5), (10,10,10)]
+    for solver in solverz:
+        for activation in activationz:
+            for layer in layerz:
+                kf = KFold(n_splits=n_folds)
+                kf.get_n_splits(X_train_pre)
+
+                scoresAuROC = []
+                scoresF1 = []
+                scoresAccuracy = []
+                scoresGmean = []
+                scoresAuPRC = []
+
+                for train_index, test_index in kf.split(X_train):
+                    X_train_cv, X_test_cv = X_train_pre[train_index], X_train_pre[test_index]
+                    y_train_cv, y_test_cv = y_train_pre[train_index], y_train_pre[test_index]
+
+                    clf = MLPClassifier(activation  = activation, solver=solver, alpha=1e-5,
+                                        hidden_layer_sizes=layer, random_state=1)                             
+
+                    # tests will be unaffected
+                    if (c_scale > 0):
+                        pProcess = ping()
+                        X_train_cv, y_train_cv = createScaledDataset(X_train_cv, y_train_cv, mode=mode, c=c_scale, targetClass=targetClass)
+                        ms_process_total += pong(pProcess)                    
+
+                    if ((np.isnan(X_train_cv)).any):
+                        X_train_cv = np.nan_to_num(X_train_cv)
+
+                    # train
+                    clf.fit(X_train_cv, y_train_cv)
+                    pred = clf.predict(X_test_cv)
+
+                    # eval
+                    auROC, f1, accuracy, gmean, auPRC = getMetrics(y_test_cv, pred)
+
+                    scoresAuROC.append(auROC)
+                    scoresF1.append(f1)
+                    scoresAccuracy.append(accuracy)
+                    scoresGmean.append(gmean)
+                    scoresAuPRC.append(auPRC)
+
+                meanScoreAuROC = np.mean(scoresAuROC)
+                meanScoreF1 = np.mean(scoresF1)
+                meanScoreAccuracy = np.mean(scoresAccuracy)
+                meanScoreGmean = np.mean(scoresGmean)
+                meanScoreAuPRC = np.mean(scoresAuPRC)
+
+                if(meanScoreAuPRC > bestAuPRC):
+                    bestSolver = solver
+                    bestActivation = activation
+                    bestAuROC = auROC
+                    bestAccuracy = meanScoreAccuracy
+                    bestGmean = meanScoreGmean
+                    bestF1 = meanScoreF1
+                    bestAuPRC = meanScoreAuPRC
+                
+    if (c_scale > 0):
+        log("scaling final train data...")
+        pProcess = ping()
+        X_train, y_train = createScaledDataset(X_train, y_train,
+                                               c=c_scale, mode=mode, targetClass=targetClass)
+        ms_process_total /= len(solverz)
+        ms_process_total /= len(activationz)
+        ms_process_total = pong(pProcess)
+        
+    if((np.isnan(X_train)).any):
+        X_train= np.nan_to_num(X_train)
+        
+    
+    clf = MLPClassifier(activation  = bestActivation, solver=bestSolver, alpha=1e-5,
+                        hidden_layer_sizes=(5, 2), random_state=1)
+    clf.fit(X_train_cv, y_train_cv)            
+    retVal.train_time = pong(pTrain)
+    
+    # get CV train metrics
+    pTrainCV = ping()
+    bestAuROC = bestAuROC
+    bestAuPRC = bestAuPRC
+    bestAccuracy, bestF1, bestGmean = -1, -1, -1
+    retVal.train_eval_time = pong(pTrainCV)
+    retVal.train_accuracy = bestAccuracy
+    retVal.train_auroc = bestAuROC
+    retVal.train_auprc = bestAuPRC
+    retVal.train_f1 = bestF1
+    retVal.train_gmean = bestGmean
+    if(c_scale > 0):
+        retVal.ms_process = ms_process_total
+    
+    pred = clf.predict(X_test)
+    
+    pTest = ping()
+    retVal.auROC, retVal.f1, retVal.accuracy, retVal.gmean, retVal.auPRC = getMetrics(y_test, pred)
+    retVal.test_time = pong(pTest)
+
+    log('NN score: auROC={}f, auPRC={}f, f1={}, accuracy={}, gmean={}'.format(retVal.auROC, retVal.auPRC, retVal.f1, retVal.accuracy, retVal.gmean))
+    
+    return retVal
+    
+
+def trainOCC(X_train, X_test, y_train, y_test):
+    
+    #TODO : Implement cross validation http://scikit-learn.org/stable/tutorial/statistical_inference/model_selection.html
+    
+    log("training OCC")
+    retVal = trainResult()
+    
+    pTrain = ping()
+    
+    X_train_pre = pd.DataFrame(X_train).reset_index(drop=True).as_matrix() # otherwise indices from X will be used
+    X_test = pd.DataFrame(X_test).reset_index(drop=True).as_matrix().astype(float) # otherwise indices from X will be used
+            
+            
+    if (str(type(y_train)) != "<class 'numpy.ndarray'>"):
+        y_train_pre = y_train.reset_index(drop=True) # otherwise indices from X will be used
+    else:
+         y_train_pre = y_train
+            
+    if (str(type(y_test)) != "<class 'numpy.ndarray'>"):
+        y_test = y_test.reset_index(drop=True).as_matrix() # otherwise indices from X will be used
+            
+    bestAuROC = 0
+    bestAuPRC = 0
+    bestNu = 0.1
+    bestKernel = 'linear'
+    bestGamma = 0.1
+    for gamma in [0.001, 0.01, 0.1, 1]:
+        for nu in [0.01, 0.1, 0.5, 0.75, 1]:
+            for kernel in ['linear', 'poly', 'sigmoid']: # 'rbf'
+                #log("CV for gamma={}, nu={}, kernel={}".format(gamma, nu, kernel))
+
+                kf = KFold(n_splits=n_folds)
+                kf.get_n_splits(X_train_pre)
+                # print(kf) # print info about folds
+
+                scoresAuROC = []
+                scoresF1 = []
+                scoresAccuracy = []
+                scoresGmean = []               
+                scoresAuPRC = []
+                
+                for train_index, test_index in kf.split(X_train):
+                    X_train_cv, X_test_cv = X_train_pre[train_index], X_train_pre[test_index]
+                    y_train_cv, y_test_cv = y_train_pre[train_index], y_train_pre[test_index]
+
+                    clf = svm.OneClassSVM(nu=nu, kernel=kernel, gamma=gamma, tol = 0.01)
+                    #n_estimators = 10
+                    #clf = BaggingClassifier(svm.OneClassSVM(nu=nu, kernel=kernel, gamma=gamma),
+                    #                        max_samples=1.0 / n_estimators, n_estimators=n_estimators)
+
+                    #pprint(X_train_cv[y_train_cv == 1])
+                    #np.savetxt("occ_data.csv", X_train_cv, delimiter=",")
+                    
+                    nrow, ncol = X_train_cv[y_train_cv == 1].shape
+                    
+                    if (nrow == 0):
+                        log("no samples (CV for gamma={}, nu={}, kernel={}), continueing...".format(gamma, nu, kernel))
+                        continue
+                    
+                    clf.fit(X_train_cv[y_train_cv == 1])
+                    pred = clf.predict(X_test_cv)
+                    pred[pred < 0] = 0 # SVM outputs -1 for the "0" class
+                    
+                    # eval
+                    auROC, f1, accuracy, gmean, auPRC = getMetrics(y_test_cv, pred)
+                    
+                    scoresAuROC.append(auROC)
+                    scoresF1.append(f1)
+                    scoresAccuracy.append(accuracy)
+                    scoresGmean.append(gmean)
+                    scoresAuPRC.append(auPRC)
+                    
+                meanScoreAuROC = np.mean(scoresAuROC)
+                meanScoreAuPRC = np.mean(scoresAuRRC)
+                meanScoreF1 = np.mean(scoresF1)
+                meanScoreAccuracy = np.mean(scoresAccuracy)
+                meanScoreGmean = np.mean(scoresGmean)
+                
+                if(meanScoreAuPRC > bestAuPRC):
+                    bestNu = nu
+                    bestGamma = gamma
+                    bestKernel = kernel
+                    bestAuROC = auROC
+                    bestAuPRC = auPRC
+                    bestAccuracy = meanScoreAccuracy
+                    bestGmean = meanScoreGmean
+                    bestF1 = meanScoreF1
+                 
+#     retVal.train_eval_time = 0
+#     retVal.train_accuracy = bestAccuracy
+#     retVal.train_auc = bestAuROC
+#     retVal.train_f1 = bestF1
+#     retVal.train_gmean = bestGmean
+    
+    log('CV finished. Achieved best auROC={} using nu={}, gamma={} and kernel={}'.format(bestAuROC, bestNu, 
+                                                                                           bestGamma, bestKernel))
+    clf = svm.OneClassSVM(nu=bestNu, kernel=bestKernel, gamma=bestGamma)
+    clf.fit(X_train[y_train == 1]) # final training using all data
+    retVal.train_time = pong(pTrain)
+    
+    # get CV train metrics
+    pTrainCV = ping()
+    bestAuROC = getCVPerformance(clf, X_train, y_train)
+    bestAccuracy, bestF1, bestGmean = -1, -1, -1
+    retVal.train_eval_time = pong(pTrainCV)
+    retVal.train_accuracy = bestAccuracy
+    retVal.train_auroc = bestAuROC
+    retVal.train_auprc = bestAuPRC
+    retVal.train_f1 = bestF1
+    retVal.train_gmean = bestGmean
+    
+    pTest = ping()
+    pred = clf.predict(X_test)
+    pred[pred < 0] = 0 # SVM outputs -1 for the "0" class
+    
+    retVal.auROC, retVal.f1, retVal.accuracy, retVal.gmean, retVal.auPRC = getMetrics(y_test, pred)
+    retVal.test_time = pong(pTest)
+    
+    log('OCC score: auROC={}f, auPRC={}f, f1={}, accuracy={}, gmean={}'.format(retVal.auROC, retVal.auPRC, retVal.f1, retVal.accuracy, retVal.gmean))
+    #log('LogisticRegression score: %f' % logistic.fit(X_train, y_train).score(X_test, y_test))
+    
+    return retVal
+
+
+def testTrainOCC():
+    df = pd.DataFrame([
+              [1,3,1,0],
+              [1,4,1,1],
+              [1,5,1,0],
+              [1.5,6,1,0],
+              [1.7,7,1,0],
+              [1,4,1,0],
+              [1,6,1,0],
+              [1,5,1,1],
+              [1,12,1,1],
+              [1,9,1,1],
+              [1,2,1,1],
+              [1,3,1,1],
+              [1,5,1,1],
+              [2,8,1,0],
+              [3,1,0,1],
+              [3,2,0,1],
+              [4,2,0,0],
+              [5,3,0,0]], columns=['a', 'b', 'target','train'])
+    dfTrainTest = df['train']
+    X, y = createDatasetXY(df, indexFeatureStart = 0, indexFeatureEnd = 2, targetColumnName = "target")
+    X_train, X_test, y_train, y_test = train_test_split_index(X, y, dfTrainTest)
+
+    trainOCC(X_train, X_test, y_train, y_test)
+    
+def trainOCCScale(X_train, X_test, y_train, y_test,
+                  c_scale = 0, mode = "constant", targetClass = "minority"):
+    log("training OCC")
+    retVal = trainResult()
+
+    pTrain = ping()
+
+    X_train_pre = pd.DataFrame(X_train).reset_index(drop=True).as_matrix()  # otherwise indices from X will be used
+    X_test = pd.DataFrame(X_test).reset_index(drop=True).as_matrix().astype(
+        float)  # otherwise indices from X will be used
+
+    if (str(type(y_train)) != "<class 'numpy.ndarray'>"):
+        y_train_pre = y_train.reset_index(drop=True)  # otherwise indices from X will be used
+    else:
+        y_train_pre = y_train
+
+    if (str(type(y_test)) != "<class 'numpy.ndarray'>"):
+        y_test = y_test.reset_index(drop=True).as_matrix()  # otherwise indices from X will be used
+
+    ms_process_total = 0
+    bestAuROC = 0
+    bestAuPRC = 0
+    bestNu = 0.1
+    bestKernel = 'linear'
+    bestGamma = 0.1
+    gammaz = [0.001, 0.01, 0.1, 1]
+    nuz = [0.01, 0.1, 0.5, 0.75, 1]
+    kernelz =  ['linear', 'poly', 'sigmoid']
+    for gamma in gammaz:
+        for nu in nuz:
+            for kernel in kernelz:  # 'rbf'
+                # log("CV for gamma={}, nu={}, kernel={}".format(gamma, nu, kernel))
+
+                kf = KFold(n_splits=n_folds)
+                kf.get_n_splits(X_train_pre)
+                # print(kf) # print info about folds
+
+                scoresAuROC = []
+                scoresAuPRC = []
+                scoresF1 = []
+                scoresAccuracy = []
+                scoresGmean = []
+                scoresAuPRC = []
+
+                for train_index, test_index in kf.split(X_train):
+                    X_train_cv, X_test_cv = X_train_pre[train_index], X_train_pre[test_index]
+                    y_train_cv, y_test_cv = y_train_pre[train_index], y_train_pre[test_index]
+
+                    # tests will be unaffected
+                    if (c_scale > 0):
+                        pProcess = ping()
+                        X_train_cv, y_train_cv = createScaledDataset(X_train_cv, y_train_cv, mode=mode, c=c_scale,
+                                                                     targetClass=targetClass)
+                        ms_process_total += pong(pProcess)
+
+                    if ((np.isnan(X_train_cv)).any):
+                        X_train_cv = np.nan_to_num(X_train_cv)
+
+                    clf = svm.OneClassSVM(nu=nu, kernel=kernel, gamma=gamma, tol=0.01)
+                    # n_estimators = 10
+                    # clf = BaggingClassifier(svm.OneClassSVM(nu=nu, kernel=kernel, gamma=gamma),
+                    #                        max_samples=1.0 / n_estimators, n_estimators=n_estimators)
+
+                    # pprint(X_train_cv[y_train_cv == 1])
+                    # np.savetxt("occ_data.csv", X_train_cv, delimiter=",")
+
+                    nrow, ncol = X_train_cv[y_train_cv == 1].shape
+
+                    if (nrow == 0):
+                        log("no samples (CV for gamma={}, nu={}, kernel={}), continueing...".format(gamma, nu, kernel))
+                        continue
+
+                    clf.fit(X_train_cv[y_train_cv == 1])
+                    pred = clf.predict(X_test_cv)
+                    pred[pred < 0] = 0  # SVM outputs -1 for the "0" class
+
+                    # eval
+                    auROC, f1, accuracy, gmean, auPRC = getMetrics(y_test_cv, pred)
+
+                    scoresAuROC.append(auROC)
+                    scoresF1.append(f1)
+                    scoresAccuracy.append(accuracy)
+                    scoresGmean.append(gmean)
+                    scoresAuPRC.append(auPRC)
+
+                meanScoreAuROC = np.mean(scoresAuROC)
+                meanScoreF1 = np.mean(scoresF1)
+                meanScoreAccuracy = np.mean(scoresAccuracy)
+                meanScoreGmean = np.mean(scoresGmean)
+                meanScoreAuPRC = np.mean(scoresAuPRC)
+                
+                if(meanScoreAuPRC > bestAuPRC):
+                    bestAuROC = auROC
+                    bestAccuracy = meanScoreAccuracy
+                    bestGmean = meanScoreGmean
+                    bestF1 = meanScoreF1
+                    bestAuPRC = meanScoreAuPRC
+                    bestNu = nu
+                    bestKernel = kernel
+                    bestGamma = gamma
+
+                    #     retVal.train_eval_time = 0
+                    #     retVal.train_accuracy = bestAccuracy
+                    #     retVal.train_auc = bestAuROC
+                    #     retVal.train_f1 = bestF1
+                    #     retVal.train_gmean = bestGmean
+
+    log('CV finished. Achieved best auROC={} using nu={}, gamma={} and kernel={}'.format(bestAuROC, bestNu,
+                                                                                           bestGamma, bestKernel))
+
+    if (c_scale > 0):
+        log("scaling final train data...")
+        pProcess = ping()
+        X_train, y_train = createScaledDataset(X_train, y_train,
+                                                     c = c_scale, mode = mode, targetClass = targetClass)
+        ms_process_total /= len(gammaz) # allowed, because: Folds can be calculated out of the CV loop.
+        ms_process_total /= len(nuz) # allowed, because: Folds can be calculated out of the CV loop.
+        ms_process_total /= len(kernelz) # allowed, because: Folds can be calculated out of the CV loop.
+        ms_process_total = pong(pProcess)
+
+    if((np.isnan(X_train)).any):
+        X_train= np.nan_to_num(X_train)
+        
+    clf = svm.OneClassSVM(nu=bestNu, kernel=bestKernel, gamma=bestGamma)
+    clf.fit(X_train[y_train == 1])  # final training using all data
+    retVal.train_time = pong(pTrain)
+
+    # get CV train metrics
+    pTrainCV = ping()
+    bestAuROC = bestAuROC
+    bestAccuracy, bestF1, bestGmean = -1, -1, -1
+    retVal.train_eval_time = pong(pTrainCV)
+    retVal.train_accuracy = bestAccuracy
+    retVal.train_auc = bestAuROC
+    retVal.train_f1 = bestF1
+    retVal.train_gmean = bestGmean
+    if(c_scale > 0):
+        retVal.ms_process = ms_process_total
+        
+    pTest = ping()
+    pred = clf.predict(X_test)
+    pred[pred < 0] = 0  # SVM outputs -1 for the "0" class
+
+    retVal.auROC, retVal.f1, retVal.accuracy, retVal.gmean, retVal.auPRC = getMetrics(y_test, pred)
+    retVal.test_time = pong(pTest)
+
+    log('OCC score: auROC={}f, auPRC={}f, f1={}, accuracy={}, gmean={}'.format(retVal.auROC, retVal.auPRC, 
+                                                                               retVal.f1, retVal.accuracy, retVal.gmean))
+    
+
+    return retVal
+
+def trainKNN(X_train, X_test, y_train, y_test):
+    log("training KNN")
+    retVal = trainResult()
+    
+    knn = neighbors.KNeighborsClassifier()
+    
+    pTrain = ping()
+    model = knn.fit(X_train, y_train)
+    retVal.train_time = pong(pTrain)
+    
+    # get CV train metrics
+    pTrainCV = ping()
+    bestAuROC = getCVPerformance(model, X_train, y_train)
+    bestAccuracy, bestF1, bestGmean = -1, -1, -1
+    retVal.train_eval_time = pong(pTrainCV)
+    retVal.train_accuracy = bestAccuracy
+    retVal.train_auc = bestAuROC
+    retVal.train_f1 = bestF1
+    retVal.train_gmean = bestGmean
+    
+    pred = model.predict(X_test)
+    
+    lastModel = model
+    lastY = y_test
+    lastPred = pred
+    
+    pTest = ping()
+    retVal.auROC, retVal.f1, retVal.accuracy, retVal.gmean, retVal.auPRC = getMetrics(y_test, pred)
+    retVal.test_time = pong(pTest)
+
+    log('KNN score: auROC={}f, auPRC={}f, f1={}, accuracy={}, gmean={}'.format(retVal.auROC, retVal.auPRC, retVal.f1, retVal.accuracy, retVal.gmean))
+    
+    return retVal
+    
+from sklearn import datasets, neighbors, linear_model, svm
+from sklearn.ensemble import RandomForestClassifier
+import numpy as np
+from sklearn import metrics
+from sklearn.metrics import roc_auc_score, make_scorer
+from sklearn.ensemble import BaggingClassifier
+from sklearn.model_selection import KFold, cross_val_score
+
+def trainKNNScale(X_train, X_test, y_train, y_test,
+                  c_scale = 0, mode = "constant", targetClass = "minority"):
+    log("training KNN")
+    retVal = trainResult()
+
+    # train
+    ms_process_total = 0
+    pTrain = ping()
+    X_train_pre = pd.DataFrame(X_train).reset_index(drop=True).as_matrix()  # otherwise indices from X will be used
+    X_test = pd.DataFrame(X_test).reset_index(drop=True).as_matrix().astype(float)  # otherwise indices from X will be used
+
+    if (str(type(y_train)) != "<class 'numpy.ndarray'>"):
+        y_train_pre = y_train.reset_index(drop=True)  # otherwise indices from X will be used
+    else:
+        y_train_pre = y_train
+
+    if (str(type(y_test)) != "<class 'numpy.ndarray'>"):
+        y_test = y_test.reset_index(drop=True).as_matrix()  # otherwise indices from X will be used
+    from sklearn.model_selection import KFold
+
+    bestAuROC = 0
+    bestAuPRC = 0
+    bestNN = 3
+    neighborz = [3, 5, 10]
+
+    number_of_folds = n_folds
+    num_of_cpus = multiprocessing.cpu_count()
+
+    for nn in neighborz:
+        kf = KFold(n_splits=n_folds)
+        kf.get_n_splits(X_train_pre)
+
+        scoresAuROC = []
+        scoresF1 = []
+        scoresAccuracy = []
+        scoresGmean = []
+        scoresAuPRC = []
+
+        for train_index, test_index in kf.split(X_train):
+            X_train_cv, X_test_cv = X_train_pre[train_index], X_train_pre[test_index]
+            y_train_cv, y_test_cv = y_train_pre[train_index], y_train_pre[test_index]
+
+            # tests will be unaffected
+            if (c_scale > 0):
+                pProcess = ping()
+                X_train_cv, y_train_cv = createScaledDataset(X_train_cv, y_train_cv, mode=mode, c=c_scale, targetClass=targetClass)
+                ms_process_total += pong(pProcess)
+
+            clf = neighbors.KNeighborsClassifier(n_neighbors = nn, n_jobs = num_of_cpus)
+
+            # train
+            if ((np.isnan(X_train_cv)).any):
+                X_train_cv = np.nan_to_num(X_train_cv)
+
+            clf.fit(X_train_cv, y_train_cv)
+            pred = clf.predict(X_test_cv)
+
+            # eval
+            auROC, f1, accuracy, gmean, auPRC = getMetrics(y_test_cv, pred)
+
+            scoresAuROC.append(auROC)
+            scoresF1.append(f1)
+            scoresAccuracy.append(accuracy)
+            scoresGmean.append(gmean)
+            scoresAuPRC.append(auPRC)
+
+        meanScoreAuROC = np.mean(scoresAuROC)
+        meanScoreF1 = np.mean(scoresF1)
+        meanScoreAccuracy = np.mean(scoresAccuracy)
+        meanScoreGmean = np.mean(scoresGmean)
+        meanScoreAuPRC = np.mean(scoresAuPRC)
+
+        if(meanScoreAuPRC > bestAuPRC):
+            bestNN = nn
+            bestAuROC = auROC
+            bestAccuracy = meanScoreAccuracy
+            bestGmean = meanScoreGmean
+            bestF1 = meanScoreF1
+            bestAuPRC = meanScoreAuPRC
+
+
+    if (c_scale > 0):
+        log("scaling final train data...")
+        pProcess = ping()
+        X_train, y_train = createScaledDataset(X_train, y_train,
+                                               c=c_scale, mode=mode, targetClass=targetClass)
+        ms_process_total /= len(neighborz)
+        ms_process_total = pong(pProcess)
+
+    if((np.isnan(X_train)).any):
+        X_train= np.nan_to_num(X_train)
+        
+    clf = neighbors.KNeighborsClassifier(n_neighbors = bestNN, n_jobs = num_of_cpus)
+    model = clf.fit(X_train, y_train)
+    retVal.train_time = pong(pTrain)
+
+    # get CV train metrics
+    pTrainCV = ping()
+    bestAccuracy, bestF1, bestGmean = -1, -1, -1
+    retVal.train_eval_time = pong(pTrainCV)
+    retVal.train_accuracy = bestAccuracy
+    retVal.train_auc = bestAuROC
+    retVal.train_f1 = bestF1
+    retVal.train_gmean = bestGmean
+    if(c_scale > 0):
+        retVal.ms_process = ms_process_total
+
+    pred = model.predict(X_test)
+
+    lastModel = model
+    lastY = y_test
+    lastPred = pred
+
+    pTest = ping()
+    retVal.auROC, retVal.f1, retVal.accuracy, retVal.gmean, retVal.auPRC = getMetrics(y_test, pred)
+    retVal.test_time = pong(pTest)
+
+    log('KNN score: auc={}f, f1={}, accuracy={}, gmean={}'.format(retVal.auROC, retVal.f1, retVal.accuracy,
+                                                                  retVal.gmean))
+
+    return retVal
+
+def trainRF(X_train, X_test, y_train, y_test):
+    log("training RF")
+    retVal = trainResult()
+
+    
+    # train    
+    pTrain = ping()
+    X_train_pre = pd.DataFrame(X_train).reset_index(drop=True).as_matrix() # otherwise indices from X will be used
+    X_test = pd.DataFrame(X_test).reset_index(drop=True).as_matrix().astype(float) # otherwise indices from X will be used
+            
+            
+    if (str(type(y_train)) != "<class 'numpy.ndarray'>"):
+        y_train_pre = y_train.reset_index(drop=True) # otherwise indices from X will be used
+    else:
+         y_train_pre = y_train
+            
+    if (str(type(y_test)) != "<class 'numpy.ndarray'>"):
+        y_test = y_test.reset_index(drop=True).as_matrix() # otherwise indices from X will be used
+    from sklearn.model_selection import KFold
+    bestAuROC = 0
+    bestAuPRC = 0
+    bestAccuracy = 0
+    bestGmean = 0
+    bestF1 = 0
+    bestEstimators = 10
+    bestCriterion = 'gini'
+    estimatorz = [5,10,20]
+    criterionz = ['gini', 'entropy']
+    for estimators in estimatorz:
+        for criterion in criterionz:
+            kf = KFold(n_splits=n_folds)
+            kf.get_n_splits(X_train_pre)
+            
+            scoresAuROC = []
+            scoresF1 = []
+            scoresAccuracy = []
+            scoresGmean = []
+            scoresAuPRC = []
+                
+            for train_index, test_index in kf.split(X_train):
+                X_train_cv, X_test_cv = X_train_pre[train_index], X_train_pre[test_index]
+                y_train_cv, y_test_cv = y_train_pre[train_index], y_train_pre[test_index]
+
+                clf = RandomForestClassifier(n_estimators=estimators, criterion=criterion)
+                
+                # train
+                clf.fit(X_train_cv, y_train_cv)
+                pred = clf.predict(X_test_cv)
+
+                # eval
+                auROC, f1, accuracy, gmean, auPRC = getMetrics(y_test_cv, pred)
+                
+                scoresAuROC.append(auROC)
+                scoresF1.append(f1)
+                scoresAccuracy.append(accuracy)
+                scoresGmean.append(gmean)
+                scoresAuPRC.append(auPRC)
+
+            meanScoreAuROC = np.mean(scoresAuROC)
+            meanScoreF1 = np.mean(scoresF1)
+            meanScoreAccuracy = np.mean(scoresAccuracy)
+            meanScoreGmean = np.mean(scoresGmean)
+            meanScoreAuPRC = np.mean(scoresAuPRC)
+
+            if(meanScoreAuPRC > bestAuPRC):
+                bestEstimators = estimators
+                bestCriterion = criterion
+                bestAuROC = auROC
+                bestAccuracy = meanScoreAccuracy
+                bestGmean = meanScoreGmean
+                bestF1 = meanScoreF1
+                bestAuPRC = meanScoreAuPRC
+                
+#     retVal.train_eval_time = 0
+#     retVal.train_accuracy = bestAccuracy
+#     retVal.train_auc = bestAuROC
+#     retVal.train_f1 = bestF1
+#     retVal.train_gmean = bestGmean
+    
+    clf = RandomForestClassifier(n_estimators=bestEstimators, criterion=bestCriterion)
+    clf.fit(X_train_cv, y_train_cv)            
+    retVal.train_time = pong(pTrain)
+    
+    # get CV train metrics
+    pTrainCV = ping()
+    bestAuROC = getCVPerformance(clf, X_train, y_train)
+    bestAccuracy, bestF1, bestGmean = -1, -1, -1
+    retVal.train_eval_time = pong(pTrainCV)
+    retVal.train_accuracy = bestAccuracy
+    retVal.train_auc = bestAuROC
+    retVal.train_f1 = bestF1
+    retVal.train_gmean = bestGmean
+    
+    pred = clf.predict(X_test)
+    
+    pTest = ping()
+    retVal.auROC, retVal.f1, retVal.accuracy, retVal.gmean, retVal.auPRC = getMetrics(y_test, pred)
+    retVal.test_time = pong(pTest)
+
+    log('RF score: auROC={}f, auPRC={}f, f1={}, accuracy={}, gmean={}'.format(retVal.auROC, retVal.auPRC, retVal.f1, retVal.accuracy, retVal.gmean))
+    
+    return retVal
+
+
+
+def trainRFScale(X_train, X_test, y_train, y_test, balanced = None,
+                 c_scale = 0, mode = "constant", targetClass = "minority"):
+    log("training RF")
+    retVal = trainResult()
+    ms_process_total = 0
+    
+    num_of_cpus = multiprocessing.cpu_count()
+
+    
+    # train    
+    pTrain = ping()
+    X_train_pre = pd.DataFrame(X_train).reset_index(drop=True).as_matrix() # otherwise indices from X will be used
+    X_test = pd.DataFrame(X_test).reset_index(drop=True).as_matrix().astype(float) # otherwise indices from X will be used
+            
+            
+    if (str(type(y_train)) != "<class 'numpy.ndarray'>"):
+        y_train_pre = y_train.reset_index(drop=True) # otherwise indices from X will be used
+    else:
+         y_train_pre = y_train
+            
+    if (str(type(y_test)) != "<class 'numpy.ndarray'>"):
+        y_test = y_test.reset_index(drop=True).as_matrix() # otherwise indices from X will be used
+    from sklearn.model_selection import KFold
+    bestAuROC = 0
+    bestAuPRC = 0
+    bestAccuracy = 0
+    bestGmean = 0
+    bestF1 = 0
+    bestEstimators = 10
+    bestCriterion = 'gini'
+    estimatorz = [5,10,20]
+    criterionz = ['gini', 'entropy']
+    for estimators in estimatorz:
+        for criterion in criterionz:
+            kf = KFold(n_splits=n_folds)
+            kf.get_n_splits(X_train_pre)
+            
+            scoresAuROC = []
+            scoresF1 = []
+            scoresAccuracy = []
+            scoresGmean = []
+            scoresAuPRC = []
+                
+            for train_index, test_index in kf.split(X_train):
+                X_train_cv, X_test_cv = X_train_pre[train_index], X_train_pre[test_index]
+                y_train_cv, y_test_cv = y_train_pre[train_index], y_train_pre[test_index]
+
+                clf = RandomForestClassifier(n_estimators=estimators, criterion=criterion, 
+                                             class_weight = balanced, n_jobs = num_of_cpus)
+                
+                # tests will be unaffected
+                if (c_scale > 0):
+                    pProcess = ping()
+                    X_train_cv, y_train_cv = createScaledDataset(X_train_cv, y_train_cv, mode=mode, c=c_scale, targetClass=targetClass)
+                    ms_process_total += pong(pProcess)                    
+
+                if ((np.isnan(X_train_cv)).any):
+                    X_train_cv = np.nan_to_num(X_train_cv)
+                    
+                # train
+                clf.fit(X_train_cv, y_train_cv)
+                pred = clf.predict(X_test_cv)
+
+                # eval
+                auROC, f1, accuracy, gmean, auPRC = getMetrics(y_test_cv, pred)
+                
+                scoresAuROC.append(auROC)
+                scoresF1.append(f1)
+                scoresAccuracy.append(accuracy)
+                scoresGmean.append(gmean)
+                scoresAuPRC.append(auPRC)
+
+            meanScoreAuROC = np.mean(scoresAuROC)
+            meanScoreF1 = np.mean(scoresF1)
+            meanScoreAccuracy = np.mean(scoresAccuracy)
+            meanScoreGmean = np.mean(scoresGmean)
+            meanScoreAuPRC = np.mean(scoresAuPRC)
+
+            if(meanScoreAuPRC > bestAuPRC):
+                bestEstimators = estimators
+                bestCriterion = criterion
+                bestAuROC = auROC
+                bestAccuracy = meanScoreAccuracy
+                bestGmean = meanScoreGmean
+                bestF1 = meanScoreF1
+                bestAuPRC = meanScoreAuPRC
+                
+    if (c_scale > 0):
+        log("scaling final train data...")
+        pProcess = ping()
+        X_train, y_train = createScaledDataset(X_train, y_train,
+                                               c=c_scale, mode=mode, targetClass=targetClass)
+        ms_process_total /= len(estimatorz)
+        ms_process_total /= len(criterionz)
+        ms_process_total = pong(pProcess)
+        
+    if((np.isnan(X_train)).any):
+        X_train= np.nan_to_num(X_train)
+        
+    clf = RandomForestClassifier(n_estimators=bestEstimators, criterion=bestCriterion)
+    clf.fit(X_train_cv, y_train_cv)            
+    retVal.train_time = pong(pTrain)
+    
+    # get CV train metrics
+    pTrainCV = ping()
+    bestAuROC = bestAuROC
+    bestAccuracy, bestF1, bestGmean = -1, -1, -1
+    retVal.train_eval_time = pong(pTrainCV)
+    retVal.train_accuracy = bestAccuracy
+    retVal.train_auc = bestAuROC
+    retVal.train_f1 = bestF1
+    retVal.train_gmean = bestGmean
+    if(c_scale > 0):
+        retVal.ms_process = ms_process_total
+    
+    pred = clf.predict(X_test)
+    
+    pTest = ping()
+    retVal.auROC, retVal.f1, retVal.accuracy, retVal.gmean, retVal.auPRC = getMetrics(y_test, pred)
+    retVal.test_time = pong(pTest)
+
+    log('RF score: auROC={}f, auPRC={}f, f1={}, accuracy={}, gmean={}'.format(retVal.auROC, retVal.auPRC, retVal.f1, retVal.accuracy, retVal.gmean))
+    
+    return retVal
+
+
+
+def trainLR(X_train, X_test, y_train, y_test, balanced = None, scoring = "none"):
+    """
+    Trains a logistic regression based on cross validation.
+    :param balance: Use 'balanced' or None
+        Weights associated with classes in the form {class_label: weight}. 
+        If not given, all classes are supposed to have weight one.
+    
+        The “balanced” mode uses the values of y to automatically adjust weights inversely 
+        proportional to class frequencies in the input data as n_samples / (n_classes * np.bincount(y)).
+        
+        Note that these weights will be multiplied with sample_weight (passed through the 
+        fit method) if sample_weight is specified. 
+        From http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegressionCV.html
+    :param scoring: Use "none" for default, "auROC" for AUC of ROC curve
+    """
+    log("training LR ({})".format(scoring))
+    
+    retVal = trainResult()
+    
+    number_of_folds = n_folds
+    tolerance = 0.01
+    num_of_cpus = multiprocessing.cpu_count()
+    
+    
+    if (scoring == "auROCWeighted"):
+        log("\"auROCWeighted\" set. Using area under ROC curve with weighted samples...")
+        n_samples, n_features = X_train.shape
+        n_classes = len(np.unique(y_train))
+        w = n_samples / (n_classes * np.bincount(y_train)) # bincount returns the number of instances for non-negative integer: 0, 1, ...
+        # w holds now the inverse weights of all classes
+        w_array = w[y_train] # pick weight based on corresponding label
+        scorer = auc_scorer = make_scorer(roc_auc_score,
+                                          average = "weighted",
+                                          sample_weight = w_array) # additional parameters can be specified, see 
+                                            # http://scikit-learn.org/stable/modules/generated/sklearn.metrics.roc_auc_score.html#sklearn.metrics.roc_auc_score
+                                            # and http://scikit-learn.org/dev/modules/model_evaluation.html
+    elif (scoring == "auROC"):
+        log("\"auROC\" set. Using area under ROC curve...")
+        scorer = auc_scorer = make_scorer(roc_auc_score) # additional parameters can be specified, see 
+                                            # http://scikit-learn.org/stable/modules/generated/sklearn.metrics.roc_auc_score.html#sklearn.metrics.roc_auc_score
+                                            # and http://scikit-learn.org/dev/modules/model_evaluation.html
+    else:
+        log("Scoring method \"" + scoring + "\" not recognized or set. Using default (accuracy)...")
+        scorer = None
+    
+                
+    lr = linear_model.LogisticRegressionCV(Cs=[0.001, 0.01, 0.1, 1], 
+                                          class_weight = balanced,
+                                          cv = number_of_folds,
+                                          penalty = 'l1',
+                                          scoring = scorer,
+                                          solver = 'liblinear',
+                                          tol = tolerance,
+                                          n_jobs = num_of_cpus)
+        
+        
+
+    tain_time = 0
+    test_time = 0
+    
+    
+    pTrain = ping()
+    model = lr.fit(X_train, y_train)
+    retVal.train_time = pong(pTrain)
+        
+    # for logisticRegressionCV the cv is already built in, therefore, we can use clf.scores_[1]:
+    # clf.scores_[1].shape > (6, 3) > 6 = number of folds, 3 = number of tried out Cs.
+    pTrainCV = ping()
+    #bestAuROC = getCVPerformance(model, X_train, y_train)
+    aucMeanCspecific = max(np.mean(model.scores_[1], axis = 0)) # get CV train metrics
+    #bestAuROC = max(np.mean(model.scores_[1], axis = 0)) # Take the mean for each C and then the maximum
+    bestAccuracy, bestF1, bestGmean = -1, -1, -1
+    retVal.train_eval_time = pong(pTrainCV)
+    retVal.train_accuracy = bestAccuracy
+    retVal.train_auc = aucMeanCspecific
+    retVal.train_f1 = bestF1
+    retVal.train_gmean = bestGmean
+    log("best AUC during training was" + str(aucMeanCspecific))
+    
+    pred_all = model.predict_proba(X_test)
+    pred = pred_all[:,1]
+    
+    #pprint(np.column_stack((y_test, pred.round(3))))
+    
+    pTest = ping()
+    retVal.auROC, retVal.f1, retVal.accuracy, retVal.gmean, retVal.auPRC = getMetrics(y_test, pred)
+    retVal.test_time = pong(pTest)
+    
+    log('LR score: auROC={}f, auPRC={}f, f1={}, accuracy={}, gmean={}'.format(retVal.auROC, retVal.auPRC, retVal.f1, retVal.accuracy, retVal.gmean))
+    
+    return retVal
+
+def trainLRScale(X_train, X_test, y_train, y_test, balanced = None,
+                 c_scale = 0, mode = "constant", targetClass = "minority"):
+    """
+    Trains a logistic regression based on cross validation.
+    :param balance: Use 'balanced' or None
+        Weights associated with classes in the form {class_label: weight}. 
+        If not given, all classes are supposed to have weight one.
+    
+        The “balanced” mode uses the values of y to automatically adjust weights inversely 
+        proportional to class frequencies in the input data as n_samples / (n_classes * np.bincount(y)).
+        
+        Note that these weights will be multiplied with sample_weight (passed through the 
+        fit method) if sample_weight is specified. 
+        From http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegressionCV.html
+    :param scoring: Use "none" for default, "auROC" for AUC of ROC curve
+    """
+    
+    retVal = trainResult()
+    
+    number_of_folds = n_folds
+    tolerance = 0.01
+    num_of_cpus = multiprocessing.cpu_count()
+    ms_process_total = 0
+    
+    # train    
+    pTrain = ping()
+    X_train_pre = pd.DataFrame(X_train).reset_index(drop=True).as_matrix() # otherwise indices from X will be used
+    X_test = pd.DataFrame(X_test).reset_index(drop=True).as_matrix().astype(float) # otherwise indices from X will be used
+            
+            
+    if (str(type(y_train)) != "<class 'numpy.ndarray'>"):
+        y_train_pre = y_train.reset_index(drop=True) # otherwise indices from X will be used
+    else:
+        y_train_pre = y_train
+            
+    if (str(type(y_test)) != "<class 'numpy.ndarray'>"):
+        y_test = y_test.reset_index(drop=True).as_matrix() # otherwise indices from X will be used
+    from sklearn.model_selection import KFold
+    bestAuROC = 0
+    bestAuPRC = 0
+    bestC= 0.001
+    Cs = [0.001, 0.01, 0.1, 1]
+    for c in Cs:
+        kf = KFold(n_splits=n_folds)
+        kf.get_n_splits(X_train_pre)
+
+        scoresAuROC = []
+        scoresF1 = []
+        scoresAccuracy = []
+        scoresGmean = []
+        scoresAuPRC = []
+        
+        for train_index, test_index in kf.split(X_train):
+            X_train_cv, X_test_cv = X_train_pre[train_index], X_train_pre[test_index]
+            y_train_cv, y_test_cv = y_train_pre[train_index], y_train_pre[test_index]
+            
+            # tests will be unaffected
+            if (c_scale > 0):
+                pProcess = ping()
+                X_train_cv, y_train_cv = createScaledDataset(X_train_cv, y_train_cv, mode = mode, c = c_scale, targetClass = targetClass)
+                ms_process_total += pong(pProcess)
+
+            clf = linear_model.LogisticRegression(C=c, 
+                                          class_weight = balanced,
+                                          penalty = 'l1',
+                                          solver = 'liblinear',
+                                          tol = tolerance,
+                                          n_jobs = num_of_cpus)
+
+            # train
+            if((np.isnan(X_train_cv)).any):
+                X_train_cv= np.nan_to_num(X_train_cv)
+                
+            clf.fit(X_train_cv, y_train_cv)
+            pred = clf.predict(X_test_cv)
+
+            # eval
+            auROC, f1, accuracy, gmean, auPRC = getMetrics(y_test_cv, pred)
+
+            scoresAuROC.append(auROC)
+            scoresF1.append(f1)
+            scoresAccuracy.append(accuracy)
+            scoresGmean.append(gmean)
+            scoresAuPRC.append(auPRC)
+
+        meanScoreAuROC = np.mean(scoresAuROC)
+        meanScoreF1 = np.mean(scoresF1)
+        meanScoreAccuracy = np.mean(scoresAccuracy)
+        meanScoreGmean = np.mean(scoresGmean)
+        meanScoreAuPRC = np.mean(scoresAuPRC)
+
+        if(meanScoreAuPRC > bestAuPRC):
+            bestC = c
+            bestAuROC = auROC
+            bestAccuracy = meanScoreAccuracy
+            bestGmean = meanScoreGmean
+            bestF1 = meanScoreF1
+            bestAuPRC = meanScoreAuPRC
+    
+    tain_time = 0
+    test_time = 0
+    lr = linear_model.LogisticRegression(C=bestC, 
+                                          class_weight = balanced,
+                                          penalty = 'l1',
+                                          solver = 'liblinear',
+                                          tol = tolerance,
+                                          n_jobs = num_of_cpus)
+    if (c_scale > 0): 
+        log("scaling final train data...")
+        pProcess = ping()
+        X_train, y_train = createScaledDataset(X_train, y_train,
+                                                     c = c_scale, mode = mode, targetClass = targetClass)
+        ms_process_total /= len(Cs)
+        ms_process_total = pong(pProcess)
+
+    if((np.isnan(X_train)).any):
+        X_train= np.nan_to_num(X_train)
+    model = lr.fit(X_train, y_train)
+    retVal.train_time = pong(pTrain)
+    
+    pTrainCV = ping()
+    aucMeanCspecific = bestAuROC
+    bestAccuracy, bestF1, bestGmean = -1, -1, -1
+    retVal.train_eval_time = pong(pTrainCV)
+    retVal.train_accuracy = bestAccuracy
+    retVal.train_auc = aucMeanCspecific
+    retVal.train_f1 = bestF1
+    retVal.train_gmean = bestGmean
+    log("best AUC during training was" + str(aucMeanCspecific))
+    if(c_scale > 0):
+        retVal.ms_process = ms_process_total
+    
+    pred_all = model.predict_proba(X_test)
+    pred = pred_all[:,1]
+    
+    #pprint(np.column_stack((y_test, pred.round(3))))
+    
+    pTest = ping()
+    retVal.auROC, retVal.f1, retVal.accuracy, retVal.gmean, retVal.auPRC = getMetrics(y_test, pred)
+    retVal.test_time = pong(pTest)
+    
+    log('LR score: auROC={}f, auPRC={}f, f1={}, accuracy={}, gmean={}'.format(retVal.auROC, retVal.auPRC, retVal.f1, retVal.accuracy, retVal.gmean))
+    
+    return retVal
+
+
+# ## Main loop, where all targets will be called subsequently
+
+# In[ ]:
+
+def evalAll(X_train_pre, X_test_pre, y_train_pre, y_test_pre, dataset, target, bs2_measure,
+            modeltypes = ['RF', 'OCC', 'LR', 'KNN', 'WLR'], 
+            approaches = ['OSS', 'CNN', 'naive', 'SMOTE', 'tomek', 'ScaleMajority', 'ScaleMinority', 'ScaleBoth', 'ADASYN']):
+    global X_train, X_test, y_train, y_test
+    
+    nAllSample = len(X_train_pre) + len(X_test_pre)
+    nAllSamplePos = sum(y_train_pre) + sum(y_test_pre)
+    nAllSampleNeg = nAllSample - nAllSamplePos
+    
+    nTrainSampleBefore = len(X_train_pre)
+    nTrainSamplePosBefore = sum(y_train_pre)
+    nTrainSampleNegBefore = nTrainSampleBefore - nTrainSamplePosBefore
+                
+    for approach in approaches:
+        c_opt_cv = -1
+        if target == "BEFUND___TA___61_14_535":
+            c_manual_values = [0, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
+        else:
+            c_manual_values = [0, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1]
+            
+        for c_manual in c_manual_values:
+            for scalingMode in ["linear", "constant"]:
+                msProcess = 0 # reset
+                approachMod = approach
+                if("Scale" in approach):
+                    log("trying approach " + approach + ", using c=" + str(c_manual) + "...")
+                elif(c_manual == 0.0 and scalingMode == "constant"):
+                    log("trying approach " + approach + "...")
+                    c_opt_cv = -1
+                else:
+                    continue # only scaling makes sense to be analysed with various c's
+
+                approachMod = approach
+                if('naive' in approach):
+                    pProcess = ping()
+                    X_test, y_test = X_test_pre, y_test_pre
+                    X_train, y_train = createNaiveDataset(X_train_pre, y_train_pre)
+                    msProcess = pong(pProcess)
+
+                if('SMOTE' in approach):
+                    pProcess = ping()
+                    X_test, y_test = X_test_pre, y_test_pre
+                    X_train, y_train = createSMOTEDataset(X_train_pre, y_train_pre)
+                    msProcess = pong(pProcess)
+
+                if('tomek' in approach):
+                    pProcess = ping()
+                    X_test, y_test = X_test_pre, y_test_pre
+                    X_train, y_train = createTomekDataset(X_train_pre, y_train_pre)                
+                    msProcess = pong(pProcess)
+
+                if('ADASYN' in approach):
+                    pProcess = ping()
+                    X_test, y_test = X_test_pre, y_test_pre
+                    X_train, y_train = createADASYNDataset(X_train_pre, y_train_pre)     
+                    msProcess = pong(pProcess)
+
+                if('OSS' in approach):
+                    pProcess = ping()
+                    X_test, y_test = X_test_pre, y_test_pre
+                    X_train, y_train = createOSSDataset(X_train_pre, y_train_pre)  
+                    msProcess = pong(pProcess)
+
+                if('CNN' in approach):
+                    pProcess = ping()
+                    X_test, y_test = X_test_pre, y_test_pre
+                    X_train, y_train = createCNNDataset(X_train_pre, y_train_pre)  
+                    msProcess = pong(pProcess)
+
+                targetClass = ""
+                if('ScaleMajority' in approach):
+                    #pProcess = ping()
+                    targetClass = "majority"
+                    approachMod = "CSS"
+                    X_test, y_test = X_test_pre, y_test_pre
+                    X_train, y_train = X_train_pre, y_train_pre
+                    #X_train, y_train = createScaledDataset(X_train_pre, y_train_pre,c = c_manual, mode = scalingMode,
+                    #                                       targetClass = targetClass 
+                    #                                       )
+                    #msProcess = pong(pProcess)      
+
+                if('ScaleMinority' in approach):
+                    #pProcess = ping()
+                    targetClass = "minority"
+                    approachMod = "CSS"
+                    X_test, y_test = X_test_pre, y_test_pre
+                    X_train, y_train = X_train_pre, y_train_pre
+                    #X_train, y_train = createScaledDataset(X_train_pre, y_train_pre, 
+                    #                                       c = c_manual, mode = scalingMode,
+                    #                                       targetClass = targetClass)
+                    #msProcess = pong(pProcess)
+
+                if('ScaleBoth' in approach):
+                    #pProcess = ping()
+                    targetClass = "both"
+                    approachMod = "CSS"
+                    X_test, y_test = X_test_pre, y_test_pre
+                    X_train, y_train = X_train_pre, y_train_pre
+                    #X_train, y_train = createScaledDataset(X_train_pre, y_train_pre, 
+                    #                                       c = c_manual, mode = scalingMode,
+                    #                                       targetClass = targetClass)
+                    #msProcess = pong(pProcess)
+
+                for modeltype in modeltypes:
+                    log("evaluating approach {} using model {}...".format(approach, modeltype))
+                    
+                    #try:
+                    if(modeltype == 'LR'):
+                        log("debug: {}, {}, {}".format(c_manual, scalingMode, targetClass))
+                        retVal = trainLRScale(X_train, X_test, y_train, y_test, balanced=None,
+                                              c_scale = c_manual, mode = scalingMode, targetClass = targetClass)
+
+                    if(modeltype == 'KNN'):
+                        retVal = trainKNNScale(X_train, X_test, y_train, y_test,
+                                              c_scale = c_manual, mode = scalingMode, targetClass = targetClass)
+
+                    if(modeltype == 'WLR'):
+                        retVal = trainLRScale(X_train, X_test, y_train, y_test, balanced='balanced',
+                                              c_scale = c_manual, mode = scalingMode, targetClass = targetClass)
+
+                    if(modeltype == 'OCC'):
+                        retVal = trainOCCScale(X_train, X_test, y_train, y_test,
+                                              c_scale = c_manual, mode = scalingMode, targetClass = targetClass)
+
+                    if(modeltype == 'RF'):
+                        retVal = trainRFScale(X_train, X_test, y_train, y_test, 
+                                              c_scale = c_manual, mode = scalingMode, targetClass = targetClass)
+
+                    if(modeltype == 'WRF'):
+                        retVal = trainRFScale(X_train, X_test, y_train, y_test, balanced='balanced',
+                                              c_scale = c_manual, mode = scalingMode, targetClass = targetClass)
+                        
+                    if(modeltype == 'NN'):
+                        retVal = trainNNScale(X_train, X_test, y_train, y_test,
+                                              c_scale = c_manual, mode = scalingMode, targetClass = targetClass)
+                    #except Exception as e:
+                    #    log("building model failed:" + str(e))
+
+                    dummy, nFeature = X_train.shape
+                    nTrainSample = len(X_train)
+                    nTrainSamplePos = sum(y_train)
+                    nTrainSampleNeg = nTrainSample - nTrainSamplePos
+
+                    if(retVal.ms_process != -1):
+                        log("ret val != -1. Using ms process from model ({}).".format(retVal.ms_process))
+                        msProcess = retVal.ms_process
+                    else:
+                        log("preprocessing took {}ms".format(msProcess))
+
+                    Report.logToFile(target = target, dataset = dataset, model_type=modeltype, 
+                                     model_train_time=retVal.train_time, 
+                                     model_train_eval_time=retVal.train_eval_time, 
+                                     model_test_time=retVal.test_time,
+                                     model_accuracy= retVal.accuracy, 
+                                     model_auroc = retVal.auROC, 
+                                     model_auprc = retVal.auPRC, 
+                                     model_f1 = retVal.f1, 
+                                     model_gmean = retVal.gmean, 
+                                     model_train_accuracy= retVal.train_accuracy,
+                                     model_train_auroc = retVal.train_auroc,
+                                     model_train_auprc = retVal.train_auprc,
+                                     model_train_f1 = retVal.train_f1, 
+                                     model_train_gmean = retVal.train_gmean, 
+                                     num_features = nFeature, 
+                                     num_sample_dataset = nAllSample,
+                                     num_sample_dataset_pos = nAllSamplePos,
+                                     num_sample_dataset_neg = nAllSampleNeg,
+                                     num_sample_train_before = nTrainSampleBefore,
+                                     num_sample_train_before_pos = nTrainSamplePosBefore,
+                                     num_sample_train_before_meg = nTrainSampleNegBefore,
+                                     num_sample_train_after = nTrainSample,
+                                     num_sample_train_after_pos = nTrainSamplePos, 
+                                     num_sample_train_after_neg = nTrainSampleNeg,
+                                     bs2 = bs2_measure,
+                                     process_time = msProcess, 
+                                     process_name = approachMod,
+                                     process_naive = 1 if ('naive' in approach) else 0, 
+                                     process_sampling_up_smote = 1 if ('SMOTE' in approach) else 0,  
+                                     process_sampling_up_adasyn = 1 if ('ADASYN' in approach) else 0, 
+                                     process_sampling_down_oss = 1 if ('OSS' in approach) else 0, 
+                                     process_sampling_down_cnn = 1 if ('CNN' in approach) else 0,  
+                                     process_sampling_down_tomek = 1 if ('tomek' in approach) else 0, 
+                                     process_weight = 1 if (modeltype == 'WLR') else 0, 
+                                     process_scale_minority = 1 if ('CSS' in approachMod) else 0, 
+                                     process_scale_mode = scalingMode if ('CSS' in approachMod) else "", 
+                                     process_scale_target = targetClass if ('CSS' in approachMod) else "", 
+                                     process_scale_c = c_manual if ('CSS' in approachMod) else 0)
+                    
+                    # notify ubidots
+                    if use_ubi:
+                        try:
+                            new_value = ubi_last_timestamp.save_value({'value': 10, 'context':{'lastTimestamp': "'" + str(datetime.datetime.now()) + "'"}})
+                        except Exception as e:
+                            log("ubidots failed." + str(e))
+                    
+
+
+# In[ ]:
+
+def evalSubSC(dfIn, name, classColumnName, modeltypes, approaches,
+             indexFeatureStart, indexFeatureEnd, fixedSplit = "no"):
+    X, y = createDatasetXY(df = dfIn, indexFeatureStart = indexFeatureStart,
+                                       indexFeatureEnd= indexFeatureEnd, targetColumnName=classColumnName)    
+    bs2_measure = getBS2(X = X, y = y)
+
+    nSample, nFeature = X.shape
+    nSampleNeg = nSample - sum(y)
+    
+    if (fixedSplit == "no"):
+        X_train_pre, X_test_pre, y_train_pre, y_test_pre = train_test_split_scaled(X, y) 
+    else:
+        X_train_pre, X_test_pre, y_train_pre, y_test_pre = train_test_split_scaled(X, y, dfIn[fixedSplit]) 
+        
+        
+    evalAll(X_train_pre, X_test_pre, y_train_pre, y_test_pre, dataset = name, target = name, bs2_measure = bs2_measure,
+               modeltypes = modeltypes, approaches = approaches)
+
+    log("completed " + name)
+
+
+# In[ ]:
+
+def evalSubMC(dfIn, name, classColumnName, indices, modeltypes, approaches,
+             indexFeatureStart, indexFeatureEnd, fixedSplit = "no"):
+    classColumnName 
+    for i in indices:
+        log("working " + name + " " + str(i))
+        dfSub = dfIn.copy()
+        dfSub[classColumnName] = (dfSub[classColumnName] == i).astype(bool)
+        dfSub[classColumnName] = dfSub[classColumnName].astype(int)
+
+        X, y = createDatasetXY(df = dfSub, indexFeatureStart = indexFeatureStart, 
+                                         indexFeatureEnd= indexFeatureEnd, targetColumnName=classColumnName)  
+        bs2_measure = getBS2(X = X, y = y)
+        
+        nSample, nFeature = X.shape
+        nSampleNeg = len(y) - sum(y)
+        log("{} samples total, {} negative and {} positive".format(nSample, nSampleNeg, (nSample-nSampleNeg)))
+        if(fixedSplit == "no"):
+            X_train_pre, X_test_pre, y_train_pre, y_test_pre = train_test_split_scaled(X, y) 
+        else:
+            X_train_pre, X_test_pre, y_train_pre, y_test_pre = train_test_split_index(X, y, dfSub[fixedSplit])
+        evalAll(X_train_pre, X_test_pre, y_train_pre, y_test_pre, dataset = name, target = name + str(i), 
+                bs2_measure = bs2_measure, modeltypes = modeltypes, approaches = approaches)
+
+    log("completed " + name)
+
+
+# In[ ]:
+
+def evalSubGlas(modeltypes = ['RF', 'WRF', 'OCC', 'LR', 'KNN', 'WLR'], 
+                approaches = ['OSS', 'CNN', 'naive', 'SMOTE', 'tomek', 'ADASYN', 'ScaleMajority', 'ScaleMinority', 'ScaleBoth']):
+    log("starting glas", force=True)
+    evalSubMC(dfGlass, "Glass", "Type", [1,2,3,5,6,7], modeltypes, approaches, 0, ncGlass-1)
+
+
+# In[ ]:
+
+def evalSubGlas67(modeltypes = ['RF', 'WRF',  'OCC', 'LR', 'KNN', 'WLR'], 
+                approaches = ['OSS', 'CNN', 'naive', 'SMOTE', 'tomek', 'ADASYN', 'ScaleMajority', 'ScaleMinority', 'ScaleBoth']):
+    evalSubMC(dfGlass, "Glass", "Type", [6,7], modeltypes, approaches, 0, ncGlass-1)
+
+
+# In[ ]:
+
+def evalSubVowel(modeltypes = ['RF', 'WRF',  'OCC', 'LR', 'KNN', 'WLR'], 
+                 approaches = [ 'OSS', 'CNN', 'naive', 'SMOTE', 'tomek', 'ADASYN', 'ScaleMajority', 'ScaleMinority', 'ScaleBoth']):
+    log("starting Vowel", force=True)
+    evalSubMC(dfVowel, "Vowel", "Class", range(0,11), modeltypes, approaches, 1, ncVowel - 2, fixedSplit = "Train or Test")
+
+
+# In[ ]:
+
+def evalSubForest(modeltypes = ['RF', 'WRF', 'OCC', 'LR', 'KNN', 'WLR'], 
+                 approaches = ['OSS', 'CNN', 'naive', 'SMOTE', 'tomek', 'ADASYN', 'ScaleMajority', 'ScaleMinority', 'ScaleBoth']):
+    log("starting forest", force=True)
+    evalSubSC(dfForest, "Forest", "area", modeltypes, approaches, 0, ncForest-1)
+
+
+# In[ ]:
+
+def evalSubPima(modeltypes = ['RF', 'WRF', 'OCC', 'LR', 'KNN', 'WLR'], 
+                 approaches = ['OSS', 'CNN', 'naive', 'SMOTE', 'tomek', 'ADASYN', 
+                               'ScaleMajority', 'ScaleMinority', 'ScaleBoth']):
+    log("starting pima", force=True)
+    evalSubSC(dfPima, "Pima", "Class", modeltypes, approaches, 0, 8)
+
+
+# In[ ]:
+
+def evalSubPhoneme(modeltypes = ['RF', 'WRF', 'OCC', 'LR', 'KNN', 'WLR'], 
+                 approaches = ['OSS', 'CNN', 'naive', 'SMOTE', 'tomek', 'ADASYN', 
+                               'ScaleMajority', 'ScaleMinority', 'ScaleBoth']):
+    log("starting phoneme", force=True)
+    evalSubSC(dfPhoneme, "Phoneme", "class", modeltypes, approaches, 0, 5)
+
+
+# In[ ]:
+
+def evalSubVehicle(modeltypes = ['RF', 'WRF', 'OCC', 'LR', 'KNN', 'WLR'], 
+                 approaches = ['OSS', 'CNN', 'naive', 'SMOTE', 'tomek', 'ADASYN', 
+                               'ScaleMajority', 'ScaleMinority', 'ScaleBoth']):
+    log("starting vehicle", force=True)
+    evalSubMC(dfVehicle, "Vehicle", "TARGET", [1, 2, 3, 4], modeltypes, approaches, 0, 18)
+
+
+# In[ ]:
+
+def evalSubAbalone(modeltypes = ['RF', 'WRF', 'OCC', 'LR', 'KNN', 'WLR'], 
+                 approaches = ['OSS', 'CNN', 'naive', 'SMOTE', 'tomek', 'ADASYN', 
+                               'ScaleMajority', 'ScaleMinority', 'ScaleBoth']):
+    log("starting abalone", force=True)
+    evalSubSC(dfAbalone, "Abalone", "Rings", modeltypes, approaches, 0, 8)
+
+
+# In[ ]:
+
+def evalSubSatimage(modeltypes = ['RF', 'WRF', 'OCC', 'LR', 'KNN', 'WLR'], 
+                 approaches = ['OSS', 'CNN', 'naive', 'SMOTE', 'tomek', 'ADASYN', 
+                               'ScaleMajority', 'ScaleMinority', 'ScaleBoth']):
+    log("starting satimage", force=True)
+    evalSubMC(dfSatimage, "Satimage", "CLASS", [1,2,3,4,5,7], modeltypes, approaches, 0, 18, fixedSplit = "TRAIN_TEST")
+
+
+# In[ ]:
+
+def evalSubMammography(modeltypes = ['RF', 'WRF', 'OCC', 'LR', 'KNN', 'WLR'], 
+                 approaches = ['OSS', 'CNN', 'naive', 'SMOTE', 'tomek', 'ADASYN', 
+                               'ScaleMajority', 'ScaleMinority', 'ScaleBoth']):
+    log("starting mammography", force=True)
+    evalSubSC(dfMammography, "Mammography", "target", modeltypes, approaches, 0, 6)
+
+
+# In[ ]:
+
+def evalSubAutomotive(modeltypes = ['LR', 'KNN', 'WLR', 'RF', 'WRF', 'OCC'],
+                      approaches = ['OSS', 'CNN', 'naive', 'SMOTE', 'tomek', 'ADASYN', 'ScaleMajority', 'ScaleMinority', 'ScaleBoth'],
+                      strategies = ['Random', 'Planned']):
+    # read in targets
+    targets = pd.DataFrame.from_csv("data/targets_w_lambda_sub_sub.csv").sort_values(by = ['CNT'])
+    rows, cols = targets.shape
+    i = 1
+
+    # eval each
+    for target in targets['TARGET']:
+        #target = "DTC___1196802" # 0x 124302 = 1196802 is the lambda rex thing, 1250
+        #target = "DTC___1257473" # 21 instances, first random entry
+        log("starting automotive (target= {})".format(target), force=True)
+
+        # determine prefixes to drop (aside from prefix of target, which will be dropped automatically)
+        prefixesToDrop = []
+        if (target.startswith("DTC___")):
+            prefixesToDrop = ["BEFUND___", "DK___"]
+        elif (target.startswith("BEFUND___")):
+            prefixesToDrop = ["DK___"]
+
+
+        dfTemp = getDataFrameForTarget(dfAutomotive.copy(), target, prefixesToDrop)
+        # move "TARGET" to start
+        target_col = dfTemp['TARGET']
+        dfTemp.drop(labels=['TARGET'], axis=1,inplace = True)
+        dfTemp.insert(0, 'TARGET', target_col)
+
+        for strat in strategies:
+            XAutomotive, yAutomotive = 0, 0
+            if strat == 'Planned':
+                XAutomotive, yAutomotive = createDatasetUsingMetaXy(df = dfTemp.copy(), indexFeatureStart = 1,
+                                                                    indexFeatureEnd= ncAutomotive, targetColumnName="TARGET",
+                                                                    metaColumnName = "META___PLANNED")
+
+            if strat == 'Random':
+                XAutomotive, yAutomotive = createDatasetUsingRandomXy(df = dfTemp.copy(), indexFeatureStart = 1,
+                                                                    indexFeatureEnd= ncAutomotive, ratio = 100)
+                
+               
+
+            XAutomotive = dropPrefix(XAutomotive, "META___")
+
+            # do PCA in any case
+            XAutomotive = doPCA(XAutomotive, numberOfDimensionsTarget = 100)
+            bs2_measure = getBS2(X =XAutomotive, y = yAutomotive)
+
+            
+            X_train_pre, X_test_pre, y_train_pre, y_test_pre = train_test_split_scaled(XAutomotive, yAutomotive) 
+            
+            log("types: X_train_pre={}, X_test_pre={}, y_train_pre={}, y_test_pre={}".format(str(type(X_train_pre)),
+                                                                                             str(type(X_test_pre)),
+                                                                                             str(type(y_train_pre)),
+                                                                                             str(type(y_test_pre))))
+
+            nRowsBasic, nColsBasic = X_train_pre.shape    
+            log("basic shape of train dataset is {} rows and {} features.".format(nRowsBasic, nColsBasic))
+
+            evalAll(X_train_pre, X_test_pre, y_train_pre, y_test_pre, dataset = "Automotive_" + strat , target = target,
+                    bs2_measure = bs2_measure, modeltypes = modeltypes, approaches = approaches)
+    log("completed automotive")
+
+
+# In[ ]:
+
+def doIt_stepwise():
+    global logpath
+    smoothing = 5
+    base_log_path = "log/"
+    
+    modeltypes = ['RF', 'WRF', 'OCC', 'LR', 'KNN', 'WLR'] # no NN, painfully slow
+    approaches = ['OSS', 'CNN', 'naive', 'SMOTE', 'tomek', 'ADASYN', 'ScaleMajority', 'ScaleMinority', 'ScaleBoth']
+
+    for i in range(0,smoothing):
+        logpath = base_log_path + str(i) + "_" + "mammography" + ".csv"
+        evalSubMammography(approaches = approaches, modeltypes = modeltypes)
+        
+    for i in range(0,smoothing):
+        logpath = base_log_path + str(i) + "_" + "satimage" + ".csv"
+        evalSubSatimage(approaches = approaches, modeltypes = modeltypes)
+        
+    for i in range(0,smoothing):
+        logpath = base_log_path + str(i) + "_" + "vowel" + ".csv"
+        evalSubVowel(approaches = approaches, modeltypes = modeltypes)
+        
+    for i in range(0,smoothing):
+        logpath = base_log_path + str(i) + "_" + "forest" + ".csv"
+        evalSubForest(approaches = approaches, modeltypes = modeltypes)
+        
+    for i in range(0,smoothing):
+        logpath = base_log_path + str(i) + "_" + "glass" + ".csv"
+        evalSubGlas(approaches = approaches, modeltypes = modeltypes)
+        
+    for i in range(0,smoothing):
+        logpath = base_log_path + str(i) + "_" + "pima" + ".csv"
+        evalSubPima(approaches = approaches, modeltypes = modeltypes)
+        
+    for i in range(0,smoothing):
+        logpath = base_log_path + str(i) + "_" + "phoneme" + ".csv"
+        evalSubPhoneme(approaches = approaches, modeltypes = modeltypes)
+        
+    for i in range(0,smoothing):
+        logpath = base_log_path + str(i) + "_" + "vehicle" + ".csv"
+        evalSubVehicle(approaches = approaches, modeltypes = modeltypes)
+        
+    for i in range(0,smoothing):
+        logpath = base_log_path + str(i) + "_" + "abalone" + ".csv"
+        evalSubAbalone(approaches = approaches, modeltypes = modeltypes)
+        
+    for i in range(0,smoothing):
+        logpath = base_log_path + str(i) + "_" + "automotive" + ".csv"
+        evalSubAutomotive(approaches = approaches, modeltypes = modeltypes)
+        
+doIt_stepwise()
+
+
+# In[ ]:
+
+def doIt():
+    smoothing = 5
+
+    for i in range(0,smoothing):
+        log("starting smoothing iteration {}".format(i), force=True)
+
+        #approaches = ['ScaleBoth']
+        #modeltypes = ['OCC']
+        modeltypes = ['RF', 'WRF', 'OCC', 'LR', 'KNN', 'WLR', 'NN']
+        approaches = ['OSS', 'CNN', 'naive', 'SMOTE', 'tomek', 'ADASYN', 'ScaleMajority', 'ScaleMinority', 'ScaleBoth']
+
+        evalSubMammography(approaches = approaches, modeltypes = modeltypes)
+        evalSubSatimage(approaches = approaches, modeltypes = modeltypes)
+        evalSubVowel(approaches = approaches, modeltypes = modeltypes)
+        evalSubForest(approaches = approaches, modeltypes = modeltypes)
+        evalSubGlas(approaches = approaches, modeltypes = modeltypes)
+        evalSubPima(approaches = approaches, modeltypes = modeltypes)
+        evalSubPhoneme(approaches = approaches, modeltypes = modeltypes)
+        evalSubVehicle(approaches = approaches, modeltypes = modeltypes)
+        evalSubAbalone(approaches = approaches, modeltypes = modeltypes)
+        evalSubAutomotive(strategies = ['Random'], approaches = approaches, modeltypes = modeltypes)
+
+#doIt()
+
+
+# In[ ]:
+
+if False:
+    def doIt():
+        smoothing = 2
+
+        for i in range(0,smoothing):
+            log("starting smoothing iteration {}".format(i))
+
+            modeltypes = ['NN']
+
+            evalSubMammography(modeltypes = modeltypes)
+            evalSubSatimage(modeltypes = modeltypes)
+            evalSubVowel(modeltypes = modeltypes)
+            evalSubForest(modeltypes = modeltypes)        
+            evalSubGlas(modeltypes = modeltypes)
+            evalSubPima(modeltypes = modeltypes)
+            evalSubPhoneme(modeltypes = modeltypes)
+            evalSubVehicle(modeltypes = modeltypes)
+            evalSubAbalone(modeltypes = modeltypes)
+            evalSubAutomotive(strategies = ['Random'], modeltypes = modeltypes)
+
+    doIt()
+
+
+# In[ ]:
+
+if False:
+    evalSubAutomotive(modeltypes = ['LR', 'KNN', 'WLR', 'RF'],
+                      strategies = ['Planned'])
+    evalSubAutomotive(modeltypes = ['OCC'],
+                      strategies = ['Planned'])
+
+
+# The following results are stored to CSV:
+# - TARGET: Name of the target, e.g. "DTC\_\_\_12345" or "Vowel1" or "class0"
+# - DATASET: Dataset, e.g. "automotive"
+# - MODEL_TYPE = "KNN", "LR" or "OCC" (one class classifier)
+# - MODEL_TRAIN_TIME: Training time of the Knn in ms
+# - MODEL_ACCURACY: Accuracy achieved by the classifier
+# - MODEL_AUC: AUC achieved by the classifier
+# - MODEL_F1: F1 achieved by the classifier
+# - MODEL_GMEAN: the g performance measure
+# - NUM_FEATURES: The number of features (after being compressed by PCA in the automotive set)
+# - NUM_SAMPLES_POS: The number of observations where (minority) target class was present
+# - NUM_SAMPLES_NEG: The number of negative observations, where target class was not present
+# - PROCESS_TIME: time in ms it took to complete all processing steps
+# - PROCESS_NAIVE: 0/1 if dataset has been processed the naive way
+# - PROCESS_SAMPLING_UP_SMOTE: 0/1 if minority class of dataset has been upsampled using SMOTE. 1 means 100% minority samples, 0.5 means half as many minority samples as majority samples
+# - PROCESS_SAMPLING_UP_ADASYN: 0/1 if minority class of dataset has been upsampled using ADASYN
+# - PROCESS_SAMPLING_DOWN_OSS: 0/1 if borderline samples have been dropped using one-sided selection
+# - PROCESS_SAMPLING_DOWN_CNN: 0/1 if CNN has been used to downsample
+# - PROCESS_SAMPLING_DOWN_TOMEK: 0/1 if samples part of tomek links have been dropped
+# - PROCESS_WEIGHT: 0/1 if trainingsamples are weighted inversly the number of instances
+# - PROCESS_SCALE_MINORITY: 0/1 if minority class samples have been scaled
+# - PROCESS_SCALE_MODE: "fixed" (normalization constant explicitly set), "auto" (scaled until no tomek links)
+# - PROCESS_SCALE_C: The normalization constant, in case SCALE_MODE is set to "fixed"
diff --git a/imblearn/scaling/__init__.py b/imblearn/scaling/__init__.py
new file mode 100644
index 000000000..805bb83e2
--- /dev/null
+++ b/imblearn/scaling/__init__.py
@@ -0,0 +1,8 @@
+"""
+The :mod:`imblearn.over_sampling` provides a set of method to
+perform over-sampling.
+"""
+
+from .css import CSS
+
+__all__ = ['CSS']
diff --git a/imblearn/scaling/base.py b/imblearn/scaling/base.py
new file mode 100644
index 000000000..e8d77fe2f
--- /dev/null
+++ b/imblearn/scaling/base.py
@@ -0,0 +1,18 @@
+"""
+Base class for the over-sampling method.
+"""
+# Authors: Bernhard Schlegel <bernhard.schlegel@mytum.de>
+# License: MIT
+
+
+from ..base import BaseSampler
+
+
+class BaseScaler(BaseSampler):
+    """Base class for over-sampling algorithms.
+
+    Warning: This class should not be used directly. Use the derive classes
+    instead.
+    """
+
+    _sampling_type = 'scaling'
diff --git a/imblearn/scaling/css.py b/imblearn/scaling/css.py
new file mode 100644
index 000000000..5e523be4c
--- /dev/null
+++ b/imblearn/scaling/css.py
@@ -0,0 +1,249 @@
+"""Class to perform sample scaling using class specific scaling (CSS)."""
+# Authors: Bernhard Schlegel <bernhard.schlegel@mytum.de>
+# License: MIT
+
+
+from __future__ import division, print_function
+from collections import Counter
+import random
+import numpy as np
+from .base import BaseScaler
+
+CSS_MODE = ('linear', 'constant')
+CSS_TARGET = ('minority', 'majority', 'both')
+
+
+class CSS(BaseScaler):
+    """Class to perform sample scaling using class specific scaling (CSS).
+
+    Parameters
+    ----------
+    mode : str (default = 'constant')
+        Defines the scaling mode. Currently, two modes are implemented: `'constant'`
+        and `'linear'`. 
+        
+        In `'constant'` mode, all samples of the `'target'` class will be scaled
+        by the same amount `c` to their class specific center. The following 
+        formula will be applied to calculate the new feature (`X`) values:
+        `X[y==0] * (1-c) + col_means * c`
+        
+        In `'linear'` mode, all samples will be scaled in depedence on their 
+        distance and `c` to their class specific center. Samples, that are 
+        one/unit standard deviation away from the class center will be scaled 
+        with `c`. The following formula will be applied to calculate the new 
+        feature (`X`) values:
+        `norm = distances * c + (1-c)`
+        `X[y==0] * (1-c) / norm + col_means * (distances * c) / norm
+        
+
+    target : str (default = 'minority')
+        defines which class to scale. Possible values are 'minority', 'majority',
+        and 'both'. Note that all sample are scaled to their corresponding class
+        center.
+
+    c : float (default = 0.25)
+        Defines the amount of the scaling. 
+    
+    target_class_value: int (default = None)
+        class level indicating the minority class. By default (`None`) the minority
+        class will be automatically determined. Use any integer number (e.g. `0`,
+        `1` or `-1`) to force the minority class.
+        
+    random_state : int, RandomState instance or None, optional (default=None)
+        If int, random_state is the seed used by the random number generator;
+        If RandomState instance, random_state is the random number generator;
+        If None, the random number generator is the RandomState instance used
+        by np.random.
+
+    Attributes
+    ----------
+    mode_ : str
+        CSS mode ('constant' or 'linear')
+
+    target_ : str or int
+        Name of the target class ('majority', 'minority', 'both')
+         
+    target_class_value: int
+        class level indicating the minority class
+
+    c_ : dict of str/int : int
+        A dictionary in which the number of occurences of each class is
+        reported.
+        
+    shuffle : Boolean
+        If True, results will be shuffled.
+
+    Examples
+    --------
+
+	>>> import numpy as np
+	>>> from sklearn.utils import shuffle
+	>>> from imblearn.scaling import CSS
+ 
+	>>> rng = np.random.RandomState(42)
+	>>> n_samples_1 = 50
+	>>> n_samples_2 = 5
+	>>> X_syn = np.r_[1.5 * rng.randn(n_samples_1, 2),
+	    			  0.5 * rng.randn(n_samples_2, 2) + [2, 2]]
+	>>> y_syn = np.array([0] * (n_samples_1) + [1] * (n_samples_2))
+	>>> X_syn, y_syn = shuffle(X_syn, y_syn)
+	>>> css = CSS(mode="linear", target="both", c=0.1, shuffle=True)
+	>>> X_train_res, y_train_res = css.fit_sample(X_syn, y_syn)
+
+    References
+    ----------
+    .. [1] B. Schlegel, and B. Sick. "Dealing with class imbalance the scalable way: 
+	       Evaluation of various techniques based on classification grade and computational 
+		   complexity." 2017 IEEE International Conference on Data Mining Workshops, 2017.
+    """
+
+    def __init__(self,
+                 mode='linear',
+                 target='minority',
+                 c=0.25,
+                 minority_class_value=None,
+                 shuffle=True,
+                 random_state=None):
+        super(CSS, self).__init__(ratio=1)
+        self.mode = mode
+        self.target = target
+        self.c = c
+        self.minority_class_value = minority_class_value
+        self.shuffle = shuffle
+
+    def _validate_estimator(self):
+        i = 1
+        # nothing to do
+
+    def fit(self, X, y):
+        """Find the classes statistics before to perform sampling.
+
+        Parameters
+        ----------
+        X : ndarray, shape (n_samples, n_features)
+            Matrix containing the data which have to be scaled.
+
+        y : ndarray, shape (n_samples, )
+            Corresponding label for each sample in X.
+
+        Returns
+        -------
+        self : object,
+            Return self.
+
+        """
+
+        super(CSS, self).fit(X, y)
+
+        self._validate_estimator()
+
+        return self
+
+    def _shuffleTwo(self, a, b):
+        #if len(a) != len(b):
+        #    raise ValueError("lenth of a ({}) doesn't match length of b ({})".format(len(a), len(b)))
+
+        indexes = np.array(range(0, len(a)))
+        random.shuffle(indexes)
+        a2, b2 = a[indexes], b[indexes]
+
+        return a2, b2, indexes
+
+    def _sample(self, X, y):
+        """scales the dataset.
+
+        Parameters
+        ----------
+        X : ndarray, shape (n_samples, n_features)
+            Matrix containing the data which have to be sampled.
+
+        y : ndarray, shape (n_samples, )
+            Corresponding label for each sample in X.
+
+        Returns
+        -------
+        X_scaled : ndarray, shape (n_samples, n_features)
+            The array containing the resampled data.
+
+        y_scaled : ndarray, shape (n_samples)
+            The corresponding label of `X_scaled`
+
+        """
+
+        if self.mode not in CSS_MODE:
+            raise ValueError('Unknown kind for CSS mode.'
+                             ' Choices are {}. Got \'{}\' instead.'.format(
+                            CSS_MODE, self.mode))
+
+        if self.target not in CSS_TARGET:
+            raise ValueError('Unknown kind for CSS target.'
+                             ' Choices are {}. Got \'{}\' instead.'.format(
+                                CSS_TARGET, self.target))
+
+        if self.c < 0 or self.c > 1:
+            raise ValueError('Received scaling factor c={}, which'
+                             ' is outside the allowed range '
+                             '(0-1].'.format(self.c))
+        if self.c is 0:
+            raise ValueError('Received scaling factor c={}, which is'
+                             ' equal to no CSS at.'.format(self.c))
+
+        if self.minority_class_value is not None and \
+                not isinstance(self.minority_class_value, int):
+            raise ValueError('Unallowed target class value \'{}\'.'
+                             ' Valid values include None to automatically'
+                             ' infer the target class or any integer number'
+                             ' corresponding to the value of the label in y')
+
+
+        mcv = self.minority_class_value
+        if mcv is None:
+            # infer minority class value
+            counts = Counter(y)
+            least_common = counts.most_common()[:-1-1:-1]
+            mcv = least_common[0][0]
+
+        # in the following _a is majority, _i is minority
+        if self.target is "majority" or self.target is "both":
+            col_means_a = np.mean(X[(y != mcv)], axis=0)
+            if self.mode is "linear":
+                distances_a = abs(np.subtract(X[y != mcv], col_means_a))
+        if self.target is "minority" or self.target is "both":
+            col_means_i = np.mean(X[(y == mcv)], axis=0)
+            if self.mode is "linear":
+                distances_i = abs(np.subtract(X[y == mcv], col_means_i))
+
+        if self.target is "majority" or self.target is "both":
+            if self.mode is "constant":
+                X_scaled_a = X[y != mcv] * (1 - self.c) + col_means_a * self.c
+            elif self.mode is "linear":
+                scale_factors_mean = (distances_a * self.c)
+                scale_factors_values = (1 - self.c * distances_a)
+
+                X_scaled_a = X[y != mcv] * scale_factors_values + col_means_a * scale_factors_mean
+        if self.target is "minority" or self.target is "both":
+            if self.mode is "constant":
+                X_scaled_i = X[y == mcv] * (1 - self.c) + col_means_i * self.c
+            elif self.mode is "linear":
+                scale_factors_mean = (distances_i * self.c)
+                scale_factors_values = (1 - self.c * distances_i)
+                X_scaled_i = X[y == mcv] * scale_factors_values + col_means_i * scale_factors_mean
+
+        # merge scaled and non scaled stuff
+        if self.target is "majority":
+            X_scaled = np.concatenate([X_scaled_a, X[y == mcv]], axis=0)
+        elif self.target is "minority":
+            X_scaled = np.concatenate([X[y != mcv], X_scaled_i], axis=0)
+        else: #"both"
+            X_scaled = np.concatenate([X_scaled_a, X_scaled_i], axis=0)
+
+        # make sure that y is in same order like X
+        y_assembled = np.concatenate([y[y != mcv], y[y == mcv]], axis=0)
+
+        # shuffle
+        X_scaled_shuffled, y_res_shuffled, indices = self._shuffleTwo(X_scaled, y_assembled)
+
+        if self.shuffle:
+            return X_scaled_shuffled, y_res_shuffled
+        else:
+            return X_scaled, y_assembled
diff --git a/imblearn/scaling/tests/__init__.py b/imblearn/scaling/tests/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/imblearn/scaling/tests/test_css.py b/imblearn/scaling/tests/test_css.py
new file mode 100644
index 000000000..55987d4fc
--- /dev/null
+++ b/imblearn/scaling/tests/test_css.py
@@ -0,0 +1,167 @@
+"""Test the module CSS."""
+# Authors: Bernhard Schlegel <bernhard.schlegel@mytum.de>
+# License: MIT
+
+from __future__ import print_function
+
+
+import numpy as np
+from numpy.testing import (assert_allclose, assert_array_equal,
+                           assert_raises_regex,
+                           assert_raises)
+
+from imblearn.scaling import CSS
+
+# Generate a global dataset to use
+RND_SEED = 0
+X = np.array([[0.11622591, -0.0317206],
+              [0.77481731, 0.60935141],
+              [1.25192108, -0.22367336],
+              [0.53366841, -0.30312976],
+              [1.52091956, -0.49283504],
+              [-0.28162401, -2.10400981],
+              [0.83680821, 1.72827342],
+              [0.3084254, 0.33299982],
+              [0.70472253, -0.73309052],
+              [0.28893132, -0.38761769],
+              [1.15514042, 0.0129463],
+              [0.88407872, 0.35454207],
+              [1.31301027, -0.92648734],
+              [-1.11515198, -0.93689695],
+              [-0.18410027, -0.45194484],
+              [0.9281014, 0.53085498],
+              [-0.14374509, 0.27370049],
+              [-0.41635887, -0.38299653],
+              [0.08711622, 0.93259929],
+              [1.70580611, -0.11219234]])
+y = np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1])
+R_TOL = 1e-4
+
+def test_css_mode():
+    # should two fail (illegal value for mode)
+    css = CSS(mode='constant2', target='minority', c=1.01, shuffle=False)
+    assert_raises(ValueError, css.fit_sample, X, y)
+
+    css = CSS(mode='no mode', target='minority', c=0, shuffle=False)
+    assert_raises(ValueError, css.fit_sample, X, y)
+
+    # these two should not fail
+    try:
+        css = CSS(mode='constant', target='minority', c=0.25, shuffle=False)
+        css.fit_sample(X,y)
+        css = CSS(mode='linear', target='minority', c=0.25, shuffle=False)
+        css.fit_sample(X,y)
+    except Exception as e:
+        raise ValueError('CSS raised an Exception unexpectedly! ({})'.format(e))
+
+def test_css_target():
+    # should two fail (illegal value for c)
+    css = CSS(mode='constant', target='abc', c=0.5, shuffle=False)
+    assert_raises(ValueError, css.fit_sample, X, y)
+
+    # these three should not fail
+    try:
+        css = CSS(mode='constant', target='minority', c=0.05, shuffle=False)
+        css.fit_sample(X,y)
+        css = CSS(mode='constant', target='majority', c=0.05, shuffle=False)
+        css.fit_sample(X,y)
+        css = CSS(mode='constant', target='both', c=0.05, shuffle=False)
+        css.fit_sample(X,y)
+    except Exception as e:
+        raise ValueError('CSS raised an Exception unexpectedly! ({})'.format(e))
+
+def test_css_c():
+    # should two fail (illegal value for c)
+    css = CSS(mode='constant', target='minority', c=1.01, shuffle=False)
+    assert_raises(ValueError, css.fit_sample, X, y)
+
+    css = CSS(mode='constant', target='minority', c=0, shuffle=False)
+    assert_raises(ValueError, css.fit_sample, X, y)
+
+    # these two should not fail
+    try:
+        css = CSS(mode='constant', target='minority', c=0.01, shuffle=False)
+        css.fit_sample(X,y)
+        css = CSS(mode='linear', target='minority', c=0.99, shuffle=False)
+        css.fit_sample(X,y)
+    except Exception as e:
+        raise ValueError('CSS raised an Exception unexpectedly! ({})'.format(e))
+
+
+def test_sample_regular():
+    # minority samples are unaffected when target is majority
+    css = CSS(mode='constant', target='majority', c=1, shuffle=False)
+    X_s, y_s = css.fit_sample(X,y)
+    assert_allclose(X[y == 1], X_s[y_s == 1], rtol=R_TOL)
+
+    # majority samples are unaffected when target is minority
+    css = CSS(mode='constant', target='minority', c=1, shuffle=False)
+    X_s, y_s = css.fit_sample(X,y)
+    assert_allclose(X[y == 0], X_s[y_s == 0], rtol=R_TOL)
+
+    # both are affected if target is both
+    css = CSS(mode='constant', target='both', c=1, shuffle=False)
+    X_s, y_s = css.fit_sample(X,y)
+    if np.allclose(X[y == 0], X_s[y_s == 0], rtol=R_TOL):
+        raise ValueError('np arrays should not be close!')
+
+    # mathematical correctness of constant scaling majority (coarse)
+    css = CSS(mode='constant', target='majority', c=1, shuffle=False)
+    X_s, y_s = css.fit_sample(X, y)
+    X_s_sub = X_s[y_s == 0]
+    for i in range(2, len(X_s_sub)):
+        if not abs(X_s_sub[0, 1] - X_s_sub[i, 1]) <= R_TOL:
+            raise ValueError('numbers dont match')
+        if not abs(X_s_sub[0, 0] - X_s_sub[i, 0]) <= R_TOL:
+            raise ValueError('numbers dont match')
+
+    # mathematical correctness of constant scaling majority (fine)
+    c_test = 0.25
+    css = CSS(mode='constant', target='majority', c=c_test, shuffle=False)
+    X_s, y_s = css.fit_sample(X, y)
+    X_sub = X[y==0]
+    X_s_sub = X_s[y_s==0]
+    mu = np.mean(X_s_sub, axis = 0)
+    for i in range(0, len(X_s_sub)):
+        if not abs(X_s_sub[i, 0] - (X_sub[i, 0] * (1 - c_test) + mu[0] * c_test)) <= R_TOL:
+            raise ValueError('numbers dont match')
+        if not abs(X_s_sub[i, 1] - (X_sub[i, 1] * (1 - c_test) + mu[1] * c_test)) <= R_TOL:
+            raise ValueError('numbers dont match')
+    # minority class should remain unaffected
+    X_sub = X[y == 1]
+    X_s_sub = X_s[y_s == 1]
+    assert_allclose(X_sub, X_s_sub, rtol=R_TOL)
+
+    # mathematical correctness of constant scaling minority (fine)
+    c_test = 0.25
+    css = CSS(mode='constant', target='minority', c=c_test, shuffle=False)
+    X_s, y_s = css.fit_sample(X, y)
+    X_sub = X[y==1]
+    X_s_sub = X_s[y_s==1]
+    mu = np.mean(X_s_sub, axis = 0)
+    for i in range(0, len(X_s_sub)):
+        if not abs(X_s_sub[i, 0] - (X_sub[i, 0] * (1 - c_test) + mu[0] * c_test)) <= R_TOL:
+            raise ValueError('numbers dont match')
+        if not abs(X_s_sub[i, 1] - (X_sub[i, 1] * (1 - c_test) + mu[1] * c_test)) <= R_TOL:
+            raise ValueError('numbers dont match')
+    # majority class should remain unaffected
+    X_sub = X[y == 0]
+    X_s_sub = X_s[y_s == 0]
+    assert_allclose(X_sub, X_s_sub, rtol=R_TOL)
+
+    # mathematical correctness of linear scaling both
+    c_test = 0.1
+    css = CSS(mode='linear', target='both', c=c_test, shuffle=False)
+    X_s, y_s = css.fit_sample(X, y)
+    for lvl in [0,1]:
+        X_sub = X[y == lvl]
+        X_s_sub = X_s[y_s == lvl]
+        mu = np.mean(X_sub, axis=0)
+        dists = abs(np.subtract(X_sub, mu))
+        for i in range(0, len(X_s_sub)):
+            for j in [0, 1]:
+                norm = dists[i, j] * c_test + (1 - dists[i, j] * c_test)
+                val_returned = X_s_sub[i, j]
+                val_expected = X_sub[i, j] * (1 - dists[i, j] * c_test) / norm + mu[j] * dists[i, j] * c_test / norm
+                if not abs(val_returned - val_expected) < R_TOL:
+                    raise ValueError('numbers dont match')
\ No newline at end of file
diff --git a/imblearn/utils/validation.py b/imblearn/utils/validation.py
index 58488463a..4989850bb 100644
--- a/imblearn/utils/validation.py
+++ b/imblearn/utils/validation.py
@@ -18,7 +18,7 @@
 from ..exceptions import raise_isinstance_error
 
 SAMPLING_KIND = ('over-sampling', 'under-sampling', 'clean-sampling',
-                 'ensemble')
+                 'ensemble', 'scaling')
 TARGET_KIND = ('binary', 'multiclass', 'multilabel-indicator')
 
 

From 2658dc73aa5a09bfd4c43eed9c0db7cb25aae1d0 Mon Sep 17 00:00:00 2001
From: Bernhard Schlegel <bernhard.schlegel@mytum.de>
Date: Wed, 28 Mar 2018 09:13:28 +0200
Subject: [PATCH 2/6] solved "comma" misspelling in docu

---
 imblearn/scaling/css.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/imblearn/scaling/css.py b/imblearn/scaling/css.py
index 5e523be4c..d54fce42b 100644
--- a/imblearn/scaling/css.py
+++ b/imblearn/scaling/css.py
@@ -79,12 +79,10 @@ class level indicating the minority class
 	>>> import numpy as np
 	>>> from sklearn.utils import shuffle
 	>>> from imblearn.scaling import CSS
- 
 	>>> rng = np.random.RandomState(42)
 	>>> n_samples_1 = 50
 	>>> n_samples_2 = 5
-	>>> X_syn = np.r_[1.5 * rng.randn(n_samples_1, 2),
-	    			  0.5 * rng.randn(n_samples_2, 2) + [2, 2]]
+	>>> X_syn = np.r_[1.5 * rng.randn(n_samples_1, 2), 0.5 * rng.randn(n_samples_2, 2) + [2, 2]]
 	>>> y_syn = np.array([0] * (n_samples_1) + [1] * (n_samples_2))
 	>>> X_syn, y_syn = shuffle(X_syn, y_syn)
 	>>> css = CSS(mode="linear", target="both", c=0.1, shuffle=True)

From 1243bfc90e67e6d562baaae0d6519afceb19514e Mon Sep 17 00:00:00 2001
From: Bernhard Schlegel <bernhard.schlegel@mytum.de>
Date: Wed, 28 Mar 2018 09:21:05 +0200
Subject: [PATCH 3/6] removed unnecessary file

---
 PCA.py | 2468 --------------------------------------------------------
 1 file changed, 2468 deletions(-)
 delete mode 100644 PCA.py

diff --git a/PCA.py b/PCA.py
deleted file mode 100644
index 34310fbff..000000000
--- a/PCA.py
+++ /dev/null
@@ -1,2468 +0,0 @@
-
-# coding: utf-8
-
-# ### Do a PCA transformation
-# https://www.dataquest.io/blog/jupyter-notebook-tips-tricks-shortcuts/
-
-# In[ ]:
-
-import sys, os
-dir = os.path.dirname(os.path.abspath(os.path.realpath('.')))
-libRoot = os.path.join(dir, 'imbalanced-learn')
-sys.path.insert(0,libRoot)
-
-
-# In[ ]:
-
-import numpy as np
-import pandas as pd
-import matplotlib.pyplot as plt
-from numpy import genfromtxt
-from sklearn.decomposition import PCA
-import time
-
-import re # re.sub() for replacing using regexps
-import datetime # ping pong
-import multiprocessing # count cpus
-from pprint import pprint # beautifully print arrays
-import datetime # get current time
-import math as math
-
-
-# ### Setup on new machine
-# http://stackoverflow.com/questions/29329667/ipython-notebook-script-deprecated-how-to-replace-with-post-save-hook
-# 
-
-# ## Settings
-
-# In[ ]:
-
-n_folds = 5
-
-
-# In[ ]:
-
-use_ubi = False
-if use_ubi:
-    import os
-    import datetime 
-    from ubidots import ApiClient
-
-    # set proxy
-    os.environ['http_proxy'] = 'http://proxy.muc:8080' 
-    os.environ['https_proxy'] = 'http://proxy.muc:8080'
-
-    # get api and variable
-    api = ApiClient(token='O32sAiO8tw4VOTxz24Wmf1IRY7ZoeY')
-    ubi_last_timestamp = api.get_variable('5910aee076254222ee1d9d3f')
-
-    new_value = ubi_last_timestamp.save_value({'value': 10, 'context':{'lastTimestamp': "'" + str(datetime.datetime.now()) + "'"}})
-
-
-# In[ ]:
-
-logpath = "log.log"
-
-class Report():
-    @staticmethod
-    def getHeader():
-        return "\"TS\"" + "," +                "\"TARGET\"" + "," +                "\"DATASET\"" + "," +                "\"MODEL_TYPE\"" + "," +                "\"MODEL_TRAIN_TIME\"" + "," +                "\"MODEL_TRAIN_EVAL_TIME\"" + "," +                "\"MODEL_TEST_TIME\"" + "," +                "\"MODEL_TRAIN_ACCURACY\"" + "," +                "\"MODEL_TRAIN_AUROC\"" + "," +                "\"MODEL_TRAIN_AUPRC\"" + "," +                "\"MODEL_TRAIN_F1\"" + "," +                "\"MODEL_TRAIN_GPERFORMANCE\"" + "," +                "\"MODEL_ACCURACY\"" + "," +                "\"MODEL_AUROC\"" + "," +                "\"MODEL_AUPRC\"" + "," +                "\"MODEL_F1\"" + "," +                "\"MODEL_GPERFORMANCE\"" + "," +                "\"NUM_FEATURES\"" + "," +                "\"NUM_SAMPLE_DATASET\"" + "," +                "\"NUM_SAMPLE_DATASET_POS\"" + "," +                "\"NUM_SAMPLE_DATASET_NEG\"" + "," +                "\"NUM_SAMPLE_TRAIN_BEFORE\"" + "," +                "\"NUM_SAMPLE_TRAIN_BEFORE_POS\"" + "," +                "\"NUM_SAMPLE_TRAIN_BEFORE_NEG\"" + "," +                "\"NUM_SAMPLE_TRAIN_AFTER\"" + "," +                "\"NUM_SAMPLE_TRAIN_AFTER_POS\"" + "," +                "\"NUM_SAMPLE_TRAIN_AFTER_NEG\"" + "," +                "\"BS2\"" + "," +                "\"PROCESS_NAME\"" + "," +                "\"PROCESS_TIME\"" + "," +                "\"PROCESS_NAIVE\"" + "," +                "\"PROCESS_SAMPLING_UP_SMOTE\"" + "," +                "\"PROCESS_SAMPLING_UP_ADASYN\"" + "," +                "\"PROCESS_SAMPLING_DOWN_OSS\"" + "," +                "\"PROCESS_SAMPLING_DOWN_CNN\"" + "," +                "\"PROCESS_SAMPLING_DOWN_TOMEK\"" + "," +                "\"PROCESS_WEIGHT\"" + "," +                "\"PROCESS_SCALE_MINORITY\"" + "," +                "\"PROCESS_SCALE_MODE\"" + "," +                "\"PROCESS_SCALE_TARGET\"" + "," +                "\"PROCESS_SCALE_C\"" + "\r"       
-    
-    @staticmethod        
-    def logToFile(target,
-                dataset,
-                model_type,
-                model_train_time,
-                model_train_eval_time,
-                model_test_time,
-                model_train_accuracy,
-                model_train_auroc,
-                model_train_auprc,
-                model_train_f1,
-                model_train_gmean,
-                model_accuracy,
-                model_auroc,
-                model_auprc,
-                model_f1,
-                model_gmean,
-                num_features,
-                num_sample_dataset,
-                num_sample_dataset_pos,
-                num_sample_dataset_neg,
-                num_sample_train_before,
-                num_sample_train_before_pos,
-                num_sample_train_before_meg,
-                num_sample_train_after,
-                num_sample_train_after_pos, 
-                num_sample_train_after_neg,
-                bs2,
-                process_name,
-                process_time,
-                process_naive = 0,
-                process_sampling_up_smote = 0,
-                process_sampling_up_adasyn = 0,
-                process_sampling_down_oss = 0,
-                process_sampling_down_cnn = 0,
-                process_sampling_down_tomek = 0,
-                process_weight = 0,
-                process_scale_minority = 0,
-                process_scale_mode = 0,
-                process_scale_target = 0,
-                process_scale_c = 0):
-        global logpath
-        pth = logpath
-        import os.path
-        if (not os.path.isfile(pth)):
-             with open(pth, "a") as myfile:
-                    myfile.write(Report.getHeader())
-
-        with open(pth, "a") as myfile:
-            myfile.write(Report.getData(target, dataset, model_type, model_train_time, model_train_eval_time, model_test_time, 
-                                        model_train_accuracy, model_train_auroc, model_train_auprc, model_train_f1, model_train_gmean,
-                                        model_accuracy, model_auroc, model_auprc, model_f1, model_gmean, num_features,
-                                 num_sample_dataset, num_sample_dataset_pos, num_sample_dataset_neg,
-                                 num_sample_train_before, num_sample_train_before_pos, num_sample_train_before_meg,
-                                 num_sample_train_after, num_sample_train_after_pos, num_sample_train_after_neg, bs2,
-                                 process_name, process_time, process_naive,
-                                 process_sampling_up_smote, process_sampling_up_adasyn, process_sampling_down_oss,
-                                 process_sampling_down_cnn, process_sampling_down_tomek, process_weight,
-                                 process_scale_minority, process_scale_mode, process_scale_target, process_scale_c)
-                        )
-    
-    @staticmethod        
-    def getData(target,
-                dataset,
-                model_type,
-                model_train_time,
-                model_train_eval_time,
-                model_test_time,
-                model_train_accuracy,
-                model_train_auprc,
-                model_train_auroc,
-                model_train_f1,
-                model_train_gmean,
-                model_accuracy,
-                model_auroc,
-                model_auprc,
-                model_f1,
-                model_gmean,
-                num_features,
-                num_sample_dataset,
-                num_sample_dataset_pos,
-                num_sample_dataset_neg,
-                num_sample_train_before,
-                num_sample_train_before_pos,
-                num_sample_train_before_meg,
-                num_sample_train_after,
-                num_sample_train_after_pos, 
-                num_sample_train_after_neg,
-                bs2,
-                process_name,
-                process_time,
-                process_naive = 0,
-                process_sampling_up_smote = 0,
-                process_sampling_up_adasyn = 0,
-                process_sampling_down_oss = 0,
-                process_sampling_down_cnn = 0,
-                process_sampling_down_tomek = 0,
-                process_weight = 0,
-                process_scale_minority = 0,
-                process_scale_mode = 0,
-                process_scale_target = 0,
-                process_scale_c = 0):
-        return "\"" + str(datetime.datetime.now()) + "\"" + "," +                "\"" + str(target) + "\"" + "," +                "\"" + str(dataset) + "\"" + "," +                "\"" + str(model_type) + "\"" + "," +                str(model_train_time) + "," +                 str(model_train_eval_time) + "," +                 str(model_test_time) + "," +                   str(model_train_accuracy) + "," +                 str(model_train_auroc) + "," +                    str(model_train_auprc) + "," +                    str(model_train_f1) + "," +                  str(model_train_gmean) + "," +                str(model_accuracy) + "," +                 str(model_auroc) + "," +                    str(model_auprc) + "," +                    str(model_f1) + "," +                  str(model_gmean) + "," +                str(num_features) + "," +                  str(num_sample_dataset) + "," +                  str(num_sample_dataset_pos) + "," +                   str(num_sample_dataset_neg) + "," +                  str(num_sample_train_before) + "," +                   str(num_sample_train_before_pos) + "," +                    str(num_sample_train_before_meg) + "," +                     str(num_sample_train_after) + "," +                           str(num_sample_train_after_pos) + "," +                   str(num_sample_train_after_neg) + "," +                      str(bs2) + "," +                      "\"" + process_name + "\"" + "," +                          str(process_time) + "," +                             str(process_naive) + "," +                     str(process_sampling_up_smote) + "," +                     str(process_sampling_up_adasyn) + "," +                 str(process_sampling_down_oss) + "," +                       str(process_sampling_down_cnn) + "," +                     str(process_sampling_down_tomek) + "," +                  str(process_weight) + "," +                           str(process_scale_minority) + "," +                  "\"" + str(process_scale_mode) + "\"" + "," +                  "\"" + str(process_scale_target) + "\"" + "," +                  str(process_scale_c) + "\r"
-
-
-# In[ ]:
-
-# define Log function
-def log(text, silent=True, force=False):
-    if not silent or force:
-        print(time.strftime('%Y.%m.%d, %H:%M:%S') + ': ' + text)
-    
-def ping():
-    return datetime.datetime.now()
-
-def pong(dt):
-    now = datetime.datetime.now()
-    diff = now - dt
-    ms = round(diff.total_seconds()*1000)
-    return ms
-
-log('init finshed', force=True)
-
-
-# In[ ]:
-
-def getBS2(X, y):
-    tl = TomekLinks()
-    X_tl, y_tl = tl.fit_sample(X, y)
-    
-    num_pos_samples = sum(y)
-    num_tomek_links = len(y) - len(y_tl)
-
-    return num_tomek_links / num_pos_samples
-
-
-# In[ ]:
-
-def indexCategorical(df, columnName):
-    df[columnName] = pd.Categorical(df[columnName]).codes
-    return df
-
-def renameTargetDropSamePrefix(df, target) :    
-    
-    rows, colsBefore = df.shape
-    
-    prefixToDrop = re.sub(r"(.*?)___.*", r"\1___", target)
-    log("renaming " + target +         " to \"TARGET\" and dropping all other columns prefixed with " + prefixToDrop)
-    df.rename(columns={target: "TARGET"}, inplace = True)
-    
-    dfReturn = dropPrefix(df, prefixToDrop)
-    
-    rows, colsAfter = dfReturn.shape
-    log("reduced number of columns from {} to {}.".format(colsBefore, colsAfter))
-    assert colsAfter < colsBefore
-    
-    return dfReturn
-
-def dropPrefix(df, prefix) :
-    prefix = prefix + ".*"
-    log("dropping " + prefix)
-    return df.select(lambda x: not re.search(prefix,x), axis = 1)
-
-def testDropPrefix():
-    df = pd.DataFrame([
-              [1,3,1,0],
-              [1,4,1,1],
-              [1,5,1,0],
-              [1.5,6,1,0],
-              [1.7,7,1,0],
-              [1,4,1,0],
-              [1,6,1,0],
-              [1,5,1,1],
-              [1,12,1,1],
-              [1,9,1,1],
-              [1,2,1,1],
-              [1,3,1,1],
-              [1,5,1,1],
-              [2,8,1,0],
-              [3,1,0,1],
-              [3,2,0,1],
-              [4,2,0,0],
-              [5,3,0,0]], columns=['PRE1___1', 'PRE1___2', 'PRE2___1','PRE3___1'])
-
-    row, nCol13 = dropPrefix(df, "PRE2___").shape
-    row, nCol23 = dropPrefix(df, "PRE1___").shape
-
-    assert nCol13 == 3, "wrong number of cols dropped"
-    assert nCol23 == 2, "wrong number of cols dropped"
-
-def pcaFeatureGroup(dfIn, featureGroupPrefix, numberOfDimensionsWhole,
-                    numberOfDimensionsTarget, minResultingFeatures = 2):
-    """
-    e.g. featureGroupPrefix = "RO___"
-    """
-    rows, cBefore = dfIn.shape
-    npMatrix = dfIn.filter(regex = featureGroupPrefix + ".*").as_matrix()
-    
-    rows, cBeforeOfGroup = npMatrix.shape
-    
-    if(cBeforeOfGroup == 0):
-        log("There are no features belonging to group {}. Returning unmodified DataFrame.".format(featureGroupPrefix))
-        return dfIn
-    
-    # scale
-    npMatrix = scale(npMatrix)
-
-    # holds the number of the whole dataset. To be in scale, we first calculate the proportion of the current feature group
-    proportion = numberOfDimensionsTarget / numberOfDimensionsWhole
-    n_component_target = int(cBeforeOfGroup * proportion)
-    
-    if (n_component_target < minResultingFeatures):
-        n_component_target = minResultingFeatures
-    if (cBeforeOfGroup != 0): 
-        # http://scikit-learn.org/stable/modules/generated/sklearn.decomposition.PCA.html#sklearn.decomposition.PCA.fit_transform
-        pca = PCA(copy=True, 
-                  iterated_power='auto', 
-                  n_components=n_component_target, 
-                  random_state=None, 
-                  svd_solver='auto', 
-                  tol=0.0, 
-                  whiten=False)
-        dfB = dropPrefix(dfIn, featureGroupPrefix)
-        rB, cB = dfB.shape
-        
-        
-        npMatrixTransformed = pca.fit_transform(npMatrix)
-        dfA = pd.DataFrame(data=npMatrixTransformed[0:,0:], 
-                             columns=[featureGroupPrefix + str(num) for num in np.arange(1,npMatrixTransformed.shape[1]+1,1)])#,
-                             #index = dfB.index )
-        rIn, cIn = dfIn.shape
-        rA, cA = dfA.shape
-                                 
-        dfReturn = np.concatenate([dfA, dfB], axis = 1) # axis 1 is columns, so this direction ->
-        dfReturn = pd.DataFrame(data=dfReturn[0:,0:],    # values
-                                index=dfA.index,    # 1st column as index
-                                columns=dfA.columns.append(dfB.columns))  # 1st row as the column names
-        
-        rReturn, cReturn = dfReturn.shape
-        
-        
-        log("feature group has been compressed from {} to {} columns".format(cBeforeOfGroup, cA))
-        
-        assert rReturn == rIn, "num rows of inputted and outputted dataframe do not match"
-        assert rIn == rA, "num rows of inputted and PCAd dataframe do not match"
-        assert rIn == rB, "num rows of inputted and non PCAd part of initial dataframe do not match"
-        assert cReturn == (cB + cA), "concatenating PCAd and non PCAd df into returned df resulted to wrong number of cols"
-        assert cBefore == (cB + cBeforeOfGroup), "number of cols from non PCAd and PCAd dataframe before transformation" +                                                  "should add up to initial number of columns"
-        
-        return dfReturn
-    else:
-        log("no columns found matching prefix " + featureGroupPrefix + ". Skipping...")
-        return dfIn
-
-def print_full(df):
-    pd.set_option('display.max_columns', df.shape[1])
-    print(df)
-    pd.reset_option('display.max_rows')
-
-
-# ## Read in the different datasources
-
-# In[ ]:
-
-
-ts = ping()
-dfAutomotive = pd.read_csv("in.csv")
-nr, ncAutomotive = dfAutomotive.shape
-ms = pong(ts)
-log("read in dataframe with " + str(nr) + " columns and " + str(ncAutomotive) + " rows in " + str(ms) + "ms", force=True)
-
-
-# In[ ]:
-
-
-ts = ping()
-dfForest = pd.read_csv("data/forestfires_id.csv")
-nr, ncForest = dfForest.shape
-ms = pong(ts)
-log("read in forest dataframe with " + str(nr) + " columns and " + str(ncForest) + " rows in " + str(ms) + "ms")
-#dfForest['area'] = np.log(1 + dfForest['area'])
-dfForest['area'] = (dfForest['area'] > 50).astype(bool).astype(int)
-
-nPositive = sum(dfForest['area'])
-nNegative = nr-nPositive
-log("ratio of forest is {} majority class and {} minority class observations ({:.3f})".format(nNegative, nPositive, nPositive/nNegative))
-
-import matplotlib.pyplot as plt
-
-plt.hist(dfForest['area'], bins=30)
-plt.ylabel('Probability')
-
-
-# In[ ]:
-
-
-ts = ping()
-dfVowel = pd.read_csv("data/vowel-context.csv")
-nr, ncVowel = dfVowel.shape
-ms = pong(ts)
-
-nPositive = sum(dfVowel['Class'] == 1)
-nNegative = nr-nPositive
-log("ratio of vowel is {} majority class and {} minority class observations ({:.3f})".format(nNegative, nPositive, nPositive/nNegative))
-
-log("read in vowel dataframe with " + str(nr) + " columns and " + str(ncVowel) + " rows in " + str(ms) + "ms")
-
-
-# In[ ]:
-
-
-ts = ping()
-dfGlass = pd.read_csv("data/glass.csv")
-nr, ncGlass = dfGlass.shape
-ms = pong(ts)
-
-sums = []
-
-for i in [1, 2, 3, 5, 6, 7]:
-    sums.append(sum(dfGlass['Type'] == i))
-    
-nPositive = np.round(np.mean(sums))
-nNegative = nr-nPositive
-log("average ratio of vowel is {} majority class and {} minority class observations ({:.3f})".format(nNegative, nPositive, nPositive/nNegative))
-
-log("read in glass dataframe with " + str(nr) + " columns and " + str(ncGlass) + " rows in " + str(ms) + "ms", force=True)
-
-
-# In[ ]:
-
-dfPima = pd.read_csv("data/pima.csv") 
-nrPima, ncPima = dfPima.shape
-nPositive = sum(dfPima['Class'])
-nNegative = nrPima-nPositive
-log("average ratio of pima is {} majority class and {} minority class observations ({:.3f})".format(nNegative, nPositive, nPositive/nNegative))
-log("features={}".format(ncPima))
-
-dfPhoneme = pd.read_csv("data/phoneme.csv") 
-nrPhoneme, ncPhoneme = dfPhoneme.shape
-nPositive = sum(dfPhoneme['class'])
-nNegative = nrPhoneme-nPositive
-log("average ratio of phoneme is {} majority class and {} minority class observations ({:.3f})".format(nNegative, nPositive, nPositive/nNegative))
-
-dfVehicle = pd.read_csv("data/vehicle.csv") 
-nrVehicle, ncVehicle = dfVehicle.shape
-
-sums = []
-for i in [1, 2, 3]:
-    sums.append(sum(dfVehicle['TARGET'] == i))
-    
-nPositive = np.round(np.mean(sums))
-nNegative = nrVehicle-nPositive
-log("average ratio of vehicle is {} majority class and {} minority class observations ({:.3f})".format(nNegative, nPositive, nPositive/nNegative))
-
-dfAbalone = pd.read_csv("data/abalone_9_18.csv") 
-nrAbalone, ncAbalone = dfAbalone.shape
-nPositive = sum(dfAbalone['Rings'])
-nNegative = nrAbalone-nPositive
-log("ratio of abalone is {} majority class and {} minority class observations ({:.3f})".format(nNegative, nPositive, nPositive/nNegative))
-
-dfSatimage = pd.read_csv("data/satimage.csv") 
-nrSatimage, ncSatimage = dfSatimage.shape
-
-sums = []
-for i in [1, 2, 3, 4, 5, 7]:
-    sums.append(sum(dfSatimage['CLASS'] == i))
-nPositive = np.round(np.mean(sums))
-nNegative = nrSatimage-nPositive
-log("average ratio of satimage is {} majority class and {} minority class observations ({:.3f})".format(nNegative, nPositive, nPositive/nNegative))
-
-dfMammography = pd.read_csv("data/mammography.csv") 
-nrMammography, ncMammography = dfMammography.shape
-nPositive = sum(dfMammography['target'])
-nNegative = nrMammography-nPositive
-log("ratio of mammography is {} majority class and {} minority class observations ({:.3f})".format(nNegative, nPositive, nPositive/nNegative))
-
-
-# In[ ]:
-
-from sklearn.preprocessing import scale
-
-def train_test_split_index(X, y, index):
-    """
-    split X and y into training and testing based on index
-    """
-    X_scaled = scale(X)
-    
-    if not isinstance(y, np.ndarray):
-        y = y.as_matrix()
-    if not isinstance(X_scaled, np.ndarray):
-        X_scaled = X_scaled.as_matrix()
-    if not isinstance(index, np.ndarray):
-        index = index.as_matrix()
-    
-    X_train = X_scaled[index == 0]
-    X_test = X_scaled[index == 1]
-    y_train = y[index == 0]
-    y_test = y[index == 1]
-    
-
-    
-    return X_train, X_test, y_train, y_test
-
-def train_test_split_scaled(X, y):
-    """
-    split X and y into training and testing based on index
-    """
-    X_scaled = scale(X)
-    X_train, X_test, y_train, y_test = train_test_split(X_scaled, y)
-    
-    if not isinstance(y_train, np.ndarray):
-        y_train = y_train.as_matrix()
-    if not isinstance(y_test, np.ndarray):
-        y_test = y_test.as_matrix()
-    if not isinstance(X_train, np.ndarray):
-        X_train = X_train.as_matrix()
-    if not isinstance(X_test, np.ndarray):
-        X_test = X_test.as_matrix()
-    
-    return X_train, X_test, y_train, y_test
-
-
-# In[ ]:
-
-def getDataFrameForTarget(df, target = "DTC___1196802", prefixesToDrop = ["BEFUND___", "DK___"]) :
-    log("getting dataframe for target " + target + " while dropping " + str(prefixesToDrop))
-    # select target and all other columns except columns with the same prefix
-    dfTemp = renameTargetDropSamePrefix(df, target)
-
-    # get DTCs
-    #dfDTC2 = df.filter(regex=("(META|CP|RO|DTC|EE|SC|MV)___.*")) # doesn't work
-    for prefix in prefixesToDrop: 
-        dfTemp = dropPrefix(dfTemp, prefix)
-
-    # convert Categories to Indexes
-    dfTemp = indexCategorical(dfTemp, "META___CARID")
-    
-    return dfTemp
-    
-# do PCA for every featuregroup separately
-def doPCA(df, featureGroupsPCA = ["CP", "RO", "EE", "MV", "SC", "DTC"], numberOfDimensionsTarget = 100):
-    ## no meta
-    assert isinstance(df, pd.DataFrame), "dataframe needs to be a pandas.DataFrame to allow filtering for " +                                         "different feature groups."
-    dfPCA = df
-    dummy, colsBefore = dfPCA.shape
-    for group in featureGroupsPCA:
-        log("Working group " + group + "...")
-        dfPCA = pcaFeatureGroup(dfPCA, featureGroupPrefix = group + "___", numberOfDimensionsWhole = colsBefore,
-                                numberOfDimensionsTarget = numberOfDimensionsTarget)
-        
-    dummy, colsAfter = dfPCA.shape
-    log("reduced dimensions from {} to {} using PCA.".format(colsBefore, colsAfter))
-    
-    return dfPCA
-
-
-# # create the datasets according to the sampling strategy
-# 
-# Variants inlcude:
-# 1. a naive approach to serve as baseline
-# 2. one-sided selection
-# 3. condest nearest neighbour
-# 4. SMOTE
-# 5. assigning costs
-# 6. preferably sample same cars
-# 7. use heuristic to identify most valuable majority-class observations
-
-# In[ ]:
-
-
-from sklearn.model_selection import train_test_split
-from collections import Counter
-from sklearn.datasets import make_classification
-from imblearn.over_sampling import SMOTE 
-from imblearn.scaling import CSS 
-from imblearn.under_sampling import TomekLinks, OneSidedSelection, CondensedNearestNeighbour
-from collections import Counter
-from imblearn.under_sampling import RandomUnderSampler 
-    
-def createDatasetXY(df, indexFeatureStart = 0, indexFeatureEnd = -1, targetColumnName = "TARGET"):
-    """
-    creates train / test splits for X and y using the given dataframe
-    
-    :param df: The dataframe to be used
-    :param indexFeatureStart: Index to start selecting features
-    :param indexFeatureEnd: Index to stop selecting features
-    :return: X, y
-    """
-    
-    if(indexFeatureEnd == -1):
-        indexFeatureEnd = len(df.columns) - 4
-    X = df.ix[:,indexFeatureStart:indexFeatureEnd] # 3 meta cols (META___{RANDOM, CARID, PLANNED}) and the target row
-    y = df[targetColumnName].astype(bool).astype(int)
-
-    return X, y
-
-def shuffleTwo(a, b):
-    a1 = pd.DataFrame(a)
-    indexes = a1.index # get the indices, the first magical column
-    assert len(a) == len(b), "lenth of a ({}) doesn't match length of b ({})".format(len(a), len(b))
-    p = np.random.permutation(len(indexes))
-    indexesShuffled = indexes[p]
-    a2, b2 = a1.ix[indexesShuffled], b[indexesShuffled]
-    
-    return a2, b2
-
-def createDatasetUsingMetaXy(df, indexFeatureStart = 0, indexFeatureEnd = -1, 
-                           targetColumnName = "TARGET",
-                           metaColumnName = "META___PLANNED"):
-    # as majority observations such examples will be selected, where the metaColumnName is 1
-    
-    if(indexFeatureEnd == -1):
-        indexFeatureEnd = len(df.columns) - 4
-    X = df.ix[:,indexFeatureStart:indexFeatureEnd] # 3 meta cols (META___{RANDOM, CARID, PLANNED}) and the target row
-    prefix = re.sub(r'___.*', r'', metaColumnName)
-    y = df[targetColumnName].astype(bool).astype(int)
-    
-    # select only the "META == 1" or "TARGET == 1" rows.
-    indices = y[y > 0].index
-    indices = indices.append(X[X[metaColumnName] > 0].index)
-    indices = np.unique(indices)
-    X = X.loc[indices]
-    X = dropPrefix(X, prefix)   
-    y = y.loc[indices]
-    
-    return X, y
-
-def createDatasetUsingRandomXy(df, indexFeatureStart = 0, indexFeatureEnd = -1, 
-                               targetColumnName = "TARGET", ratio = 0.1):
-    # ratio defines the ratio between majority:minority (10 means: 10 times as much majority)
-    # as majority observations such examples will be selected, where the metaColumnName is 1
-    
-    if(indexFeatureEnd == -1):
-        indexFeatureEnd = len(df.columns) - 4
-    X = df.ix[:,indexFeatureStart:indexFeatureEnd] # 3 meta cols (META___{RANDOM, CARID, PLANNED}) and the target row
-    y = df[targetColumnName].astype(bool).astype(int)
-    
-    
-    log('Original dataset shape {}'.format(Counter(y)))
-
-    targetRatio = 1/ratio
-    num_pos = sum(y)
-    currentRatio = num_pos / (len(y)-num_pos)
-    log("Current ratio={}, targetRatio={}".format(currentRatio, targetRatio))
-    if(targetRatio < currentRatio):
-        targetRatio = currentRatio
-    
-    rus = RandomUnderSampler(random_state=42, ratio = targetRatio)
-    X_res, y_res = rus.fit_sample(X, y)
-    log('Resampled dataset shape {}'.format(Counter(y_res)))
-    
-    X_res = pd.DataFrame(data=X_res[0:,0:], # values
-                         #index=X.index,     # 1st column as index
-                         columns=X.columns)
-    
-    log("class returned by RandomXy for X is {}".format(str(type(X_res))))
-    
-    return X_res, y_res
-
-def createNaiveDataset(X_train, y_train):
-    """
-    creates train / test splits for X and y using the given dataframe
-    
-    :param df: The dataframe to be used
-    :param indexFeatureStart: Index to start selecting features
-    :param indexFeatureEnd: Index to stop selecting features
-    :return: X_train, X_test, y_train, y_test used for training
-    """
-    log("creating dataset [naive mode]...")
-    
-    return X_train, y_train
-
-def createSMOTEDataset(X_train, y_train):
-    log("creating dataset [SMOTE]...")
-    
-    log('Original dataset shape {}'.format(Counter(y_train)))
-
-    # SMOTE expects n_neighbors <= n_samples
-    n_neighbors = 5
-    n_samples = sum(y_train)
-    n_samples_total, dummy = X_train.shape
-    # bug in sklearn\neighbors\base.py in kneighbors(self, X, n_neighbors, return_distance)
-    # that causes the number of samples to be 1 smaller.
-    if ((n_samples-1) < n_neighbors):
-        log("reducing n_neighbors ({}) to number of samples ({})".format(n_neighbors, n_samples))
-        n_neighbors = n_samples - 1  
-    if ((n_samples_total-1) < n_neighbors):
-        log("reducing n_neighbors ({}) to total number of samples ({})".format(n_neighbors, n_samples_total))
-        n_neighbors = n_samples_total - 1
-    
-    sm = SMOTE(random_state=42, k_neighbors=n_neighbors)
-    X_train_res, y_train_res = sm.fit_sample(X_train, y_train)
-    log('Resampled dataset shape {}'.format(Counter(y_train_res)))
-    
-    # shuffle
-    X_train_res, y_train_res = shuffleTwo(X_train_res, y_train_res)
-    
-    return X_train_res, y_train_res
-
-def createADASYNDataset(X_train, y_train):
-    from imblearn.over_sampling import ADASYN 
-    ada = ADASYN()
-    X_train_res, y_train_res = ada.fit_sample(X_train, y_train)
-    
-    # shuffle
-    X_train_res, y_train_res = shuffleTwo(X_train_res, y_train_res)
-    
-    return X_train_res, y_train_res
-
-def createTomekDataset(X_train, y_train):
-    tl = TomekLinks(return_indices=True)
-    X_train_res, y_train_res, idx_resampled = tl.fit_sample(X_train, y_train)
-    
-    # shuffle
-    X_train_res, y_train_res = shuffleTwo(X_train_res, y_train_res)
-    
-    return X_train_res,  y_train_res
-
-def createOSSDataset(X_train, y_train):
-    oss = OneSidedSelection(return_indices=True)
-    X_train_res, y_train_res, idx_resampled = oss.fit_sample(X_train, y_train)
-    
-    # shuffle
-    X_train_res, y_train_res = shuffleTwo(X_train_res, y_train_res)
-#     X_train_res = pd.DataFrame(X_train_res)
-#     assert len(X_train_res) == len(y_train_res)
-#     p = np.random.permutation(len(X_train_res))
-#     X_train_res.reset_index(drop=True)
-#     X_train_res, y_train_res = X_train_res.ix[p], y_train_res[p]
-
-    return X_train_res, y_train_res
-
-def createCNNDataset(X_train, y_train):
-    cnn = CondensedNearestNeighbour(return_indices=True)
-    X_train_res, y_train_res, idx_resampled = cnn.fit_sample(X_train, y_train)
-    
-    # shuffle
-    X_train_res, y_train_res = shuffleTwo(X_train_res, y_train_res)
-    
-    return X_train_res, y_train_res
-
-def createScaledDataset(X_train, y_train, targetClass = "majority", c = 0.2, mode = "constant", 
-                        verbose = False):
-    
-    css = CSS(mode=mode, target=targetClass, c=c, shuffle=True)
-    return css.fit_sample(X_train,y_train) # X_s, y_s
-
-def testVisualCreateScaledDataset():
-    iVowel = 0
-    dfVowelSub = dfVowel.copy()
-    dfVowelSub['Class'] = (dfVowelSub['Class'] == iVowel).astype(bool)
-    dfVowelSub['Class'] = dfVowelSub['Class'].astype(int)
-
-    XVowel, yVowel = createDatasetXY(df = dfVowelSub, indexFeatureStart = 1, 
-                                     indexFeatureEnd = ncVowel-2, targetColumnName = "Class")
-    zVowel = dfVowel['Train or Test']
-    X_train_pre, X_test_pre, y_train_pre, y_test_pre = train_test_split_index(XVowel, yVowel, zVowel)     
-
-    X_train, X_test, y_train, y_test = createNaiveDataset(X_train_pre, X_test_pre, y_train_pre, y_test_pre)
-    retVal = trainLR(X_train, X_test, y_train, y_test, balanced=None, scoring="auROC")
-
-    pca = PCA(n_components=2)
-    pcaFitted = pca.fit(X_train)
-    X_r = pcaFitted.transform(X_train)
-    plt.scatter(X_r[:,0], X_r[:,1], c=y_train, alpha=0.5)
-    #np.savetxt("out/visualize/vowel_class_1/no_scale.csv", np.column_stack([X_r[:,0], X_r[:,1], y_train]), delimiter=",")
-    plt.show()
-
-    X_train, X_test, y_train, y_test = createScaledDataset(X_train_pre, X_test_pre, y_train_pre, y_test_pre, c = 0.3)
-    retVal = trainLR(X_train, X_test, y_train, y_test, balanced=None, scoring="auROC")
-    pca = PCA(n_components=2)
-    X_r = pcaFitted.transform(X_train)
-    plt.scatter(X_r[:,0], X_r[:,1], c=y_train, alpha=0.5)
-    #np.savetxt("out/visualize/vowel_class_1/scale_0.2.csv", np.column_stack([X_r[:,0], X_r[:,1], y_train]), delimiter=",")
-    plt.show()
-
-def testCreateScaledDataset():
-    df = pd.DataFrame([
-              [1,3,1,0],
-              [1,4,1,1],
-              [1,5,1,1],
-              [2,8,1,0],
-              [3,1,0,1],
-              [3,2,0,1],
-              [4,2,0,0],
-              [5,3,0,0]], columns=['a', 'b', 'target','train'])
-    dfTrainTest = df['train']
-    X, y = createDatasetXY(df, indexFeatureStart = 0, indexFeatureEnd = 2, targetColumnName = "target")
-    X_train, X_test, y_train, y_test = train_test_split_index(X, y, dfTrainTest)
-    X_train
-    c = 0.3
-    X_train_scaled, X_test_scaled, y_train_scaled, y_test_scaled = createScaledDataset(X_train, X_test, y_train, 
-                                                                                       y_test, mode = "single", c = c)
-    
-    
-    (X_train.ix[6,0] + X_train.ix[7,0])/2*c + (1-c) * X_train.ix[6,0] == X_train_scaled.ix[6,0]
-    (X_train.ix[6,1] + X_train.ix[7,1])/2*c + (1-c) * X_train.ix[7,1] == X_train_scaled.ix[7,1]
-    X_train.ix[0,1] == X_train_scaled.ix[0,1]
-    X_train.ix[3,1] == X_train_scaled.ix[3,1]
-
-
-# In[ ]:
-
-from sklearn import datasets, neighbors, linear_model, svm
-from sklearn.ensemble import RandomForestClassifier
-import numpy as np
-from sklearn import metrics
-from sklearn.metrics import roc_auc_score, make_scorer, auc, precision_recall_curve
-from sklearn.ensemble import BaggingClassifier
-from sklearn.model_selection import KFold, cross_val_score
-from sklearn.neural_network import MLPClassifier
-
-lastModel = 0
-lastY = 0
-lastPred = 0
-
-class trainResult():
-    auROC = -1
-    auPRC = -1
-    accuracy = -1
-    f1 = -1
-    gmean = -1
-    train_time = -1
-    test_time = -1
-    gmean = -1
-    train_eval_time = -1
-    train_accuracy = -1
-    train_auroc = -1
-    train_auprc = -1
-    train_f1 = -1
-    train_gmean = -1
-    ms_process = -1
-
-def get_au_prc(real, preds, pos_label=1):
-    precision, recall, _ = precision_recall_curve(real, preds, pos_label=pos_label)
-    auPRC = metrics.auc(precision, recall, reorder=True)
-    
-    return auPRC
-    
-def getMetrics(y = np.array([1, 1, 2, 2]), pred = np.array([0.1, 0.4, 0.35, 0.8]), threshold = 0.5):
-    
-    if (len(np.unique(pred)) < 2) : 
-        #log("all predictions the same. setting auc to 0 and f1 to 0.")
-        accuracy = metrics.accuracy_score(y, pred > threshold)
-        return (0, 0, accuracy, 0, 0)
-    
-    else:
-        fpr, tpr, thresholds = metrics.roc_curve(y, pred, pos_label=1)
-        auc = metrics.auc(fpr, tpr) # AU ROC
-        f1 = metrics.f1_score(y, pred > threshold)
-        accuracy = metrics.accuracy_score(y, pred > threshold)
-        precision = metrics.precision_score(y, pred > threshold)
-        recall = metrics.recall_score(y, pred > threshold)
-        auPRC = get_au_prc(y, pred, pos_label=1)
-        
-        g_mean = math.sqrt(precision * recall)
-        return (auc, f1, accuracy, g_mean, auPRC)
-
-def getNumberOfTomekLinks(X, y):
-    from imblearn.under_sampling import TomekLinks
-    tl = TomekLinks(return_indices=True)
-    n_row_before, dummy = X.shape
-    X_resampled, y_resampled, idx_resampled = tl.fit_sample(X, y)
-    n_row_after, dummy = X_resampled.shape
-    tlFound = n_row_before - n_row_after
-    log(str(tlFound) + " tomek links found")
-    return tlFound
-
-from sklearn.model_selection import cross_val_score
-def getCVPerformanceOld(clf, X_train, y_train, scoring = "roc_auc"):
-    
-    if len(np.unique(y_train)) != 2:
-        return 0
-    
-    np.set_printoptions(threshold=np.inf)
-    pprint(y_train)
-    
-    # for more scorers see http://scikit-learn.org/stable/modules/model_evaluation.html#scoring-parameter
-    scores = cross_val_score(clf, X_train, y_train, cv = n_folds, scoring = scoring)
-            
-    return np.mean(scores)
-    
-from sklearn.model_selection import cross_val_predict
-from sklearn.metrics import roc_auc_score
-def getCVPerformance(clf, X_train, y_train, scoring = "roc_auc"):
-    
-    if len(np.unique(y_train)) != 2:
-        return 0
-    
-    pred = cross_val_predict(clf, X_train, y_train, cv=n_folds)
- 
-    return roc_auc_score(y_train, pred)
-
-def trainNNScale(X_train, X_test, y_train, y_test,
-                 c_scale = 0, mode = "constant", targetClass = "minority"):
-    log("training NN")
-    retVal = trainResult()
-    ms_process_total = 0
-
-    
-    # train    
-    pTrain = ping()
-    X_train_pre = pd.DataFrame(X_train).reset_index(drop=True).as_matrix() # otherwise indices from X will be used
-    X_test = pd.DataFrame(X_test).reset_index(drop=True).as_matrix().astype(float) # otherwise indices from X will be used
-            
-            
-    if (str(type(y_train)) != "<class 'numpy.ndarray'>"):
-        y_train_pre = y_train.reset_index(drop=True) # otherwise indices from X will be used
-    else:
-         y_train_pre = y_train
-            
-    if (str(type(y_test)) != "<class 'numpy.ndarray'>"):
-        y_test = y_test.reset_index(drop=True).as_matrix() # otherwise indices from X will be used
-    from sklearn.model_selection import KFold
-    bestAuROC = 0
-    bestAuPRC = 0
-    bestAccuracy = 0
-    bestGmean = 0
-    bestF1 = 0
-    bestSolver = 'lbfgs'
-    bestActivation = 'relu'
-    solverz = ['lbfgs', 'sgd', 'adam']
-    activationz = ['identity', 'logistic', 'tanh', 'relu']
-    layerz = [(2,2), (5,2), (5,5), (10,5), (10,10), (2,2,2), (5,5,5), (10,10,10)]
-    for solver in solverz:
-        for activation in activationz:
-            for layer in layerz:
-                kf = KFold(n_splits=n_folds)
-                kf.get_n_splits(X_train_pre)
-
-                scoresAuROC = []
-                scoresF1 = []
-                scoresAccuracy = []
-                scoresGmean = []
-                scoresAuPRC = []
-
-                for train_index, test_index in kf.split(X_train):
-                    X_train_cv, X_test_cv = X_train_pre[train_index], X_train_pre[test_index]
-                    y_train_cv, y_test_cv = y_train_pre[train_index], y_train_pre[test_index]
-
-                    clf = MLPClassifier(activation  = activation, solver=solver, alpha=1e-5,
-                                        hidden_layer_sizes=layer, random_state=1)                             
-
-                    # tests will be unaffected
-                    if (c_scale > 0):
-                        pProcess = ping()
-                        X_train_cv, y_train_cv = createScaledDataset(X_train_cv, y_train_cv, mode=mode, c=c_scale, targetClass=targetClass)
-                        ms_process_total += pong(pProcess)                    
-
-                    if ((np.isnan(X_train_cv)).any):
-                        X_train_cv = np.nan_to_num(X_train_cv)
-
-                    # train
-                    clf.fit(X_train_cv, y_train_cv)
-                    pred = clf.predict(X_test_cv)
-
-                    # eval
-                    auROC, f1, accuracy, gmean, auPRC = getMetrics(y_test_cv, pred)
-
-                    scoresAuROC.append(auROC)
-                    scoresF1.append(f1)
-                    scoresAccuracy.append(accuracy)
-                    scoresGmean.append(gmean)
-                    scoresAuPRC.append(auPRC)
-
-                meanScoreAuROC = np.mean(scoresAuROC)
-                meanScoreF1 = np.mean(scoresF1)
-                meanScoreAccuracy = np.mean(scoresAccuracy)
-                meanScoreGmean = np.mean(scoresGmean)
-                meanScoreAuPRC = np.mean(scoresAuPRC)
-
-                if(meanScoreAuPRC > bestAuPRC):
-                    bestSolver = solver
-                    bestActivation = activation
-                    bestAuROC = auROC
-                    bestAccuracy = meanScoreAccuracy
-                    bestGmean = meanScoreGmean
-                    bestF1 = meanScoreF1
-                    bestAuPRC = meanScoreAuPRC
-                
-    if (c_scale > 0):
-        log("scaling final train data...")
-        pProcess = ping()
-        X_train, y_train = createScaledDataset(X_train, y_train,
-                                               c=c_scale, mode=mode, targetClass=targetClass)
-        ms_process_total /= len(solverz)
-        ms_process_total /= len(activationz)
-        ms_process_total = pong(pProcess)
-        
-    if((np.isnan(X_train)).any):
-        X_train= np.nan_to_num(X_train)
-        
-    
-    clf = MLPClassifier(activation  = bestActivation, solver=bestSolver, alpha=1e-5,
-                        hidden_layer_sizes=(5, 2), random_state=1)
-    clf.fit(X_train_cv, y_train_cv)            
-    retVal.train_time = pong(pTrain)
-    
-    # get CV train metrics
-    pTrainCV = ping()
-    bestAuROC = bestAuROC
-    bestAuPRC = bestAuPRC
-    bestAccuracy, bestF1, bestGmean = -1, -1, -1
-    retVal.train_eval_time = pong(pTrainCV)
-    retVal.train_accuracy = bestAccuracy
-    retVal.train_auroc = bestAuROC
-    retVal.train_auprc = bestAuPRC
-    retVal.train_f1 = bestF1
-    retVal.train_gmean = bestGmean
-    if(c_scale > 0):
-        retVal.ms_process = ms_process_total
-    
-    pred = clf.predict(X_test)
-    
-    pTest = ping()
-    retVal.auROC, retVal.f1, retVal.accuracy, retVal.gmean, retVal.auPRC = getMetrics(y_test, pred)
-    retVal.test_time = pong(pTest)
-
-    log('NN score: auROC={}f, auPRC={}f, f1={}, accuracy={}, gmean={}'.format(retVal.auROC, retVal.auPRC, retVal.f1, retVal.accuracy, retVal.gmean))
-    
-    return retVal
-    
-
-def trainOCC(X_train, X_test, y_train, y_test):
-    
-    #TODO : Implement cross validation http://scikit-learn.org/stable/tutorial/statistical_inference/model_selection.html
-    
-    log("training OCC")
-    retVal = trainResult()
-    
-    pTrain = ping()
-    
-    X_train_pre = pd.DataFrame(X_train).reset_index(drop=True).as_matrix() # otherwise indices from X will be used
-    X_test = pd.DataFrame(X_test).reset_index(drop=True).as_matrix().astype(float) # otherwise indices from X will be used
-            
-            
-    if (str(type(y_train)) != "<class 'numpy.ndarray'>"):
-        y_train_pre = y_train.reset_index(drop=True) # otherwise indices from X will be used
-    else:
-         y_train_pre = y_train
-            
-    if (str(type(y_test)) != "<class 'numpy.ndarray'>"):
-        y_test = y_test.reset_index(drop=True).as_matrix() # otherwise indices from X will be used
-            
-    bestAuROC = 0
-    bestAuPRC = 0
-    bestNu = 0.1
-    bestKernel = 'linear'
-    bestGamma = 0.1
-    for gamma in [0.001, 0.01, 0.1, 1]:
-        for nu in [0.01, 0.1, 0.5, 0.75, 1]:
-            for kernel in ['linear', 'poly', 'sigmoid']: # 'rbf'
-                #log("CV for gamma={}, nu={}, kernel={}".format(gamma, nu, kernel))
-
-                kf = KFold(n_splits=n_folds)
-                kf.get_n_splits(X_train_pre)
-                # print(kf) # print info about folds
-
-                scoresAuROC = []
-                scoresF1 = []
-                scoresAccuracy = []
-                scoresGmean = []               
-                scoresAuPRC = []
-                
-                for train_index, test_index in kf.split(X_train):
-                    X_train_cv, X_test_cv = X_train_pre[train_index], X_train_pre[test_index]
-                    y_train_cv, y_test_cv = y_train_pre[train_index], y_train_pre[test_index]
-
-                    clf = svm.OneClassSVM(nu=nu, kernel=kernel, gamma=gamma, tol = 0.01)
-                    #n_estimators = 10
-                    #clf = BaggingClassifier(svm.OneClassSVM(nu=nu, kernel=kernel, gamma=gamma),
-                    #                        max_samples=1.0 / n_estimators, n_estimators=n_estimators)
-
-                    #pprint(X_train_cv[y_train_cv == 1])
-                    #np.savetxt("occ_data.csv", X_train_cv, delimiter=",")
-                    
-                    nrow, ncol = X_train_cv[y_train_cv == 1].shape
-                    
-                    if (nrow == 0):
-                        log("no samples (CV for gamma={}, nu={}, kernel={}), continueing...".format(gamma, nu, kernel))
-                        continue
-                    
-                    clf.fit(X_train_cv[y_train_cv == 1])
-                    pred = clf.predict(X_test_cv)
-                    pred[pred < 0] = 0 # SVM outputs -1 for the "0" class
-                    
-                    # eval
-                    auROC, f1, accuracy, gmean, auPRC = getMetrics(y_test_cv, pred)
-                    
-                    scoresAuROC.append(auROC)
-                    scoresF1.append(f1)
-                    scoresAccuracy.append(accuracy)
-                    scoresGmean.append(gmean)
-                    scoresAuPRC.append(auPRC)
-                    
-                meanScoreAuROC = np.mean(scoresAuROC)
-                meanScoreAuPRC = np.mean(scoresAuRRC)
-                meanScoreF1 = np.mean(scoresF1)
-                meanScoreAccuracy = np.mean(scoresAccuracy)
-                meanScoreGmean = np.mean(scoresGmean)
-                
-                if(meanScoreAuPRC > bestAuPRC):
-                    bestNu = nu
-                    bestGamma = gamma
-                    bestKernel = kernel
-                    bestAuROC = auROC
-                    bestAuPRC = auPRC
-                    bestAccuracy = meanScoreAccuracy
-                    bestGmean = meanScoreGmean
-                    bestF1 = meanScoreF1
-                 
-#     retVal.train_eval_time = 0
-#     retVal.train_accuracy = bestAccuracy
-#     retVal.train_auc = bestAuROC
-#     retVal.train_f1 = bestF1
-#     retVal.train_gmean = bestGmean
-    
-    log('CV finished. Achieved best auROC={} using nu={}, gamma={} and kernel={}'.format(bestAuROC, bestNu, 
-                                                                                           bestGamma, bestKernel))
-    clf = svm.OneClassSVM(nu=bestNu, kernel=bestKernel, gamma=bestGamma)
-    clf.fit(X_train[y_train == 1]) # final training using all data
-    retVal.train_time = pong(pTrain)
-    
-    # get CV train metrics
-    pTrainCV = ping()
-    bestAuROC = getCVPerformance(clf, X_train, y_train)
-    bestAccuracy, bestF1, bestGmean = -1, -1, -1
-    retVal.train_eval_time = pong(pTrainCV)
-    retVal.train_accuracy = bestAccuracy
-    retVal.train_auroc = bestAuROC
-    retVal.train_auprc = bestAuPRC
-    retVal.train_f1 = bestF1
-    retVal.train_gmean = bestGmean
-    
-    pTest = ping()
-    pred = clf.predict(X_test)
-    pred[pred < 0] = 0 # SVM outputs -1 for the "0" class
-    
-    retVal.auROC, retVal.f1, retVal.accuracy, retVal.gmean, retVal.auPRC = getMetrics(y_test, pred)
-    retVal.test_time = pong(pTest)
-    
-    log('OCC score: auROC={}f, auPRC={}f, f1={}, accuracy={}, gmean={}'.format(retVal.auROC, retVal.auPRC, retVal.f1, retVal.accuracy, retVal.gmean))
-    #log('LogisticRegression score: %f' % logistic.fit(X_train, y_train).score(X_test, y_test))
-    
-    return retVal
-
-
-def testTrainOCC():
-    df = pd.DataFrame([
-              [1,3,1,0],
-              [1,4,1,1],
-              [1,5,1,0],
-              [1.5,6,1,0],
-              [1.7,7,1,0],
-              [1,4,1,0],
-              [1,6,1,0],
-              [1,5,1,1],
-              [1,12,1,1],
-              [1,9,1,1],
-              [1,2,1,1],
-              [1,3,1,1],
-              [1,5,1,1],
-              [2,8,1,0],
-              [3,1,0,1],
-              [3,2,0,1],
-              [4,2,0,0],
-              [5,3,0,0]], columns=['a', 'b', 'target','train'])
-    dfTrainTest = df['train']
-    X, y = createDatasetXY(df, indexFeatureStart = 0, indexFeatureEnd = 2, targetColumnName = "target")
-    X_train, X_test, y_train, y_test = train_test_split_index(X, y, dfTrainTest)
-
-    trainOCC(X_train, X_test, y_train, y_test)
-    
-def trainOCCScale(X_train, X_test, y_train, y_test,
-                  c_scale = 0, mode = "constant", targetClass = "minority"):
-    log("training OCC")
-    retVal = trainResult()
-
-    pTrain = ping()
-
-    X_train_pre = pd.DataFrame(X_train).reset_index(drop=True).as_matrix()  # otherwise indices from X will be used
-    X_test = pd.DataFrame(X_test).reset_index(drop=True).as_matrix().astype(
-        float)  # otherwise indices from X will be used
-
-    if (str(type(y_train)) != "<class 'numpy.ndarray'>"):
-        y_train_pre = y_train.reset_index(drop=True)  # otherwise indices from X will be used
-    else:
-        y_train_pre = y_train
-
-    if (str(type(y_test)) != "<class 'numpy.ndarray'>"):
-        y_test = y_test.reset_index(drop=True).as_matrix()  # otherwise indices from X will be used
-
-    ms_process_total = 0
-    bestAuROC = 0
-    bestAuPRC = 0
-    bestNu = 0.1
-    bestKernel = 'linear'
-    bestGamma = 0.1
-    gammaz = [0.001, 0.01, 0.1, 1]
-    nuz = [0.01, 0.1, 0.5, 0.75, 1]
-    kernelz =  ['linear', 'poly', 'sigmoid']
-    for gamma in gammaz:
-        for nu in nuz:
-            for kernel in kernelz:  # 'rbf'
-                # log("CV for gamma={}, nu={}, kernel={}".format(gamma, nu, kernel))
-
-                kf = KFold(n_splits=n_folds)
-                kf.get_n_splits(X_train_pre)
-                # print(kf) # print info about folds
-
-                scoresAuROC = []
-                scoresAuPRC = []
-                scoresF1 = []
-                scoresAccuracy = []
-                scoresGmean = []
-                scoresAuPRC = []
-
-                for train_index, test_index in kf.split(X_train):
-                    X_train_cv, X_test_cv = X_train_pre[train_index], X_train_pre[test_index]
-                    y_train_cv, y_test_cv = y_train_pre[train_index], y_train_pre[test_index]
-
-                    # tests will be unaffected
-                    if (c_scale > 0):
-                        pProcess = ping()
-                        X_train_cv, y_train_cv = createScaledDataset(X_train_cv, y_train_cv, mode=mode, c=c_scale,
-                                                                     targetClass=targetClass)
-                        ms_process_total += pong(pProcess)
-
-                    if ((np.isnan(X_train_cv)).any):
-                        X_train_cv = np.nan_to_num(X_train_cv)
-
-                    clf = svm.OneClassSVM(nu=nu, kernel=kernel, gamma=gamma, tol=0.01)
-                    # n_estimators = 10
-                    # clf = BaggingClassifier(svm.OneClassSVM(nu=nu, kernel=kernel, gamma=gamma),
-                    #                        max_samples=1.0 / n_estimators, n_estimators=n_estimators)
-
-                    # pprint(X_train_cv[y_train_cv == 1])
-                    # np.savetxt("occ_data.csv", X_train_cv, delimiter=",")
-
-                    nrow, ncol = X_train_cv[y_train_cv == 1].shape
-
-                    if (nrow == 0):
-                        log("no samples (CV for gamma={}, nu={}, kernel={}), continueing...".format(gamma, nu, kernel))
-                        continue
-
-                    clf.fit(X_train_cv[y_train_cv == 1])
-                    pred = clf.predict(X_test_cv)
-                    pred[pred < 0] = 0  # SVM outputs -1 for the "0" class
-
-                    # eval
-                    auROC, f1, accuracy, gmean, auPRC = getMetrics(y_test_cv, pred)
-
-                    scoresAuROC.append(auROC)
-                    scoresF1.append(f1)
-                    scoresAccuracy.append(accuracy)
-                    scoresGmean.append(gmean)
-                    scoresAuPRC.append(auPRC)
-
-                meanScoreAuROC = np.mean(scoresAuROC)
-                meanScoreF1 = np.mean(scoresF1)
-                meanScoreAccuracy = np.mean(scoresAccuracy)
-                meanScoreGmean = np.mean(scoresGmean)
-                meanScoreAuPRC = np.mean(scoresAuPRC)
-                
-                if(meanScoreAuPRC > bestAuPRC):
-                    bestAuROC = auROC
-                    bestAccuracy = meanScoreAccuracy
-                    bestGmean = meanScoreGmean
-                    bestF1 = meanScoreF1
-                    bestAuPRC = meanScoreAuPRC
-                    bestNu = nu
-                    bestKernel = kernel
-                    bestGamma = gamma
-
-                    #     retVal.train_eval_time = 0
-                    #     retVal.train_accuracy = bestAccuracy
-                    #     retVal.train_auc = bestAuROC
-                    #     retVal.train_f1 = bestF1
-                    #     retVal.train_gmean = bestGmean
-
-    log('CV finished. Achieved best auROC={} using nu={}, gamma={} and kernel={}'.format(bestAuROC, bestNu,
-                                                                                           bestGamma, bestKernel))
-
-    if (c_scale > 0):
-        log("scaling final train data...")
-        pProcess = ping()
-        X_train, y_train = createScaledDataset(X_train, y_train,
-                                                     c = c_scale, mode = mode, targetClass = targetClass)
-        ms_process_total /= len(gammaz) # allowed, because: Folds can be calculated out of the CV loop.
-        ms_process_total /= len(nuz) # allowed, because: Folds can be calculated out of the CV loop.
-        ms_process_total /= len(kernelz) # allowed, because: Folds can be calculated out of the CV loop.
-        ms_process_total = pong(pProcess)
-
-    if((np.isnan(X_train)).any):
-        X_train= np.nan_to_num(X_train)
-        
-    clf = svm.OneClassSVM(nu=bestNu, kernel=bestKernel, gamma=bestGamma)
-    clf.fit(X_train[y_train == 1])  # final training using all data
-    retVal.train_time = pong(pTrain)
-
-    # get CV train metrics
-    pTrainCV = ping()
-    bestAuROC = bestAuROC
-    bestAccuracy, bestF1, bestGmean = -1, -1, -1
-    retVal.train_eval_time = pong(pTrainCV)
-    retVal.train_accuracy = bestAccuracy
-    retVal.train_auc = bestAuROC
-    retVal.train_f1 = bestF1
-    retVal.train_gmean = bestGmean
-    if(c_scale > 0):
-        retVal.ms_process = ms_process_total
-        
-    pTest = ping()
-    pred = clf.predict(X_test)
-    pred[pred < 0] = 0  # SVM outputs -1 for the "0" class
-
-    retVal.auROC, retVal.f1, retVal.accuracy, retVal.gmean, retVal.auPRC = getMetrics(y_test, pred)
-    retVal.test_time = pong(pTest)
-
-    log('OCC score: auROC={}f, auPRC={}f, f1={}, accuracy={}, gmean={}'.format(retVal.auROC, retVal.auPRC, 
-                                                                               retVal.f1, retVal.accuracy, retVal.gmean))
-    
-
-    return retVal
-
-def trainKNN(X_train, X_test, y_train, y_test):
-    log("training KNN")
-    retVal = trainResult()
-    
-    knn = neighbors.KNeighborsClassifier()
-    
-    pTrain = ping()
-    model = knn.fit(X_train, y_train)
-    retVal.train_time = pong(pTrain)
-    
-    # get CV train metrics
-    pTrainCV = ping()
-    bestAuROC = getCVPerformance(model, X_train, y_train)
-    bestAccuracy, bestF1, bestGmean = -1, -1, -1
-    retVal.train_eval_time = pong(pTrainCV)
-    retVal.train_accuracy = bestAccuracy
-    retVal.train_auc = bestAuROC
-    retVal.train_f1 = bestF1
-    retVal.train_gmean = bestGmean
-    
-    pred = model.predict(X_test)
-    
-    lastModel = model
-    lastY = y_test
-    lastPred = pred
-    
-    pTest = ping()
-    retVal.auROC, retVal.f1, retVal.accuracy, retVal.gmean, retVal.auPRC = getMetrics(y_test, pred)
-    retVal.test_time = pong(pTest)
-
-    log('KNN score: auROC={}f, auPRC={}f, f1={}, accuracy={}, gmean={}'.format(retVal.auROC, retVal.auPRC, retVal.f1, retVal.accuracy, retVal.gmean))
-    
-    return retVal
-    
-from sklearn import datasets, neighbors, linear_model, svm
-from sklearn.ensemble import RandomForestClassifier
-import numpy as np
-from sklearn import metrics
-from sklearn.metrics import roc_auc_score, make_scorer
-from sklearn.ensemble import BaggingClassifier
-from sklearn.model_selection import KFold, cross_val_score
-
-def trainKNNScale(X_train, X_test, y_train, y_test,
-                  c_scale = 0, mode = "constant", targetClass = "minority"):
-    log("training KNN")
-    retVal = trainResult()
-
-    # train
-    ms_process_total = 0
-    pTrain = ping()
-    X_train_pre = pd.DataFrame(X_train).reset_index(drop=True).as_matrix()  # otherwise indices from X will be used
-    X_test = pd.DataFrame(X_test).reset_index(drop=True).as_matrix().astype(float)  # otherwise indices from X will be used
-
-    if (str(type(y_train)) != "<class 'numpy.ndarray'>"):
-        y_train_pre = y_train.reset_index(drop=True)  # otherwise indices from X will be used
-    else:
-        y_train_pre = y_train
-
-    if (str(type(y_test)) != "<class 'numpy.ndarray'>"):
-        y_test = y_test.reset_index(drop=True).as_matrix()  # otherwise indices from X will be used
-    from sklearn.model_selection import KFold
-
-    bestAuROC = 0
-    bestAuPRC = 0
-    bestNN = 3
-    neighborz = [3, 5, 10]
-
-    number_of_folds = n_folds
-    num_of_cpus = multiprocessing.cpu_count()
-
-    for nn in neighborz:
-        kf = KFold(n_splits=n_folds)
-        kf.get_n_splits(X_train_pre)
-
-        scoresAuROC = []
-        scoresF1 = []
-        scoresAccuracy = []
-        scoresGmean = []
-        scoresAuPRC = []
-
-        for train_index, test_index in kf.split(X_train):
-            X_train_cv, X_test_cv = X_train_pre[train_index], X_train_pre[test_index]
-            y_train_cv, y_test_cv = y_train_pre[train_index], y_train_pre[test_index]
-
-            # tests will be unaffected
-            if (c_scale > 0):
-                pProcess = ping()
-                X_train_cv, y_train_cv = createScaledDataset(X_train_cv, y_train_cv, mode=mode, c=c_scale, targetClass=targetClass)
-                ms_process_total += pong(pProcess)
-
-            clf = neighbors.KNeighborsClassifier(n_neighbors = nn, n_jobs = num_of_cpus)
-
-            # train
-            if ((np.isnan(X_train_cv)).any):
-                X_train_cv = np.nan_to_num(X_train_cv)
-
-            clf.fit(X_train_cv, y_train_cv)
-            pred = clf.predict(X_test_cv)
-
-            # eval
-            auROC, f1, accuracy, gmean, auPRC = getMetrics(y_test_cv, pred)
-
-            scoresAuROC.append(auROC)
-            scoresF1.append(f1)
-            scoresAccuracy.append(accuracy)
-            scoresGmean.append(gmean)
-            scoresAuPRC.append(auPRC)
-
-        meanScoreAuROC = np.mean(scoresAuROC)
-        meanScoreF1 = np.mean(scoresF1)
-        meanScoreAccuracy = np.mean(scoresAccuracy)
-        meanScoreGmean = np.mean(scoresGmean)
-        meanScoreAuPRC = np.mean(scoresAuPRC)
-
-        if(meanScoreAuPRC > bestAuPRC):
-            bestNN = nn
-            bestAuROC = auROC
-            bestAccuracy = meanScoreAccuracy
-            bestGmean = meanScoreGmean
-            bestF1 = meanScoreF1
-            bestAuPRC = meanScoreAuPRC
-
-
-    if (c_scale > 0):
-        log("scaling final train data...")
-        pProcess = ping()
-        X_train, y_train = createScaledDataset(X_train, y_train,
-                                               c=c_scale, mode=mode, targetClass=targetClass)
-        ms_process_total /= len(neighborz)
-        ms_process_total = pong(pProcess)
-
-    if((np.isnan(X_train)).any):
-        X_train= np.nan_to_num(X_train)
-        
-    clf = neighbors.KNeighborsClassifier(n_neighbors = bestNN, n_jobs = num_of_cpus)
-    model = clf.fit(X_train, y_train)
-    retVal.train_time = pong(pTrain)
-
-    # get CV train metrics
-    pTrainCV = ping()
-    bestAccuracy, bestF1, bestGmean = -1, -1, -1
-    retVal.train_eval_time = pong(pTrainCV)
-    retVal.train_accuracy = bestAccuracy
-    retVal.train_auc = bestAuROC
-    retVal.train_f1 = bestF1
-    retVal.train_gmean = bestGmean
-    if(c_scale > 0):
-        retVal.ms_process = ms_process_total
-
-    pred = model.predict(X_test)
-
-    lastModel = model
-    lastY = y_test
-    lastPred = pred
-
-    pTest = ping()
-    retVal.auROC, retVal.f1, retVal.accuracy, retVal.gmean, retVal.auPRC = getMetrics(y_test, pred)
-    retVal.test_time = pong(pTest)
-
-    log('KNN score: auc={}f, f1={}, accuracy={}, gmean={}'.format(retVal.auROC, retVal.f1, retVal.accuracy,
-                                                                  retVal.gmean))
-
-    return retVal
-
-def trainRF(X_train, X_test, y_train, y_test):
-    log("training RF")
-    retVal = trainResult()
-
-    
-    # train    
-    pTrain = ping()
-    X_train_pre = pd.DataFrame(X_train).reset_index(drop=True).as_matrix() # otherwise indices from X will be used
-    X_test = pd.DataFrame(X_test).reset_index(drop=True).as_matrix().astype(float) # otherwise indices from X will be used
-            
-            
-    if (str(type(y_train)) != "<class 'numpy.ndarray'>"):
-        y_train_pre = y_train.reset_index(drop=True) # otherwise indices from X will be used
-    else:
-         y_train_pre = y_train
-            
-    if (str(type(y_test)) != "<class 'numpy.ndarray'>"):
-        y_test = y_test.reset_index(drop=True).as_matrix() # otherwise indices from X will be used
-    from sklearn.model_selection import KFold
-    bestAuROC = 0
-    bestAuPRC = 0
-    bestAccuracy = 0
-    bestGmean = 0
-    bestF1 = 0
-    bestEstimators = 10
-    bestCriterion = 'gini'
-    estimatorz = [5,10,20]
-    criterionz = ['gini', 'entropy']
-    for estimators in estimatorz:
-        for criterion in criterionz:
-            kf = KFold(n_splits=n_folds)
-            kf.get_n_splits(X_train_pre)
-            
-            scoresAuROC = []
-            scoresF1 = []
-            scoresAccuracy = []
-            scoresGmean = []
-            scoresAuPRC = []
-                
-            for train_index, test_index in kf.split(X_train):
-                X_train_cv, X_test_cv = X_train_pre[train_index], X_train_pre[test_index]
-                y_train_cv, y_test_cv = y_train_pre[train_index], y_train_pre[test_index]
-
-                clf = RandomForestClassifier(n_estimators=estimators, criterion=criterion)
-                
-                # train
-                clf.fit(X_train_cv, y_train_cv)
-                pred = clf.predict(X_test_cv)
-
-                # eval
-                auROC, f1, accuracy, gmean, auPRC = getMetrics(y_test_cv, pred)
-                
-                scoresAuROC.append(auROC)
-                scoresF1.append(f1)
-                scoresAccuracy.append(accuracy)
-                scoresGmean.append(gmean)
-                scoresAuPRC.append(auPRC)
-
-            meanScoreAuROC = np.mean(scoresAuROC)
-            meanScoreF1 = np.mean(scoresF1)
-            meanScoreAccuracy = np.mean(scoresAccuracy)
-            meanScoreGmean = np.mean(scoresGmean)
-            meanScoreAuPRC = np.mean(scoresAuPRC)
-
-            if(meanScoreAuPRC > bestAuPRC):
-                bestEstimators = estimators
-                bestCriterion = criterion
-                bestAuROC = auROC
-                bestAccuracy = meanScoreAccuracy
-                bestGmean = meanScoreGmean
-                bestF1 = meanScoreF1
-                bestAuPRC = meanScoreAuPRC
-                
-#     retVal.train_eval_time = 0
-#     retVal.train_accuracy = bestAccuracy
-#     retVal.train_auc = bestAuROC
-#     retVal.train_f1 = bestF1
-#     retVal.train_gmean = bestGmean
-    
-    clf = RandomForestClassifier(n_estimators=bestEstimators, criterion=bestCriterion)
-    clf.fit(X_train_cv, y_train_cv)            
-    retVal.train_time = pong(pTrain)
-    
-    # get CV train metrics
-    pTrainCV = ping()
-    bestAuROC = getCVPerformance(clf, X_train, y_train)
-    bestAccuracy, bestF1, bestGmean = -1, -1, -1
-    retVal.train_eval_time = pong(pTrainCV)
-    retVal.train_accuracy = bestAccuracy
-    retVal.train_auc = bestAuROC
-    retVal.train_f1 = bestF1
-    retVal.train_gmean = bestGmean
-    
-    pred = clf.predict(X_test)
-    
-    pTest = ping()
-    retVal.auROC, retVal.f1, retVal.accuracy, retVal.gmean, retVal.auPRC = getMetrics(y_test, pred)
-    retVal.test_time = pong(pTest)
-
-    log('RF score: auROC={}f, auPRC={}f, f1={}, accuracy={}, gmean={}'.format(retVal.auROC, retVal.auPRC, retVal.f1, retVal.accuracy, retVal.gmean))
-    
-    return retVal
-
-
-
-def trainRFScale(X_train, X_test, y_train, y_test, balanced = None,
-                 c_scale = 0, mode = "constant", targetClass = "minority"):
-    log("training RF")
-    retVal = trainResult()
-    ms_process_total = 0
-    
-    num_of_cpus = multiprocessing.cpu_count()
-
-    
-    # train    
-    pTrain = ping()
-    X_train_pre = pd.DataFrame(X_train).reset_index(drop=True).as_matrix() # otherwise indices from X will be used
-    X_test = pd.DataFrame(X_test).reset_index(drop=True).as_matrix().astype(float) # otherwise indices from X will be used
-            
-            
-    if (str(type(y_train)) != "<class 'numpy.ndarray'>"):
-        y_train_pre = y_train.reset_index(drop=True) # otherwise indices from X will be used
-    else:
-         y_train_pre = y_train
-            
-    if (str(type(y_test)) != "<class 'numpy.ndarray'>"):
-        y_test = y_test.reset_index(drop=True).as_matrix() # otherwise indices from X will be used
-    from sklearn.model_selection import KFold
-    bestAuROC = 0
-    bestAuPRC = 0
-    bestAccuracy = 0
-    bestGmean = 0
-    bestF1 = 0
-    bestEstimators = 10
-    bestCriterion = 'gini'
-    estimatorz = [5,10,20]
-    criterionz = ['gini', 'entropy']
-    for estimators in estimatorz:
-        for criterion in criterionz:
-            kf = KFold(n_splits=n_folds)
-            kf.get_n_splits(X_train_pre)
-            
-            scoresAuROC = []
-            scoresF1 = []
-            scoresAccuracy = []
-            scoresGmean = []
-            scoresAuPRC = []
-                
-            for train_index, test_index in kf.split(X_train):
-                X_train_cv, X_test_cv = X_train_pre[train_index], X_train_pre[test_index]
-                y_train_cv, y_test_cv = y_train_pre[train_index], y_train_pre[test_index]
-
-                clf = RandomForestClassifier(n_estimators=estimators, criterion=criterion, 
-                                             class_weight = balanced, n_jobs = num_of_cpus)
-                
-                # tests will be unaffected
-                if (c_scale > 0):
-                    pProcess = ping()
-                    X_train_cv, y_train_cv = createScaledDataset(X_train_cv, y_train_cv, mode=mode, c=c_scale, targetClass=targetClass)
-                    ms_process_total += pong(pProcess)                    
-
-                if ((np.isnan(X_train_cv)).any):
-                    X_train_cv = np.nan_to_num(X_train_cv)
-                    
-                # train
-                clf.fit(X_train_cv, y_train_cv)
-                pred = clf.predict(X_test_cv)
-
-                # eval
-                auROC, f1, accuracy, gmean, auPRC = getMetrics(y_test_cv, pred)
-                
-                scoresAuROC.append(auROC)
-                scoresF1.append(f1)
-                scoresAccuracy.append(accuracy)
-                scoresGmean.append(gmean)
-                scoresAuPRC.append(auPRC)
-
-            meanScoreAuROC = np.mean(scoresAuROC)
-            meanScoreF1 = np.mean(scoresF1)
-            meanScoreAccuracy = np.mean(scoresAccuracy)
-            meanScoreGmean = np.mean(scoresGmean)
-            meanScoreAuPRC = np.mean(scoresAuPRC)
-
-            if(meanScoreAuPRC > bestAuPRC):
-                bestEstimators = estimators
-                bestCriterion = criterion
-                bestAuROC = auROC
-                bestAccuracy = meanScoreAccuracy
-                bestGmean = meanScoreGmean
-                bestF1 = meanScoreF1
-                bestAuPRC = meanScoreAuPRC
-                
-    if (c_scale > 0):
-        log("scaling final train data...")
-        pProcess = ping()
-        X_train, y_train = createScaledDataset(X_train, y_train,
-                                               c=c_scale, mode=mode, targetClass=targetClass)
-        ms_process_total /= len(estimatorz)
-        ms_process_total /= len(criterionz)
-        ms_process_total = pong(pProcess)
-        
-    if((np.isnan(X_train)).any):
-        X_train= np.nan_to_num(X_train)
-        
-    clf = RandomForestClassifier(n_estimators=bestEstimators, criterion=bestCriterion)
-    clf.fit(X_train_cv, y_train_cv)            
-    retVal.train_time = pong(pTrain)
-    
-    # get CV train metrics
-    pTrainCV = ping()
-    bestAuROC = bestAuROC
-    bestAccuracy, bestF1, bestGmean = -1, -1, -1
-    retVal.train_eval_time = pong(pTrainCV)
-    retVal.train_accuracy = bestAccuracy
-    retVal.train_auc = bestAuROC
-    retVal.train_f1 = bestF1
-    retVal.train_gmean = bestGmean
-    if(c_scale > 0):
-        retVal.ms_process = ms_process_total
-    
-    pred = clf.predict(X_test)
-    
-    pTest = ping()
-    retVal.auROC, retVal.f1, retVal.accuracy, retVal.gmean, retVal.auPRC = getMetrics(y_test, pred)
-    retVal.test_time = pong(pTest)
-
-    log('RF score: auROC={}f, auPRC={}f, f1={}, accuracy={}, gmean={}'.format(retVal.auROC, retVal.auPRC, retVal.f1, retVal.accuracy, retVal.gmean))
-    
-    return retVal
-
-
-
-def trainLR(X_train, X_test, y_train, y_test, balanced = None, scoring = "none"):
-    """
-    Trains a logistic regression based on cross validation.
-    :param balance: Use 'balanced' or None
-        Weights associated with classes in the form {class_label: weight}. 
-        If not given, all classes are supposed to have weight one.
-    
-        The “balanced” mode uses the values of y to automatically adjust weights inversely 
-        proportional to class frequencies in the input data as n_samples / (n_classes * np.bincount(y)).
-        
-        Note that these weights will be multiplied with sample_weight (passed through the 
-        fit method) if sample_weight is specified. 
-        From http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegressionCV.html
-    :param scoring: Use "none" for default, "auROC" for AUC of ROC curve
-    """
-    log("training LR ({})".format(scoring))
-    
-    retVal = trainResult()
-    
-    number_of_folds = n_folds
-    tolerance = 0.01
-    num_of_cpus = multiprocessing.cpu_count()
-    
-    
-    if (scoring == "auROCWeighted"):
-        log("\"auROCWeighted\" set. Using area under ROC curve with weighted samples...")
-        n_samples, n_features = X_train.shape
-        n_classes = len(np.unique(y_train))
-        w = n_samples / (n_classes * np.bincount(y_train)) # bincount returns the number of instances for non-negative integer: 0, 1, ...
-        # w holds now the inverse weights of all classes
-        w_array = w[y_train] # pick weight based on corresponding label
-        scorer = auc_scorer = make_scorer(roc_auc_score,
-                                          average = "weighted",
-                                          sample_weight = w_array) # additional parameters can be specified, see 
-                                            # http://scikit-learn.org/stable/modules/generated/sklearn.metrics.roc_auc_score.html#sklearn.metrics.roc_auc_score
-                                            # and http://scikit-learn.org/dev/modules/model_evaluation.html
-    elif (scoring == "auROC"):
-        log("\"auROC\" set. Using area under ROC curve...")
-        scorer = auc_scorer = make_scorer(roc_auc_score) # additional parameters can be specified, see 
-                                            # http://scikit-learn.org/stable/modules/generated/sklearn.metrics.roc_auc_score.html#sklearn.metrics.roc_auc_score
-                                            # and http://scikit-learn.org/dev/modules/model_evaluation.html
-    else:
-        log("Scoring method \"" + scoring + "\" not recognized or set. Using default (accuracy)...")
-        scorer = None
-    
-                
-    lr = linear_model.LogisticRegressionCV(Cs=[0.001, 0.01, 0.1, 1], 
-                                          class_weight = balanced,
-                                          cv = number_of_folds,
-                                          penalty = 'l1',
-                                          scoring = scorer,
-                                          solver = 'liblinear',
-                                          tol = tolerance,
-                                          n_jobs = num_of_cpus)
-        
-        
-
-    tain_time = 0
-    test_time = 0
-    
-    
-    pTrain = ping()
-    model = lr.fit(X_train, y_train)
-    retVal.train_time = pong(pTrain)
-        
-    # for logisticRegressionCV the cv is already built in, therefore, we can use clf.scores_[1]:
-    # clf.scores_[1].shape > (6, 3) > 6 = number of folds, 3 = number of tried out Cs.
-    pTrainCV = ping()
-    #bestAuROC = getCVPerformance(model, X_train, y_train)
-    aucMeanCspecific = max(np.mean(model.scores_[1], axis = 0)) # get CV train metrics
-    #bestAuROC = max(np.mean(model.scores_[1], axis = 0)) # Take the mean for each C and then the maximum
-    bestAccuracy, bestF1, bestGmean = -1, -1, -1
-    retVal.train_eval_time = pong(pTrainCV)
-    retVal.train_accuracy = bestAccuracy
-    retVal.train_auc = aucMeanCspecific
-    retVal.train_f1 = bestF1
-    retVal.train_gmean = bestGmean
-    log("best AUC during training was" + str(aucMeanCspecific))
-    
-    pred_all = model.predict_proba(X_test)
-    pred = pred_all[:,1]
-    
-    #pprint(np.column_stack((y_test, pred.round(3))))
-    
-    pTest = ping()
-    retVal.auROC, retVal.f1, retVal.accuracy, retVal.gmean, retVal.auPRC = getMetrics(y_test, pred)
-    retVal.test_time = pong(pTest)
-    
-    log('LR score: auROC={}f, auPRC={}f, f1={}, accuracy={}, gmean={}'.format(retVal.auROC, retVal.auPRC, retVal.f1, retVal.accuracy, retVal.gmean))
-    
-    return retVal
-
-def trainLRScale(X_train, X_test, y_train, y_test, balanced = None,
-                 c_scale = 0, mode = "constant", targetClass = "minority"):
-    """
-    Trains a logistic regression based on cross validation.
-    :param balance: Use 'balanced' or None
-        Weights associated with classes in the form {class_label: weight}. 
-        If not given, all classes are supposed to have weight one.
-    
-        The “balanced” mode uses the values of y to automatically adjust weights inversely 
-        proportional to class frequencies in the input data as n_samples / (n_classes * np.bincount(y)).
-        
-        Note that these weights will be multiplied with sample_weight (passed through the 
-        fit method) if sample_weight is specified. 
-        From http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegressionCV.html
-    :param scoring: Use "none" for default, "auROC" for AUC of ROC curve
-    """
-    
-    retVal = trainResult()
-    
-    number_of_folds = n_folds
-    tolerance = 0.01
-    num_of_cpus = multiprocessing.cpu_count()
-    ms_process_total = 0
-    
-    # train    
-    pTrain = ping()
-    X_train_pre = pd.DataFrame(X_train).reset_index(drop=True).as_matrix() # otherwise indices from X will be used
-    X_test = pd.DataFrame(X_test).reset_index(drop=True).as_matrix().astype(float) # otherwise indices from X will be used
-            
-            
-    if (str(type(y_train)) != "<class 'numpy.ndarray'>"):
-        y_train_pre = y_train.reset_index(drop=True) # otherwise indices from X will be used
-    else:
-        y_train_pre = y_train
-            
-    if (str(type(y_test)) != "<class 'numpy.ndarray'>"):
-        y_test = y_test.reset_index(drop=True).as_matrix() # otherwise indices from X will be used
-    from sklearn.model_selection import KFold
-    bestAuROC = 0
-    bestAuPRC = 0
-    bestC= 0.001
-    Cs = [0.001, 0.01, 0.1, 1]
-    for c in Cs:
-        kf = KFold(n_splits=n_folds)
-        kf.get_n_splits(X_train_pre)
-
-        scoresAuROC = []
-        scoresF1 = []
-        scoresAccuracy = []
-        scoresGmean = []
-        scoresAuPRC = []
-        
-        for train_index, test_index in kf.split(X_train):
-            X_train_cv, X_test_cv = X_train_pre[train_index], X_train_pre[test_index]
-            y_train_cv, y_test_cv = y_train_pre[train_index], y_train_pre[test_index]
-            
-            # tests will be unaffected
-            if (c_scale > 0):
-                pProcess = ping()
-                X_train_cv, y_train_cv = createScaledDataset(X_train_cv, y_train_cv, mode = mode, c = c_scale, targetClass = targetClass)
-                ms_process_total += pong(pProcess)
-
-            clf = linear_model.LogisticRegression(C=c, 
-                                          class_weight = balanced,
-                                          penalty = 'l1',
-                                          solver = 'liblinear',
-                                          tol = tolerance,
-                                          n_jobs = num_of_cpus)
-
-            # train
-            if((np.isnan(X_train_cv)).any):
-                X_train_cv= np.nan_to_num(X_train_cv)
-                
-            clf.fit(X_train_cv, y_train_cv)
-            pred = clf.predict(X_test_cv)
-
-            # eval
-            auROC, f1, accuracy, gmean, auPRC = getMetrics(y_test_cv, pred)
-
-            scoresAuROC.append(auROC)
-            scoresF1.append(f1)
-            scoresAccuracy.append(accuracy)
-            scoresGmean.append(gmean)
-            scoresAuPRC.append(auPRC)
-
-        meanScoreAuROC = np.mean(scoresAuROC)
-        meanScoreF1 = np.mean(scoresF1)
-        meanScoreAccuracy = np.mean(scoresAccuracy)
-        meanScoreGmean = np.mean(scoresGmean)
-        meanScoreAuPRC = np.mean(scoresAuPRC)
-
-        if(meanScoreAuPRC > bestAuPRC):
-            bestC = c
-            bestAuROC = auROC
-            bestAccuracy = meanScoreAccuracy
-            bestGmean = meanScoreGmean
-            bestF1 = meanScoreF1
-            bestAuPRC = meanScoreAuPRC
-    
-    tain_time = 0
-    test_time = 0
-    lr = linear_model.LogisticRegression(C=bestC, 
-                                          class_weight = balanced,
-                                          penalty = 'l1',
-                                          solver = 'liblinear',
-                                          tol = tolerance,
-                                          n_jobs = num_of_cpus)
-    if (c_scale > 0): 
-        log("scaling final train data...")
-        pProcess = ping()
-        X_train, y_train = createScaledDataset(X_train, y_train,
-                                                     c = c_scale, mode = mode, targetClass = targetClass)
-        ms_process_total /= len(Cs)
-        ms_process_total = pong(pProcess)
-
-    if((np.isnan(X_train)).any):
-        X_train= np.nan_to_num(X_train)
-    model = lr.fit(X_train, y_train)
-    retVal.train_time = pong(pTrain)
-    
-    pTrainCV = ping()
-    aucMeanCspecific = bestAuROC
-    bestAccuracy, bestF1, bestGmean = -1, -1, -1
-    retVal.train_eval_time = pong(pTrainCV)
-    retVal.train_accuracy = bestAccuracy
-    retVal.train_auc = aucMeanCspecific
-    retVal.train_f1 = bestF1
-    retVal.train_gmean = bestGmean
-    log("best AUC during training was" + str(aucMeanCspecific))
-    if(c_scale > 0):
-        retVal.ms_process = ms_process_total
-    
-    pred_all = model.predict_proba(X_test)
-    pred = pred_all[:,1]
-    
-    #pprint(np.column_stack((y_test, pred.round(3))))
-    
-    pTest = ping()
-    retVal.auROC, retVal.f1, retVal.accuracy, retVal.gmean, retVal.auPRC = getMetrics(y_test, pred)
-    retVal.test_time = pong(pTest)
-    
-    log('LR score: auROC={}f, auPRC={}f, f1={}, accuracy={}, gmean={}'.format(retVal.auROC, retVal.auPRC, retVal.f1, retVal.accuracy, retVal.gmean))
-    
-    return retVal
-
-
-# ## Main loop, where all targets will be called subsequently
-
-# In[ ]:
-
-def evalAll(X_train_pre, X_test_pre, y_train_pre, y_test_pre, dataset, target, bs2_measure,
-            modeltypes = ['RF', 'OCC', 'LR', 'KNN', 'WLR'], 
-            approaches = ['OSS', 'CNN', 'naive', 'SMOTE', 'tomek', 'ScaleMajority', 'ScaleMinority', 'ScaleBoth', 'ADASYN']):
-    global X_train, X_test, y_train, y_test
-    
-    nAllSample = len(X_train_pre) + len(X_test_pre)
-    nAllSamplePos = sum(y_train_pre) + sum(y_test_pre)
-    nAllSampleNeg = nAllSample - nAllSamplePos
-    
-    nTrainSampleBefore = len(X_train_pre)
-    nTrainSamplePosBefore = sum(y_train_pre)
-    nTrainSampleNegBefore = nTrainSampleBefore - nTrainSamplePosBefore
-                
-    for approach in approaches:
-        c_opt_cv = -1
-        if target == "BEFUND___TA___61_14_535":
-            c_manual_values = [0, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
-        else:
-            c_manual_values = [0, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1]
-            
-        for c_manual in c_manual_values:
-            for scalingMode in ["linear", "constant"]:
-                msProcess = 0 # reset
-                approachMod = approach
-                if("Scale" in approach):
-                    log("trying approach " + approach + ", using c=" + str(c_manual) + "...")
-                elif(c_manual == 0.0 and scalingMode == "constant"):
-                    log("trying approach " + approach + "...")
-                    c_opt_cv = -1
-                else:
-                    continue # only scaling makes sense to be analysed with various c's
-
-                approachMod = approach
-                if('naive' in approach):
-                    pProcess = ping()
-                    X_test, y_test = X_test_pre, y_test_pre
-                    X_train, y_train = createNaiveDataset(X_train_pre, y_train_pre)
-                    msProcess = pong(pProcess)
-
-                if('SMOTE' in approach):
-                    pProcess = ping()
-                    X_test, y_test = X_test_pre, y_test_pre
-                    X_train, y_train = createSMOTEDataset(X_train_pre, y_train_pre)
-                    msProcess = pong(pProcess)
-
-                if('tomek' in approach):
-                    pProcess = ping()
-                    X_test, y_test = X_test_pre, y_test_pre
-                    X_train, y_train = createTomekDataset(X_train_pre, y_train_pre)                
-                    msProcess = pong(pProcess)
-
-                if('ADASYN' in approach):
-                    pProcess = ping()
-                    X_test, y_test = X_test_pre, y_test_pre
-                    X_train, y_train = createADASYNDataset(X_train_pre, y_train_pre)     
-                    msProcess = pong(pProcess)
-
-                if('OSS' in approach):
-                    pProcess = ping()
-                    X_test, y_test = X_test_pre, y_test_pre
-                    X_train, y_train = createOSSDataset(X_train_pre, y_train_pre)  
-                    msProcess = pong(pProcess)
-
-                if('CNN' in approach):
-                    pProcess = ping()
-                    X_test, y_test = X_test_pre, y_test_pre
-                    X_train, y_train = createCNNDataset(X_train_pre, y_train_pre)  
-                    msProcess = pong(pProcess)
-
-                targetClass = ""
-                if('ScaleMajority' in approach):
-                    #pProcess = ping()
-                    targetClass = "majority"
-                    approachMod = "CSS"
-                    X_test, y_test = X_test_pre, y_test_pre
-                    X_train, y_train = X_train_pre, y_train_pre
-                    #X_train, y_train = createScaledDataset(X_train_pre, y_train_pre,c = c_manual, mode = scalingMode,
-                    #                                       targetClass = targetClass 
-                    #                                       )
-                    #msProcess = pong(pProcess)      
-
-                if('ScaleMinority' in approach):
-                    #pProcess = ping()
-                    targetClass = "minority"
-                    approachMod = "CSS"
-                    X_test, y_test = X_test_pre, y_test_pre
-                    X_train, y_train = X_train_pre, y_train_pre
-                    #X_train, y_train = createScaledDataset(X_train_pre, y_train_pre, 
-                    #                                       c = c_manual, mode = scalingMode,
-                    #                                       targetClass = targetClass)
-                    #msProcess = pong(pProcess)
-
-                if('ScaleBoth' in approach):
-                    #pProcess = ping()
-                    targetClass = "both"
-                    approachMod = "CSS"
-                    X_test, y_test = X_test_pre, y_test_pre
-                    X_train, y_train = X_train_pre, y_train_pre
-                    #X_train, y_train = createScaledDataset(X_train_pre, y_train_pre, 
-                    #                                       c = c_manual, mode = scalingMode,
-                    #                                       targetClass = targetClass)
-                    #msProcess = pong(pProcess)
-
-                for modeltype in modeltypes:
-                    log("evaluating approach {} using model {}...".format(approach, modeltype))
-                    
-                    #try:
-                    if(modeltype == 'LR'):
-                        log("debug: {}, {}, {}".format(c_manual, scalingMode, targetClass))
-                        retVal = trainLRScale(X_train, X_test, y_train, y_test, balanced=None,
-                                              c_scale = c_manual, mode = scalingMode, targetClass = targetClass)
-
-                    if(modeltype == 'KNN'):
-                        retVal = trainKNNScale(X_train, X_test, y_train, y_test,
-                                              c_scale = c_manual, mode = scalingMode, targetClass = targetClass)
-
-                    if(modeltype == 'WLR'):
-                        retVal = trainLRScale(X_train, X_test, y_train, y_test, balanced='balanced',
-                                              c_scale = c_manual, mode = scalingMode, targetClass = targetClass)
-
-                    if(modeltype == 'OCC'):
-                        retVal = trainOCCScale(X_train, X_test, y_train, y_test,
-                                              c_scale = c_manual, mode = scalingMode, targetClass = targetClass)
-
-                    if(modeltype == 'RF'):
-                        retVal = trainRFScale(X_train, X_test, y_train, y_test, 
-                                              c_scale = c_manual, mode = scalingMode, targetClass = targetClass)
-
-                    if(modeltype == 'WRF'):
-                        retVal = trainRFScale(X_train, X_test, y_train, y_test, balanced='balanced',
-                                              c_scale = c_manual, mode = scalingMode, targetClass = targetClass)
-                        
-                    if(modeltype == 'NN'):
-                        retVal = trainNNScale(X_train, X_test, y_train, y_test,
-                                              c_scale = c_manual, mode = scalingMode, targetClass = targetClass)
-                    #except Exception as e:
-                    #    log("building model failed:" + str(e))
-
-                    dummy, nFeature = X_train.shape
-                    nTrainSample = len(X_train)
-                    nTrainSamplePos = sum(y_train)
-                    nTrainSampleNeg = nTrainSample - nTrainSamplePos
-
-                    if(retVal.ms_process != -1):
-                        log("ret val != -1. Using ms process from model ({}).".format(retVal.ms_process))
-                        msProcess = retVal.ms_process
-                    else:
-                        log("preprocessing took {}ms".format(msProcess))
-
-                    Report.logToFile(target = target, dataset = dataset, model_type=modeltype, 
-                                     model_train_time=retVal.train_time, 
-                                     model_train_eval_time=retVal.train_eval_time, 
-                                     model_test_time=retVal.test_time,
-                                     model_accuracy= retVal.accuracy, 
-                                     model_auroc = retVal.auROC, 
-                                     model_auprc = retVal.auPRC, 
-                                     model_f1 = retVal.f1, 
-                                     model_gmean = retVal.gmean, 
-                                     model_train_accuracy= retVal.train_accuracy,
-                                     model_train_auroc = retVal.train_auroc,
-                                     model_train_auprc = retVal.train_auprc,
-                                     model_train_f1 = retVal.train_f1, 
-                                     model_train_gmean = retVal.train_gmean, 
-                                     num_features = nFeature, 
-                                     num_sample_dataset = nAllSample,
-                                     num_sample_dataset_pos = nAllSamplePos,
-                                     num_sample_dataset_neg = nAllSampleNeg,
-                                     num_sample_train_before = nTrainSampleBefore,
-                                     num_sample_train_before_pos = nTrainSamplePosBefore,
-                                     num_sample_train_before_meg = nTrainSampleNegBefore,
-                                     num_sample_train_after = nTrainSample,
-                                     num_sample_train_after_pos = nTrainSamplePos, 
-                                     num_sample_train_after_neg = nTrainSampleNeg,
-                                     bs2 = bs2_measure,
-                                     process_time = msProcess, 
-                                     process_name = approachMod,
-                                     process_naive = 1 if ('naive' in approach) else 0, 
-                                     process_sampling_up_smote = 1 if ('SMOTE' in approach) else 0,  
-                                     process_sampling_up_adasyn = 1 if ('ADASYN' in approach) else 0, 
-                                     process_sampling_down_oss = 1 if ('OSS' in approach) else 0, 
-                                     process_sampling_down_cnn = 1 if ('CNN' in approach) else 0,  
-                                     process_sampling_down_tomek = 1 if ('tomek' in approach) else 0, 
-                                     process_weight = 1 if (modeltype == 'WLR') else 0, 
-                                     process_scale_minority = 1 if ('CSS' in approachMod) else 0, 
-                                     process_scale_mode = scalingMode if ('CSS' in approachMod) else "", 
-                                     process_scale_target = targetClass if ('CSS' in approachMod) else "", 
-                                     process_scale_c = c_manual if ('CSS' in approachMod) else 0)
-                    
-                    # notify ubidots
-                    if use_ubi:
-                        try:
-                            new_value = ubi_last_timestamp.save_value({'value': 10, 'context':{'lastTimestamp': "'" + str(datetime.datetime.now()) + "'"}})
-                        except Exception as e:
-                            log("ubidots failed." + str(e))
-                    
-
-
-# In[ ]:
-
-def evalSubSC(dfIn, name, classColumnName, modeltypes, approaches,
-             indexFeatureStart, indexFeatureEnd, fixedSplit = "no"):
-    X, y = createDatasetXY(df = dfIn, indexFeatureStart = indexFeatureStart,
-                                       indexFeatureEnd= indexFeatureEnd, targetColumnName=classColumnName)    
-    bs2_measure = getBS2(X = X, y = y)
-
-    nSample, nFeature = X.shape
-    nSampleNeg = nSample - sum(y)
-    
-    if (fixedSplit == "no"):
-        X_train_pre, X_test_pre, y_train_pre, y_test_pre = train_test_split_scaled(X, y) 
-    else:
-        X_train_pre, X_test_pre, y_train_pre, y_test_pre = train_test_split_scaled(X, y, dfIn[fixedSplit]) 
-        
-        
-    evalAll(X_train_pre, X_test_pre, y_train_pre, y_test_pre, dataset = name, target = name, bs2_measure = bs2_measure,
-               modeltypes = modeltypes, approaches = approaches)
-
-    log("completed " + name)
-
-
-# In[ ]:
-
-def evalSubMC(dfIn, name, classColumnName, indices, modeltypes, approaches,
-             indexFeatureStart, indexFeatureEnd, fixedSplit = "no"):
-    classColumnName 
-    for i in indices:
-        log("working " + name + " " + str(i))
-        dfSub = dfIn.copy()
-        dfSub[classColumnName] = (dfSub[classColumnName] == i).astype(bool)
-        dfSub[classColumnName] = dfSub[classColumnName].astype(int)
-
-        X, y = createDatasetXY(df = dfSub, indexFeatureStart = indexFeatureStart, 
-                                         indexFeatureEnd= indexFeatureEnd, targetColumnName=classColumnName)  
-        bs2_measure = getBS2(X = X, y = y)
-        
-        nSample, nFeature = X.shape
-        nSampleNeg = len(y) - sum(y)
-        log("{} samples total, {} negative and {} positive".format(nSample, nSampleNeg, (nSample-nSampleNeg)))
-        if(fixedSplit == "no"):
-            X_train_pre, X_test_pre, y_train_pre, y_test_pre = train_test_split_scaled(X, y) 
-        else:
-            X_train_pre, X_test_pre, y_train_pre, y_test_pre = train_test_split_index(X, y, dfSub[fixedSplit])
-        evalAll(X_train_pre, X_test_pre, y_train_pre, y_test_pre, dataset = name, target = name + str(i), 
-                bs2_measure = bs2_measure, modeltypes = modeltypes, approaches = approaches)
-
-    log("completed " + name)
-
-
-# In[ ]:
-
-def evalSubGlas(modeltypes = ['RF', 'WRF', 'OCC', 'LR', 'KNN', 'WLR'], 
-                approaches = ['OSS', 'CNN', 'naive', 'SMOTE', 'tomek', 'ADASYN', 'ScaleMajority', 'ScaleMinority', 'ScaleBoth']):
-    log("starting glas", force=True)
-    evalSubMC(dfGlass, "Glass", "Type", [1,2,3,5,6,7], modeltypes, approaches, 0, ncGlass-1)
-
-
-# In[ ]:
-
-def evalSubGlas67(modeltypes = ['RF', 'WRF',  'OCC', 'LR', 'KNN', 'WLR'], 
-                approaches = ['OSS', 'CNN', 'naive', 'SMOTE', 'tomek', 'ADASYN', 'ScaleMajority', 'ScaleMinority', 'ScaleBoth']):
-    evalSubMC(dfGlass, "Glass", "Type", [6,7], modeltypes, approaches, 0, ncGlass-1)
-
-
-# In[ ]:
-
-def evalSubVowel(modeltypes = ['RF', 'WRF',  'OCC', 'LR', 'KNN', 'WLR'], 
-                 approaches = [ 'OSS', 'CNN', 'naive', 'SMOTE', 'tomek', 'ADASYN', 'ScaleMajority', 'ScaleMinority', 'ScaleBoth']):
-    log("starting Vowel", force=True)
-    evalSubMC(dfVowel, "Vowel", "Class", range(0,11), modeltypes, approaches, 1, ncVowel - 2, fixedSplit = "Train or Test")
-
-
-# In[ ]:
-
-def evalSubForest(modeltypes = ['RF', 'WRF', 'OCC', 'LR', 'KNN', 'WLR'], 
-                 approaches = ['OSS', 'CNN', 'naive', 'SMOTE', 'tomek', 'ADASYN', 'ScaleMajority', 'ScaleMinority', 'ScaleBoth']):
-    log("starting forest", force=True)
-    evalSubSC(dfForest, "Forest", "area", modeltypes, approaches, 0, ncForest-1)
-
-
-# In[ ]:
-
-def evalSubPima(modeltypes = ['RF', 'WRF', 'OCC', 'LR', 'KNN', 'WLR'], 
-                 approaches = ['OSS', 'CNN', 'naive', 'SMOTE', 'tomek', 'ADASYN', 
-                               'ScaleMajority', 'ScaleMinority', 'ScaleBoth']):
-    log("starting pima", force=True)
-    evalSubSC(dfPima, "Pima", "Class", modeltypes, approaches, 0, 8)
-
-
-# In[ ]:
-
-def evalSubPhoneme(modeltypes = ['RF', 'WRF', 'OCC', 'LR', 'KNN', 'WLR'], 
-                 approaches = ['OSS', 'CNN', 'naive', 'SMOTE', 'tomek', 'ADASYN', 
-                               'ScaleMajority', 'ScaleMinority', 'ScaleBoth']):
-    log("starting phoneme", force=True)
-    evalSubSC(dfPhoneme, "Phoneme", "class", modeltypes, approaches, 0, 5)
-
-
-# In[ ]:
-
-def evalSubVehicle(modeltypes = ['RF', 'WRF', 'OCC', 'LR', 'KNN', 'WLR'], 
-                 approaches = ['OSS', 'CNN', 'naive', 'SMOTE', 'tomek', 'ADASYN', 
-                               'ScaleMajority', 'ScaleMinority', 'ScaleBoth']):
-    log("starting vehicle", force=True)
-    evalSubMC(dfVehicle, "Vehicle", "TARGET", [1, 2, 3, 4], modeltypes, approaches, 0, 18)
-
-
-# In[ ]:
-
-def evalSubAbalone(modeltypes = ['RF', 'WRF', 'OCC', 'LR', 'KNN', 'WLR'], 
-                 approaches = ['OSS', 'CNN', 'naive', 'SMOTE', 'tomek', 'ADASYN', 
-                               'ScaleMajority', 'ScaleMinority', 'ScaleBoth']):
-    log("starting abalone", force=True)
-    evalSubSC(dfAbalone, "Abalone", "Rings", modeltypes, approaches, 0, 8)
-
-
-# In[ ]:
-
-def evalSubSatimage(modeltypes = ['RF', 'WRF', 'OCC', 'LR', 'KNN', 'WLR'], 
-                 approaches = ['OSS', 'CNN', 'naive', 'SMOTE', 'tomek', 'ADASYN', 
-                               'ScaleMajority', 'ScaleMinority', 'ScaleBoth']):
-    log("starting satimage", force=True)
-    evalSubMC(dfSatimage, "Satimage", "CLASS", [1,2,3,4,5,7], modeltypes, approaches, 0, 18, fixedSplit = "TRAIN_TEST")
-
-
-# In[ ]:
-
-def evalSubMammography(modeltypes = ['RF', 'WRF', 'OCC', 'LR', 'KNN', 'WLR'], 
-                 approaches = ['OSS', 'CNN', 'naive', 'SMOTE', 'tomek', 'ADASYN', 
-                               'ScaleMajority', 'ScaleMinority', 'ScaleBoth']):
-    log("starting mammography", force=True)
-    evalSubSC(dfMammography, "Mammography", "target", modeltypes, approaches, 0, 6)
-
-
-# In[ ]:
-
-def evalSubAutomotive(modeltypes = ['LR', 'KNN', 'WLR', 'RF', 'WRF', 'OCC'],
-                      approaches = ['OSS', 'CNN', 'naive', 'SMOTE', 'tomek', 'ADASYN', 'ScaleMajority', 'ScaleMinority', 'ScaleBoth'],
-                      strategies = ['Random', 'Planned']):
-    # read in targets
-    targets = pd.DataFrame.from_csv("data/targets_w_lambda_sub_sub.csv").sort_values(by = ['CNT'])
-    rows, cols = targets.shape
-    i = 1
-
-    # eval each
-    for target in targets['TARGET']:
-        #target = "DTC___1196802" # 0x 124302 = 1196802 is the lambda rex thing, 1250
-        #target = "DTC___1257473" # 21 instances, first random entry
-        log("starting automotive (target= {})".format(target), force=True)
-
-        # determine prefixes to drop (aside from prefix of target, which will be dropped automatically)
-        prefixesToDrop = []
-        if (target.startswith("DTC___")):
-            prefixesToDrop = ["BEFUND___", "DK___"]
-        elif (target.startswith("BEFUND___")):
-            prefixesToDrop = ["DK___"]
-
-
-        dfTemp = getDataFrameForTarget(dfAutomotive.copy(), target, prefixesToDrop)
-        # move "TARGET" to start
-        target_col = dfTemp['TARGET']
-        dfTemp.drop(labels=['TARGET'], axis=1,inplace = True)
-        dfTemp.insert(0, 'TARGET', target_col)
-
-        for strat in strategies:
-            XAutomotive, yAutomotive = 0, 0
-            if strat == 'Planned':
-                XAutomotive, yAutomotive = createDatasetUsingMetaXy(df = dfTemp.copy(), indexFeatureStart = 1,
-                                                                    indexFeatureEnd= ncAutomotive, targetColumnName="TARGET",
-                                                                    metaColumnName = "META___PLANNED")
-
-            if strat == 'Random':
-                XAutomotive, yAutomotive = createDatasetUsingRandomXy(df = dfTemp.copy(), indexFeatureStart = 1,
-                                                                    indexFeatureEnd= ncAutomotive, ratio = 100)
-                
-               
-
-            XAutomotive = dropPrefix(XAutomotive, "META___")
-
-            # do PCA in any case
-            XAutomotive = doPCA(XAutomotive, numberOfDimensionsTarget = 100)
-            bs2_measure = getBS2(X =XAutomotive, y = yAutomotive)
-
-            
-            X_train_pre, X_test_pre, y_train_pre, y_test_pre = train_test_split_scaled(XAutomotive, yAutomotive) 
-            
-            log("types: X_train_pre={}, X_test_pre={}, y_train_pre={}, y_test_pre={}".format(str(type(X_train_pre)),
-                                                                                             str(type(X_test_pre)),
-                                                                                             str(type(y_train_pre)),
-                                                                                             str(type(y_test_pre))))
-
-            nRowsBasic, nColsBasic = X_train_pre.shape    
-            log("basic shape of train dataset is {} rows and {} features.".format(nRowsBasic, nColsBasic))
-
-            evalAll(X_train_pre, X_test_pre, y_train_pre, y_test_pre, dataset = "Automotive_" + strat , target = target,
-                    bs2_measure = bs2_measure, modeltypes = modeltypes, approaches = approaches)
-    log("completed automotive")
-
-
-# In[ ]:
-
-def doIt_stepwise():
-    global logpath
-    smoothing = 5
-    base_log_path = "log/"
-    
-    modeltypes = ['RF', 'WRF', 'OCC', 'LR', 'KNN', 'WLR'] # no NN, painfully slow
-    approaches = ['OSS', 'CNN', 'naive', 'SMOTE', 'tomek', 'ADASYN', 'ScaleMajority', 'ScaleMinority', 'ScaleBoth']
-
-    for i in range(0,smoothing):
-        logpath = base_log_path + str(i) + "_" + "mammography" + ".csv"
-        evalSubMammography(approaches = approaches, modeltypes = modeltypes)
-        
-    for i in range(0,smoothing):
-        logpath = base_log_path + str(i) + "_" + "satimage" + ".csv"
-        evalSubSatimage(approaches = approaches, modeltypes = modeltypes)
-        
-    for i in range(0,smoothing):
-        logpath = base_log_path + str(i) + "_" + "vowel" + ".csv"
-        evalSubVowel(approaches = approaches, modeltypes = modeltypes)
-        
-    for i in range(0,smoothing):
-        logpath = base_log_path + str(i) + "_" + "forest" + ".csv"
-        evalSubForest(approaches = approaches, modeltypes = modeltypes)
-        
-    for i in range(0,smoothing):
-        logpath = base_log_path + str(i) + "_" + "glass" + ".csv"
-        evalSubGlas(approaches = approaches, modeltypes = modeltypes)
-        
-    for i in range(0,smoothing):
-        logpath = base_log_path + str(i) + "_" + "pima" + ".csv"
-        evalSubPima(approaches = approaches, modeltypes = modeltypes)
-        
-    for i in range(0,smoothing):
-        logpath = base_log_path + str(i) + "_" + "phoneme" + ".csv"
-        evalSubPhoneme(approaches = approaches, modeltypes = modeltypes)
-        
-    for i in range(0,smoothing):
-        logpath = base_log_path + str(i) + "_" + "vehicle" + ".csv"
-        evalSubVehicle(approaches = approaches, modeltypes = modeltypes)
-        
-    for i in range(0,smoothing):
-        logpath = base_log_path + str(i) + "_" + "abalone" + ".csv"
-        evalSubAbalone(approaches = approaches, modeltypes = modeltypes)
-        
-    for i in range(0,smoothing):
-        logpath = base_log_path + str(i) + "_" + "automotive" + ".csv"
-        evalSubAutomotive(approaches = approaches, modeltypes = modeltypes)
-        
-doIt_stepwise()
-
-
-# In[ ]:
-
-def doIt():
-    smoothing = 5
-
-    for i in range(0,smoothing):
-        log("starting smoothing iteration {}".format(i), force=True)
-
-        #approaches = ['ScaleBoth']
-        #modeltypes = ['OCC']
-        modeltypes = ['RF', 'WRF', 'OCC', 'LR', 'KNN', 'WLR', 'NN']
-        approaches = ['OSS', 'CNN', 'naive', 'SMOTE', 'tomek', 'ADASYN', 'ScaleMajority', 'ScaleMinority', 'ScaleBoth']
-
-        evalSubMammography(approaches = approaches, modeltypes = modeltypes)
-        evalSubSatimage(approaches = approaches, modeltypes = modeltypes)
-        evalSubVowel(approaches = approaches, modeltypes = modeltypes)
-        evalSubForest(approaches = approaches, modeltypes = modeltypes)
-        evalSubGlas(approaches = approaches, modeltypes = modeltypes)
-        evalSubPima(approaches = approaches, modeltypes = modeltypes)
-        evalSubPhoneme(approaches = approaches, modeltypes = modeltypes)
-        evalSubVehicle(approaches = approaches, modeltypes = modeltypes)
-        evalSubAbalone(approaches = approaches, modeltypes = modeltypes)
-        evalSubAutomotive(strategies = ['Random'], approaches = approaches, modeltypes = modeltypes)
-
-#doIt()
-
-
-# In[ ]:
-
-if False:
-    def doIt():
-        smoothing = 2
-
-        for i in range(0,smoothing):
-            log("starting smoothing iteration {}".format(i))
-
-            modeltypes = ['NN']
-
-            evalSubMammography(modeltypes = modeltypes)
-            evalSubSatimage(modeltypes = modeltypes)
-            evalSubVowel(modeltypes = modeltypes)
-            evalSubForest(modeltypes = modeltypes)        
-            evalSubGlas(modeltypes = modeltypes)
-            evalSubPima(modeltypes = modeltypes)
-            evalSubPhoneme(modeltypes = modeltypes)
-            evalSubVehicle(modeltypes = modeltypes)
-            evalSubAbalone(modeltypes = modeltypes)
-            evalSubAutomotive(strategies = ['Random'], modeltypes = modeltypes)
-
-    doIt()
-
-
-# In[ ]:
-
-if False:
-    evalSubAutomotive(modeltypes = ['LR', 'KNN', 'WLR', 'RF'],
-                      strategies = ['Planned'])
-    evalSubAutomotive(modeltypes = ['OCC'],
-                      strategies = ['Planned'])
-
-
-# The following results are stored to CSV:
-# - TARGET: Name of the target, e.g. "DTC\_\_\_12345" or "Vowel1" or "class0"
-# - DATASET: Dataset, e.g. "automotive"
-# - MODEL_TYPE = "KNN", "LR" or "OCC" (one class classifier)
-# - MODEL_TRAIN_TIME: Training time of the Knn in ms
-# - MODEL_ACCURACY: Accuracy achieved by the classifier
-# - MODEL_AUC: AUC achieved by the classifier
-# - MODEL_F1: F1 achieved by the classifier
-# - MODEL_GMEAN: the g performance measure
-# - NUM_FEATURES: The number of features (after being compressed by PCA in the automotive set)
-# - NUM_SAMPLES_POS: The number of observations where (minority) target class was present
-# - NUM_SAMPLES_NEG: The number of negative observations, where target class was not present
-# - PROCESS_TIME: time in ms it took to complete all processing steps
-# - PROCESS_NAIVE: 0/1 if dataset has been processed the naive way
-# - PROCESS_SAMPLING_UP_SMOTE: 0/1 if minority class of dataset has been upsampled using SMOTE. 1 means 100% minority samples, 0.5 means half as many minority samples as majority samples
-# - PROCESS_SAMPLING_UP_ADASYN: 0/1 if minority class of dataset has been upsampled using ADASYN
-# - PROCESS_SAMPLING_DOWN_OSS: 0/1 if borderline samples have been dropped using one-sided selection
-# - PROCESS_SAMPLING_DOWN_CNN: 0/1 if CNN has been used to downsample
-# - PROCESS_SAMPLING_DOWN_TOMEK: 0/1 if samples part of tomek links have been dropped
-# - PROCESS_WEIGHT: 0/1 if trainingsamples are weighted inversly the number of instances
-# - PROCESS_SCALE_MINORITY: 0/1 if minority class samples have been scaled
-# - PROCESS_SCALE_MODE: "fixed" (normalization constant explicitly set), "auto" (scaled until no tomek links)
-# - PROCESS_SCALE_C: The normalization constant, in case SCALE_MODE is set to "fixed"

From d9d4410e9268ed3a0ec7780a5a161363c6f9b80a Mon Sep 17 00:00:00 2001
From: Bernhard Schlegel <bernhard.schlegel@mytum.de>
Date: Wed, 28 Mar 2018 18:06:11 +0200
Subject: [PATCH 4/6] cleaned up code, removed unncessary files, made names
 more verbose, introduced safe indexing

---
 imblearn/scaling/base.py |   2 +-
 imblearn/scaling/css.py  | 103 +++++++++++++++++++--------------------
 2 files changed, 51 insertions(+), 54 deletions(-)

diff --git a/imblearn/scaling/base.py b/imblearn/scaling/base.py
index e8d77fe2f..0b3fc3690 100644
--- a/imblearn/scaling/base.py
+++ b/imblearn/scaling/base.py
@@ -15,4 +15,4 @@ class BaseScaler(BaseSampler):
     instead.
     """
 
-    _sampling_type = 'scaling'
+    _sampling_type = 'scaling'
\ No newline at end of file
diff --git a/imblearn/scaling/css.py b/imblearn/scaling/css.py
index d54fce42b..25c8faa70 100644
--- a/imblearn/scaling/css.py
+++ b/imblearn/scaling/css.py
@@ -7,10 +7,12 @@
 from collections import Counter
 import random
 import numpy as np
+from sklearn.utils import check_random_state, safe_indexing
+
 from .base import BaseScaler
 
 CSS_MODE = ('linear', 'constant')
-CSS_TARGET = ('minority', 'majority', 'both')
+CSS_SAMPLING_STRATEGY = ('minority', 'majority', 'both')
 
 
 class CSS(BaseScaler):
@@ -22,7 +24,7 @@ class CSS(BaseScaler):
         Defines the scaling mode. Currently, two modes are implemented: `'constant'`
         and `'linear'`. 
         
-        In `'constant'` mode, all samples of the `'target'` class will be scaled
+        In `'constant'` mode, all samples of the `'sampling_strategy'` class will be scaled
         by the same amount `c` to their class specific center. The following 
         formula will be applied to calculate the new feature (`X`) values:
         `X[y==0] * (1-c) + col_means * c`
@@ -36,7 +38,7 @@ class CSS(BaseScaler):
         `X[y==0] * (1-c) / norm + col_means * (distances * c) / norm
         
 
-    target : str (default = 'minority')
+    sampling_strategy : str (default = 'minority')
         defines which class to scale. Possible values are 'minority', 'majority',
         and 'both'. Note that all sample are scaled to their corresponding class
         center.
@@ -44,7 +46,7 @@ class CSS(BaseScaler):
     c : float (default = 0.25)
         Defines the amount of the scaling. 
     
-    target_class_value: int (default = None)
+    sampling_strategy_class_value: int (default = None)
         class level indicating the minority class. By default (`None`) the minority
         class will be automatically determined. Use any integer number (e.g. `0`,
         `1` or `-1`) to force the minority class.
@@ -60,10 +62,10 @@ class will be automatically determined. Use any integer number (e.g. `0`,
     mode_ : str
         CSS mode ('constant' or 'linear')
 
-    target_ : str or int
-        Name of the target class ('majority', 'minority', 'both')
+    sampling_strategy_ : str or int
+        Name of the sampling_strategy class ('majority', 'minority', 'both')
          
-    target_class_value: int
+    sampling_strategy_class_value: int
         class level indicating the minority class
 
     c_ : dict of str/int : int
@@ -85,7 +87,7 @@ class level indicating the minority class
 	>>> X_syn = np.r_[1.5 * rng.randn(n_samples_1, 2), 0.5 * rng.randn(n_samples_2, 2) + [2, 2]]
 	>>> y_syn = np.array([0] * (n_samples_1) + [1] * (n_samples_2))
 	>>> X_syn, y_syn = shuffle(X_syn, y_syn)
-	>>> css = CSS(mode="linear", target="both", c=0.1, shuffle=True)
+	>>> css = CSS(mode="linear", sampling_strategy="both", c=0.1, shuffle=True)
 	>>> X_train_res, y_train_res = css.fit_sample(X_syn, y_syn)
 
     References
@@ -96,23 +98,18 @@ class level indicating the minority class
     """
 
     def __init__(self,
+                 sampling_strategy='minority',
                  mode='linear',
-                 target='minority',
                  c=0.25,
                  minority_class_value=None,
-                 shuffle=True,
-                 random_state=None):
-        super(CSS, self).__init__(ratio=1)
+                 shuffle=True):
+        super(CSS, self).__init__()
+        self.sampling_strategy = sampling_strategy
         self.mode = mode
-        self.target = target
         self.c = c
         self.minority_class_value = minority_class_value
         self.shuffle = shuffle
 
-    def _validate_estimator(self):
-        i = 1
-        # nothing to do
-
     def fit(self, X, y):
         """Find the classes statistics before to perform sampling.
 
@@ -133,13 +130,9 @@ def fit(self, X, y):
 
         super(CSS, self).fit(X, y)
 
-        self._validate_estimator()
-
         return self
 
     def _shuffleTwo(self, a, b):
-        #if len(a) != len(b):
-        #    raise ValueError("lenth of a ({}) doesn't match length of b ({})".format(len(a), len(b)))
 
         indexes = np.array(range(0, len(a)))
         random.shuffle(indexes)
@@ -173,10 +166,10 @@ def _sample(self, X, y):
                              ' Choices are {}. Got \'{}\' instead.'.format(
                             CSS_MODE, self.mode))
 
-        if self.target not in CSS_TARGET:
-            raise ValueError('Unknown kind for CSS target.'
+        if self.sampling_strategy not in CSS_SAMPLING_STRATEGY:
+            raise ValueError('Unknown kind for CSS sampling_strategy.'
                              ' Choices are {}. Got \'{}\' instead.'.format(
-                                CSS_TARGET, self.target))
+                                CSS_SAMPLING_STRATEGY, self.sampling_strategy))
 
         if self.c < 0 or self.c > 1:
             raise ValueError('Received scaling factor c={}, which'
@@ -188,55 +181,59 @@ def _sample(self, X, y):
 
         if self.minority_class_value is not None and \
                 not isinstance(self.minority_class_value, int):
-            raise ValueError('Unallowed target class value \'{}\'.'
+            raise ValueError('Unallowed sampling_strategy class value \'{}\'.'
                              ' Valid values include None to automatically'
-                             ' infer the target class or any integer number'
+                             ' infer the sampling_strategy class or any integer number'
                              ' corresponding to the value of the label in y')
 
-
-        mcv = self.minority_class_value
-        if mcv is None:
+        minority_class = self.minority_class_value
+        if minority_class is None:
             # infer minority class value
             counts = Counter(y)
             least_common = counts.most_common()[:-1-1:-1]
-            mcv = least_common[0][0]
+            minority_class = least_common[0][0]
+
+        # get indices for later, safe indexing
+        majority_class_indices = (y != minority_class)
+        minority_class_indices = (y == minority_class)
 
-        # in the following _a is majority, _i is minority
-        if self.target is "majority" or self.target is "both":
-            col_means_a = np.mean(X[(y != mcv)], axis=0)
+        # in the following _majority is majority, _minority is minority
+        if self.sampling_strategy is "majority" or self.sampling_strategy is "both":
+            # mean_majority_class is the mean of all features (=columns)
+            mean_majority_class = np.mean(safe_indexing(X, majority_class_indices), axis=0)
             if self.mode is "linear":
-                distances_a = abs(np.subtract(X[y != mcv], col_means_a))
-        if self.target is "minority" or self.target is "both":
-            col_means_i = np.mean(X[(y == mcv)], axis=0)
+                distances_majority = abs(np.subtract(safe_indexing(X, majority_class_indices), mean_majority_class))
+        if self.sampling_strategy is "minority" or self.sampling_strategy is "both":
+            mean_minority_class = np.mean(safe_indexing(X, minority_class_indices), axis=0)
             if self.mode is "linear":
-                distances_i = abs(np.subtract(X[y == mcv], col_means_i))
+                distances_minority = abs(np.subtract(safe_indexing(X, minority_class_indices), mean_minority_class))
 
-        if self.target is "majority" or self.target is "both":
+        if self.sampling_strategy is "majority" or self.sampling_strategy is "both":
             if self.mode is "constant":
-                X_scaled_a = X[y != mcv] * (1 - self.c) + col_means_a * self.c
+                X_scaled_majority = safe_indexing(X, majority_class_indices) * (1 - self.c) + mean_majority_class * self.c
             elif self.mode is "linear":
-                scale_factors_mean = (distances_a * self.c)
-                scale_factors_values = (1 - self.c * distances_a)
+                scale_factors_mean = (distances_majority * self.c)
+                scale_factors_values = (1 - self.c * distances_majority)
 
-                X_scaled_a = X[y != mcv] * scale_factors_values + col_means_a * scale_factors_mean
-        if self.target is "minority" or self.target is "both":
+                X_scaled_majority = safe_indexing(X, majority_class_indices) * scale_factors_values + mean_majority_class * scale_factors_mean
+        if self.sampling_strategy is "minority" or self.sampling_strategy is "both":
             if self.mode is "constant":
-                X_scaled_i = X[y == mcv] * (1 - self.c) + col_means_i * self.c
+                X_scaled_minority = safe_indexing(X, minority_class_indices) * (1 - self.c) + mean_minority_class * self.c
             elif self.mode is "linear":
-                scale_factors_mean = (distances_i * self.c)
-                scale_factors_values = (1 - self.c * distances_i)
-                X_scaled_i = X[y == mcv] * scale_factors_values + col_means_i * scale_factors_mean
+                scale_factors_mean = (distances_minority * self.c)
+                scale_factors_values = (1 - self.c * distances_minority)
+                X_scaled_minority = safe_indexing(X, minority_class_indices) * scale_factors_values + mean_minority_class * scale_factors_mean
 
         # merge scaled and non scaled stuff
-        if self.target is "majority":
-            X_scaled = np.concatenate([X_scaled_a, X[y == mcv]], axis=0)
-        elif self.target is "minority":
-            X_scaled = np.concatenate([X[y != mcv], X_scaled_i], axis=0)
+        if self.sampling_strategy is "majority":
+            X_scaled = np.concatenate([X_scaled_majority, safe_indexing(X, minority_class_indices)], axis=0)
+        elif self.sampling_strategy is "minority":
+            X_scaled = np.concatenate([safe_indexing(X, majority_class_indices), X_scaled_minority], axis=0)
         else: #"both"
-            X_scaled = np.concatenate([X_scaled_a, X_scaled_i], axis=0)
+            X_scaled = np.concatenate([X_scaled_majority, X_scaled_minority], axis=0)
 
         # make sure that y is in same order like X
-        y_assembled = np.concatenate([y[y != mcv], y[y == mcv]], axis=0)
+        y_assembled = np.concatenate([y[majority_class_indices], y[minority_class_indices]], axis=0)
 
         # shuffle
         X_scaled_shuffled, y_res_shuffled, indices = self._shuffleTwo(X_scaled, y_assembled)

From b1afd231444a8ee593e992076b7b903de41b3779 Mon Sep 17 00:00:00 2001
From: Bernhard Schlegel <bernhard.schlegel@mytum.de>
Date: Wed, 28 Mar 2018 18:13:48 +0200
Subject: [PATCH 5/6] corrected code style, moved checks to fit

---
 imblearn/scaling/css.py | 51 ++++++++++++++++++++---------------------
 1 file changed, 25 insertions(+), 26 deletions(-)

diff --git a/imblearn/scaling/css.py b/imblearn/scaling/css.py
index 25c8faa70..ac21c68f3 100644
--- a/imblearn/scaling/css.py
+++ b/imblearn/scaling/css.py
@@ -127,9 +127,33 @@ def fit(self, X, y):
             Return self.
 
         """
-
         super(CSS, self).fit(X, y)
 
+        if self.mode not in CSS_MODE:
+            raise ValueError('Unknown kind for CSS mode.'
+                             ' Choices are {}. Got {} instead.'.format(
+                                CSS_MODE, self.mode))
+
+        if self.sampling_strategy not in CSS_SAMPLING_STRATEGY:
+            raise ValueError('Unknown kind for CSS sampling_strategy.'
+                             ' Choices are {}. Got {} instead.'.format(
+                                CSS_SAMPLING_STRATEGY, self.sampling_strategy))
+
+        if self.c < 0 or self.c > 1:
+            raise ValueError('Received scaling factor c={}, which'
+                             ' is outside the allowed range '
+                             '(0-1].'.format(self.c))
+        if self.c is 0:
+            raise ValueError('Received scaling factor c={}, which is'
+                             ' equal to no CSS at.'.format(self.c))
+
+        if (self.minority_class_value is not None and
+                not isinstance(self.minority_class_value, int)):
+            raise ValueError('Unallowed sampling_strategy class value \'{}\'.'
+                             ' Valid values include None to automatically'
+                             ' infer the sampling_strategy class or any integer number'
+                             ' corresponding to the value of the label in y')
+
         return self
 
     def _shuffleTwo(self, a, b):
@@ -161,31 +185,6 @@ def _sample(self, X, y):
 
         """
 
-        if self.mode not in CSS_MODE:
-            raise ValueError('Unknown kind for CSS mode.'
-                             ' Choices are {}. Got \'{}\' instead.'.format(
-                            CSS_MODE, self.mode))
-
-        if self.sampling_strategy not in CSS_SAMPLING_STRATEGY:
-            raise ValueError('Unknown kind for CSS sampling_strategy.'
-                             ' Choices are {}. Got \'{}\' instead.'.format(
-                                CSS_SAMPLING_STRATEGY, self.sampling_strategy))
-
-        if self.c < 0 or self.c > 1:
-            raise ValueError('Received scaling factor c={}, which'
-                             ' is outside the allowed range '
-                             '(0-1].'.format(self.c))
-        if self.c is 0:
-            raise ValueError('Received scaling factor c={}, which is'
-                             ' equal to no CSS at.'.format(self.c))
-
-        if self.minority_class_value is not None and \
-                not isinstance(self.minority_class_value, int):
-            raise ValueError('Unallowed sampling_strategy class value \'{}\'.'
-                             ' Valid values include None to automatically'
-                             ' infer the sampling_strategy class or any integer number'
-                             ' corresponding to the value of the label in y')
-
         minority_class = self.minority_class_value
         if minority_class is None:
             # infer minority class value

From 8cbc0eb658ee090de7593550d03f165189c9dda4 Mon Sep 17 00:00:00 2001
From: Bernhard Schlegel <bernhard.schlegel@mytum.de>
Date: Wed, 28 Mar 2018 20:41:17 +0200
Subject: [PATCH 6/6] renamed test according to new naming scheme

---
 imblearn/scaling/tests/test_css.py | 44 +++++++++++++++---------------
 1 file changed, 22 insertions(+), 22 deletions(-)

diff --git a/imblearn/scaling/tests/test_css.py b/imblearn/scaling/tests/test_css.py
index 55987d4fc..5252d0cef 100644
--- a/imblearn/scaling/tests/test_css.py
+++ b/imblearn/scaling/tests/test_css.py
@@ -39,74 +39,74 @@
 
 def test_css_mode():
     # should two fail (illegal value for mode)
-    css = CSS(mode='constant2', target='minority', c=1.01, shuffle=False)
+    css = CSS(mode='constant2', sampling_strategy='minority', c=1.01, shuffle=False)
     assert_raises(ValueError, css.fit_sample, X, y)
 
-    css = CSS(mode='no mode', target='minority', c=0, shuffle=False)
+    css = CSS(mode='no mode', sampling_strategy='minority', c=0, shuffle=False)
     assert_raises(ValueError, css.fit_sample, X, y)
 
     # these two should not fail
     try:
-        css = CSS(mode='constant', target='minority', c=0.25, shuffle=False)
+        css = CSS(mode='constant', sampling_strategy='minority', c=0.25, shuffle=False)
         css.fit_sample(X,y)
-        css = CSS(mode='linear', target='minority', c=0.25, shuffle=False)
+        css = CSS(mode='linear', sampling_strategy='minority', c=0.25, shuffle=False)
         css.fit_sample(X,y)
     except Exception as e:
         raise ValueError('CSS raised an Exception unexpectedly! ({})'.format(e))
 
 def test_css_target():
     # should two fail (illegal value for c)
-    css = CSS(mode='constant', target='abc', c=0.5, shuffle=False)
+    css = CSS(mode='constant', sampling_strategy='abc', c=0.5, shuffle=False)
     assert_raises(ValueError, css.fit_sample, X, y)
 
     # these three should not fail
     try:
-        css = CSS(mode='constant', target='minority', c=0.05, shuffle=False)
+        css = CSS(mode='constant', sampling_strategy='minority', c=0.05, shuffle=False)
         css.fit_sample(X,y)
-        css = CSS(mode='constant', target='majority', c=0.05, shuffle=False)
+        css = CSS(mode='constant', sampling_strategy='majority', c=0.05, shuffle=False)
         css.fit_sample(X,y)
-        css = CSS(mode='constant', target='both', c=0.05, shuffle=False)
+        css = CSS(mode='constant', sampling_strategy='both', c=0.05, shuffle=False)
         css.fit_sample(X,y)
     except Exception as e:
         raise ValueError('CSS raised an Exception unexpectedly! ({})'.format(e))
 
 def test_css_c():
     # should two fail (illegal value for c)
-    css = CSS(mode='constant', target='minority', c=1.01, shuffle=False)
+    css = CSS(mode='constant', sampling_strategy='minority', c=1.01, shuffle=False)
     assert_raises(ValueError, css.fit_sample, X, y)
 
-    css = CSS(mode='constant', target='minority', c=0, shuffle=False)
+    css = CSS(mode='constant', sampling_strategy='minority', c=0, shuffle=False)
     assert_raises(ValueError, css.fit_sample, X, y)
 
     # these two should not fail
     try:
-        css = CSS(mode='constant', target='minority', c=0.01, shuffle=False)
+        css = CSS(mode='constant', sampling_strategy='minority', c=0.01, shuffle=False)
         css.fit_sample(X,y)
-        css = CSS(mode='linear', target='minority', c=0.99, shuffle=False)
+        css = CSS(mode='linear', sampling_strategy='minority', c=0.99, shuffle=False)
         css.fit_sample(X,y)
     except Exception as e:
         raise ValueError('CSS raised an Exception unexpectedly! ({})'.format(e))
 
 
 def test_sample_regular():
-    # minority samples are unaffected when target is majority
-    css = CSS(mode='constant', target='majority', c=1, shuffle=False)
+    # minority samples are unaffected when sampling_strategy is majority
+    css = CSS(mode='constant', sampling_strategy='majority', c=1, shuffle=False)
     X_s, y_s = css.fit_sample(X,y)
     assert_allclose(X[y == 1], X_s[y_s == 1], rtol=R_TOL)
 
-    # majority samples are unaffected when target is minority
-    css = CSS(mode='constant', target='minority', c=1, shuffle=False)
+    # majority samples are unaffected when sampling_strategy is minority
+    css = CSS(mode='constant', sampling_strategy='minority', c=1, shuffle=False)
     X_s, y_s = css.fit_sample(X,y)
     assert_allclose(X[y == 0], X_s[y_s == 0], rtol=R_TOL)
 
-    # both are affected if target is both
-    css = CSS(mode='constant', target='both', c=1, shuffle=False)
+    # both are affected if sampling_strategy is both
+    css = CSS(mode='constant', sampling_strategy='both', c=1, shuffle=False)
     X_s, y_s = css.fit_sample(X,y)
     if np.allclose(X[y == 0], X_s[y_s == 0], rtol=R_TOL):
         raise ValueError('np arrays should not be close!')
 
     # mathematical correctness of constant scaling majority (coarse)
-    css = CSS(mode='constant', target='majority', c=1, shuffle=False)
+    css = CSS(mode='constant', sampling_strategy='majority', c=1, shuffle=False)
     X_s, y_s = css.fit_sample(X, y)
     X_s_sub = X_s[y_s == 0]
     for i in range(2, len(X_s_sub)):
@@ -117,7 +117,7 @@ def test_sample_regular():
 
     # mathematical correctness of constant scaling majority (fine)
     c_test = 0.25
-    css = CSS(mode='constant', target='majority', c=c_test, shuffle=False)
+    css = CSS(mode='constant', sampling_strategy='majority', c=c_test, shuffle=False)
     X_s, y_s = css.fit_sample(X, y)
     X_sub = X[y==0]
     X_s_sub = X_s[y_s==0]
@@ -134,7 +134,7 @@ def test_sample_regular():
 
     # mathematical correctness of constant scaling minority (fine)
     c_test = 0.25
-    css = CSS(mode='constant', target='minority', c=c_test, shuffle=False)
+    css = CSS(mode='constant', sampling_strategy='minority', c=c_test, shuffle=False)
     X_s, y_s = css.fit_sample(X, y)
     X_sub = X[y==1]
     X_s_sub = X_s[y_s==1]
@@ -151,7 +151,7 @@ def test_sample_regular():
 
     # mathematical correctness of linear scaling both
     c_test = 0.1
-    css = CSS(mode='linear', target='both', c=c_test, shuffle=False)
+    css = CSS(mode='linear', sampling_strategy='both', c=c_test, shuffle=False)
     X_s, y_s = css.fit_sample(X, y)
     for lvl in [0,1]:
         X_sub = X[y == lvl]