#!/usr/bin/env python

# import sklearn modules
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, roc_auc_score

# import numpy
import numpy as np

# fix random seed for reproducibility
SEED = 7
np.random.seed(SEED)

def first_model():

    iris = load_iris()
    x = iris.data
    y = iris.target

    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=.5, random_state=SEED)

    model = DecisionTreeClassifier()
    model = model.fit(x_train, y_train)
    preds = model.predict(x_test)

    # Prediction accuracy
    print("Accuracy for Decision Tree Classifier: " + str(accuracy_score(y_test, preds)*100)+"%")

# Task: load more classifiers and compare their accuracies
# RandomForestClassifier, KNeighborsClassifier, GradientBoostingClassifier

# QA: what will happen if you change test_size and/or add random_state value

# Task: write simple emseble model which will combine multiple classifiers
# and check its accuracy with y_test

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

def ensemble_model():

    iris = load_iris()
    x = iris.data
    y = iris.target

    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=.5, random_state=SEED)

    model_1 = DecisionTreeClassifier()
    model_1 = model_1.fit(x_train, y_train)
    preds_1 = model_1.predict(x_test)

    # Prediction accuracy
    print("Accuracy for Decision Tree Classifier: " + str(accuracy_score(y_test, preds_1)*100)+"%")

    model_2 = GaussianNB()
    model_2 = model_2.fit(x_train, y_train)
    preds_2 = model_2.predict(x_test)

    # Prediction accuracy
    print("Accuracy for GaussianNB: " + str(accuracy_score(y_test, preds_2)*100)+"%")

    # make an average amont two predictions
    try:
        preds_a = (preds_1+preds_2)/2.
        print("Ensemble preds", preds_a)
        print("Accuracy for ensemble model: " + str(accuracy_score(y_test, preds_a)*100)+"%")
    except Exception as exp:
        print("ERROR: %s" % exp)
        print("We need to handle continuous valies")

    preds_a = (preds_1+preds_2)/2.
    preds_a[preds_a==1.5] = 2
    print("Accuracy for ensemble model (1.5->2): " + str(accuracy_score(y_test, preds_a)*100)+"%")
    preds_a = (preds_1+preds_2)/2.
    preds_a[preds_a==1.5] = 1
    print("Accuracy for ensemble model (1.5->1): " + str(accuracy_score(y_test, preds_a)*100)+"%")

# Task: take 3 different classifiers and create ensemble with votes
# votes can be assigned as most common predictions among two classifiers

import numpy as np

def vote_preds(preds1, preds2, preds3):
    votes = []
    for idx in range(len(preds1)):
        p1 = preds1[idx]
        p2 = preds2[idx]
        p3 = preds3[idx]
        if p1 == p2:
            votes.append(p1)
        elif p1 == p3:
            votes.append(p1)
        elif p2 == p3:
            votes.append(p2)
        else: # no consistency we'll average
            ap = (p1+p2+p3)/3.
            votes.append(round(ap))
    return np.array(votes)

def ensemble_votes():

    iris = load_iris()
    x = iris.data
    y = iris.target

    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=.5, random_state=SEED)

    model_1 = DecisionTreeClassifier()
    model_1 = model_1.fit(x_train, y_train)
    preds_1 = model_1.predict(x_test)
    print("DecisionTree preds", preds_1)

    # Prediction accuracy
    print("Accuracy for Decision Tree Classifier: " + str(accuracy_score(y_test, preds_1)*100)+"%")

    model_2 = GaussianNB()
    model_2 = model_2.fit(x_train, y_train)
    preds_2 = model_2.predict(x_test)

    # Prediction accuracy
    print("Accuracy for GaussianNB: " + str(accuracy_score(y_test, preds_2)*100)+"%")

    model_3 = KNeighborsClassifier()
    model_3 = model_3.fit(x_train, y_train)
    preds_3 = model_3.predict(x_test)

    # Prediction accuracy
    print("Accuracy for KNeighborsClassifier: " + str(accuracy_score(y_test, preds_3)*100)+"%")

    preds_a = vote_preds(preds_1,preds_2,preds_3)
    print("Accuracy for ensemble model with votes: " + str(accuracy_score(y_test, preds_a)*100)+"%")

# Introduce concept of scaling and cross validation

from sklearn.preprocessing import LabelBinarizer, StandardScaler
from sklearn.model_selection import cross_val_predict, KFold
from sklearn.svm.classes import SVC
from sklearn.metrics import confusion_matrix

def cross_val_predict():
    X, y = load_iris(return_X_y=True)
    X = StandardScaler().fit_transform(X)

    clf = SVC()
    cv = KFold(n_splits=4, random_state=SEED, shuffle=True)

    idx = 1
    for train_index, test_index in cv.split(X):
        clf.fit(X[train_index], y[train_index])
        ypred = clf.predict(X[test_index])
        auc = accuracy_score(y[test_index], ypred)
        print("Fold: %s, AUC: %s" % (idx, auc) )
        conf_matrix = confusion_matrix(y[test_index], ypred)
        print(conf_matrix)
        idx += 1

# Final task: write ensemble model (voting or not) which will perform
# best using cross validation techniques

# Bonus: https://www.programcreek.com/python/example/81062/sklearn.datasets.load_iris

# If time permit introduce keras NN


from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import cross_val_score

from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasClassifier
from keras.utils import np_utils

def nn_model():
    X, y = load_iris(return_X_y=True)
    X = StandardScaler().fit_transform(X)

    # encode class values as integers
    encoder = LabelEncoder()
    encoder.fit(y)
    encoded_y = encoder.transform(y)
    # convert integers to categorical variables (i.e. one hot encoded)
    cat_y = np_utils.to_categorical(encoded_y)
    print("input dataset labels : %s ... %s" % (y[0], y[-1]))
    print("categorical variables: %s ... %s" % (cat_y[0], cat_y[-1]))

    # create Keras NN model
    def base_model():
        model = Sequential()
        model.add(Dense(8, input_dim=4, activation='relu'))
        model.add(Dense(3, activation='softmax'))
        model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
        return model

    clf = KerasClassifier(build_fn=base_model, epochs=100, batch_size=5, verbose=0)

    # evaluate the model using kFold cross validation with 20% of the data for testing and 80% for training
    cv = KFold(n_splits=5, shuffle=True, random_state=SEED)
    results = cross_val_score(clf, X, cat_y, cv=cv)
    print("NN validation accuracy: %.2f%% +- (%.2f%%)" % (results.mean()*100, results.std()*100))


def main():
    print("\n+++ first model")
    first_model()
    print("\n+++ ensemble model")
    ensemble_model()
    print("\n+++ ensemble votes")
    ensemble_votes()
    print("\n+++ cross validation technique")
    cross_val_predict()
    print("\n+++ neural networks")
    nn_model()

if __name__ == '__main__':
    main()
