import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

%matplotlib inline

titanic = pd.read_csv("../data/titanic-processed.csv")
titanic.head()

titanic = titanic.drop(columns=['cabin','deck','ticket','title'])
titanic['sex'] = titanic['sex'].map({"male": 0, "female": 1})
titanic['has_family'] = titanic['has_family'].map({False: 0, True: 1})
titanic['fare_ordinal'] = titanic['fare_ordinal'].map({"normal": 0, "more expensive": 1, "most expensive": 2})
titanic['age_ordinal'] = titanic['age_ordinal'].map({"child": 0, "young": 1, "adult": 2, "old": 3}) 
titanic = pd.get_dummies(titanic, columns=['embarked', 'title_short'])

titanic.head()

from sklearn.preprocessing import MinMaxScaler 

scaler = MinMaxScaler() 
titanic = pd.DataFrame(scaler.fit_transform(titanic), index=titanic.index, columns=titanic.columns)
titanic.head()

X_titanic = titanic.drop('survived', axis=1) # create feature matrix - we will use all columns except the target attribute and store in X_titanic
y_titanic = titanic['survived'] # vector of target attribute values as the 'survived' column

print(X_titanic.shape)
print(y_titanic.shape)

from sklearn.model_selection import train_test_split 

X_train, X_test, y_train, y_test = train_test_split(X_titanic, y_titanic, test_size=0.3, random_state=1) # split the dataset into training and testing parts, so that the testing part will be 30% of the total dataset

from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier() # initialize the kNN classifier

from sklearn.model_selection import GridSearchCV # import libraries

# define the hyperparameter values
# for the k parameter, we will generate values range from 1 to 50

k_range = list(range(1, 50))
print(k_range)

# we will create the so-called parameter grid: we map the generated values to the parameter field
# in this case, we create a n_neighbors parameter to which we assign an array of its examined values

param_grid = dict(n_neighbors=k_range)
print(param_grid)

# apply Grid Search - set the parameters:
# model - knn
# parameter array - param_grid
# we will use 5-fold cross-validation
# we will use the accuracy metric for evaluation

grid = GridSearchCV(estimator=knn, param_grid=param_grid, cv=5, scoring='accuracy') #set the Grid Search
grid.fit(X_train, y_train) # apply Grid Search on training data

print("Best hyperparameters:")
print()
print(grid.best_params_)
print()
print(grid.best_score_)

sorted(grid.cv_results_.keys())

print(grid.cv_results_["mean_test_score"][24]) # výsledky pre konkrétnu metriku a pre konkrétny model

# see comlete results

print("Individual scores for individual values of the parameter k:")
print()

means = grid.cv_results_['mean_test_score'] # we assign the results of test score averages to the variable means
stds = grid.cv_results_['std_test_score'] # we assign a list of standard deviations to the stds variable
params = grid.cv_results_['params']

for mean, std, params in zip(means, stds, params): # for all records, we print formatted output - zip maps the same indexes of multiple containers/fields so that they can be used as one entity
    print("%0.3f (+/-%0.03f) for value %s" % (mean, std, params)) # output formatting
print()

# we can examine individual models and their specific results

print('Parameter k of model 0:')
print(grid.cv_results_["params"][0])

# model score with index 0 (k=1) for individual cross-validation splits
print()
print('CV score of model 0:')
print(grid.cv_results_["split0_test_score"][0])
print(grid.cv_results_["split1_test_score"][0])
print(grid.cv_results_["split2_test_score"][0])
print(grid.cv_results_["split3_test_score"][0])
print(grid.cv_results_["split4_test_score"][0])

# Average model score with index 0
print()
print('Average score of model 0')
print(grid.cv_results_["mean_test_score"][0])

# using matplotlib, we plot the dependence of the values of the parameter k and the score between these two quantities:

# YOUR CODE HERE
plt.plot( # YOUR CODE HERE )
plt.xlabel(' ... ')
plt.ylabel(' ... ')

# create parameter lists for k-NN algorithm weights and metrics

weights_range = # YOUR CODE HERE
metric_range = # YOUR CODE HERE

# create a parameter array for the defined parameters and their ranges

param_grid = # YOUR CODE HERE
print(param_grid)

# set the Grid Search parameters

grid = # YOUR CODE HERE 
grid.fit(X_train, y_train)

# We will list the parameters and scores for the best of the models

# YOUR CODE HERE

# Print all the results

# YOUR CODE HERE

# YOUR CODE HERE

Automated search for model parameters¶

GridSearch for finding optimal algorithm settings¶

Grid Search parameters setting¶

Evaluation of Grid Search results¶

Visualization of the dependence of the value of the parameter k on the score¶

Simultaneous search of several parameters¶

Task 7.1.¶