from sklearn.datasets import load_iris
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import seaborn as sns

%matplotlib inline

# using the load_dataset() function, we load the iris dataset from the repository of standard datasets into the iris data frame
iris = sns.load_dataset("iris")

# print the header of the iris data frame
iris.head(10)

# set the default rendering style in the Seaborn library using the set() function
sns.set()

# we draw a pairplot for all attributes with color differentiation according to the species attribute
# the optional height parameter defines the size of the rendered graph compared to the standard preset value
g = sns.pairplot(iris, hue='species', height=2);

# from the iris data frame, we use the drop() function to remove the column with the target attribute and then store such a data frame in X_iris,
# the axis parameter set to 1 specifies that we remove the entire column from the frame
X_iris = iris.drop('species', axis=1)

# we will print out the dimensionality of the feature matrix (size 150 x 4)
X_iris.shape

# analogously, for the needs of creating a column of values vector, we assign to y_iris the values of the "species" column from the iris data frame
y_iris = iris['species']

# we print out the dimensionality of the target attribute value vector (dimension 150)
y_iris.shape

from sklearn.model_selection import train_test_split # Naimportujeme z knižnice potrebnú funkciu
# we will use a function to split the feature matrix and the value vector into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_iris, y_iris, test_size=0.4, random_state=1)

# YOUR CODE HERE

# importing the class corresponding to the model we will train
from sklearn.neighbors import KNeighborsClassifier 

model = KNeighborsClassifier()             
model.fit(X_train, y_train) # training the model on the training set
y_model = model.predict(X_test) # # using the model to predict the target attribute of the test data feature matrix

from sklearn.metrics import accuracy_score

accuracy_score(y_test, y_model)

from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_test, y_model)
print(cm)

print(pd.DataFrame(confusion_matrix(y_test, y_model, labels=['setosa', 'versicolor','virginica']), index=['true:setosa', 'true:versicolor', 'true:virginica'], columns=['pred:setosa', 'pred:versicolor', 'pred:virginica']))

g = sns.heatmap(cm, cmap='magma', annot=True) # render using the heatmap() function from seaborn

# first, we split the whole dataset in two by creating a "temporary" set and a test set in the ratio 80/20
# then we divide the "temporary" set into training and testing in the same way. In order to preserve the original ratio, this time in a ratio of 75/25
X_temp, X_test, y_temp, y_test = train_test_split(X_iris, y_iris, test_size=0.2, random_state=1)
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.25, random_state=1)

# YOUR CODE HERE

# importing the class corresponding to the model we will train
from sklearn.neighbors import KNeighborsClassifier 

model = KNeighborsClassifier(n_neighbors=3) # training kNN model with parameter value k=3
model.fit(X_train, y_train) # training the model on the training set

y_model = model.predict(X_val) # we will use different models for prediction on the validation set
print(f"Accuracy of the first model: {accuracy_score(y_val, y_model)}") # and we compare the performance metrics of the classifiers on the validation set

# YOUR CODE HERE

# YOUR CODE HERE

# we import the necessary function
from sklearn.model_selection import cross_val_score
# we will use cross-validation for the model, dividing the X_train/y_train training set into 5 parts
score = cross_val_score(model, X_train, y_train, cv=5)
print(score) # we will print score values

# we import the necessary functions for cross-validation
from sklearn.model_selection import cross_validate           
scoring = ['accuracy'] # we choose the metrics we want to calculate for the models

# execute the cross-validation
# parameter return_train_score specifies whether we also want to return the result of the evaluation on the training set in the results
scores = cross_validate(model, X_train, y_train, scoring=scoring, cv=10, return_train_score=False) 

# let's sort the output array of metrics by key
sorted(scores.keys())

print(scores['test_accuracy'])  # we will print the selected field of metrics

# YOUR CODE HERE

# YOUR CODE HERE

# YOUR CODE HERE

# YOUR CODE HERE

Introduction to Scikit-learn¶

Modeling with Scikit-learn¶

Splitting the data into a flag matrix and a vector of target attribute values¶

Division into training and testing set¶

Model training¶

Evaluation of the model¶

Division of data into training, validation and test set¶

Task 5.1:¶

N-fold cross-validation¶

Tasks¶

Task 5.2¶

Task 5.3¶

Task 5.4¶

Task 5.5¶