# Titanic import a preprocessing

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

%matplotlib inline

titanic = pd.read_csv("../data/titanic-processed.csv")

titanic = titanic.drop(columns=['cabin','ticket','title', 'deck', 'fare_ordinal', 'age_ordinal'])

titanic['sex'] = titanic['sex'].map({"male": 0, "female": 1})
titanic['has_family'] = titanic['has_family'].map({False: 0, True: 1})

titanic = pd.get_dummies(titanic, columns=['embarked', 'title_short'])

titanic.head()


X_titanic = titanic.drop('survived', axis=1) # create a flag matrix - use all columns except the target attribute and store in X_titanic
y_titanic = titanic['survived'] # create a vector of target attribute values as column 'survived'

print(X_titanic.shape) # for checking, we can print the dimensions of the matrix of values and the vector of the target attribute
print(y_titanic.shape)

from sklearn.model_selection import train_test_split # we import the function train_test_split()
X_train, X_test, y_train, y_test = train_test_split(X_titanic, y_titanic, test_size=0.3, random_state=1) # split the dataset into training and testing parts, so that the testing part will be 30% of the total dataset


from sklearn.naive_bayes import GaussianNB 

nb = GaussianNB()
nb.fit(X_train, y_train)

y_nb = nb.predict(X_test) 

from sklearn.metrics import accuracy_score

print(f"Presnosť (accuracy) modelu: {accuracy_score(y_test, y_nb)}")


prediction = nb.predict_proba(X_test[:1])


print(prediction)

Classification using probabilistic models - Naive Bayes¶

Task 7.6.¶