# Titanic import a preprocessing

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

%matplotlib inline

titanic = pd.read_csv("../data/titanic-processed.csv")

titanic = titanic.drop(columns=['ticket', 'cabin'])

titanic['sex'] = titanic['sex'].map({"male": 0, "female": 1})
titanic['has_family'] = titanic['has_family'].map({False: 0, True: 1})

titanic['fare_ordinal'] = titanic['fare_ordinal'].map({"normal": 0, "more expensive": 1, "most expensive": 2})
titanic['age_ordinal'] = titanic['age_ordinal'].map({"child": 0, "young": 1, "adult": 2, "old": 3}) 

titanic = pd.get_dummies(titanic, columns=['embarked', 'title_short', 'deck', 'title'])

titanic.head()


X_titanic = titanic.drop('survived', axis=1) # create a flag matrix - use all columns except the target attribute and store in X_titanic
y_titanic = titanic['survived'] # create a vector of target attribute values as column 'survived'

print(X_titanic.shape) # for checking, we can print the dimensions of the matrix of values and the vector of the target attribute
print(y_titanic.shape)

from sklearn.model_selection import train_test_split # we import the function train_test_split()
X_train, X_test, y_train, y_test = train_test_split(X_titanic, y_titanic, test_size=0.3, random_state=1) # split the dataset into training and testing parts, so that the testing part will be 30% of the total dataset


from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix

rf = RandomForestClassifier()
rf.fit(X_train, y_train)

y_rf = rf.predict(X_test)

print(f"Presnosť (accuracy) modelu: {accuracy_score(y_test, y_rf)}")
cm = confusion_matrix(y_test, y_rf)
print(cm)


sorted(zip(rf.feature_importances_, X_train.columns), reverse=True)


# YOUR CODE HERE

Classification using composite models - Random Forests¶

Task 7.5.¶