import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# we will set the display of graphs directly in the cells of the notebook
%matplotlib inline
# initialize the seaborn library
sns.set()


data = pd.read_csv("../data/titanic.csv") # load the data from the file ../data/titanic.csv
print(data.shape) # display the size of the data table in the form (number of rows, number of columns)
print(data.columns) # display the column names


data.head() # display the first 5 rows of the table


data.drop(columns="ticket", inplace=True) # remove the ticket column directly from the data frame


 # you can remap categorical attribute values using the map method of the Series object
data["embarked"] = data["embarked"].map({"S": "Southampton", "Q": 'Queenstown', "C":"Cherbourg"})
data["embarked"].value_counts() # display the different values after replacement and their frequency


# method isna returns a table with only Boolean values for each column (True - missing value, False - non-empty value)
# sum then counts the number of True values (ie the number of missing values for each column)
data.isna().sum() # count the number of missing values


p = data["fare"].hist() # draw a histogram of values


# calculate the mean value and the median
fare_mean = data["fare"].mean()
fare_median = data["fare"].median()
print("fare mean: {0:.4f}, median: {1:.4f}".format(fare_mean, fare_median))


# since fare values are significantly skewed, we will replace the missing values with the median, which better characterizes the most frequently occurring values
data["fare"].fillna(fare_median, inplace=True)
data["fare"].isna().sum() # check the number of missing values after replacement


# replace missing values

# check the number of missing values after replacement
data["embarked"].isna().sum()


# we will create a new attribute family, which will indicate the total number of relatives (sum of sibsp + parch)
data["family"] = data.eval("sibsp + parch")
p = data["family"].hist()


# we will create a new binary attribute has_family, which will indicate whether the passenger traveled with his family
data["has_family"] = data.eval("family > 0")
data["has_family"].sum() # we count how many passengers traveled with their family


# we define a function that extracts only part of the title from the entire name string
def extract_title(name):
    if pd.isna(name): # use pd.isna to test if name has an empty value
        return np.nan # if name is empty, we return an empty value for title as well
                      # (empty values are represented in pandas by the numeric constant np.nan - Not A Number)
    start = name.find(",") + 1
    end = name.find(".")
    return name[start:end].strip() # from the name we return the substring from , to . (without blank characters at the beginning and end)

# using the apply method, we apply our extract_title function to all the values of the name column and store the returned values in the title column
data["title"] = data["name"].apply(extract_title)
# we will no longer need the name column, so we will remove it from the data table
data.drop(columns="name", inplace=True)
# display the titles and how many times they occurred
data["title"].value_counts()


# using the apply method, we remap the titles to a shortened list assigned to the title_short column
def map_title(title):
    # we will map all rank or noble titles to the value 'rare title'
    if title in {"Master", "Dr", "Rev", "Col", "Major", "Don", "Jonkheer", "Sir", "Dona", "Lady", "Capt", "the Countess"}:
        return "rare title"
    elif title in {"Mlle", "Ms"}: # titles from French
        return "Miss"
    elif title in {"Mme"}:
        return "Mrs"
    return title;

data["title_short"] = data["title"].apply(map_title)
data["title_short"].value_counts()


# in the contingency table, we group data by gender and title of passengers and calculate basic statistics about age
# parameter margins=True adds total statistics for each row and column of the pivot table
pd.pivot_table(data, index=["sex", "title_short"], values="age", aggfunc=["median", "mean", "min", "max", "count"], margins=True)


# interesting is the young age of men with a noble title, let's view the histogram of values ​​in this group of passengers
l = data.query("sex == 'male' and title_short == 'rare title'")["age"].hist()


# first we calculate the median for individual groups using a pivot table
ptable = pd.pivot_table(data, index=["sex", "title_short"], values="age", aggfunc="median")
ptable


# if we want e.g. to write a specific value in a row for a combination (female, Miss), we need to enter the index as a tuple
# (similarly, we would have to enter a tuple of values for the columns as well, if we had hierarchical column labels)
ptable["age"][("female", "Miss")]


# we save only non-empty age values in the age1 variable (the original data will not change)
age1 = data["age"].dropna() # the dropna method will only return records with non-empty values

# we define a function with which we replace the missing age values by gender and title
# `row` parameter will be an object representing one row in the table
def replace_missing_age(row):
    # row values for individual columns can be accessed by indexing
    age = row["age"] # we find out the age, gender and title of the passenger
    sex = row["sex"]
    title = row["title_short"]
    # if age is a missing value, we replace it with the median in the given group determined by gender and title
    # (we store the calculated median values in the ptable contingency table)
    if pd.isna(age):
        return ptable["age"][(sex, title)]
    else:
        return age # otherwise we return a known value

# we apply the function replace_missing_age to each row of the table (axis=1, the default value axis=0 would apply the function column by column)
# the result is the age values with the missing values replaced, which we store in the age2 variable
age2 = data.apply(replace_missing_age, axis=1)

# for comparison, we draw a histogram without replacement and after replacing the missing values
p = plt.hist([age1, age2])


data["age"] = age2 # we will replace the age column with filled missing values in the original data


data["fare"].describe() # we will display the basic statistics for the fare attribute


data["fare_ordinal"] = pd.cut(data["fare"], 3) # using the cut method, we divide the values into 3 intervals of the same size
data["fare_ordinal"].value_counts() # display the labels of the intervals and their frequency


# most of the values belong approximately to the interval from 0-170, for a better understanding of the distribution of values, we will display a histogram
p = data["fare"].hist(bins=20) # the bins parameter specifies the number of intervals for the histogram calculation


# instead of dividing the data into intervals of equal size, we can divide the data using the qcut method into intervals with approximately the same number of values
data["fare_ordinal"] = pd.qcut(data["fare"], 3)
data["fare_ordinal"].value_counts() # display the labels of the intervals and their frequency


# with the cut method, we can directly enter the limit values of the intervals, e.g. we divide the fare into intervals [0-25], (25-100] and (100, 520]
# by default the first interval does not include the smallest value, so if we want to include 0 values as well, we set include_lowest to True
data["fare_ordinal"] = pd.cut(data["fare"], bins=[0, 25, 100, 520], include_lowest=True)
data["fare_ordinal"].value_counts()


# interval labels can be named directly by entering the labels parameter
data["fare_ordinal"] = pd.cut(data["fare"], bins=[0, 25, 100, 520], include_lowest=True, labels=["normal", "more expensive", "most expensive"])
data["fare_ordinal"].value_counts()


# we calculate the frequency table for all combinations of values between the attributes pclass and sex
# note that in the crosstab method, unlike pivot_table, you must specify columns directly as data attributes
# (Series objects)
pd.crosstab(index=data["pclass"], columns=data["sex"])


# similar to the pivot table, we can enter multiple rows or columns
# e.g. in the following table, we calculate for each combination of class and fare amount,
# how many women and how many men bought the given ticket
pd.crosstab(index=[data["pclass"], data["fare_ordinal"]], columns=data["sex"])

Basic settings¶

Data set - Titanic¶

Editing values¶

Replacement of missing values 1¶

Task 2.1¶

Derivation of new attributes¶

Replacement of missing values 2¶

Conversion of numeric attributes to ordinal - discretization of values¶

Dependencies between categorical attributes¶

Task 2.2¶

Task 2.3¶

Task 2.4¶

Task 2.5¶

Task 2.6¶