# import the required libraries, this paragraph should be run first before they are used
# imported pd, np, plt and sns objects
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# set the display of graphs directly in the paragraphs of the notebook
%matplotlib inline
# initialize the seaborn library
sns.set()

# load the data for the year 2015 from the file 2015.csv
data_2015 = pd.read_csv("../data/2015.csv")

# display the names of the columns (the columns property is an object of type pandas.Index, which represents the sequence of indexes for
# rows or columns of the table, it is accessed similarly as the Python list)
data_2015.columns

# you can find out the size of the data table from the shape property
print(data_2015.shape[0]) # first dimension - number of rows
print(data_2015.shape[1]) # second dimension - number of columns

# you can access the data by indexing, e.g. to print the value of the 'Country' column on the first row of the table (index 0)
print(data_2015['Country'][0])

scores = data_2015['Score'] # scores is an object of type pandas.Series that represents one column of the table

# for numeric columns we can calculate basic statistics directly
score_count = scores.count()      # number of non-empty values
score_mean = scores.mean()        # average
score_std = scores.std()          # standard deviation
score_min = scores.min()          # minimum value
score_max = scores.max()          # maximum value
score_q25 = scores.quantile(0.25) # 25% quartile
score_q50 = scores.quantile(0.5)  # 50% quartile - median
score_q75 = scores.quantile(0.75) # 75% quartile

# statistics for all numerical attributes of the table can also be calculated at once using the describe() method
data_2015.describe()

# for categorical attributes we can display a list of different values ​​and their frequency
# e.g. for 'Region'
data_2015['Region'].value_counts()

# columns are removed using the drop method
# by default, the drop method does not change the original data frame, but creates a new one, so we change the variable data_2015
# to the new value
data_2015 = data_2015.drop(columns="StdError")
# we add the 'Year' column to the table, whose value we set to 2015 for all rows
data_2015["Year"] = 2015
# using the head method, we display the first rows of the table (by default, the head method returns a new DataFrame object with the first
# 5 rows of the original table)
data_2015.head()

# load the data for the year 2016
data_2016 = pd.read_csv("../data/2016.csv")
# if we set the inplace parameter to True in the drop method, the column will be removed directly in the original frame and not created
# a new one, as the columns parameter you can specify a list of several columns to be deleted at once
data_2016.drop(columns=["LowerConfidence", "UpperConfidence"], inplace=True)
# add the Year column
data_2016["Year"] = 2016
# you can specify the number of rows to display for the head method
data_2016.head(1) # print 1 row

# we add a new column Score to the table, the value of which we calculate for each row as the sum of the attributes GDP, Family,
# Health, Freedom, Trust, Generosity and Dystopia
data_2016["Score"] = data_2016.eval("GDP + Family + Health + Freedom + Trust + Generosity + Dystopia")

# the Rank column is determined by the overall ranking of the countries sorted by score from the largest to the smallest
# we sort the rows of the table according to the score in descending order
data_2016.sort_values(by="Score", ascending=False)
# for checking, we display the 3 first and 3 last countries
data_2016.head(3)

# the tail method returns the last rows of the table
data_2016.tail(3)

# since we have the data sorted in descending order according to the total score, we set the values in the Rank order column to
# the sequence of numbers 1, 2, ..., number of countries (number of rows in the table)
data_2016["Rank"] = range(1, data_2016.shape[0] + 1)
data_2016.head()

data_2017 = None

# using the concat method, we can combine data for all years into one table, parameter sort=False and ignore_index=True
# indicates that the rows should not be rearranged according to their original indexes and that the order in which they are listed is the same
# like the order of combined tables (i.e. first all 2015 rows, then 2016 and finally 2017)
data = pd.concat([data_2015, data_2016, data_2017], ignore_index=True, sort=False)
data.head(1)

# we select only the Contry, Rank, Score and Year columns
selected = data[["Country", "Rank", "Score", "Year"]]

# we will select only rows about Slovakia
selected.query("Country == 'Slovakia'")

# when filtering rows, you can enter a more complex condition with the operators <, >, <=, >=, !=, ==, in [list],
# not in [list] and with logical conjunctions and, or and not

# e.g. we will filter out the rows about our neighbors for 2017 and sort the result according to the overall ranking of the countries
q = "Country in ['Slovakia', 'Czech Republic', 'Poland', 'Hungary', 'Ukraine', 'Austria'] and Year == 2017"
selected.query(q).sort_values(by="Rank")

# e.g. to calculate the average score for all countries in a given year we can enter
pd.pivot_table(data, index="Year", values="Score")

# we can group data according to several categorical attributes at once,
# e.g. by region and year
pd.pivot_table(data, index=["Region", "Year"], values="Score")

# we can rearrange the pivot table by moving some categorical attributes from the rows of the table
# (parameter index) to columns (parameter columns)
# e.g. we can display the previous table more clearly as follows
table = pd.pivot_table(data, index="Region", columns="Year", values="Score")
table

# values of the pivot table can be directly displayed graphically, e.g. as a horizontal bar graph
table.plot(kind="barh")
# set the description of the x-axis
l = plt.xlabel("Average Happiness Rank")

# you can calculate multiple aggregation functions in one table by setting the aggfunc parameter to a list of functions,
# e.g. to calculate the mean value and standard deviation of the score for each region:
pd.pivot_table(data, index="Region", values="Score", aggfunc=["mean", "std"])

# you can also calculate different aggregation functions for different numeric attributes at once
# e.g. in the following table we calculate the average value for the score (Score) and the minimum and maximum value
# for the rank (Rank) for each region
pd.pivot_table(data, index="Region", values=["Score", "Rank"], aggfunc={"Score": "mean", "Rank": ["min", "max"]})

# we select only the numeric attributes
factors = data[["GDP", "Family", "Health", "Freedom", "Trust",  "Generosity"]]
# we calculate the correlation table using the corr method
corr_table = factors.corr()
corr_table

p = sns.heatmap(corr_table,
                xticklabels=corr_table.columns, yticklabels=corr_table.columns, # označíme osy názvami stĺpcov
                vmin=-1, vmax=1, # set the minimum and maximum value for the color palette
                cmap='coolwarm', # change the preset color palette
                square=True)     # display square fields

p = sns.lmplot(data=factors, x='GDP', y='Health', fit_reg=True)

p = sns.pairplot(factors)

Basic settings¶

Dataset - World Happiness Report¶

Loading data and basic statistics¶

Data preparation¶

Task 1.1¶

Selection of data¶

Dependencies between attributes¶

Dependencies between categorical and numerical attributes¶

Dependencies between numerical attributes - correlation¶

Data set - Iris¶

Task 1.2¶

Task 1.3¶

Task 1.4¶

Task 1.5¶

Task 1.6¶