import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# display of graphs directly in the cells of the notebook
%matplotlib inline
# initialize the seaborn library
sns.set()

# for CSV files you can change e.g. value separator (parameter `delimiter`) and decimal point character (`decimal`)
# if the file does not have a header with attribute names on the first line (parameter `header=None`), the attributes can be named
# in the `names` parameter
data = pd.read_csv("../data/iris.csv", delimiter=";", decimal=",", header=None, names=["sepal_length", "sepal_width", "petal_length", "petal_width", "species"])
data.head()

# similarly, you can load data from an Excel file (by default, the data table from the first workbook is loaded)
data = pd.read_excel("../data/iris.xlsx")

# by default, a column with row indexes is also written to the file, if we want to write only attributes without an index, we set `index=False`
data.to_csv("../data/iris_processed.csv", index=False)

# when writing, we can choose which attributes are written
data.to_excel("../data/iris_processed.xlsx", index=False, columns=["sepal_length", "sepal_width"])

# first, we will load and pre-process the data we will work with: World Happiness Report from 2015 and 2016
data_2015 = pd.read_csv("../data/2015.csv") # data for 2015
data_2015 = data_2015[["Country", "Rank", "Score"]] # we only select `Country`, `Rank` and `Score` columns
# we will rename the columns `Rank` and `Score` to `Rank 2015` and `Score 2015`
data_2015 = data_2015.rename(columns={"Rank":"Rank 2015", "Score":"Score 2015"})
data_2015.head()

# similarly, we will pre-process the data for 2016
data_2016 = pd.read_csv("../data/2016.csv")
data_2016 = data_2016[["Country", "Rank", "Score"]]
data_2016 = data_2016.rename(columns={"Rank":"Rank 2016", "Score":"Score 2016"})
data_2016.head()

# if we are using the `concat` method for concatenation by columns (`axis=1`), the data will be concatenated according to the order in the rows
# if the number of rows in the joined tables differs, the data will be supplemented with missing values
data_all = pd.concat([data_2015, data_2016], axis=1)
data_all.tail()

# we will use the `merge` method to join the tables according to the keys
# in the basic parameters, we set the connected data (right and left tables) and attributes that will be used as a key
data_all = pd.merge(left=data_2015, right=data_2016, left_on="Country", right_on="Country")
data_all.tail()

# we check the number of rows and missing values in the linked table
print(len(data_2015), len(data_2016), len(data_all))
data_all.isna().sum()

# joining according to the keys of the left table (if the key is not in the right table, the missing values ​​are added)
data_all = pd.merge(left=data_2015, right=data_2016, left_on="Country", right_on="Country", how="left")
print(len(data_2015), len(data_2016), len(data_all))
data_all.isna().sum()

# unification of values from both tables
data_all = pd.merge(left=data_2015, right=data_2016, left_on="Country", right_on="Country", how="outer")
print(len(data_2015), len(data_2016), len(data_all))
data_all.isna().sum()

# we will import the necessary objects from the `sklearn` library
from sklearn.preprocessing import MinMaxScaler, StandardScaler

# we load the `iris` dataset
iris = pd.read_csv("../../6/data/iris.csv")
# we will store numerical attributes in `iris_data' variable
iris_data = iris[["sepal_length", "sepal_width", "petal_length", "petal_width"]]
# we save the names of species (classes) in the variable `iris_labels`
iris_labels = iris["species"]

# we will use the MinMaxScaler object to normalize the data to the interval 0-1
# create a normalization object
min_max = MinMaxScaler()
# we transform the data, the result is a numerical array of the `numpy` library, which we convert back to the `pandas` data frame
norm_array = min_max.fit_transform(iris_data)
# we will create a new data frame from the `numpy` array, we will keep the column names from the original `iris_data` set
iris_data_norm = pd.DataFrame(norm_array, columns=iris_data.columns)
iris_data_norm.describe()

standard = StandardScaler()
std_array = standard.fit_transform(iris_data)
iris_data_std = pd.DataFrame(norm_array, columns=iris_data.columns)
iris_data_std.describe()

# we draw a histogram of the original, normalized and standardized data for the `sepal_length` attribute
# we create a figure divided into 1 row and 3 columns, we display one histogram in each column
fig, axis = plt.subplots(1, 3) 
pl = iris_data["sepal_length"].hist(ax=axis[0]) # `ax` parameter determines in which part of the figure the graph will be displayed
pl = iris_data_norm["sepal_length"].hist(ax=axis[1])
pl = iris_data_std["sepal_length"].hist(ax=axis[2])

# import the PCA method from the `sklearn` library
from sklearn.decomposition import PCA

# we only calculate the two most important principal components
pca = PCA(n_components=2)
pca_array = pca.fit_transform(iris_data_norm)
# the result is a `numpy` numeric field, which we convert to a `pandas` data frame
# name the columns `component_1` and `component_2`
iris_data_pca = pd.DataFrame(pca_array, columns=["component_1", "component_2"])

# let's see how the examples look after projection, we can no longer simply interpret the values of the components
iris_data_pca.head()

# let's see how much information (variance) in the original data is projected into individual components
pca.explained_variance_ratio_

# we calculate the percentage of how much information there is in total in the first two main components that we visualize
print("{0:.4f}".format(np.sum(pca.explained_variance_ratio_) * 100))

# we connect the projected data with the original labels of the examples and display the data on the X-Y graph
# we draw the color according to the assignment to individual plant species
iris_pca = pd.concat([iris_data_pca, iris_labels], axis=1)
pl = sns.scatterplot(data=iris_pca, x="component_1", y="component_2", hue="species")

Method merge	SQL	Description
left	LEFT OUTER JOIN	Only keys from the left table
right	RIGHT OUTER JOIN	Only keys from the right table
outer	FULL OUTER JOIN	Select data if they have a key in the right or left table (unification)
inner	INNER JOIN	Select data if they have a key in both the right and left table (intersection)

Basic settings¶

Options for loading and writing data¶

Joining tables¶

Data normalization¶

Data visualization using the method of principal components¶

Tasks¶

Task 3.1¶

Task 3.2¶

Task 3.3¶