import matplotlib.pyplot as plt # we import matplotlib for plotting
import seaborn as sns; sns.set() # import seaborn for more advanced visualizations and set the environment
import numpy as np # we import numpy for working with fields

# we will set rendering of visualizations in Jupyter notebooks
%matplotlib inline

from sklearn.datasets.samples_generator import make_blobs # import the function for the data generator

# we will generate 300 records with a defined distribution
# in four groups, with a defined deviation from the centers

blobs, y_true = make_blobs(n_samples=300, centers=4, cluster_std=0.60, random_state=0)

# the output is a matrix of example flags (blobs) and a vector of true cluster membership values y_true
# blobs and y_true are numpy arrays
# write one record from the generated data points
print(blobs[:1])

# plot the points with a scatterplot 
# to the x-axis column with index 0
# to the y-axis column with index 1

g = sns.scatterplot(x=blobs[:, 0], y=blobs[:, 1])

from sklearn.cluster import KMeans # first we import the necessary library, in this case KMeans for the given model

kmeans = KMeans(n_clusters=4) # initialize the K-Means model, set the parameter value K - the number of clusters - to 4
kmeans.fit(blobs) # train the model on the input data
y_kmeans = kmeans.predict(blobs) # we sort all the data into the created clusters

centers = kmeans.cluster_centers_ # we load the centroids of the created clusters into the variable centers

print("All centroids:") # print centroids for all clusters
print(centers)

# we will create a sample example and transform it into a numpy field in the desired shape (1 row, 2 columns)
x = np.array([1.98686, 3.76876]).reshape(1, 2)

prediction = kmeans.predict(x) # we will predict its membership in the kmeans model cluster
print(prediction) # print the output on the screen

# draw the first scatterplot - X and Y axes correspond to the first, respectively to the second column in the data field
# we want to distinguish individual points by color according to clusters (we have cluster membership in y_kmeans)
# we set the color palette to Set1
g=sns.scatterplot(x=blobs[:, 0], y=blobs[:, 1], hue=y_kmeans, palette="Set1")

# we also draw the centroids in the created visualization, also with a scatterplot, since they have the same structure as the input data
# we draw the objects of the centers array (coordinates are both columns)
# we set the size of the points with the s parameter (we want to highlight the centroids a little)
# we define the color and the rendering method with the marker parameter
g=sns.scatterplot(x=centers[:, 0], y=centers[:, 1], s=150, color=".1", marker="X")

# we create a new model, this time for 6 clusters
kmeans2 = KMeans(n_clusters=6)

kmeans2.fit(blobs) # train the model on the input data
y_kmeans2 = kmeans2.predict(blobs) # we will use the model to assign data to the created clusters

# we plot the data with a scatterplot, differentiated by color according to clusters
g = sns.scatterplot(x=blobs[:, 0], y=blobs[:, 1], hue=y_kmeans2, s=50, palette='Set1')

from sklearn.datasets import make_moons # we import the necessary library for dataset generation

# we will generate the input data
# 200 examples
moon_data, y_true = make_moons(200, noise=.05, random_state=0)

# moon_data again contains a numpy array of examples described by 2 attributes and y_true the actual value of cluster membership

# again, using the scatter plot, we plot the generated data
# on the x-axis the first column and on the y-axis the second
sns.scatterplot(x=moon_data[:, 0], y=moon_data[:, 1])

# we initialize the K-Means model, for 2 clusters
kmeans_moons = KMeans(n_clusters=2)
kmeans_moons.fit(moon_data) # train the model on the input data
labels = kmeans_moons.predict(moon_data) # we assign the input data to clusters

# plot the data points using a scatter plot
# we differentiate by color according to belonging to clusters (labels)


# YOUR CODE HERE

from sklearn.cluster import DBSCAN # we import the necessary libraries

dbscan = DBSCAN(eps=######) # initialize the DBSCAN model for the defined minimum distance value

labels = dbscan.fit_predict(moon_data) # train the model on the input data and assign the data to clusters

# using a scatter plot, we draw the data points and color them according to their belonging to the clusters
sns.scatterplot(x=moon_data[:, 0], y=moon_data[:, 1], hue=labels, s=50, palette='Set1')

import scipy.cluster.hierarchy as shc # we import the necessary libraries

plt.figure(figsize=(30, 20)) # set the size of the rendered image
plt.title("Dendogram:") # we will write its name
links = shc.linkage(blobs, method='ward') # create a hierarchical cluster model

dend = shc.dendrogram(links) # draw the dendrogram

from sklearn.cluster import AgglomerativeClustering # we import the necessary libraries
aggcl = AgglomerativeClustering(n_clusters=5) # set the parameters and the defined number of clusters (where the agglomerative model "stops")
labels_agg = aggcl.fit_predict(blobs) # train the model

sns.scatterplot(x=blobs[:,0], y=blobs[:,1], hue=labels_agg, palette='Set1')

Clustering¶

K-means methods¶

Grid or density based clustering¶

Hierarchical clustering¶