# import necessary libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


# read the dataset and explore first 5 rows

data = pd.read_csv('../data/customers.csv')
data.head()


# YOUR CODE HERE
data = data.drop("CustomerID", axis=1)
data["Gender"] = data["Gender"].map({"Male": 0, "Female":1})
data.head()


from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

results_inertia = []
results_silhouette = []

K = range(2,10)

for k in K:
    model = KMeans(n_clusters=k)
    model.fit(data)
    predictions = model.predict(data)
    results_inertia.append(model.inertia_)
    results_silhouette.append(silhouette_score(data, predictions))
    
print("Inertia:")
print(results_inertia)
print("Silhouette:")
print(results_silhouette)


plt.figure(figsize=(12, 4)) # define the size of the image (stretch a little to the width, to render 2 next to each other)

plt.subplot(1, 2, 1) # 1-2-1 means we will create 1 row, 2 columns and draw to 1.
plt.plot(K, results_inertia, 'bx-')
plt.xlabel('Number of clusters')
plt.ylabel('Sum of distance generators')
plt.title('Inertia')

plt.subplot(1, 2, 2) # 1-2-2 means we will create 1 row, 2 columns and draw up to 2.
plt.plot(K, results_silhouette, 'bx-')
plt.xlabel('Number of clusters')
plt.ylabel('Silhouette')
plt.title('Silhouette')

plt.show() # call the show function only at the end, which renders both graphs at once


model = KMeans(n_clusters=6)
model.fit(data)
labels = model.predict(data)


# print out the predictions

print(labels)


from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix

dt = DecisionTreeClassifier(max_depth=4)   # Decision tree init   
dt.fit(data, labels)        # train the model

y_dt = dt.predict(data)       # test the model

from sklearn.metrics import accuracy_score,precision_score, recall_score # compute the metrics

print(f"Accuracy: {accuracy_score(labels, y_dt)}")

cm = confusion_matrix(labels, y_dt)  # confusion matrix
print(cm)


from sklearn import tree
from sklearn.tree import export_graphviz

with open("decision_tree.txt", "w") as f:
    f = tree.export_graphviz(dt, feature_names=data.columns.values, class_names=["0","1","2","3","4","5"], out_file=f)


# YOUR CODE HERE

Clustering - evaluation and interpretation of clusters - example 2¶

Task 9.5.¶