import numpy as np
import pandas as pd
import seaborn as sns


titanic = pd.read_csv("../data/titanic-processed.csv")
titanic.head()


# YOUR CODE HERE


# YOUR CODE HERE


# YOUR CODE HERE


# we will preprocess the data in the same way as in the examples from the previous exercises:
# - remove attributes that we will not use (e.g. duplicates)
# - map binary and ordinal attributes to indexes
# - transform categorical attributes without arrangement using the One Hot approach

titanic = titanic.drop(columns=['cabin','deck','ticket','title'])
titanic['sex'] = titanic['sex'].map({"male": 0, "female": 1})
titanic['has_family'] = titanic['has_family'].map({False: 0, True: 1})
titanic['fare_ordinal'] = titanic['fare_ordinal'].map({"normal": 0, "more expensive": 1, "most expensive": 2}) 
titanic['age_ordinal'] = titanic['age_ordinal'].map({"child": 0, "young": 1, "adult": 2, "old": 3}) 
titanic = pd.get_dummies(titanic, columns=['embarked', 'title_short'])


from sklearn.cluster import DBSCAN

dbscan = DBSCAN(eps=100) # initialize the DBSCAN model for the defined minimum distance value

labels = dbscan.fit_predict(titanic) # train the model on the input data

g = sns.scatterplot(x='age', y='fare', hue=labels, data=titanic) # draw a dot plot, colored according to clusters


from sklearn.preprocessing import normalize # we import used libraries
from sklearn.metrics import confusion_matrix

data=pd.read_csv("../data/creditcard.csv") # load the data into the data frame from the file
data.head() # we will print the first 5 records on the screen


print(data["Class"].value_counts())


g = sns.countplot(x='Class', data=data)


features=data.drop(["Time","Class"],axis=1)
labels=pd.DataFrame(data[["Class"]])


from sklearn.preprocessing import normalize
features=normalize(features)


from sklearn.cluster import KMeans
from sklearn.cluster import DBSCAN

#kmeans=KMeans(n_clusters=2, max_iter=300)
#kmeans.fit(features)
#y_kmeans=kmeans.predict(features)

dbscan = DBSCAN(eps=0.5)
y_dbscan = dbscan.fit_predict(features)


#clusters, counts = np.unique(y_kmeans, return_counts=True) # we use the unique function to identify different values and return their numbers
#print(np.asarray((clusters, counts))) 

clusters, counts = np.unique(y_dbscan, return_counts=True) # we use the unique function to identify different values and return their numbers
print(np.asarray((clusters, counts)))


#print(confusion_matrix(labels,y_kmeans))

y_dbscan[y_dbscan == -1] = 1
print(confusion_matrix(labels == 1,y_dbscan == -1))


Fraud = data[data['Class']==1] # will select data that is flagged as fraud
Valid = data[data['Class']==0] # will select data that is flagged as OK

outlier_fraction = len(Fraud)/float(len(Valid)) # calculate the proportion of anomalies (fraud) in the data, which we then use as a parameter of the LOF method
print(outlier_fraction)


from sklearn.neighbors import LocalOutlierFactor 

lof = LocalOutlierFactor(n_neighbors=40, metric='euclidean', contamination = outlier_fraction) # we will create a model, the density around each example is calculated (number of neighbors), as the contamination parameter we will indicate the expected share of ammonals
y_lof = lof.fit_predict(features) # train the model
#scores_prediction = lof.negative_outlier_factor_


y_lof[y_lof == 1] = 0
y_lof[y_lof == -1] = 1

print(confusion_matrix(labels,y_lof))

Detection of anomalies and outliers¶

Task 9.6.¶