Commit 8becf39c authored by Wolf's avatar Wolf
Browse files

Upload New File

parent 890f7779
# load libraries
import numpy as np
from sklearn.cluster import KMeans
from scipy.spatial import distance
from sklearn.metrics import accuracy_score,f1_score,roc_auc_score
# load data set
data = np.loadtxt('crimerate_binary.csv', delimiter=',')
[n,p] = data.shape
# split data into a training set and a testing set
size_train = int(0.75*n) # we use first 75% data for training, the rest for testing
sample_train = data[0:size_train,0:-1]
label_train = data[0:size_train,-1]
sample_test = data[size_train:,0:-1]
label_test = data[size_train:,-1]
# ----------------------------------------
# clustering-based anomaly detection
# tutorial slides, page 61 - 67
# use k-means clustering for detection
# ----------------------------------------
# step 1. choose number of clusters for k-means clustering algorithm
numcluster = 2
# step 2. run k-means clustering algorithm on both training & testing data
sample = data[:,0:-1]
label = data[:,-1]
kmeans = KMeans(n_clusters=numcluster, random_state=0).fit(sample)
# step 3. compute center of each cluster
center = kmeans.cluster_centers_
# step 4. compute distance from examples to their assigned cluster centers
discenter = distance.cdist(center,sample)
# treat the distance as anomalous scores
adscore = np.min(discenter,axis=0)
# now just pick out distnace of testing examples because that's what we care about
adscore = adscore[len(label_train):]
# evaluate AUC score
auc_score = roc_auc_score(label_test, adscore)
# to get detection error and f1-score, we need to threshold anomalous score
# the range of adscore is [0.65, 4.10]
threshold = 1.2
adscore[adscore <= threshold] = 0
adscore[adscore > threshold] = 1
# now evaluate error and f1-score
err = 1 - accuracy_score(label_test,adscore)
f1score = f1_score(label_test,adscore)
# step 5. print results
print('\nClustering-based Approach (K-means)')
print('Detection Error = %.4f' % err)
print('F1 Score = %.4f' % f1score)
print('AUC Score = %.4f' % auc_score)
# -----------
# Assignment
# -----------
# 1. play with different number of clusters (line 26), what do you observe?
# 2. play with different thresholds (line 50), what do you observe?
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment