Commit f5aade08 authored by Shaya Wolf's avatar Shaya Wolf
Browse files

anomalies

parent c99f6ff5
# load libraries
import numpy as np
from sklearn import linear_model
from sklearn.metrics import accuracy_score,f1_score,roc_auc_score
# load data set
data = np.loadtxt('crimerate_binary.csv', delimiter=',')
[n,p] = data.shape
# split data into a training set and a testing set
size_train = int(0.75*n) # we use first 75% data for training, the rest for testing
sample_train = data[0:size_train,0:-1]
label_train = data[0:size_train,-1]
sample_test = data[size_train:,0:-1]
label_test = data[size_train:,-1]
# ----------------------------------------
# classification-based anomaly detection
# tutorial slides, page 49 - 59
# use linear regression model for detection
# ----------------------------------------
# step 1. choose a classification model (linear regression)
model = linear_model.LinearRegression()
# step 2. train the model using examples
model.fit(sample_train, label_train)
# step 3. apply model to predict whether an example is normal or anomaly
label_pred = model.predict(sample_test)
# we can treat above output as anomalous score, and get AUC score from it
auc_score = roc_auc_score(label_test, label_pred)
# because this is a regression model, we need to threshold its output to get detection error and f1-score
threshold = 0.4
label_pred[label_pred <= threshold] = 0
label_pred[label_pred > threshold] = 1
err = 1 - accuracy_score(label_test,label_pred)
f1score = f1_score(label_test,label_pred)
# step 4. print results
print('\nClassification-based Approach (Linear Regression Model)')
print('Detection Error = %.4f' % err)
print('F1 Score = %.4f' % f1score)
print('AUC Score = %.4f' % auc_score)
# -----------
# Assignment
# -----------
# play with different threshold values (line 37), what do you observe?
# load libraries
import numpy as np
from sklearn import linear_model
from sklearn.metrics import accuracy_score,f1_score,roc_auc_score
# load data set
data = np.loadtxt('crimerate_binary.csv', delimiter=',')
[n,p] = data.shape
# split data into a training set and a testing set
size_train = int(0.75*n) # we use first 75% data for training, the rest for testing
sample_train = data[0:size_train,0:-1]
label_train = data[0:size_train,-1]
sample_test = data[size_train:,0:-1]
label_test = data[size_train:,-1]
# ----------------------------------------
# classification-based anomaly detection
# tutorial slides, page 49 - 59
# use Logistic Regression model for detection (no need to threshold its output, page 54)
# ----------------------------------------
# step 1. choose a classification model (logistic regression)
model = linear_model.LogisticRegression(C=1)
# step 2. train the model using examples
model.fit(sample_train, label_train)
# step 3. apply model to predict whether an example is normal or anomaly
label_pred = model.predict(sample_test)
# directly get detection error and f1-score
err = 1 - accuracy_score(label_test,label_pred)
f1score = f1_score(label_test,label_pred)
# to get AUC score, we need to first compute anomalous score
# here we use prediction probability as anomalous score
adscore = model.predict_proba(sample_test)
adscore = adscore[:,1]
auc_score = roc_auc_score(label_test, adscore)
# step 4. print results
print('\nClassification-based Approach (Logistic Regression Model)')
print('Detection Error = %.4f' % err)
print('F1 Score = %.4f' % f1score)
print('AUC Score = %.4f' % auc_score)
# -----------
# Assignment
# -----------
# play with different hyper-parameter C (line 25), what do you observe?
# load libraries
import numpy as np
from sklearn.cluster import KMeans
from scipy.spatial import distance
from sklearn.metrics import accuracy_score,f1_score,roc_auc_score
# load data set
data = np.loadtxt('crimerate_binary.csv', delimiter=',')
[n,p] = data.shape
# split data into a training set and a testing set
size_train = int(0.75*n) # we use first 75% data for training, the rest for testing
sample_train = data[0:size_train,0:-1]
label_train = data[0:size_train,-1]
sample_test = data[size_train:,0:-1]
label_test = data[size_train:,-1]
# ----------------------------------------
# clustering-based anomaly detection
# tutorial slides, page 61 - 67
# use k-means clustering for detection
# ----------------------------------------
# step 1. choose number of clusters for k-means clustering algorithm
numcluster = 2
# step 2. run k-means clustering algorithm on both training & testing data
sample = data[:,0:-1]
label = data[:,-1]
kmeans = KMeans(n_clusters=numcluster, random_state=0).fit(sample)
# step 3. compute center of each cluster
center = kmeans.cluster_centers_
# step 4. compute distance from examples to their assigned cluster centers
discenter = distance.cdist(center,sample)
# treat the distance as anomalous scores
adscore = np.min(discenter,axis=0)
# now just pick out distnace of testing examples because that's what we care about
adscore = adscore[len(label_train):]
# evaluate AUC score
auc_score = roc_auc_score(label_test, adscore)
# to get detection error and f1-score, we need to threshold anomalous score
# the range of adscore is [0.65, 4.10]
threshold = 1.2
adscore[adscore <= threshold] = 0
adscore[adscore > threshold] = 1
# now evaluate error and f1-score
err = 1 - accuracy_score(label_test,adscore)
f1score = f1_score(label_test,adscore)
# step 5. print results
print('\nClustering-based Approach (K-means)')
print('Detection Error = %.4f' % err)
print('F1 Score = %.4f' % f1score)
print('AUC Score = %.4f' % auc_score)
# -----------
# Assignment
# -----------
# 1. play with different number of clusters (line 26), what do you observe?
# 2. play with different thresholds (line 50), what do you observe?
# load libraries
import numpy as np
from sklearn import svm
from sklearn.metrics import accuracy_score,f1_score,roc_auc_score
# load data set
data = np.loadtxt('crimerate_binary.csv', delimiter=',')
[n,p] = data.shape
# split data into a training set and a testing set
size_train = int(0.75*n) # we use first 75% data for training, the rest for testing
sample_train = data[0:size_train,0:-1]
label_train = data[0:size_train,-1]
sample_test = data[size_train:,0:-1]
label_test = data[size_train:,-1]
# --------------------------------------------------------------
# Support Vector Data Descriptor (SVDD)-based anomaly detection
# tutorial slides, page 69 - 73
# --------------------------------------------------------------
# step 1. construct SVDD model
model = svm.OneClassSVM(kernel='rbf',gamma=0.1,nu=0.8)
# step 2. train the model using ONLY NORMAL examples
model.fit(sample_train[label_train==0,:], )
# the following code trains the model using both normal and abnormal examples
# model.fit(sample_train, )
# step 3. apply model to predict whether an example is normal or anomaly
label_pred = model.predict(sample_test)
label_pred[label_pred==-1] = 0
# evaluate detection error and f1-score
err = 1 - accuracy_score(label_test,label_pred)
f1score = f1_score(label_test,label_pred)
# evaluate AUC score, by first getting annomalous score
adscore = model.decision_function(sample_test)*-1
auc_score = roc_auc_score(label_test, adscore)
# step 4. print results
print('\nSVDD-based Approach')
print('Detection Error = %.4f' % err)
print('F1 Score = %.4f' % f1score)
print('AUC Score = %.4f' % auc_score)
# -----------
# Assignment
# -----------
# 1. train SVDD model using both normal and abnormal examples (replace line 27 with line 29), what do you observe?
# 2. play with different parameters of SVDD model (line 24), what do you observe?
# load libraries
import numpy as np
from sklearn.neighbors.kde import KernelDensity
from sklearn.metrics import accuracy_score,f1_score,roc_auc_score
# load data set
data = np.loadtxt('crimerate_binary.csv', delimiter=',')
[n,p] = data.shape
# split data into a training set and a testing set
size_train = int(0.75*n) # we use first 75% data for training, the rest for testing
sample_train = data[0:size_train,0:-1]
label_train = data[0:size_train,-1]
sample_test = data[size_train:,0:-1]
label_test = data[size_train:,-1]
# ------------------------------------
# Statistics-based anomaly detection
# tutorial slides, page 75 - 83
# ------------------------------------
# step 1. construct a distribution model
model = KernelDensity(kernel='gaussian', bandwidth=1e0)
# step 2. estimate the distribution using ONLY NORMAL examples
model.fit(sample_train[label_train==0,:])
# the following code trains the model using both normal and abnormal examples
# model.fit(sample_train, )
# step 3. apply model to estimate density of testing examples
# treat this density as anomalous score
adscore = 1 - model.score_samples(sample_test)
# get AUC score
auc_score = roc_auc_score(label_test, adscore)
# to get detection error and f1-score, we need to threshold anomalous score
# the range of adscore is [94.5, 101]
threshold = 97
adscore[adscore <= threshold] = 0
adscore[adscore > threshold] = 1
# evaluate detection error and f1-score
# now evaluate error and f1-score
err = 1 - accuracy_score(label_test,adscore)
f1score = f1_score(label_test,adscore)
# step 4. print results
print('\nStatistics-based Approach')
print('Detection Error = %.4f' % err)
print('F1 Score = %.4f' % f1score)
print('AUC Score = %.4f' % auc_score)
# -----------
# Assignment
# -----------
# 1. estimate distribution using both normal and abnormal examples (replace line 27 with line 29), what do you observe?
# 2. play with different hyper-parameter bandwidth of distribution model (line 24), what do you observe?
# 3. play with different thresholds (line 38), what do you observe?
# load libraries
import numpy as np
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics import accuracy_score,f1_score,roc_auc_score
# load data set
data = np.loadtxt('crimerate_binary.csv', delimiter=',')
[n,p] = data.shape
# split data into a training set and a testing set
size_train = int(0.75*n) # we use first 75% data for training, the rest for testing
sample_train = data[0:size_train,0:-1]
label_train = data[0:size_train,-1]
sample_test = data[size_train:,0:-1]
label_test = data[size_train:,-1]
# ------------------------------------
# Neighborhood-based anomaly detection
# tutorial slides, page 85 - 93
# ------------------------------------
# step 1. choose number of neighbors
num_neighbor = 30
# step 2. construct a neighborhood using ONLY NORMAL examples
nbrs = NearestNeighbors(n_neighbors=num_neighbor, algorithm='ball_tree').fit(sample_train[label_train==0,:])
# the following code trains the model using both normal and abnormal examples
# nbrs = NearestNeighbors(n_neighbors=num_neighbor, algorithm='ball_tree').fit(sample_train)
# step 3. compute distance from each testing example to its num_neighbor neighbors
distances, indices = nbrs.kneighbors(sample_test)
# treat average distance as anomalous score
adscore = np.sum(distances,axis=1)/num_neighbor
# get AUC score
auc_score = roc_auc_score(label_test, adscore)
# to get detection error and f1-score, we need to threshold anomalous score
# the range of adscore is [0.9, 3.3]
threshold = 2
adscore[adscore <= threshold] = 0
adscore[adscore > threshold] = 1
# evaluate detection error and f1-score
# now evaluate error and f1-score
err = 1 - accuracy_score(label_test,adscore)
f1score = f1_score(label_test,adscore)
# step 4. print results
print('\nNeighborhood-based Approach')
print('Detection Error = %.4f' % err)
print('F1 Score = %.4f' % f1score)
print('AUC Score = %.4f' % auc_score)
# -----------
# Assignment
# -----------
# 1. construct neighborhood using both normal and abnormal examples (replace line 27 with line 29), what do you observe?
# 2. play with different size of neighborhood (line 24), what do you observe?
# 3. play with different thresholds (line 39), what do you observe?
# load libraries
import numpy as np
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score,f1_score,roc_auc_score
# load data set
data = np.loadtxt('crimerate_binary.csv', delimiter=',')
[n,p] = data.shape
# split data into a training set and a testing set
size_train = int(0.75*n) # we use first 75% data for training, the rest for testing
sample_train = data[0:size_train,0:-1]
label_train = data[0:size_train,-1]
sample_test = data[size_train:,0:-1]
label_test = data[size_train:,-1]
# ------------------------------------
# Spectral-based anomaly detection
# tutorial slides, page 95 - 100
# ------------------------------------
# step 1. choose number of PC components and construct a PCA model
num_component = 2
model = PCA(n_components=num_component)
# step 2. estimate PCA model using ONLY NORMAL examples
model.fit(sample_train[label_train==0,:])
# the following code trains the model using both normal and abnormal examples
# model.fit(sample_train)
# step 3. use the PCA model to project testing exampels onto low dimensional feature space
sample_test_pca = model.transform(sample_test)
# step 4. reconstruct testing examples from the low dimensional space back to the original space
sample_test_recovered = model.inverse_transform(sample_test_pca)
# step 5. compute reconstruction error and treat them as anomalous score
dif = np.subtract(sample_test, sample_test_recovered)
adscore = np.sum(dif**2,axis=1)**(1/2)
# evaluate AUC score
auc_score = roc_auc_score(label_test, adscore)
# to get detection error and f1-score, we need to threshold anomalous score
# the range of adscore is [0.6, 3]
threshold = 1.5
adscore[adscore <= threshold] = 0
adscore[adscore > threshold] = 1
# evaluate detection error and f1-score
# now evaluate error and f1-score
err = 1 - accuracy_score(label_test,adscore)
f1score = f1_score(label_test,adscore)
# step 4. print results
print('\nSpectral-based Approach (PCA)')
print('Detection Error = %.4f' % err)
print('F1 Score = %.4f' % f1score)
print('AUC Score = %.4f' % auc_score)
# -----------
# Assignment
# -----------
# 1. train PCA model using both normal and abnormal examples (replace line 28 with line 30), what do you observe?
# 2. play with different number of PC components (line 24), what do you observe?
# 3. play with different thresholds (line 45), what do you observe?
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment