1_Classification_LinearRegression.py 1.71 KB
Newer Older
Wolf's avatar
Wolf committed
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54

# load libraries 
import numpy as np
from sklearn import linear_model
from sklearn.metrics import accuracy_score,f1_score,roc_auc_score

# load data set 
data = np.loadtxt('crimerate_binary.csv', delimiter=',')
[n,p] = data.shape

# split data into a training set and a testing set
size_train = int(0.75*n) # we use first 75% data for training, the rest for testing
sample_train = data[0:size_train,0:-1]
label_train = data[0:size_train,-1]
sample_test = data[size_train:,0:-1]
label_test = data[size_train:,-1]

# ----------------------------------------
# classification-based anomaly detection 
# tutorial slides, page 49 - 59
# use linear regression model for detection 
# ----------------------------------------

# step 1. choose a classification model (linear regression)
model = linear_model.LinearRegression()

# step 2. train the model using examples 
model.fit(sample_train, label_train)

# step 3. apply model to predict whether an example is normal or anomaly 
label_pred = model.predict(sample_test) 
# we can treat above output as anomalous score, and get AUC score from it 
auc_score = roc_auc_score(label_test, label_pred)
# because this is a regression model, we need to threshold its output to get detection error and f1-score
threshold = 0.4
label_pred[label_pred <= threshold] = 0
label_pred[label_pred > threshold] = 1
err = 1 - accuracy_score(label_test,label_pred) 
f1score = f1_score(label_test,label_pred)

# step 4. print results 
print('\nClassification-based Approach (Linear Regression Model)')
print('Detection Error = %.4f' % err)
print('F1 Score = %.4f' % f1score)
print('AUC Score = %.4f' % auc_score)


# -----------
# Assignment 
# -----------
# play with different threshold values (line 37), what do you observe?