# load libraries import numpy as np from sklearn import linear_model from sklearn.metrics import accuracy_score,f1_score,roc_auc_score # load data set data = np.loadtxt('crimerate_binary.csv', delimiter=',') [n,p] = data.shape # split data into a training set and a testing set size_train = int(0.75*n) # we use first 75% data for training, the rest for testing sample_train = data[0:size_train,0:-1] label_train = data[0:size_train,-1] sample_test = data[size_train:,0:-1] label_test = data[size_train:,-1] # ---------------------------------------- # classification-based anomaly detection # tutorial slides, page 49 - 59 # use linear regression model for detection # ---------------------------------------- # step 1. choose a classification model (linear regression) model = linear_model.LinearRegression() # step 2. train the model using examples model.fit(sample_train, label_train) # step 3. apply model to predict whether an example is normal or anomaly label_pred = model.predict(sample_test) # we can treat above output as anomalous score, and get AUC score from it auc_score = roc_auc_score(label_test, label_pred) # because this is a regression model, we need to threshold its output to get detection error and f1-score threshold = 0.4 label_pred[label_pred <= threshold] = 0 label_pred[label_pred > threshold] = 1 err = 1 - accuracy_score(label_test,label_pred) f1score = f1_score(label_test,label_pred) # step 4. print results print('\nClassification-based Approach (Linear Regression Model)') print('Detection Error = %.4f' % err) print('F1 Score = %.4f' % f1score) print('AUC Score = %.4f' % auc_score) # ----------- # Assignment # ----------- # play with different threshold values (line 37), what do you observe?