本篇博文主要介绍如何使用SVDD算法和Isolation Forest算法来进行异常检测
首先是SVDD算法,主要是用sklearn里面的svm.OneClassSVM()来做单分类的异常检测,用cross_validation作为交叉验证调参
1 import numpy as np 2 from sklearn import svm 3 from sklearn.cross_validation import train_test_split 4 5 class Calculator_timeout(object): 6 def __init__(self,Appid): 7 self.Appid = Appid 8 #这里的self.respondse_list和self.request_data_list分别是待训练的数据集列表 我这里是从MySQL数据库中通过SparkSQL取的,后面会详细介绍 9 self.X1 = np.array(self.respondse_list) 10 self.y1 = np.array(self.request_data_list) 11 self.y = range(0, len(self.X1)) 12 self.X = np.c_[self.X1, self.y1] 13 self.X_testt = self.X[len(self.X)-360:] 14 15 def get_best_parameters(self): 16 X_train, X_test, y_train, y_test = train_test_split( 17 self.X, self.y, test_size=0.5, random_state=42) 18 parameters = {"nu": [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9] 19 "gamma": [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9] 20 "cache_size": [200], 21 "kernel": ['rbf', 'linear'], 22 "max_iter": [-1], 23 "shrinking": [True, False], 24 "tol": [0.001], 25 "verbose": [False] 26 } 27 28 svr = svm.OneClassSVM() 29 clf = GridSearchCV(sc, svr, parameters, scoring='%s_weighted' % 'precision') 30 clf.fit(X_train, y_train) 31 print("Best parameters set found on development set:") 32 print(clf.best_params_) 33 return clf.best_params_ 34 35 def get_pic(self): 36 aaa = self.get_best_parameters() 37 clf = svm.OneClassSVM(kernel = aaa['kernel'],verbose = aaa['verbose'], 38 max_iter = aaa['max_iter'], tol = aaa['tol'], 39 cache_size = aaa['cache_size'], shrinking = aaa['shrinking'], 40 nu = aaa['nu'], gamma = aaa['gamma']) 41 42 clf.fit(self.X) 43 # pred_train = clf.predict(self.X,self.y) 44 pred_test = clf.predict(self.X_testt) 45 print self.X1 46 print len(self.respondse_list) 47 print ("novelty detection result:",pred_test) 48 return pred_test,self.X_testt
下一篇将介绍使用SparkSQL和mllib实现kmeans算法预测