from numpy import * import matplotlib.pyplot as plt def loadDataSet(): data_mat = [] label_mat = [] fr = open('testSet.txt') for line in fr.readlines(): line_arr = line.strip().split() data_mat.append([1.0, float(line_arr[0]), float(line_arr[1])]) label_mat.append(int(line_arr[2])) return data_mat, label_mat def sigmoid(in_x): # sigmoid函数 return 1.0 / (1 + exp(-in_x)) def gradAscent(data_mat_in, class_labels): data_matrix = mat(data_mat_in) # 将列表转换为矩阵 label_mat = mat(class_labels).transpose() # 将列表转换为竖向量 m, n = shape(data_matrix) # 向量行列数,100行,3列 alpha = 0.001 max_cycles = 500 weights = ones((n, 1)) # 生成三个1的竖向量 for k in range(max_cycles): h = sigmoid(data_matrix * weights) # 矩阵相乘 error = (label_mat - h) # 将sigmoid中x>部分的图像沿y=0.5做轴对称 weights = weights + alpha * data_matrix.transpose() * error # w = w +α*梯度 return weights def plotBestFit(weights): data_mat, label_mat = loadDataSet() data_arr = array(data_mat) n = shape(data_arr)[0] xcord1 = [] ycord1 = [] xcord2 = [] ycord2 = [] for i in range(n): if int(label_mat[i]) == 1: xcord1.append(data_arr[i, 1]) ycord1.append(data_arr[i, 2]) else: xcord2.append(data_arr[i, 1]) ycord2.append(data_arr[i, 2]) fig = plt.figure() ax = fig.add_subplot(111) ax.scatter(xcord1, ycord1, s=30, c='red', marker='s') ax.scatter(xcord2, ycord2, s=30, c='green') x = arange(-3.0, 3.0, 0.1) y = (-weights[0] - weights[1] * x) / weights[2] ax.plot(x, y) # 画线 plt.xlabel('x1') plt.ylabel('x2') plt.show() def stocGradAscent0(data_matrix, class_labels): m, n = shape(data_matrix) alpha = 0.01 weights = ones(n) for i in range(m): h = sigmoid(sum(data_matrix[i] * weights)) # 向量相乘,得一个数 error = class_labels[i] - h # 一个数 weights = weights + alpha * error * data_matrix[i] # 求所有向量和 return weights def stocGradAscent1(data_matrix, class_labels, num_iter=150): m, n = shape(data_matrix) weights = ones(n) for j in range(num_iter): data_index = list(range(m)) for i in range(m): alpha = 4 / (1.0 + j + i) + 0.01 # 避免参数的严格下降 randindex = int(random.uniform(0, len(data_index))) # 随机选择 h = sigmoid(sum(data_matrix[randindex] * weights)) error = class_labels[randindex] - h weights = weights + alpha * error * data_matrix[randindex] del data_index[randindex] return weights def classifyVector(in_x,weights): prob = sigmoid(sum(in_x*weights)) if prob > 0.5: return 1.0 else: return 0.0 def colicTest(): fr_train = open('horseColicTraining.txt') fr_test = open('horseColicTest.txt') training_set = [] training_labels = [] for line in fr_train.readlines(): curr_line = line.strip().split(' ') line_arr =[] for i in range(21): line_arr.append(float(curr_line[i])) training_set.append(line_arr) training_labels.append(float(curr_line[21])) train_weights = stocGradAscent1(array(training_set),training_labels,200) error_count = 0 num_test_voc = 0.0 for line in fr_test.readlines(): num_test_voc +=1 curr_line = line.strip().split(' ') line_arr = [] for i in range(21): line_arr.append(float(curr_line[i])) if int(classifyVector(array(line_arr),train_weights)) != int(curr_line[21]): error_count += 1 error_rate = (float(error_count)/num_test_voc) print('the error rate of this test is : %s' % error_rate) return error_rate def multiTest(): num_tests = 10 error_sum = 0.0 for k in range(num_tests): error_sum += colicTest() print('after %s iterations the average error rate is: %s' % (num_tests,error_sum/float(num_tests)))