zoukankan html css js c++ java

logistic 回归（线性和非线性）

一：线性logistic 回归

代码如下：

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy.optimize as opt
import seaborn as sns

#读取数据集
path = 'ex2data1.txt'
data = pd.read_csv(path, header=None, names=['Exam 1', 'Exam 2', 'Admitted'])

#将正负数据集分开
positive = data[data['Admitted'].isin([1])]
negative = data[data['Admitted'].isin([0])]

'''
#查看分布
fig, ax = plt.subplots(figsize=(12, 8))
ax.scatter(positive['Exam 1'], positive['Exam 2'], s=60, c='b', marker='o', label='Admitted')
ax.scatter(negative['Exam 1'], negative['Exam 2'], s=50, c='r', marker='x', label='UnAdmitted')
ax.legend()
ax.set_xlabel('Exam 1 Score')
ax.set_ylabel('Exam 2 Score')
plt.show()
'''

#sigmoid函数实现
def sigmoid(h):
    return 1 / (1 + np.exp(-h))


'''
#测试sigmoid函数
nums = np.arange(-10, 11, step=1)
fig, ax = plt.subplots(figsize=(12, 8))
ax.plot(nums, sigmoid(nums), 'k')
plt.show()
'''

#计算损失函数值
def cost(theta, X, y):
    theta = np.matrix(theta)
    X = np.matrix(X)
    y = np.matrix(y)

    part1 = np.multiply(-y, np.log(sigmoid(X * theta.T)))
    part2 = np.multiply((1-y), np.log(1-sigmoid(X * theta.T)))
    return np.sum(part1-part2) / len(X)

#在原矩阵第1列前加一列全1
data.insert(0, 'ones', 1)

cols = data.shape[1]

X = data.iloc[:, 0:cols-1]
y = data.iloc[:, cols-1:cols]

X = np.array(X.values)
y = np.array(y.values)
theta = np.zeros(3) #这里是一个行向量


#返回梯度向量，注意是向量
def gradient(theta, X, y):
    theta = np.matrix(theta)
    X = np.matrix(X)
    y = np.matrix(y)

    parameters = theta.ravel().shape[1]
    grad = np.zeros(parameters)

    error = sigmoid(X * theta.T) - y

    grad = error.T.dot(X)
    grad = grad / len(X)
    return grad

#通过高级算法计算出最好的theta值
result = opt.fmin_tnc(func=cost, x0=theta, fprime=gradient, args=(X, y))

#print(cost(result[0], X, y))

#测试所得theta的性能
#计算原数据集的预测情况
def predict(theta, X):
    theta = np.matrix(theta)
    X = np.matrix(X)

    probability = sigmoid(X * theta.T)
    return [1 if i > 0.5 else 0 for i in probability]


theta_min = result[0]
predictions = predict(theta_min, X)

correct = [1 if((a == 1 and b == 1) or(a == 0 and b == 0)) else 0 for(a, b) in zip(predictions, y)]
accuracy = (sum(map(int, correct)) % len(correct))
print('accuracy = {0}%'.format(accuracy))#训练集测试准确度89%


# 作图
theta_temp = theta_min
theta_temp = theta_temp / theta_temp[2]

x = np.arange(130, step=0.1)
y = -(theta_temp[0] + theta_temp[1] * x)
#画出原点
sns.set(context='notebook', style='ticks', font_scale=1.5)
sns.lmplot('Exam 1', 'Exam 2', hue='Admitted', data=data,
           size=6,
           fit_reg=False,
           scatter_kws={"s": 25}
           )
#画出分界线
plt.plot(x, y, 'grey')
plt.xlim(0, 130)
plt.ylim(0, 130)
plt.title('Decision Boundary')
plt.show()

二：非线性logistic 回归（正则化）

代码如下：

import pandas as pd
import numpy as np
import scipy.optimize as opt
import matplotlib.pyplot as plt


path = 'ex2data2.txt'
data = pd.read_csv(path, header=None, names=['Test 1', 'Test 2', 'Accepted'])

positive = data[data['Accepted'].isin([1])]
negative = data[data['Accepted'].isin([0])]

'''
#显示原始数据的分布
fig, ax = plt.subplots(figsize=(12, 8))
ax.scatter(positive['Test 1'], positive['Test 2'], s=50, c='b', marker='o', label='Accepted')
ax.scatter(negative['Test 1'], negative['Test 2'], s=50, c='r', marker='x', label='Unaccepted')
ax.legend() #显示右上角的Accepted 和 Unaccepted标签
ax.set_xlabel('Test 1 Score')
ax.set_ylabel('Test 2 Score')
plt.show()
'''
degree = 5
x1 = data['Test 1']
x2 = data['Test 2']
#在data的第三列插入一列全1
data.insert(3, 'Ones', 1)

#创建多项式特征值，最高阶为4
for i in range(1, degree):
    for j in range(0, i):
        data['F' + str(i) + str(j)] = np.power(x1, i-j) * np.power(x2, j)

#删除原数据中的test 1和test 2两列
data.drop('Test 1', axis=1, inplace=True)
data.drop('Test 2', axis=1, inplace=True)


#sigmoid函数实现
def sigmoid(h):
    return 1 / (1 + np.exp(-h))


def cost(theta, X, y, learnRate):
    theta = np.matrix(theta)
    X = np.matrix(X)
    y = np.matrix(y)

    first = np.multiply(-y, np.log(sigmoid(X * theta.T)))
    second = np.multiply((1 - y), np.log(1 - sigmoid(X * theta.T)))
    reg = (learnRate / (2 * len(X))) * np.sum(np.power(theta[:, 1:theta.shape[1]], 2))
    return np.sum(first - second) / len(X) + reg


learnRate = 1
cols = data.shape[1]

X = data.iloc[:, 1:cols]
y = data.iloc[:, 0:1]

X = np.array(X)
y = np.array(y)
theta = np.zeros(X.shape[1])


#计算原数据集的预测情况
def predict(theta, X):
    theta = np.matrix(theta)
    X = np.matrix(X)

    probability = sigmoid(X * theta.T)
    return [1 if i > 0.5 else 0 for i in probability]


def gradientReg(theta, X, y, learnRate):
    theta = np.matrix(theta)
    X = np.matrix(X)
    y = np.matrix(y)

    paramates = int(theta.ravel().shape[1])
    grad = np.zeros(paramates)

    grad = (sigmoid(X * theta.T) - y).T * X / len(X) + (learnRate / len(X)) * theta[:, i]
    grad[0] = grad[0] - (learnRate / len(X)) * theta[:, i]
    return grad

result = opt.fmin_tnc(func=cost, x0=theta, fprime=gradientReg, args=(X, y, learnRate))
print(result)

theta_min = np.matrix(result[0])
predictions = predict(theta_min, X)
correct = [1 if((a == 1 and b == 1) or(a == 0 and b == 0)) else 0 for(a, b) in zip(predictions, y)]
accuracy = (sum(map(int, correct)) % len(correct))

print('accuracy = {0}%'.format(accuracy))

查看全文

相关阅读:
VS2010版快捷键
 Win7旗舰版中的IIS配置asp.net的运行环境
 实现软件自动在线升级的原理
 view_countInfo
C#尝试读取或写入受保护的内存。这通常指示其他内存已损坏。
error: 40
SQL Server 2008 阻止保存要求重新创建表的更改问题的设置方法
 继承实现圆柱体面积体积的计算
 圆柱模板价格计算器V1.0版本
 python3.7内置函数整理笔记

原文地址：https://www.cnblogs.com/qiang-wei/p/9839458.html