python机器学习-乳腺癌细胞挖掘(博主亲自录制视频)
https://study.163.com/course/introduction.htm?courseId=1005269003&utm_campaign=commission&utm_source=cp-400000000398149&utm_medium=share
A 方案--我的回归模型
(1)Input.py 通过excel接口,输入数据
(2)linear_R.py 通过导入Input.py输入数据, 此脚本用于选择最佳mode
(3)input_decide.py 确定模型后,对输入的x和y列表参数进行预处理
(4)最后执行主函数 Linear_main.py
分析汇总:
R平方达到0.998,预测准确性相当高
但是最后三年的残差数据偏离度大,而最后真实数据与最近三年权重关系很大,需要考虑其他模型
B 方案---excel 数据分析
R平方达不到0.9,而且预测数据远低于最近三年数据,比我的模型还不如,直接pass
方案C 时间序列
预测数据1722,但是回归预测中最后5年偏差还是很大,不一定准确,需要更好算法
方案D 移动平均加权
大陆军费开支数据
通过excel接口,输入数据
Input.py

#coding=utf-8 #通过excel接口,输入数据 import xlrd excelFileName="中国军费开支1.xlsx" wb=xlrd.open_workbook(excelFileName) sheet=wb.sheets()[0] list_y=sheet.col_values(2)[1:] list_x=sheet.col_values(0)[1:]
#linear_R.py
#,通过导入Input.py输入数据, 此脚本用于选择最佳mode
#coding=utf-8
#linear_R.py
#此脚本用于选择最佳mode
import math,input_process,Input
def Mean(sample_list):
mean_value=float(sum(sample_list))/len(sample_list)
return mean_value
def Variance_sample(sample_list):
#mean数组的平均数
mean= Mean(sample_list)
#数组的长度
length=len(sample_list)
sum1=0
for i in range(length):
#print "i:",i
sum1+=(sample_list[i]-mean)**2
#print "sum1:",sum1
#variance=sum1/length
variance=sum1/(length)
return variance
def Deviation(sample_list):
variance=Variance_sample(sample_list)
deviation=math.sqrt(variance)
return deviation
def R_xy(s_xy,s_x,s_y):
r_xy=float(s_xy)/(s_x*s_y)
#r_xy=round(r_xy_value,5) #保留五位小数
return r_xy
def S_xy(list_x,list_y):
x_mean=Mean(list_x)
y_mean=Mean(list_y)
if len(list_x)!=len(list_y):
print ("erro")
s_xy=0
total=0
for i in range(len(list_x)):
sum1=list_x[i]*list_y[i]
total+=sum1
s_xy=(float(total)/len(list_x))-(x_mean*y_mean)
return s_xy
#计算linear mode 的R平方值
def R_square_mode_linear(list_x,list_y):
#x平均数
x_mean=Mean(list_x)
#print "x_mean:",x_mean
#y平均数
y_mean=Mean(list_y)
#print "y_mean:",y_mean
#list_x的标准差
s_x=Deviation(list_x)
#print "s_x:",s_x
#list_y的标准差
s_y=Deviation(list_y)
#print "s_y:",s_y
s_xy=S_xy(list_x,list_y)
#print "s_xy:",s_xy
#r_xy_value为{x_i}和{y_i}的相关系数
r_xy=R_xy(s_xy,s_x,s_y)
#print "r_xy:",r_xy
#计算R平方
r_square=r_xy**2
#print "r_square:",r_square
return r_square
def R_square_mode_1(list_x,list_y):
#输入两个列表取对数,生成两个新列表
list_x=input_process.Reciprocal_list(list_x)
#x平均数
x_mean=Mean(list_x)
#print "x_mean:",x_mean
#y平均数
y_mean=Mean(list_y)
#print "y_mean:",y_mean
#list_x的标准差
s_x=Deviation(list_x)
#print "s_x:",s_x
#list_y的标准差
s_y=Deviation(list_y)
#print "s_y:",s_y
s_xy=S_xy(list_x,list_y)
#print "s_xy:",s_xy
#r_xy_value为{x_i}和{y_i}的相关系数
r_xy=R_xy(s_xy,s_x,s_y)
#print "r_xy:",r_xy
#计算R平方
r_square=r_xy**2
#print "r_square:",r_square
return r_square
def R_square_mode_2(list_x,list_y):
#输入两个列表取对数,生成两个新列表
log_list_x=input_process.log_list(list_x)
log_list_y=input_process.log_list(list_y)
#对两个新列表取平均数
x_mean=Mean(log_list_x)
y_mean=Mean(log_list_y)
#对两个新列表算标准差
s_x=Deviation(log_list_x)
s_y=Deviation(log_list_y)
s_xy=S_xy(log_list_x,log_list_y)
#r_xy_value为{x_i}和{y_i}的相关系数
r_xy=R_xy(s_xy,s_x,s_y)
#计算R平方
r_square=r_xy**2
return r_square
def R_square_mode_3(list_x,list_y):
#y列表取对数,生成新列表
log_list_y=input_process.log_list(list_y)
#对两个新列表取平均数
x_mean=Mean(list_x)
y_mean=Mean(log_list_y)
#对两个新列表算标准差
s_x=Deviation(list_x)
s_y=Deviation(log_list_y)
s_xy=S_xy(list_x,log_list_y)
#r_xy_value为{x_i}和{y_i}的相关系数
r_xy=R_xy(s_xy,s_x,s_y)
#计算R平方
r_square=r_xy**2
return r_square
def R_square_mode_5(list_x,list_y):
#y列表取对数,生成新列表
negative_reciprocal_x=input_process.negative_reciprocal_list(list_x)
log_list_y=input_process.log_list(list_y)
#for test
#print"negative_reciprocal_x:",negative_reciprocal_x
#print"log_list_y",log_list_y
#对两个新列表取平均数
x_mean=Mean(negative_reciprocal_x)
y_mean=Mean(log_list_y)
#对两个新列表算标准差
s_x=Deviation(negative_reciprocal_x)
s_y=Deviation(log_list_y)
s_xy=S_xy(negative_reciprocal_x,log_list_y)
#r_xy_value为{x_i}和{y_i}的相关系数
r_xy=R_xy(s_xy,s_x,s_y)
#计算R平方
r_square=r_xy**2
return r_square
def R_square_mode_6(list_x,list_y):
#x列表取平方数
square_list_x=input_process.square_list(list_x)
#x平均数
x_mean=Mean(square_list_x)
#y平均数
y_mean=Mean(list_y)
#list_x的标准差
s_x=Deviation(square_list_x)
#list_y的标准差
s_y=Deviation(list_y)
s_xy=S_xy(square_list_x,list_y)
#r_xy_value为{x_i}和{y_i}的相关系数
r_xy=R_xy(s_xy,s_x,s_y)
#计算R平方
r_square=r_xy**2
return r_square
def R_square_mode_7(list_x,list_y):
#输入两个列表取对数,生成两个新列表
list_x=input_process.log_list(list_x)
#x平均数
x_mean=Mean(list_x)
#print "x_mean:",x_mean
#y平均数
y_mean=Mean(list_y)
#print "y_mean:",y_mean
#list_x的标准差
s_x=Deviation(list_x)
#print "s_x:",s_x
#list_y的标准差
s_y=Deviation(list_y)
#print "s_y:",s_y
s_xy=S_xy(list_x,list_y)
#print "s_xy:",s_xy
#r_xy_value为{x_i}和{y_i}的相关系数
r_xy=R_xy(s_xy,s_x,s_y)
#print "r_xy:",r_xy
#计算R平方
r_square=r_xy**2
#print "r_square:",r_square
return r_square
def R_square_list(list_x,list_y,mode_list):
r_square_list=[]
best_mode=0
r_square=0
#计算R_square_list
for i in mode_list:
mode=i
if mode=="linear":
r_square=R_square_mode_linear(list_x,list_y)
r_square_list.append(r_square)
if mode==1:
r_square=R_square_mode_1(list_x,list_y)
r_square_list.append(r_square)
if mode==2:
r_square=R_square_mode_2(list_x,list_y)
r_square_list.append(r_square)
if mode==3:
r_square=R_square_mode_3(list_x,list_y)
r_square_list.append(r_square)
if mode==5:
r_square=R_square_mode_5(list_x,list_y)
r_square_list.append(r_square)
if mode==6:
r_square=R_square_mode_6(list_x,list_y)
r_square_list.append(r_square)
if mode==7:
r_square=R_square_mode_7(list_x,list_y)
r_square_list.append(r_square)
return r_square_list
def Dict_r_mode(r_square_list,mode_list):
dict_r_mode=dict(zip(r_square_list,mode_list))
#print"dict:R^2 and mode",dict_r_mode
return dict_r_mode
#mode函数模型自动判断
def Mode_choose(r_square_list):
#R平方列表,采用最大值对应mode
best_r_square=max(r_square_list)
best_mode=dict_r_mode[best_r_square]
return best_mode
#导入Input内数据
list_x=Input.list_x
list_y=Input.list_y
#绘图模式,mode=linear(线性标准)
#其它有
#mode=1 (y=a+b/x)
#mode=2 (y=a*x**b)
#mode=3 (y=a*e**(b*x))
#mode=4 (y=a*e**(b/x))
#mode=5 (y=a*e**(-b/x))
#mode=6 (y=b*x**2+a)
#mode=7 (y=a+b*lnx)
mode_list=["linear",1,2,3,5,6,7]
r_square_list=R_square_list(list_x,list_y,mode_list)
dict_r_mode=Dict_r_mode(r_square_list,mode_list)
mode=Mode_choose(r_square_list)
#测试时修改mode
#mode='linear'
#mode=1
#mode=7
#mode=2
input_decide.py
确定模型后,对输入的x和y列表参数进行预处理
#coding=utf-8
#input_decide.py
#确定模型后,对输入的x和y列表参数进行预处理
import math,Input,input_process,linear_R
list_x1=Input.list_x
list_y1=Input.list_y
mode=linear_R.mode
def List_x(list_x1,mode):
if mode=="linear":
#list_x不变
list_x=list_x1
if mode==1:
list_x=input_process.Reciprocal_list(list_x1)
if mode==2:
#输入两个列表取对数,生成两个新列表
list_x=input_process.log_list(list_x1)
if mode==3:
#list_x不变
list_x=list_x1
if mode==5:
#输入两个列表取对数,生成两个新列表
list_x=input_process.negative_reciprocal_list(list_x1)
if mode==6:
#输入两个列表取对数,生成两个新列表
list_x=input_process.square_list(list_x1)
if mode==7:
#输入两个列表取对数,生成两个新列表
list_x=input_process.log_list(list_x1)
return list_x
def List_y(list_y1,mode):
if mode=="linear":
#list_y不变
list_y=list_y1
if mode==1:
#输入两个列表取对数,生成两个新列表
list_y=list_y1
if mode==2:
#输入两个列表取对数,生成两个新列表
list_y=input_process.log_list(list_y1)
if mode==3:
#输入两个列表取对数,生成两个新列表
list_y=input_process.log_list(list_y1)
if mode==5:
#输入两个列表取对数,生成两个新列表
list_y=input_process.log_list(list_y1)
if mode==6:
#输入两个列表取对数,生成两个新列表
list_y=list_y1
if mode==7:
#list_y不变
list_y=list_y1
return list_y
list_x=List_x(list_x1,mode)
list_y=List_y(list_y1,mode)
最后执行主函数
#主函数Linear_main.py
#coding=utf-8
#目录:
#14.一元线性回归
#r^2自动判断模型是否合适
#residual判断错误值
#其它函数转换成一元线性回归
#1.单词
#排列permutation,组合combination,阶乘factorial 概率probability
import math,pylab,numpy,linear_R,input_decide,input_process,Input
def Mean(sample_list):
mean_value=float(sum(sample_list))/len(sample_list)
return mean_value
def Variance_sample(sample_list):
#mean数组的平均数
mean= Mean(sample_list)
#数组的长度
length=len(sample_list)
sum1=0
for i in range(length):
#print "i:",i
sum1+=(sample_list[i]-mean)**2
#print "sum1:",sum1
#variance=sum1/length
variance=sum1/(length)
return variance
def Deviation(sample_list):
variance=Variance_sample(sample_list)
deviation=math.sqrt(variance)
return deviation
def S_xy(list_x,list_y):
x_mean=Mean(list_x)
y_mean=Mean(list_y)
if len(list_x)!=len(list_y):
print ("erro")
s_xy=0
total=0
for i in range(len(list_x)):
sum1=list_x[i]*list_y[i]
total+=sum1
s_xy=(float(total)/len(list_x))-(x_mean*y_mean)
return s_xy
#14.一元线性回归
def R_xy(s_xy,s_x,s_y):
r_xy=float(s_xy)/(s_x*s_y)
#r_xy=round(r_xy_value,5) #保留五位小数
return r_xy
#r的平方可以判断模型是否合适,r的平方值越大,越合适,反之亦然
def R_estimate(r_xy_value):
if r_xy_value**2>0.8:
return True
else:
return False
#线性回归模式
def Linear_b(s_xy,s_x):
b=s_xy/s_x**2
#b=round(b,4)
return b
def Linear_a(b,x_mean,y_mean):
a=y_mean-b*x_mean
#a=round(a,4)
#当mode=2或3或5时,a要特殊处理
if mode==5 or mode==2 or mode==3:
a1=y_mean-b*x_mean
#print "a1:",a1
a=math.e**a1
return a
#预算将来值
def Forecast_linear(b,a,forecast_x,mode):
#对mode采用不同计算模式
y_forecast=0
if mode=="linear":
y_forecast=b*forecast_x+a
if mode==1:
#print "forecast_x:",forecast_x
y_forecast=a+b/forecast_x
if mode==2:
y_forecast=a*(forecast_x**b)
if mode==3:
y_forecast=a*(math.e**(b*forecast_x))
if mode==5:
y_forecast=a*(math.e**(float(-b)/forecast_x))
if mode==6:
y_forecast=a+(b*forecast_x**2)
if mode==7:
y_forecast=a+b*math.log(forecast_x,math.e)
#y_forecast=round(y_forecast,0)#不保留小数
#y_forecast=int(y_forecast) #取整数
return y_forecast
#绘制一元线性分布图
def Draw_linear_regression_model(list_x_initial,list_y_initial,b,a,mode):
#先画点,再画线
#list_x和list_y是绘制点的x和y值集合,b,a是参数,mode是绘制模式
#绘制点分布
pylab.plot(list_x_initial,list_y_initial,'ro') #'ro'表示绘制点
#x取值范围智能化
#x_min=min(list_x) x=min(list_total)/1.5
x_max=max(list_x_initial)*1.5
x=numpy.arange(0.01,x_max) #x取值范围可以随意更改
#计算y值
y=Forecast_linear(b,a,x,mode)
#绘制方程式
pylab.plot(y)
# Pad margins so that markers don't get clipped by the axes,让点不与坐标轴重合
pylab.margins(0.3)
pylab.grid(True)
pylab.title("linear regression model")
pylab.show()
#程序后期检验工作
#建模线性回归方程后,算出y的近似值
def Linear_y(b,a,list_x_initial,mode):
list_linear_y=[]
for i in list_x_initial:
#print 'i:',i
value=Forecast_linear(b,a,i,mode)
#print value
list_linear_y.append(value)
return list_linear_y
#计算residual残差列表
def Residual(list_y_initial,list_linear_y):
residual_list=[]
for i in range(len(list_y_initial)):
value=list_y_initial[i]-list_linear_y[i]
residual_list.append(value)
return residual_list
#根据residual残差检验数据错误
def WrongNumber_check(list_x_initial,list_y_initial,residual_list):
#预测不准确残差值
wrongNumber_predict=[]
#点元组为元素,生成列表
list_dot=zip(list_x_initial,list_y_initial)
#残差和点为元素,组成列表
list_residual_dot=zip(residual_list,list_dot)
dict_residual_dot=dict(list_residual_dot)
#判断标准benchmark
absolute_residual_list=input_process.Absolute_list(residual_list)
benchmark=1.5*Mean(absolute_residual_list)
#遍历残差值,如果残差绝对值大于标准值,则对应的点添加到错值预测列表
for i in residual_list:
if math.fabs(i)>benchmark:
wrongNumber=dict_residual_dot[i]
wrongNumber_predict.append(wrongNumber)
#print"wrongNumber prediction:",wrongNumber_predict
return wrongNumber_predict
#绘制残点图
def Draw_residual(residual_list):
x=[i for i in range(1,len(residual_list)+1)]
y=residual_list
pylab.plot(x,y,'ro')
pylab.title("draw residual to check wrong number")
# Pad margins so that markers don't get clipped by the axes,让点不与坐标轴重合
pylab.margins(0.3)
#绘制网格
pylab.grid(True)
pylab.show()
def print_out(x_mean,y_mean,s_x,s_y,s_xy,r_xy,r_estimate,
b,a,forecast_value,wrongNumber_Predict,mode):
print ("x_mean:",x_mean)
print ("y_mean:",y_mean)
print ("s_x:",s_x)
print ("s_y:",s_y)
print ("s_xy:",s_xy)
print ("r_xy:",r_xy)
print ("r_square:",r_square)
print ("r_estimate:",r_estimate)
print ("b:",b)
print ("a:",a)
print ("forecast_value:",forecast_value)
print ("wrongNumber_Predict:",wrongNumber_Predict)
print ("the best mode:",mode)
#输出方程式:
if mode=="linear":
print ("y=a+b*x")
if mode==1:
print ("y=a+b/x")
if mode==2:
print ("y=a*x**b")
if mode==3:
print ("y=a*e**(b*x)")
if mode==5:
print ("y=a*e**(-b/x)")
if mode==6:
print ("y=b*x**2+a")
if mode==7:
print ("y=a+b*lnx")
#execution
#standard mode
def execution_linear_regression():
#standard print
print_out(x_mean,y_mean,s_x,s_y,s_xy,r_xy,r_estimate,
b,a,forecast_value,wrongNumber_Predict,mode)
#draw regression linear
Draw_linear_regression_model(list_x_initial,list_y_initial,b,a,mode)
#draw residual
Draw_residual(residual_list)
mode=linear_R.mode
list_x_initial=Input.list_x
list_y_initial=Input.list_y
#处理后的最终参数
list_x=input_decide.list_x
list_y=input_decide.list_y
#预测值
forecast_x=28
#绘图模式,mode=linear(线性标准)
#其它有
#mode='linear' (y=a+b*x)
#mode=1 (y=a+b/x)
#mode=2 (y=a*x**b)
#mode=3 (y=a*e**(b*x))
#mode=5 (y=a*e**(-b/x))
#mode=6 (y=b*x**2+a)
#mode=7 (y=a+b*lnx)
mode_list=["linear",1,2,3,5,6,7]
#x mean value
x_mean=Mean(list_x)
#y mean value
y_mean=Mean(list_y)
#list_x variance
s_x=Deviation(list_x)
#list_y variance
s_y=Deviation(list_y)
s_xy=S_xy(list_x,list_y)
#r_xy_value is the ratio of {x_i} and {y_i}
r_xy=R_xy(s_xy,s_x,s_y)
#R平方
r_square=r_xy**2
#estimate r'value
r_estimate=R_estimate(r_xy)
#b
b=Linear_b(s_xy,s_x)
#a
a=Linear_a(b,x_mean,y_mean)
forecast_value=Forecast_linear(b,a,forecast_x,mode)
#get y's approximation
list_linear_y=Linear_y(b,a,list_x_initial,mode)
#residual
residual_list=Residual(list_y_initial,list_linear_y)
wrongNumber_Predict=WrongNumber_check(list_x_initial,list_y_initial,residual_list)
#输出详细参数数据和图
execution_linear_regression()
执行完成后程序输出的参数
R平方达到0.998,预测准确性相当高


但是最后三年的残差数据偏离度大,而最后真实数据与最近三年权重关系很大,需要考虑其他模型

B 方案---excel 数据分析
R平方达不到0.9,而且预测数据远低于最近三年数据,比我的模型还不如,直接pass

方案C 时间序列
预测数据1722,但是回归预测中最后5年偏差还是很大,不一定准确,需要更好算法




