python机器学习-乳腺癌细胞挖掘(博主亲自录制视频)
项目合作QQ:231469242
多重共线性测试需要改进
文件夹需要两个包
python3.0 anaconda
normality_check.py 正太检验
# -*- coding: utf-8 -*- ''' Author:Toby QQ:231469242,all right reversed,no commercial use normality_check.py 正态性检验脚本 ''' import scipy from scipy.stats import f import numpy as np import matplotlib.pyplot as plt import scipy.stats as stats # additional packages from statsmodels.stats.diagnostic import lillifors #正态分布测试 def check_normality(testData): #20<样本数<50用normal test算法检验正态分布性 if 20<len(testData) <50: p_value= stats.normaltest(testData)[1] if p_value<0.05: print("use normaltest") print ("data are not normal distributed") return False else: print("use normaltest") print ("data are normal distributed") return True #样本数小于50用Shapiro-Wilk算法检验正态分布性 if len(testData) <50: p_value= stats.shapiro(testData)[1] if p_value<0.05: print ("use shapiro:") print ("data are not normal distributed") return False else: print ("use shapiro:") print ("data are normal distributed") return True if 300>=len(testData) >=50: p_value= lillifors(testData)[1] if p_value<0.05: print ("use lillifors:") print ("data are not normal distributed") return False else: print ("use lillifors:") print ("data are normal distributed") return True if len(testData) >300: p_value= stats.kstest(testData,'norm')[1] if p_value<0.05: print ("use kstest:") print ("data are not normal distributed") return False else: print ("use kstest:") print ("data are normal distributed") return True #对所有样本组进行正态性检验 def NormalTest(list_groups): for group in list_groups: #正态性检验 status=check_normality(group) if status==False : return False return True
Rsquare_multimode.py 多种模型计算R平方
加入了线性显著检测和r相关系数显著检测,多重共线性,自相关,残差正太检验等等
# -*- coding: utf-8 -*- #斯皮尔曼等级相关(Spearman’s correlation coefficient for ranked data) import math,pylab,scipy import numpy as np import scipy.stats as stats from scipy.stats import t from scipy.stats import f import pandas as pd import matplotlib.pyplot as plt from statsmodels.stats.diagnostic import lillifors import normality_check import statsmodels.formula.api as sm x=[4.03,3.76,3.77,3.34,3.47,2.92,3.20,2.71,3.53,4.51] y=[6.47,6.13,6.19,4.89,5.63,4.52,5.89,4.79,5.27,6.08] list_group=[x,y] sample=len(x) #显著性 a=0.05 #数据可视化 plt.plot(x,y,'ro') #斯皮尔曼等级相关,非参数检验 def Spearmanr(x,y): print("use spearmanr,Nonparametric tests") #样本不一致时,发出警告 if len(x)!=len(y): print ("warming,the samples are not equal!") r,p=stats.spearmanr(x,y) print("spearman r**2:",r**2) print("spearman p:",p) if sample<500 and p>0.05: print("when sample < 500,p has no mean(>0.05)") print("when sample > 500,p has mean") #皮尔森 ,参数检验 def Pearsonr(x,y): print("use Pearson,parametric tests") r,p=stats.pearsonr(x,y) print("pearson r**2:",r**2) print("pearson p:",p) if sample<30: print("when sample <30,pearson has no mean") #皮尔森 ,参数检验,带有详细参数 def Pearsonr_details(x,y,xLabel,yLabel,formula): n=len(x) df=n-2 data=pd.DataFrame({yLabel:y,xLabel:x}) result = sm.ols(formula, data).fit() print(result.summary()) #模型F分布显著性分析 print(' ') print("linear relation Significant test:...................................") #如果F检验的P值<0.05,拒绝H0,x和y无显著关系,H1成立,x和y有显著关系 if result.f_pvalue<0.05: print ("P value of f test<0.05,the linear relation is right.") #R的显著检验 print(' ') print("R significant test:...................................") r_square=result.rsquared r=math.sqrt(r_square) t_score=r*math.sqrt(n-2)/(math.sqrt(1-r**2)) t_std=t.isf(a/2,df) if t_score<-t_std or t_score>t_std: print ("R is significant according to its sample size") else: print ("R is not significant") #残差分析 print(' ') print("residual error analysis:...................................") states=normality_check.check_normality(result.resid) if states==True: print("the residual error are normal distributed") else: print("the residual error are not normal distributed") #残差偏态和峰态 Skew = stats.skew(result.resid, bias=True) Kurtosis = stats.kurtosis(result.resid, fisher=False,bias=True) if round(Skew,1)==0: print("residual errors normality Skew:in middle,perfect match") elif round(Skew,1)>0: print("residual errors normality Skew:close right") elif round(Skew,1)<0: print("residual errors normality Skew:close left") if round(Kurtosis,1)==3: print("residual errors normality Kurtosis:in middle,perfect match") elif round(Kurtosis,1)>3: print("residual errors normality Kurtosis:more peak") elif round(Kurtosis,1)<3: print("residual errors normality Kurtosis:more flat") #自相关分析autocorrelation print(' ') print("autocorrelation test:...................................") DW = np.sum( np.diff( result.resid.values )**2.0 )/ result.ssr if round(DW,1)==2: print("Durbin-Watson close to 2,there is no autocorrelation.OLS model works well") #共线性检查 print(' ') print("multicollinearity test:") conditionNumber=result.condition_number if conditionNumber>30: print("conditionNumber>30,multicollinearity exists") else: print("conditionNumber<=30,multicollinearity not exists") #绘制残差图,用于方差齐性检验 Draw_residual(list(result.resid)) ''' result.rsquared Out[28]: 0.61510660055413524 ''' #kendalltau非参数检验 def Kendalltau(x,y): print("use kendalltau,Nonparametric tests") r,p=stats.kendalltau(x,y) print("kendalltau r**2:",r**2) print("kendalltau p:",p) #选择模型 def R_mode(x,y,xLabel,yLabel,formula): #正态性检验 Normal_result=normality_check.NormalTest(list_group) print ("normality result:",Normal_result) if len(list_group)>2: Kendalltau(x,y) if Normal_result==False: Spearmanr(x,y) Kendalltau(x,y) if Normal_result==True: Pearsonr_details(x,y,xLabel,yLabel,formula) #调整的R方 def Adjust_Rsquare(r_square,n,k): adjust_rSquare=1-((1-r_square)*(n-1)*1.0/(n-k-1)) return adjust_rSquare ''' n=len(x) n=10 k=1 r_square=0.615 Adjust_Rsquare(r_square,n,k) Out[11]: 0.566875 ''' #绘图 def Plot(x,y,yLabel,xLabel,Title): plt.plot(x,y,'ro') plt.ylabel(yLabel) plt.xlabel(xLabel) plt.title(Title) plt.show() #绘图参数 yLabel='Alcohol' xLabel='Tobacco' Title='Sales in Several UK Regions' Plot(x,y,yLabel,xLabel,Title) formula='Alcohol ~ Tobacco' #绘制残点图 def Draw_residual(residual_list): x=[i for i in range(1,len(residual_list)+1)] y=residual_list pylab.plot(x,y,'ro') pylab.title("draw residual to check wrong number") # Pad margins so that markers don't get clipped by the axes,让点不与坐标轴重合 pylab.margins(0.3) #绘制网格 pylab.grid(True) pylab.show() R_mode(x,y,xLabel,yLabel,formula) ''' result.fittedvalues表示预测的y值阵列 result.fittedvalues Out[42]: 0 6.094983 1 5.823391 2 5.833450 3 5.400915 4 5.531682 5 4.978439 6 5.260090 7 4.767201 8 5.592035 9 6.577813 dtype: float64 #计算残差的偏态 S = stats.skew(result.resid, bias=True) Out[44]: -0.013678125910039975 K = stats.kurtosis(result.resid, fisher=False,bias=True) K Out[47]: 1.5271300905736027 '''
result.params 得到两个参数:x的系数和截距
截距
result.params[0]
x系数
result.params[1]
dubin watson解读
--残差是否符合正太分布
D.W统计量是用来检验残差分布是否为正态分布的,因为用OLS进行回归估计是假设模型残差服从正态分布的,因此,如果残差不服从正态分布,那么,模型将是有偏的,也就是说模型的解释能力是不强的。
D.W统计量在2左右说明残差是服从正态分布的,若偏离2太远,那么你所构建的模型
的解释能力就要受影响了。
jarque-bera解读
----样本是否符合正太分布
JB统计量全称叫Jarque-Bera统计量,是用来检验一组样本是否能够认为来自正态总体的一种方法,它依据OLS残差,对大样本进行检验(或称为渐进检验)。
![](https://gss0.bdstatic.com/94o3dSag_xI4khGkpoWK1HF6hhy/baike/s%3D104/sign=d95100e86f81800a6ae58d0e853533d6/0ff41bd5ad6eddc459d34d9c3adbb6fd526633a4.jpg)
Jarque和Bera建立了如下检验统计量——JB统计量:
若变量服从正态分布,则S为零,K为3,因而JB统计量的值为零;如果变量不是正态变量,则JB统计量将为一个逐渐增大值。
如果JB统计量值较大,比如为11,则可以计算出卡方值大于11的概率为0.004,这个概率过小,因此不能认为样本来自正态分布。反之,成立。
Jarque-Bera的P值接近于0,表明显著性高,数据服从正态分布。
Omnibus解读
Omnibus统计量的P值都接近于0,自变量的作用显著。
Omnibus tests are a kind of statistical test. They test whether the explained variance in a set of data is significantly greater than the unexplained variance, overall. One example is the F-test in the analysis of variance. There can be legitimate significant effects within a model even if the omnibus test is not significant. For instance, in a model with two independent variables, if only one variable exerts a significant effect on the dependent variable and the other does not, then the omnibus test may be non-significant. This fact does not affect the conclusions that may be drawn from the one significant variable. In order to test effects within an omnibus test, researchers often use contrasts.
https://en.wikipedia.org/wiki/Omnibus_test