python机器学习-乳腺癌细胞挖掘(博主亲自录制视频)
单因素方差+kruskalwallis
onewayTest.py
# -*- coding: utf-8 -*-
import numpy as np,variance_check
# additional packages
from scipy.stats.mstats import kruskalwallis
from scipy import stats
group1=[27,2,4,18,7,9]
group2=[20,8,14,36,21,22]
group3=[34,31,3,23,30,6]
list_groups=[group1,group2,group3]
#前期检验
normality=variance_check.NormalTest(list_groups)
leveneResult=variance_check.Levene_test(list_groups[0],list_groups[1],list_groups[2])
equal_lenth=variance_check.Equal_lenth(list_groups)
def Choose_mode(normality,leveneResult,group1,group2,group3):
if normality==True and leveneResult==True:
print"Use anova test:"
statistic,p=stats.f_oneway(group1,group2,group3)
print"statistic,p:",statistic,p
if p<0.05:
print "There is significant difference"
return True
else:
print "There is no significant difference"
return False
if normality==False:
print"Use kruskawallis test:"
h, p = kruskalwallis(list_groups)
print"H value:",h
print"p",p
# Print the results
if p<0.05:
print('There is a significant difference between the cities.')
return True
else:
print('No significant difference between the cities.')
return False
Choose_mode(normality,leveneResult,group1,group2,group3)
作者toby,qq:231469242

variance_check.py
# -*- coding: utf-8 -*-
'''
用于方差齐性检验
正太性检验
配对相等检验
'''
import scipy,math
from scipy.stats import f
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as stats
# additional packages
from statsmodels.stats.diagnostic import lillifors
#多重比较
from statsmodels.sandbox.stats.multicomp import multipletests
#用于排列组合
import itertools
'''
#测试数据
group1=[2,3,7,2,6]
group2=[10,8,7,5,10]
group3=[10,13,14,13,15]
list_groups=[group1,group2,group3]
list_total=group1+group2+group3
'''
a=0.05
#正态分布测试
def check_normality(testData):
#20<样本数<50用normal test算法检验正态分布性
if 20<len(testData) <50:
p_value= stats.normaltest(testData)[1]
if p_value<0.05:
print"use normaltest"
print"p of normal:",p_value
print "data are not normal distributed"
return False
else:
print"use normaltest"
print"p of normal:",p_value
print "data are normal distributed"
return True
#样本数小于50用Shapiro-Wilk算法检验正态分布性
if len(testData) <50:
p_value= stats.shapiro(testData)[1]
if p_value<0.05:
print "use shapiro:"
print"p of normal:",p_value
print "data are not normal distributed"
return False
else:
print "use shapiro:"
print"p of normal:",p_value
print "data are normal distributed"
return True
if 300>=len(testData) >=50:
p_value= lillifors(testData)[1]
if p_value<0.05:
print "use lillifors:"
print"p of normal:",p_value
print "data are not normal distributed"
return False
else:
print "use lillifors:"
print"p of normal:",p_value
print "data are normal distributed"
return True
if len(testData) >300:
p_value= stats.kstest(testData,'norm')[1]
if p_value<0.05:
print "use kstest:"
print"p of normal:",p_value
print "data are not normal distributed"
return False
else:
print "use kstest:"
print"p of normal:",p_value
print "data are normal distributed"
return True
#对所有样本组进行正态性检验
def NormalTest(list_groups):
for group in list_groups:
#正态性检验
status=check_normality(group)
if status==False :
return False
return True
#排列组合函数
def Combination(list_groups):
combination= []
for i in range(1,len(list_groups)+1):
iter = itertools.combinations(list_groups,i)
combination.append(list(iter))
#需要排除第一个和最后一个
return combination[1:-1][0]
'''
Out[57]:
[[([2, 3, 7, 2, 6], [10, 8, 7, 5, 10]),
([2, 3, 7, 2, 6], [10, 13, 14, 13, 15]),
([10, 8, 7, 5, 10], [10, 13, 14, 13, 15])]]
'''
#方差齐性检测
def Levene_test(group1,group2,group3):
leveneResult=scipy.stats.levene(group1,group2,group3)
p=leveneResult[1]
print"levene test:"
if p<0.05:
print"variances of groups are not equal"
return False
else:
print"variances of groups are equal"
return True
'''
H0成立,三组数据方差无显著差异
Out[9]: LeveneResult(statistic=0.24561403508771934, pvalue=0.7860617221429711)
'''
#比较组内的样本是否相等,如果不相等,不适合于tukey等方法
#此函数有问题,无法解决nan排除
def Equal_lenth(list_groups):
list1=list_groups[0]
list2=list_groups[1]
list3=list_groups[2]
list1_removeNan=[x for x in list1 if str(x) != 'nan' and str(x)!= '-inf']
list2_removeNan=[x for x in list2 if str(x) != 'nan' and str(x)!= '-inf']
list3_removeNan=[x for x in list3 if str(x) != 'nan' and str(x)!= '-inf']
len1=len(list1_removeNan)
len2=len(list2_removeNan)
len3=len(list3_removeNan)
if len1==len2==len3:
return True
else:
return False
'''
#返回True or false
normality=NormalTest(list_groups)
leveneResult=Levene_test(list_groups[0],list_groups[1],list_groups[2])
'''
作者toby,qq:231469242
练习例题
8.2 Multiple Groups
The following example is taken from the really good, but somewhat advanced book
by A.J. Dobson: “An Introduction to Generalized Linear Models”:
• Get the data
The file Data/data_others/Table 6.6 Plant experiment.xls, which can also
be found on https://github.com/thomas-haslwanter/statsintro/tree/master/Data/
data_others, contains data from an experiment with plants in three different
growing conditions. Read the data into Python. Hint: use the module xlrd.
• Perform an ANOVA
Are the three groups different? (Correct answer: yes, they are.)
• Multiple Comparisons
Using the Tukey test, which of the pairs are different? (Correct answer: only
TreamtmentA and TreatmentB differ.)
• Kruskal–Wallis
Would a nonparametric comparison lead to a different result? (Correct answer:
no.)

# -*- coding: utf-8 -*-
# Import standard packages
import variance_check
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
import pandas as pd
from scipy.stats.mstats import kruskalwallis
#数据data from an experiment with plants in three different growing conditions
#所以用配对T试验进行事后多重检测
list_Control=[4.17,5.58,5.18,6.11,4.5,4.61,5.17,4.53,5.33,5.14]
list_treatmentA=[4.81,4.17,4.41,3.59,5.87,3.83,6.03,4.89,4.32,4.69]
list_treatmentB=[6.31,5.12,5.54,5.5,5.37,5.29,4.92,6.15,5.8,5.26]
list_groups=[list_Control,list_treatmentA,list_treatmentB]
normality=variance_check.NormalTest(list_groups)
leveneResult=variance_check.Levene_test(list_groups[0],list_groups[1],list_groups[2])
print(stats.f_oneway(list_Control,list_treatmentA,list_treatmentB))
print(stats.ttest_rel(list_Control,list_treatmentA))
print(stats.ttest_rel(list_Control,list_treatmentB))
print(stats.ttest_rel(list_treatmentA,list_treatmentB))
'''
#解读:三组数据正态分布,方差剂型符合,只有treatmentA和treatmentB有显著区别
use shapiro:
p of normal: 0.747474491596
data are normal distributed
use shapiro:
p of normal: 0.451944738626
data are normal distributed
use shapiro:
p of normal: 0.564250946045
data are normal distributed
levene test:
variances of groups are equal
F_onewayResult(statistic=4.846087862380136, pvalue=0.015909958325622899)
Ttest_relResult(statistic=0.99384151305794055, pvalue=0.3462672871440382)
Ttest_relResult(statistic=-1.772083360883858, pvalue=0.11014394200586315)
Ttest_relResult(statistic=-2.8463513880802855, pvalue=0.0192031388472628)
'''
kruskalwallis(list_groups)
'''
#与方差检验结果一致
KruskalResult(statistic=7.9882287494437154, pvalue=0.018423755731471966)
'''
作者toby,qq:231469242
spss检验与python统计结果一致,group与weight有显著关系,多重检验:treatmentA和treatmentB有显著关系,
此样本tukey,lsd,bonferroni结果一致



https://study.163.com/provider/400000000398149/index.htm?share=2&shareId=400000000398149( 欢迎关注博主主页,学习python视频资源,还有大量免费python经典文章)

