python机器学习-乳腺癌细胞挖掘(博主亲自录制视频)
单因素方差+kruskalwallis
onewayTest.py
# -*- coding: utf-8 -*- import numpy as np,variance_check # additional packages from scipy.stats.mstats import kruskalwallis from scipy import stats group1=[27,2,4,18,7,9] group2=[20,8,14,36,21,22] group3=[34,31,3,23,30,6] list_groups=[group1,group2,group3] #前期检验 normality=variance_check.NormalTest(list_groups) leveneResult=variance_check.Levene_test(list_groups[0],list_groups[1],list_groups[2]) equal_lenth=variance_check.Equal_lenth(list_groups) def Choose_mode(normality,leveneResult,group1,group2,group3): if normality==True and leveneResult==True: print"Use anova test:" statistic,p=stats.f_oneway(group1,group2,group3) print"statistic,p:",statistic,p if p<0.05: print "There is significant difference" return True else: print "There is no significant difference" return False if normality==False: print"Use kruskawallis test:" h, p = kruskalwallis(list_groups) print"H value:",h print"p",p # Print the results if p<0.05: print('There is a significant difference between the cities.') return True else: print('No significant difference between the cities.') return False Choose_mode(normality,leveneResult,group1,group2,group3)
作者toby,qq:231469242
variance_check.py
# -*- coding: utf-8 -*- ''' 用于方差齐性检验 正太性检验 配对相等检验 ''' import scipy,math from scipy.stats import f import numpy as np import matplotlib.pyplot as plt import scipy.stats as stats # additional packages from statsmodels.stats.diagnostic import lillifors #多重比较 from statsmodels.sandbox.stats.multicomp import multipletests #用于排列组合 import itertools ''' #测试数据 group1=[2,3,7,2,6] group2=[10,8,7,5,10] group3=[10,13,14,13,15] list_groups=[group1,group2,group3] list_total=group1+group2+group3 ''' a=0.05 #正态分布测试 def check_normality(testData): #20<样本数<50用normal test算法检验正态分布性 if 20<len(testData) <50: p_value= stats.normaltest(testData)[1] if p_value<0.05: print"use normaltest" print"p of normal:",p_value print "data are not normal distributed" return False else: print"use normaltest" print"p of normal:",p_value print "data are normal distributed" return True #样本数小于50用Shapiro-Wilk算法检验正态分布性 if len(testData) <50: p_value= stats.shapiro(testData)[1] if p_value<0.05: print "use shapiro:" print"p of normal:",p_value print "data are not normal distributed" return False else: print "use shapiro:" print"p of normal:",p_value print "data are normal distributed" return True if 300>=len(testData) >=50: p_value= lillifors(testData)[1] if p_value<0.05: print "use lillifors:" print"p of normal:",p_value print "data are not normal distributed" return False else: print "use lillifors:" print"p of normal:",p_value print "data are normal distributed" return True if len(testData) >300: p_value= stats.kstest(testData,'norm')[1] if p_value<0.05: print "use kstest:" print"p of normal:",p_value print "data are not normal distributed" return False else: print "use kstest:" print"p of normal:",p_value print "data are normal distributed" return True #对所有样本组进行正态性检验 def NormalTest(list_groups): for group in list_groups: #正态性检验 status=check_normality(group) if status==False : return False return True #排列组合函数 def Combination(list_groups): combination= [] for i in range(1,len(list_groups)+1): iter = itertools.combinations(list_groups,i) combination.append(list(iter)) #需要排除第一个和最后一个 return combination[1:-1][0] ''' Out[57]: [[([2, 3, 7, 2, 6], [10, 8, 7, 5, 10]), ([2, 3, 7, 2, 6], [10, 13, 14, 13, 15]), ([10, 8, 7, 5, 10], [10, 13, 14, 13, 15])]] ''' #方差齐性检测 def Levene_test(group1,group2,group3): leveneResult=scipy.stats.levene(group1,group2,group3) p=leveneResult[1] print"levene test:" if p<0.05: print"variances of groups are not equal" return False else: print"variances of groups are equal" return True ''' H0成立,三组数据方差无显著差异 Out[9]: LeveneResult(statistic=0.24561403508771934, pvalue=0.7860617221429711) ''' #比较组内的样本是否相等,如果不相等,不适合于tukey等方法 #此函数有问题,无法解决nan排除 def Equal_lenth(list_groups): list1=list_groups[0] list2=list_groups[1] list3=list_groups[2] list1_removeNan=[x for x in list1 if str(x) != 'nan' and str(x)!= '-inf'] list2_removeNan=[x for x in list2 if str(x) != 'nan' and str(x)!= '-inf'] list3_removeNan=[x for x in list3 if str(x) != 'nan' and str(x)!= '-inf'] len1=len(list1_removeNan) len2=len(list2_removeNan) len3=len(list3_removeNan) if len1==len2==len3: return True else: return False ''' #返回True or false normality=NormalTest(list_groups) leveneResult=Levene_test(list_groups[0],list_groups[1],list_groups[2]) '''
作者toby,qq:231469242
练习例题
8.2 Multiple Groups
The following example is taken from the really good, but somewhat advanced book
by A.J. Dobson: “An Introduction to Generalized Linear Models”:
• Get the data
The file Data/data_others/Table 6.6 Plant experiment.xls, which can also
be found on https://github.com/thomas-haslwanter/statsintro/tree/master/Data/
data_others, contains data from an experiment with plants in three different
growing conditions. Read the data into Python. Hint: use the module xlrd.
• Perform an ANOVA
Are the three groups different? (Correct answer: yes, they are.)
• Multiple Comparisons
Using the Tukey test, which of the pairs are different? (Correct answer: only
TreamtmentA and TreatmentB differ.)
• Kruskal–Wallis
Would a nonparametric comparison lead to a different result? (Correct answer:
no.)
# -*- coding: utf-8 -*- # Import standard packages import variance_check import numpy as np import matplotlib.pyplot as plt from scipy import stats import pandas as pd from scipy.stats.mstats import kruskalwallis #数据data from an experiment with plants in three different growing conditions #所以用配对T试验进行事后多重检测 list_Control=[4.17,5.58,5.18,6.11,4.5,4.61,5.17,4.53,5.33,5.14] list_treatmentA=[4.81,4.17,4.41,3.59,5.87,3.83,6.03,4.89,4.32,4.69] list_treatmentB=[6.31,5.12,5.54,5.5,5.37,5.29,4.92,6.15,5.8,5.26] list_groups=[list_Control,list_treatmentA,list_treatmentB] normality=variance_check.NormalTest(list_groups) leveneResult=variance_check.Levene_test(list_groups[0],list_groups[1],list_groups[2]) print(stats.f_oneway(list_Control,list_treatmentA,list_treatmentB)) print(stats.ttest_rel(list_Control,list_treatmentA)) print(stats.ttest_rel(list_Control,list_treatmentB)) print(stats.ttest_rel(list_treatmentA,list_treatmentB)) ''' #解读:三组数据正态分布,方差剂型符合,只有treatmentA和treatmentB有显著区别 use shapiro: p of normal: 0.747474491596 data are normal distributed use shapiro: p of normal: 0.451944738626 data are normal distributed use shapiro: p of normal: 0.564250946045 data are normal distributed levene test: variances of groups are equal F_onewayResult(statistic=4.846087862380136, pvalue=0.015909958325622899) Ttest_relResult(statistic=0.99384151305794055, pvalue=0.3462672871440382) Ttest_relResult(statistic=-1.772083360883858, pvalue=0.11014394200586315) Ttest_relResult(statistic=-2.8463513880802855, pvalue=0.0192031388472628) ''' kruskalwallis(list_groups) ''' #与方差检验结果一致 KruskalResult(statistic=7.9882287494437154, pvalue=0.018423755731471966) '''
作者toby,qq:231469242
spss检验与python统计结果一致,group与weight有显著关系,多重检验:treatmentA和treatmentB有显著关系,
此样本tukey,lsd,bonferroni结果一致
https://study.163.com/provider/400000000398149/index.htm?share=2&shareId=400000000398149( 欢迎关注博主主页,学习python视频资源,还有大量免费python经典文章)