信息熵:变量的不确定性越大,熵越大。熵可用下面的公式描述:
-(p1*logp1+p2*logp2+...+pn*logpn)
pi表示事件i发生的概率
ID3:
GAIN(A)=INFO(D)-INFO_A(D)
节点A的信息增益为不加节点A时的信息量INFO(D)-加上A后的信息量INFO_A(D)
算法步骤:
1、树以代表训练样本的某个结点开始
2、如果样本都在同一类,则将该节点设置为叶子,并使用该类标号
3、否则,算法使用熵度量每个样本的分类结点,选择可以获得最大信息的节点
4、所有的属性都是分类的,连续值必须离散化
停止条件:该节点上所有的样本都属于一个类
没有剩余的属性
没有属性时,比如已经分到第三个属性,但是没有第四个属性,这时将样本分到最多的那类
C4.5与ID3区别在于属性度量方式的不同
优点:直观、便于理解、小规模数据有效
缺点:处理连续变量不好
类别较多时,错误增加比较快
可规模性一般
package dTree;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
public class dataClass {
public static void main(String[] args) {
double [][]exerciseData = {{1,1,0,0},{1,3,1,1},{3,2,1,1},{2,2,1,1},{3,2,1,1},{2,3,0,1},{2,1,0,0},{3,2,0,1},{2,1,0,1},{1,1,1,0}};//每一列表示一个属性值,最后一列表示决策层
int[] index = gainResult(exerciseData);//输出的结果表示按照决策树规则所对应的属性参考顺序
for(int i = 0;i<index.length;i++){
System.out.print(" "+(index[i]+1));
}
}
private static int[] gainResult(double[][] exerciseData) {
int dataQuantity = exerciseData.length;
int attributeQuantity = exerciseData[0].length-1;
int []attribute = new int[attributeQuantity];
int []newAttribute = new int [attributeQuantity];
double [][]newExerciseData = exerciseData ;
double [][]maxgainIndexData = new double[dataQuantity][attributeQuantity];
for(int i = 0;i<attributeQuantity;i++){
attribute[i] = MaxgainIndex(newExerciseData);
for(int j = 0;j<maxgainIndexData.length;j++){
maxgainIndexData[j][i] = newExerciseData[j][attribute[i]];
}
newExerciseData = NewData(newExerciseData,attribute[i]);
}
boolean flag =true;
for(int i = 0;i<maxgainIndexData[0].length;i++){//寻找第i列所对应的exerciseData
for(int k = 0;k<exerciseData[0].length-1;k++){
flag = true;
for(int j = 0;j<exerciseData.length;j++){
if(maxgainIndexData[j][i]!=exerciseData[j][k]){
flag = false;
break;
}
}
if(flag==true){
newAttribute[i] = k;
}
}
}
return newAttribute;
}
//矩阵转置
private static double[][] Transpose(double[][] exerciseData){
int rows = exerciseData.length;
int columns = exerciseData[0].length;
double [][]newData = new double [columns][rows];
for(int i = 0;i<columns;i++){
for(int j= 0;j<rows;j++){
newData[i][j] = exerciseData[j][i];
}
}
return newData;
}
private static double[][] NewData(double[][] exerciseData,int maxIndex) {//删除exerciseData中maxindex列的数据,产生新数据
double [][]newExerciseData = new double[exerciseData.length][];
for(int i = 0;i<exerciseData.length;i++){
newExerciseData[i] = new double[exerciseData[i].length-1];
for(int j = 0;j<newExerciseData[i].length;j++){
if(j>=maxIndex){
newExerciseData[i][j] = exerciseData[i][j+1];
}else{
newExerciseData[i][j] = exerciseData[i][j];
}
}
}
return newExerciseData;
}
private static int MaxgainIndex(double[][] exerciseData) {//获取exerciseData最大增益率所对应的一列
double []gainRatio = gainAll(exerciseData);
double maxGain = gainRatio[0];//最大增益率
int maxIndex = 0;//最大增益率所对应的索引值
for(int i=1;i<gainRatio.length-1;i++){
if(maxGain<gainRatio[i]){
maxGain = gainRatio[i];
maxIndex = i;
}
}
return maxIndex;
}
public static double[] gainAll(double [][]Data){//得到Data中每一列的增益值
int col = Data.length;//数据个数
int vol = Data[0].length;//属性个数
double [][]count = new double[vol][];
double []info = new double[vol];
double Lcount[][] = new double[vol][];//第i个属性的第j个分类的比率
double Mcount[][] = new double[vol][];
List <List<Map1>>listM = new ArrayList<List<Map1>>();
List <List<Map1>>listM2 = new ArrayList<List<Map1>>();
double []gain;
//矩阵的属性统计
for (int i = 0;i<vol;i++){
//属性i的不重复的分类集(mapList加入了属性i以及对应的决策层的值)
List<Map> mapList = new ArrayList<Map>();
for(int j = 0;j<col;j++){
Map y = new HashMap();
y.put(Data[j][i],Data[j][vol-1]);
if(!mapList.contains(y)){
mapList.add(y);
}
}
//属性i全部分类集(重复,listM2加入了i值以及决策层的值)
List<Map> AllmapList = new ArrayList<Map>();
for(int j = 0;j<col;j++){
Map y = new HashMap();
y.put(Data[j][i],Data[j][vol-1]);
AllmapList.add(y);
}
count[i] = new double[mapList.size()];
double sum = 0;
double num = 0;
List<Map1>LM = new ArrayList<Map1>();
for(int j=0;j<mapList.size();j++){
Iterator it =((Map)(mapList.get(j))).keySet().iterator();
num = (Double) it.next();
for(int k = 0;k<AllmapList.size();k++){
if(mapList.get(j).equals(AllmapList.get(k))){
count[i][j] = count[i][j]+1;
}
}
Map1 p = new Map1();
p.setKey(count[i][j]);
p.setValue(num);
LM.add(p);
}
listM2.add(LM);
}
for( int k = 0;k<vol;k++){
List <Double>list = new ArrayList<Double>();
for(int i = 0;i<col;i++){
if(!list.contains(Data[i][k])){
list.add(Data[i][k]);
}
}
Lcount[k] = new double[list.size()];
Mcount[k] = new double[list.size()];
for(int j = 0;j<col;j++){
int index = list.indexOf(Data[j][k]);
Lcount[k][index] = Lcount[k][index]+1;
Mcount[k][index] = Mcount[k][index]+1;
}
double LastSum = 0;
for(int i = 0;i<Lcount[k].length;i++){
LastSum = LastSum+Lcount[k][i];
}
for(int j = 0;j<Lcount[k].length;j++){
Lcount[k][j] = Lcount[k][j]/LastSum;
}
List<Map1> LM = new ArrayList<Map1>();
for(int i = 0;i<Lcount[k].length;i++){
Map1 p = new Map1();
p.setKey(Mcount[k][i]);
p.setValue(list.get(i));
LM.add(p);
}
listM.add(LM);
}
gain = new double[listM2.size()];
for(int i = 0; i<listM2.size()-1;i++){
List listi = new ArrayList();
listi = listM.get(i);
double sum = 0;
for(int j=0;j<listi.size();j++){
Map1 p = (Map1) listi.get(j);
double key = p.getKey();
double value = p.getValue();
for(int k = 0;k<listM2.get(i).size();k++){
Map1 p1 = (Map1) listM2.get(i).get(k);
if(p1.value==value){
sum = sum+xlog2(p1.key/p.key);
}
//System.out.println(sum);
}
gain[i]+=sum*Lcount[i][j];
sum = 0;
}
}
for(int i = 0;i<Lcount[Lcount.length-1].length;i++){
gain[listM2.size()-1] += -xlog2(Lcount[Lcount.length-1][i]);
}
for(int j = 0;j<gain.length-1;j++){
gain[j] = gain[gain.length-1]+gain[j];
}
double[]Scount = new double [Lcount.length-1];
for(int j= 0;j<Lcount.length-1;j++){
double sum = 0;
for(int k = 0;k<Lcount[j].length;k++){
sum += xlog2(Lcount[j][k]);
}
Scount[j] = -sum;
}
for(int j= 0;j<Scount.length;j++){
gain[j] = gain[j]/Scount[j];
}
return gain;
}
public static boolean contain(Map mapList,double key,double value){
if(value==Double.parseDouble(mapList.get(key).toString())){
return true;
}else{
return false;
}
}
public static double xlog2(double x){
return x*(Math.log(x)/Math.log((double)2));
}
}