zoukankan html css js c++ java

文本聚类——Kmeans

上两篇文章分别用朴素贝叶斯算法和KNN算法对newgroup文本进行了分类測试。本文使用Kmeans算法对文本进行聚类。

1、文本预处理

文本预处理在前面两本文章中已经介绍，此处（略）。

2、文本向量化

package com.datamine.kmeans;

import java.io.*;
import java.util.*;
import java.util.Map.Entry;

/**
 * 计算文档的属性向量，将全部文档向量化
 * @author Administrator
 */
public class ComputeWordsVector {

	/**
	 * 计算文档的TF-IDF属性向量。返回Map<文件名称，<特征词，TF-IDF值>>
	 * @param testSampleDir 处理好的聚类样本測试例子集
	 * @return 全部測试例子的属性向量构成的map
	 * @throws IOException
	 */
	public Map<String,Map<String,Double>> computeTFMultiIDF(String testSampleDir) throws IOException{
		
		String word;
		Map<String,Map<String,Double>> allTestSampleMap = new TreeMap<String, Map<String,Double>>();
		Map<String,Double> idfPerWordMap = computeIDF(testSampleDir);
		Map<String,Double> tfPerDocMap = new TreeMap<String, Double>();
		
		File[] samples = new File(testSampleDir).listFiles();
		System.out.println("the total number of test files is " + samples.length);
		for(int i = 0;i<samples.length;i++){
			
			tfPerDocMap.clear();
			FileReader samReader = new FileReader(samples[i]);
			BufferedReader samBR = new BufferedReader(samReader);
			Double wordSumPerDoc = 0.0; //计算每篇文档的总词数
			while((word = samBR.readLine()) != null){
				if(!word.isEmpty()){
					wordSumPerDoc++;
					if(tfPerDocMap.containsKey(word))
						tfPerDocMap.put(word, tfPerDocMap.get(word)+1.0);
					else
						tfPerDocMap.put(word, 1.0);
				}
			}
			
			Double maxCount = 0.0,wordWeight; //记录出现次数最多的词的次数，用作归一化  ？？？
			Set<Map.Entry<String, Double>> tempTF = tfPerDocMap.entrySet();
			for(Iterator<Map.Entry<String, Double>> mt = tempTF.iterator();mt.hasNext();){
				Map.Entry<String, Double> me = mt.next();
				if(me.getValue() > maxCount)
					maxCount = me.getValue();
			}
			
			for(Iterator<Map.Entry<String, Double>> mt = tempTF.iterator();mt.hasNext();){
				Map.Entry<String, Double> me = mt.next();
				Double IDF = Math.log(samples.length / idfPerWordMap.get(me.getKey()));
				wordWeight = (me.getValue() / wordSumPerDoc) * IDF;
				tfPerDocMap.put(me.getKey(), wordWeight);
			}
			TreeMap<String,Double> tempMap = new TreeMap<String, Double>();
			tempMap.putAll(tfPerDocMap);
			allTestSampleMap.put(samples[i].getName(), tempMap);
		}
		printTestSampleMap(allTestSampleMap);
		return allTestSampleMap;
	}
	
	/**
	 * 输出測试例子map内容，用于測试
	 * @param allTestSampleMap
	 * @throws IOException 
	 */
	private void printTestSampleMap(
			Map<String, Map<String, Double>> allTestSampleMap) throws IOException {
		// TODO Auto-generated method stub
		File outPutFile = new File("E:/DataMiningSample/KmeansClusterResult/allTestSampleMap.txt");
		FileWriter outPutFileWriter = new FileWriter(outPutFile);
		Set<Map.Entry<String, Map<String,Double>>> allWords = allTestSampleMap.entrySet();
		
		for(Iterator<Entry<String, Map<String, Double>>> it = allWords.iterator();it.hasNext();){
			
			Map.Entry<String, Map<String,Double>> me = it.next();
			outPutFileWriter.append(me.getKey()+" ");
			
			Set<Map.Entry<String, Double>> vectorSet = me.getValue().entrySet();
			for(Iterator<Map.Entry<String, Double>> vt = vectorSet.iterator();vt.hasNext();){
				Map.Entry<String, Double> vme = vt.next();
				outPutFileWriter.append(vme.getKey()+" "+vme.getValue()+" ");
			}
			outPutFileWriter.append("
");
			outPutFileWriter.flush();
		}
		outPutFileWriter.close();
		
	}

	/**
	 * 统计每一个词的总出现次数，返回出现次数大于n次的词汇构成终于的属性词典
	 * @param strDir 处理好的newsgroup文件文件夹的绝对路径
	 * @param wordMap 记录出现的每一个词构成的属性词典
	 * @return newWordMap 返回出现次数大于n次的词汇构成终于的属性词典
	 * @throws IOException
	 */
	public SortedMap<String, Double> countWords(String strDir,
			Map<String, Double> wordMap) throws IOException {
		
		File sampleFile = new File(strDir);
		File[] sample = sampleFile.listFiles();
		String word;
		
		for(int i =0 ;i < sample.length;i++){
			
			if(!sample[i].isDirectory()){
				FileReader samReader = new FileReader(sample[i]);
				BufferedReader samBR = new BufferedReader(samReader);
				while((word = samBR.readLine()) != null){
					if(!word.isEmpty() && wordMap.containsKey(word))
						wordMap.put(word, wordMap.get(word)+1);
					else
						wordMap.put(word, 1.0);
				}
				samBR.close();
			}else{
				countWords(sample[i].getCanonicalPath(),wordMap);
			}
		}
		
		/*
		 * 去除停顿词后。先用DF算法选取特征词，后面再增加特征词的选取算法
		 */
		SortedMap<String,Double> newWordMap = new TreeMap<String, Double>();
		Set<Map.Entry<String, Double>> allWords = wordMap.entrySet();
		for(Iterator<Map.Entry<String, Double>> it = allWords.iterator();it.hasNext();){
			Map.Entry<String, Double> me = it.next();
			if(me.getValue() > 100) //DF算法降维
				newWordMap.put(me.getKey(), me.getValue());
		}
		
		return newWordMap;
	}
	
	/**
	 * 计算IDF，即属性词典中每一个词在多少个文档中出现过
	 * @param testSampleDir 聚类算法測试样本所在的文件夹
	 * @return 单词IDFmap <单词，包括该单词的文档数>
	 * @throws IOException
	 */
	public Map<String,Double> computeIDF(String testSampleDir) throws IOException{
		
		Map<String,Double> IDFPerWordMap = new TreeMap<String, Double>();
		//记下当前已经遇到过的该文档中的词
		Set<String> alreadyCountWord = new HashSet<String>();
		String word;
		File[] samples = new File(testSampleDir).listFiles();
		for(int i = 0;i<samples.length;i++){
			
			alreadyCountWord.clear();
			FileReader tsReader = new FileReader(samples[i]);
			BufferedReader tsBR = new BufferedReader(tsReader);
			while((word = tsBR.readLine()) != null){
				
				if(!alreadyCountWord.contains(word)){
					if(IDFPerWordMap.containsKey(word))
						IDFPerWordMap.put(word, IDFPerWordMap.get(word)+1.0);
					else
						IDFPerWordMap.put(word, 1.0);
					alreadyCountWord.add(word);
				}
			}
		}
		return IDFPerWordMap;
	}

	/**
	 * 创建聚类算法的測试例子集。主要是过滤出仅仅含有特征词的文档写到一个文件夹下
	 * @param srcDir 源文件夹，已经预处理可是还没有过滤非特征词的文档文件夹
	 * @param desDir 目的文件夹，聚类算法的測试例子文件夹
	 * @return 创建測试例子集中特征词数组
	 * @throws IOException 
	 */
	public String[] createTestSamples(String srcDir, String desDir) throws IOException {
		
		SortedMap<String,Double> wordMap = new TreeMap<String, Double>();
		wordMap = countWords(srcDir,wordMap);
		System.out.println("special words map sizes:" + wordMap.size());
		String word,testSampleFile;
		
		File[] sampleDir = new File(srcDir).listFiles();
		for(int i =0;i<sampleDir.length;i++){
			
			File[] sample = sampleDir[i].listFiles();
			for(int j =0;j<sample.length;j++){
				
				testSampleFile = desDir + sampleDir[i].getName()+"_"+sample[j].getName();
				FileReader samReader = new FileReader(sample[j]);
				BufferedReader samBR = new BufferedReader(samReader);
				FileWriter tsWriter = new FileWriter(new File(testSampleFile));
				while((word = samBR.readLine()) != null){
					if(wordMap.containsKey(word))
						tsWriter.append(word + "
");
				}
				tsWriter.flush();
				tsWriter.close();
			}
		}
	
		//返回属性词典
		String[] terms = new String[wordMap.size()];
		int i = 0;
		Set<Map.Entry<String, Double>> allWords = wordMap.entrySet();
		for(Iterator<Map.Entry<String, Double>> it = allWords.iterator();it.hasNext();){
			Map.Entry<String, Double> me = it.next();
			terms[i] = me.getKey();
			i++;
		}
		
		return terms;
		
	}
	
	
	

	
	
}

3、Kmeans算法

Kmeans算法是很经典的聚类算法，算法主要过程例如以下：先选K个（或者随机选择）初始聚类点作为初始中心点，然后就算其它全部点到K个聚类中心点的距离，将点分到近期的聚类中。聚类完后。再次计算各个类中的中心点，中心点发生变化，于是更新中心点，然后再计算其它点到中心点的距离又一次聚类。中心点又发生变化，如此迭代下去。

初始点选取策略：随机选。均匀抽样，最大最小法等....

距离的度量方法：1-余弦相似度，2-向量内积

算法停止条件：计算准则函数及设置最大迭代次数

空聚类的处理：注意空聚类导致的程序bug

package com.datamine.kmeans;

import java.io.BufferedReader;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.util.*;

/**
 * kmeans聚类算法的实现类，将newsgroup文档集聚成10类、20类、30类
 * 算法结束条件：当每一个点近期的聚类中心点就是它所属的聚类中心点时。算法结束
 * @author Administrator
 *
 */
public class KmeansCluster {

	/**
	 * kmeans算法主过程
	 * @param allTestSampleMap 聚类算法測试样本map(已经向量化) <文件名称，<特征词，TF-IDF值>>
	 * @param k 聚类的数量
	 * @return 聚类结果 <文件名称，聚类完毕后所属的类别号>
	 */
	private Map<String, Integer> doProcess(
			Map<String, Map<String, Double>> allTestSampleMap, int k) {
		
		//0、首先获取allTestSampleMap全部文件名称顺序组成的数组
		String[] testSampleNames = new String[allTestSampleMap.size()];
		int count =0,tsLength = allTestSampleMap.size();
		Set<Map.Entry<String, Map<String,Double>>> allTestSampleMapSet = allTestSampleMap.entrySet();
		for(Iterator<Map.Entry<String, Map<String,Double>>> it = allTestSampleMapSet.iterator();it.hasNext();){
			Map.Entry<String, Map<String,Double>> me = it.next();
			testSampleNames[count++] = me.getKey();
		}
		
		//1、初始点的选择算法是随机选择或者是均匀分开选择。这里採用后者
		Map<Integer,Map<String,Double>> meansMap = getInitPoint(allTestSampleMap,k);
		double [][] distance = new double[tsLength][k]; //distance[i][k]记录点i到聚类中心k的距离
		
		//2、初始化k个聚类
		int[] assignMeans = new int[tsLength]; //记录全部点属于的聚类序号，初始化全部为0
		Map<Integer,Vector<Integer>> clusterMember = new TreeMap<Integer, Vector<Integer>>();//记录每一个聚类的成员点序号
		Vector<Integer> mem = new Vector<Integer>();
		int iterNum = 0; //迭代次数
		
		while(true){
			System.out.println("Iteration No." + (iterNum++) + "-------------------------");
			//3、计算每一个点和每一个聚类中心的距离
			for(int i = 0;i < tsLength;i++){
				for(int j = 0;j<k;j++)
					distance[i][j] = getDistance(allTestSampleMap.get(testSampleNames[i]),meansMap.get(j));
			}
			
			//4、找出每一个点近期的聚类中心
			int [] nearestMeans = new int[tsLength];
			for(int i = 0;i < tsLength;i++){
				nearestMeans[i] = findNearestMeans(distance,i);
			}
			
			//5、推断当前全部点属于的聚类序号是否已经全部是其离的近期的聚类，假设是或者达到最大的迭代次数。那么结束算法
			int okCount = 0;
			for(int i= 0;i<tsLength;i++){
				if(nearestMeans[i] == assignMeans[i])
					okCount ++;
			}
			System.out.println("okCount = " + okCount);
			if(okCount == tsLength || iterNum >= 10)
				break;
			
			//6、假设前面条件不满足，那么须要又一次聚类再次进行一次迭代，须要改动每一个聚类的成员和每一个点属于的聚类信息
			clusterMember.clear();
			for(int i = 0;i < tsLength;i++){
				assignMeans[i] = nearestMeans[i];
				if(clusterMember.containsKey(nearestMeans[i])){
					clusterMember.get(nearestMeans[i]).add(i);
				}
				else{
					mem.clear();
					mem.add(i);
					Vector<Integer> tempMem = new Vector<Integer>();
					tempMem.addAll(mem);
					clusterMember.put(nearestMeans[i], tempMem);
				}
			}
			
			//7、又一次计算每一个聚类的中心点
			for(int i = 0;i<k;i++){
				
				if(!clusterMember.containsKey(i)) //注意kmeans可能产生空聚类
					continue;
				
				Map<String,Double> newMean = computeNewMean(clusterMember.get(i),allTestSampleMap,testSampleNames);
				Map<String,Double> tempMean = new TreeMap<String,Double>();
				tempMean.putAll(newMean);
				meansMap.put(i, tempMean);
			}
		
		}
		
		//8、形成聚类结果而且返回
 		Map<String,Integer> resMap = new TreeMap<String,Integer>();
		for(int i = 0;i<tsLength;i++){
			resMap.put(testSampleNames[i], assignMeans[i]);
		}
		
		return resMap;
	}
	
	/**
	 * 计算当前聚类的新中心，採用向量平均
	 * @param clusterM 该点到全部聚类中心的距离
	 * @param allTestSampleMap 全部測试例子 <文件名称，向量>
	 * @param testSampleNames 全部測试例子名构成的数组
	 * @return 新的聚类中心向量
	 */
	private Map<String, Double> computeNewMean(Vector<Integer> clusterM,
			Map<String, Map<String, Double>> allTestSampleMap,
			String[] testSampleNames) {
		
		double memberNum = (double)clusterM.size();
		Map<String,Double> newMeanMap = new TreeMap<String,Double>();
		Map<String,Double> currentMemMap = new TreeMap<String, Double>();
		
		for(Iterator<Integer> it = clusterM.iterator();it.hasNext();){
			int me = it.next();
			currentMemMap = allTestSampleMap.get(testSampleNames[me]);
			Set<Map.Entry<String, Double>> currentMemMapSet = currentMemMap.entrySet();
			for(Iterator<Map.Entry<String, Double>> jt = currentMemMapSet.iterator();jt.hasNext();){
				Map.Entry<String, Double> ne = jt.next();
				if(newMeanMap.containsKey(ne.getKey()))
					newMeanMap.put(ne.getKey(), newMeanMap.get(ne.getKey())+ne.getValue());
				else
					newMeanMap.put(ne.getKey(), ne.getValue());
			}
		}
		
		Set<Map.Entry<String, Double>> newMeanMapSet = newMeanMap.entrySet();
		for(Iterator<Map.Entry<String, Double>> it = newMeanMapSet.iterator();it.hasNext();){
			Map.Entry<String, Double> me = it.next();
			newMeanMap.put(me.getKey(), newMeanMap.get(me.getKey()) / memberNum);
		}
		
		return newMeanMap;
	}

	/**
	 * 找出距离当前点近期的聚类中心
	 * @param distance 点到全部聚类中心的距离
	 * @param m 点（文本号）
	 * @return 近期聚类中心的序号j
	 */
	private int findNearestMeans(double[][] distance, int m) {
		
		double minDist = 10;
		int j = 0;
		for(int i = 0;i<distance[m].length;i++){
			if(distance[m][i] < minDist){
				minDist = distance[m][i];
				j = i;
			}
		}
		return j;
	}

	/**
	 * 计算两个点的距离
	 * @param map1 点1的向量map
	 * @param map2 点2的向量map
	 * @return 两个点的欧式距离
	 */
	private double getDistance(Map<String, Double> map1, Map<String, Double> map2) {

		return 1 - computeSim(map1,map2);
	}

	/**计算两个文本的类似度
	 * @param testWordTFMap 文本1的<单词,词频>向量
	 * @param trainWordTFMap 文本2<单词,词频>向量
	 * @return Double 向量之间的类似度 以向量夹角余弦计算（加上凝视部分代码就可以）或者向量内积计算（不加凝视部分，效果相当而速度更快）
	 * @throws IOException 
	 */
	private double computeSim(Map<String, Double> testWordTFMap,
			Map<String, Double> trainWordTFMap) {
		// TODO Auto-generated method stub
		double mul = 0;//, testAbs = 0, trainAbs = 0;
		Set<Map.Entry<String, Double>> testWordTFMapSet = testWordTFMap.entrySet();
		for(Iterator<Map.Entry<String, Double>> it = testWordTFMapSet.iterator(); it.hasNext();){
			Map.Entry<String, Double> me = it.next();
			if(trainWordTFMap.containsKey(me.getKey())){
				mul += me.getValue()*trainWordTFMap.get(me.getKey());
			}
			//testAbs += me.getValue() * me.getValue();
		}
		//testAbs = Math.sqrt(testAbs);
		
		/*Set<Map.Entry<String, Double>> trainWordTFMapSet = trainWordTFMap.entrySet();
		for(Iterator<Map.Entry<String, Double>> it = trainWordTFMapSet.iterator(); it.hasNext();){
			Map.Entry<String, Double> me = it.next();
			trainAbs += me.getValue()*me.getValue();
		}
		trainAbs = Math.sqrt(trainAbs);*/
		return mul ;/// (testAbs * trainAbs);
	}

	/**
	 * 获取kmeans算法迭代的初始点
	 * @param allTestSampleMap <文件名称，<特征词。TF-IDF值>>
	 * @param k 聚类的数量
	 * @return  meansMap k个聚类的中心点向量
	 */
	private Map<Integer, Map<String, Double>> getInitPoint(
			Map<String, Map<String, Double>> allTestSampleMap, int k) {
		
		int count = 0, i = 0;
		//保存k个聚类的中心向量
		Map<Integer,Map<String,Double>> meansMap = new TreeMap<Integer, Map<String,Double>>();
		System.out.println("本次聚类的初始点相应的文件为：");
		Set<Map.Entry<String, Map<String,Double>>> allTestSampleMapSet = allTestSampleMap.entrySet();
		for(Iterator<Map.Entry<String, Map<String,Double>>> it = allTestSampleMapSet.iterator();it.hasNext();){
			Map.Entry<String, Map<String,Double>> me = it.next();
			if(count == i*allTestSampleMapSet.size() / k){
				meansMap.put(i, me.getValue());
				System.out.println(me.getKey());
				i++;
			}
			count++ ;
		}
		
		return meansMap;
	}

	/**
	 * 输出聚类结果到文件里
	 * @param kmeansClusterResult 聚类结果
	 * @param kmeansClusterResultFile 输出聚类结果到文件里
	 * @throws IOException 
	 */
	private void printClusterResult(Map<String, Integer> kmeansClusterResult,
			String kmeansClusterResultFile) throws IOException {

		FileWriter resultWriter = new FileWriter(kmeansClusterResultFile);
		Set<Map.Entry<String, Integer>> kmeansClusterResultSet = kmeansClusterResult.entrySet();
		for(Iterator<Map.Entry<String, Integer>> it = kmeansClusterResultSet.iterator();it.hasNext();){
			Map.Entry<String, Integer> me = it.next();
			resultWriter.append(me.getKey()+" "+me.getValue()+"
");
		}
		resultWriter.flush();
		resultWriter.close();
	}
	
	/**
	 * 评估函数依据聚类结果文件统计熵 和 混淆矩阵
	 * @param kmeansClusterResultFile 聚类结果文件
	 * @param k 聚类数目
	 * @return 聚类结果的熵值
	 * @throws IOException 
	 */
	private double evaluateClusterResult(String kmeansClusterResultFile, int k) throws IOException {

		Map<String,String> rightCate = new TreeMap<String, String>();
		Map<String,String> resultCate = new TreeMap<String, String>();
		FileReader crReader = new FileReader(kmeansClusterResultFile);
		BufferedReader crBR  = new BufferedReader(crReader);
		String[] s;
		String line;
		while((line = crBR.readLine()) != null){
			s = line.split(" ");
			resultCate.put(s[0], s[1]);
			rightCate.put(s[0], s[0].split("_")[0]);
		}
		crBR.close();
		return computeEntropyAndConfuMatrix(rightCate,resultCate,k);//返回熵
	}
	
	/**
	 * 计算混淆矩阵并输出，返回熵
	 * @param rightCate 正确的类目相应map
	 * @param resultCate 聚类结果相应map
	 * @param k 聚类的数目
	 * @return 返回聚类熵
	 */
	private double computeEntropyAndConfuMatrix(Map<String, String> rightCate,
			Map<String, String> resultCate, int k) {
		
		//k行20列，[i,j]表示聚类i中属于类目j的文件数
		int[][] confusionMatrix = new int[k][20];
		
		//首先求出类目相应的数组索引
		SortedSet<String> cateNames = new TreeSet<String>();
		Set<Map.Entry<String, String>> rightCateSet = rightCate.entrySet();
		for(Iterator<Map.Entry<String, String>> it = rightCateSet.iterator();it.hasNext();){
			Map.Entry<String, String> me = it.next();
			cateNames.add(me.getValue());
		}
		
		String[] cateNamesArray = cateNames.toArray(new String[0]);
		Map<String,Integer> cateNamesToIndex = new TreeMap<String, Integer>();
		for(int i =0;i < cateNamesArray.length ;i++){
			cateNamesToIndex.put(cateNamesArray[i], i);
		}
		
		for(Iterator<Map.Entry<String, String>> it = rightCateSet.iterator();it.hasNext();){
			Map.Entry<String, String> me = it.next();
			confusionMatrix[Integer.parseInt(resultCate.get(me.getKey()))][cateNamesToIndex.get(me.getValue())]++;
		}
		
		//输出混淆矩阵
		double [] clusterSum = new double[k]; //记录每一个聚类的文件数
		double [] everyClusterEntropy = new double[k]; //记录每一个聚类的熵
		double clusterEntropy = 0;
		
		System.out.print("      ");
		
		for(int i=0;i<20;i++){
			System.out.printf("%-6d",i);
		}
		
		System.out.println();
		
		for(int i =0;i<k;i++){
			System.out.printf("%-6d",i);
			for(int j = 0;j<20;j++){
				clusterSum[i] += confusionMatrix[i][j];
				System.out.printf("%-6d",confusionMatrix[i][j]);
			}
			System.out.println();
		}
		System.out.println();
		
		//计算熵值
		for(int i = 0;i<k;i++){
			if(clusterSum[i] != 0){
				for(int j = 0;j< 20 ;j++){
					double p = (double)confusionMatrix[i][j]/clusterSum[i];
					if(p!=0)
						everyClusterEntropy[i] += -p * Math.log(p); 
				}
				clusterEntropy += clusterSum[i]/(double)rightCate.size() * everyClusterEntropy[i];  
			}
		}
		return clusterEntropy;
	}

	public void KmeansClusterMain(String testSampleDir) throws IOException {
		
		//首先计算文档TF-IDF向量，保存为Map<String,Map<String,Double>> 即为Map<文件名称,Map<特征词，TF-IDF值>>
		ComputeWordsVector computV = new ComputeWordsVector();
		
		//int k[] = {10,20,30}; 三组分类
		int k[] = {20};
		
		Map<String,Map<String,Double>> allTestSampleMap = computV.computeTFMultiIDF(testSampleDir);
		
		for(int i =0;i<k.length;i++){
			System.out.println("開始聚类。聚成"+k[i]+"类");
			String KmeansClusterResultFile = "E:\DataMiningSample\KmeansClusterResult\";
			Map<String,Integer> KmeansClusterResult = new TreeMap<String, Integer>();
			KmeansClusterResult = doProcess(allTestSampleMap,k[i]);
			KmeansClusterResultFile += k[i];
			printClusterResult(KmeansClusterResult,KmeansClusterResultFile);
			System.out.println("The Entropy for this Cluster is " + evaluateClusterResult(KmeansClusterResultFile,k[i]));
		}
		
	}
	
	
	public static void main(String[] args) throws IOException {
		
		KmeansCluster test = new KmeansCluster();
		
		String KmeansClusterResultFile = "E:\DataMiningSample\KmeansClusterResult\20";
		System.out.println("The Entropy for this Cluster is " + test.evaluateClusterResult(KmeansClusterResultFile,20));
	}


	
}

4、程序入口

package com.datamine.kmeans;

import java.io.IOException;
import java.text.SimpleDateFormat;
import java.util.Date;

public class ClusterMain {

	/**
	 * Kmeans 聚类主程序入口
	 * @param args
	 * @throws IOException 
	 */
	public static void main(String[] args) throws IOException {
		
		//数据预处理 在分类算法中已经实现 这里（略）
		
		ComputeWordsVector computeV = new ComputeWordsVector();
		
		KmeansCluster kmeansCluster = new KmeansCluster();
		
		String srcDir = "E:\DataMiningSample\processedSample\";
		String desDir = "E:\DataMiningSample\clusterTestSample\";
		
		SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
		String beginTime = sdf.format(new Date());
		System.out.println("程序開始运行时间："+beginTime);
		
		String[] terms = computeV.createTestSamples(srcDir,desDir);
		kmeansCluster.KmeansClusterMain(desDir);
		
		String endTime = sdf.format(new Date());
		System.out.println("程序结束运行时间："+endTime);
		
	}
	
	
}

5、聚类结果

snippet_file_name="blog_20160328_4_4592322" name="code" class="java">程序開始运行时间：2016-03-14 17:02:38 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 3 3 1 1 0 5 2 1 0 0 2 27 11 53 4 6 15 176 14 127 7 5 5 8 0 14 31 16 34 2 2 2 1 5 661 96 18 257 26 9 3 0 0 13 25 13 6 2 3 2 6 2 78 575 213 15 119 15 6 2 1 4 131 2 4 2 6 0 2 1 13 151 563 11 50 3 3 1 2 14 125 4 8 1 0 3 0 0 78 25 37 348 13 2 0 0 2 5 38 5 6 2 1 1 2 8 24 21 23 166 38 45 45 26 10 37 87 34 27 22 15 8 35 12 24 45 6 629 28 20 14 0 3 87 10 4 1 8 0 13 0 1 10 8 4 25 781 40 1 1 0 70 5 10 2 8 4 2 3 11 0 1 1 11 34 831 1 0 1 7 7 0 1 1 1 8 0 6 2 4 1 7 7 4 633 4 5 11 18 9 5 13 8 10 3 1 9 4 1 20 1 3 286 961 0 17 8 4 2 2 0 5 3 6 1 2 2 0 1 1 0 858 51 1 1 2 16 8 69 4 7 7 17 5 12 8 5 2 5 46 13 793 6 5 2 30 5 0 1 0 2 4 6 3 4 4 2 14 746 3 1 2 3 55 11 29 39 15 18 12 13 7 3 4 13 195 38 36 5 6 18 5 11 0 2 0 1 1 0 4 1 4 1 4 16 6 846 3 6 16 274 0 2 4 2 1 5 7 0 0 10 30 12 5 28 363 9 289 23 0 0 2 0 0 6 0 1 1 3 1 3 2 9 8 843 48 18 1 1 1 0 2 13 2 6 3 3 9 12 18 5 444 16 164 69 for this Cluster is 1.2444339205006887

查看全文

相关阅读:
转载：c++内存泄露机制
 推荐一款不错的dialog小工具：artDialog
写的一些推广方法拿出来分享下
 struts2标签具体解释
 父亲节：再见，总有一天
 Hadoop是什么
 熊猫烟花集团完美见证异速联远程接入系统
 OpenStack Networking
管道(Pipe)/createPipe
百度2014校园招聘算法——给出一组数据A=[a_0, a_1, a-2, ... a_n](当中n可变)，打印出该数值元素的全部组合。

原文地址：https://www.cnblogs.com/brucemengbm/p/7229347.html