zoukankan html css js c++ java

Twenty Newsgroups Classification实例任务之TrainNaiveBayesJob(一)

接着上篇blog，继续看log里面的信息如下：

+ echo 'Training Naive Bayes model'
Training Naive Bayes model
+ ./bin/mahout trainnb -i /home/mahout/mahout-work-mahout/20news-train-vectors -el -o /home/mahout/mahout-work-mahout/model -li /home/mahout/mahout-work-mahout/labelindex -ow

这里mahout的trainnb对应的源码文件是TrainNaiveBayesJob类，该类主要的工作是：（1）新建了一个LabelIndex的文件；（2）执行了一个prepareJob，Mapper和Reducer分别是：IndexInstancesMapper、VectorSumReducer；（2）执行了另外的一个prepareJob，Mapper和Reducer分别是：WeightsMapper、VectorSumReducer；本篇主要分析前面两个工作。

新建LabelIndex的代码如下：

private long createLabelIndex(Path labPath) throws IOException {
    long labelSize = 0;
    if (hasOption(LABELS)) {
      Iterable<String> labels = Splitter.on(",").split(getOption(LABELS));
      labelSize = BayesUtils.writeLabelIndex(getConf(), labels, labPath);
    } else if (hasOption(EXTRACT_LABELS)) {
      SequenceFileDirIterable<Text, IntWritable> iterable =
              new SequenceFileDirIterable<Text, IntWritable>(getInputPath(), PathType.LIST, PathFilters.logsCRCFilter(), getConf());
      labelSize = BayesUtils.writeLabelIndex(getConf(), labPath, iterable);
    }
    return labelSize;
  }

这里的主要工作是把相关的文件名转换为数字，文件名如下图：

下面看Mapper，IndexInstancesMapper的主要代码如下：

 labelIndex = BayesUtils.readIndexFromCache(ctx.getConfiguration());
String label = labelText.toString().split("/")[1]; 
    if (labelIndex.containsKey(label)) {
      ctx.write(new IntWritable(labelIndex.get(label)), instance);

首先在setup函数中读取labelindex的map映射关系，然后在map中针对输入/alt.atheism/51060解析/后面的字符串，即文件名进行匹配，输出对应的数字和相应的value不变；

VectorSumReducer：

 Vector vector = null;
    for (VectorWritable v : values) {
      if (vector == null) {
        vector = v.get();
      } else {
        vector.assign(v.get(), Functions.PLUS);
      }
    }
    ctx.write(key, new VectorWritable(vector));

上面的代码就是把相同的文件对应的word的单词的个数全部加起来，由于一共有20个文件，所以这里的reduce输出应该有20个，对应log里面的信息，可以看到确实匹配，如下图：

这里额可以通过下面的代码来测试相关的文件：

package mahout.fansy.test.bayes.read;

import java.io.IOException;
import java.net.URI;
import java.util.HashMap;
import java.util.Map;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.util.ReflectionUtils;
import org.apache.mahout.math.VectorWritable;

public class ReadLabelIndex {
 
	/**
	 * @param args
	 */
	public static Configuration conf=new Configuration();
	static String fPath="";
	static String trainPath="";
	static{
		conf.set("mapred.job.tracker", "ubuntu:9001");
		fPath="hdfs://ubuntu:9000/home/mahout/mahout-work-mahout/labelindex"; // lableindex 数据文件
		trainPath="hdfs://ubuntu:9000/home/mahout/mahout-work-mahout/"
		+"20news-train-vectors/part-r-00000"; // 训练样本数据
	}
	public static void main(String[] args) throws IOException {
	//	readFromFile(fPath);
		readFromFile(trainPath);
	}
	
	/**
	 * 读取LabelIndex文件
	 * @param fPath
	 * @return
	 * @throws IOException
	 */
	public static Map<Writable,Writable> readFromFile(String fPath) throws IOException{
		FileSystem fs = FileSystem.get(URI.create(fPath), conf);
	    Path path = new Path(fPath);
	    Map<Writable,Writable> map=new HashMap<Writable,Writable>();
	    SequenceFile.Reader reader = null;
	    try {
	      reader = new SequenceFile.Reader(fs, path, conf);
	      Writable key = (Writable)
	        ReflectionUtils.newInstance(reader.getKeyClass(), conf);
	      Writable value = (Writable)
	        ReflectionUtils.newInstance(reader.getValueClass(), conf);
	      while (reader.next(key, value)) {
	    	 // 	Writable k=;  // 如何实现Writable的深度复制？
	        //	map.put(key, value);
	    	  System.out.println(key.toString()+", "+value.toString());
	    	  System.exit(-1);// 只打印第一条记录
	      }
	    } finally {
	      IOUtils.closeStream(reader);
	    }
	    return map;
	}
	
}

这里在写的时候想做一个通用的，所以需要对Writable深度复制，但是一时间还没有想到办法，所以这里留个问题，有时间解决。

分享，成长，快乐

转载请注明blog地址：http://blog.csdn.net/fansy1990

查看全文

相关阅读:
asp.net 验证输入有效性
 优化SQL SERVER访问性能
 with nocheck ; nocheck
VB SStab设置当前页
 相互关联的子查询
 mybatis 批量查询参数语句
 转载 mysql函数大全
 Tchar.h 中的一般文本映射
 字节序的概念、判断、及转换
 Windows风格与C/C++风格：UNICODE VS _UNICODE 与 TEXT() VS _T()

原文地址：https://www.cnblogs.com/pangblog/p/3306315.html