zoukankan      html  css  js  c++  java
  • hadoop文本转换为序列文件

    在以前使用hadoop的时候因为mahout里面很多都要求输入文件时序列文件,所以涉及到把文本文件转换为序列文件或者序列文件转为文本文件(因为当时要分析mahout的源码,所以就要看到它的输入文件是什么,文本比较好看其内容)。一般这个有两种做法,其一:按照《hadoop权威指南》上面的方面直接读出序列文件然后写入一个文本;其二,编写一个job任务,直接设置输出文件的格式,这样也可以把序列文件读成文本(个人一般采用这样方法)。时隔好久,今天又重新试了下,居然不行了?,比如,我要编写一个把文本转为序列文件的java程序如下:

    package mahout.fansy.canopy.transformdata;
    
    import java.io.IOException;
    
    import org.apache.hadoop.conf.Configuration;
    import org.apache.hadoop.fs.Path;
    import org.apache.hadoop.io.Text;
    import org.apache.hadoop.io.Writable;
    import org.apache.hadoop.mapreduce.Job;
    import org.apache.hadoop.mapreduce.Mapper;
    import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
    import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
    import org.apache.mahout.common.AbstractJob;
    import org.apache.mahout.math.RandomAccessSparseVector;
    import org.apache.mahout.math.Vector;
    import org.apache.mahout.math.VectorWritable;
    
    public class Text2VectorWritable extends AbstractJob{
     
    	@Override
    	public int run(String[] arg0) throws Exception {
    		addInputOption();
    	    addOutputOption();
    	    if (parseArguments(arg0) == null) {
    		      return -1;
    		}
    	    Path input=getInputPath();
    	    Path output=getOutputPath();
    	    Configuration conf=getConf();
    	    Job job=new Job(conf,"text2vectorWritable with input:"+input.getName());
    	 //   job.setInputFormatClass(SequenceFileInputFormat.class);
    	    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    	    job.setMapperClass(Text2VectorWritableMapper.class);
    	    job.setMapOutputKeyClass(Writable.class);
    	    job.setMapOutputValueClass(VectorWritable.class);
    	    job.setNumReduceTasks(0);
    	    job.setJarByClass(Text2VectorWritable.class);
    	     
    	    FileInputFormat.addInputPath(job, input);
    	    SequenceFileOutputFormat.setOutputPath(job, output);
    	    if (!job.waitForCompletion(true)) {
    	        throw new InterruptedException("Canopy Job failed processing " + input);
    	      }
    		return 0;
    	}
    	
    	public static class Text2VectorWritableMapper extends Mapper<Writable,Text,Writable,VectorWritable>{
    		public void map(Writable key,Text value,Context context)throws IOException,InterruptedException{
    			String[] str=value.toString().split(",");
    			Vector vector=new RandomAccessSparseVector(str.length);
    			for(int i=0;i<str.length;i++){
    				vector.set(i, Double.parseDouble(str[i]));
    			}
    			VectorWritable va=new VectorWritable(vector);
    			context.write(key, va);
    		}
    	}
    	
    }
    

    这样在运行的时候老是提示说 我的Map的value的类型不是Text,不管我设置为什么类型都会是这样的情况。后来我就想会不会是map的输出时Text的格式?,然后我就把上面的程序加入了Reducer,如下:

    package mahout.fansy.canopy.transformdata;
    
    import java.io.IOException;
    
    import org.apache.hadoop.conf.Configuration;
    import org.apache.hadoop.fs.Path;
    import org.apache.hadoop.io.LongWritable;
    import org.apache.hadoop.io.Text;
    import org.apache.hadoop.mapreduce.Job;
    import org.apache.hadoop.mapreduce.Mapper;
    import org.apache.hadoop.mapreduce.Reducer;
    import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
    import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
    import org.apache.mahout.common.AbstractJob;
    import org.apache.mahout.math.RandomAccessSparseVector;
    import org.apache.mahout.math.Vector;
    import org.apache.mahout.math.VectorWritable;
    
    public class Text2VectorWritableCopy extends AbstractJob{
     
    	@Override
    	public int run(String[] arg0) throws Exception {
    		addInputOption();
    	    addOutputOption();
    	    if (parseArguments(arg0) == null) {
    		      return -1;
    		}
    	    Path input=getInputPath();
    	    Path output=getOutputPath();
    	    Configuration conf=getConf();
    	    Job job=new Job(conf,"text2vectorWritableCopy with input:"+input.getName());
    	 //   job.setInputFormatClass(SequenceFileInputFormat.class);
    	    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    	    job.setMapperClass(Text2VectorWritableMapper.class);
    	    job.setMapOutputKeyClass(LongWritable.class);
    	    job.setMapOutputValueClass(VectorWritable.class);
    	    job.setReducerClass(Text2VectorWritableReducer.class);
    	    job.setOutputKeyClass(LongWritable.class);
    	    job.setOutputValueClass(VectorWritable.class);
    	    job.setJarByClass(Text2VectorWritableCopy.class);
    	     
    	    FileInputFormat.addInputPath(job, input);
    	    SequenceFileOutputFormat.setOutputPath(job, output);
    	    if (!job.waitForCompletion(true)) {
    	        throw new InterruptedException("Canopy Job failed processing " + input);
    	      }
    		return 0;
    	}
    	
    	public static class Text2VectorWritableMapper extends Mapper<LongWritable,Text,LongWritable,VectorWritable>{
    		public void map(LongWritable key,Text value,Context context)throws IOException,InterruptedException{
    			String[] str=value.toString().split(",");
    			Vector vector=new RandomAccessSparseVector(str.length);
    			for(int i=0;i<str.length;i++){
    				vector.set(i, Double.parseDouble(str[i]));
    			}
    			VectorWritable va=new VectorWritable(vector);
    			context.write(key, va);
    		}
    	}
    	
    	public static class Text2VectorWritableReducer extends Reducer<LongWritable,VectorWritable,LongWritable,VectorWritable>{
    		public void reduce(LongWritable key,Iterable<VectorWritable> values,Context context)throws IOException,InterruptedException{
    			for(VectorWritable v:values){
    				context.write(key, v);
    			}
    		}
    	}
    	
    }
    

    然后在运行,就可以了。

    不过关于map的输出是否一定是text格式的,还有待论证。


    分享,快乐,成长


    转载请注明出处:http://blog.csdn.net/fansy1990 



  • 相关阅读:
    二维码生成
    Tomcat 日志文件分割
    java代码优化
    User-Agent 请求消息头
    json
    Access-Control-Allow-Origin跨域请求处理
    Failed to read HTTP message: org.springframework.http.converter.HttpMessageNotReadableException: Required request body is missing: public xxxxxxxx.
    Nginx的配置文件
    WebService 及 CXF 的进阶讲解
    WebService的讲解 和 CXF 的初步使用
  • 原文地址:https://www.cnblogs.com/aukle/p/3228696.html
Copyright © 2011-2022 走看看