zoukankan      html  css  js  c++  java
  • hadoop wordcount

    Mapper

    // map的数量与数的分片有关系
    public class WCMapper extends Mapper<LongWritable, Text, Text, LongWritable>{
    
    	protected void map(LongWritable key, Text value, Context context)
    			throws IOException, InterruptedException {
    		String line = value.toString();
    		String[] words = StringUtils.split(line, " ");
    		for (String word : words) {
    			context.write(new Text(word), new LongWritable(1));
    		}
    	}
    }
    

      

    Reducer 

    public class WCReducer extends Reducer<Text, LongWritable, Text, LongWritable> {
    
    	
    	@Override
    	protected void reduce(Text key, Iterable<LongWritable> values, Context context)
    			throws IOException, InterruptedException {
    		long count = 0;
    		for (LongWritable l : values) {
    			count ++;
    		}
    		context.write(key, new LongWritable(count));
    	}
    }
    

    Runner

    public class WCRunner {
    
    	public static void main(String[] args) throws Exception {
    		Configuration conf = new Configuration();
    		Job job = Job.getInstance(conf);
    		
    		job.setJarByClass(WCRunner.class);
    		
    		job.setMapperClass(WCMapper.class);
    		job.setReducerClass(WCReducer.class);
    		
    		job.setMapOutputKeyClass(Text.class);
    		job.setMapOutputValueClass(LongWritable.class);
    		
    		job.setOutputKeyClass(Text.class);
    		job.setOutputValueClass(LongWritable.class);
    // 设置reduce的数量,对应的会生成设置数量的文件,每个文件的内容是根据
    // job.setPartitionerClass(HashPartitioner.class);中的Partitioner确定
    
                    job.setNumReduceTasks(10); 
    FileInputFormat.setInputPaths(job, new Path(args[0])); 
    FileOutputFormat.setOutputPath(job, new Path(args[1])); 
    System.exit(job.waitForCompletion(true) ? 0 : 1); 
    } 
    }
    

      

    public class WCRunner2 extends Configured implements Tool{
    
      
    	public int run(String[] args) throws Exception {	  
    		Configuration conf = new Configuration();
    		Job job = Job.getInstance(conf);
    		
    		job.setJarByClass(WCRunner2.class);
    		
    		job.setMapperClass(WCMapper.class);
            job.setReducerClass(WCReducer.class);	
            
            job.setMapOutputKeyClass(Text.class);
            job.setMapOutputValueClass(LongWritable.class);
            
            job.setOutputKeyClass(Text.class);
            job.setOutputValueClass(LongWritable.class);
            
            FileInputFormat.setInputPaths(job, new Path(args[0]));
            FileOutputFormat.setOutputPath(job, new Path(args[1]));
            
    		return job.waitForCompletion(true) ? 0 : 1;
    	}
    
    	
    	public static void main(String[] args) throws Exception {
    		ToolRunner.run(new WCRunner2(), args);
    	}
     
    }
    

    执行:  hadoop jar wc.jar com.easytrack.hadoop.mr.WCRunner2  /wordcount.txt /wc/output4

  • 相关阅读:
    【转】 GetProcAddress()用法
    AutoCAD开发小记
    Visual Studio 2015正式版发布
    【VS2010]如何删除【附加依赖项】中“继承的值”?
    OpenCV入门指南
    Visual Studio 遇到了异常。这可能是由某个扩展导致的。
    VS2010在WIN7下安装报错“下列组件安装失败”如何解决
    获取系统日期时间的简单方法
    免费在线pdf互转工具
    应用层vc实现三种文件监视方法
  • 原文地址:https://www.cnblogs.com/heml/p/5990133.html
Copyright © 2011-2022 走看看