zoukankan      html  css  js  c++  java
  • Hadoop AWS Word Count 样例

    在AWS里用Elastic Map Reduce 开一个Cluster

    然后登陆master node并编译下面程序:


    import java.io.IOException;
    import java.util.StringTokenizer;
    
    import org.apache.hadoop.conf.Configuration;
    import org.apache.hadoop.fs.Path;
    import org.apache.hadoop.io.IntWritable;
    import org.apache.hadoop.io.LongWritable;
    import org.apache.hadoop.io.Text;
    import org.apache.hadoop.mapreduce.Job;
    import org.apache.hadoop.mapreduce.Mapper;
    import org.apache.hadoop.mapreduce.Reducer;
    import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
    import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
    
    
    public class WordCount {
    
    	public static class WordCountMapper extends Mapper<LongWritable, Text, Text, IntWritable> {
    
    		private final IntWritable one = new IntWritable(1);
    		private Text word = new Text();
    		
    		@Override
    		protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
    			String line = value.toString();
    			StringTokenizer tokenizer = new StringTokenizer(line);
    			while(tokenizer.hasMoreTokens()) {
    				word.set(tokenizer.nextToken());
    				context.write(word, one);
    			}
    		}
    		
    	}
    	
    	public static class WordCountReducer extends Reducer<Text, IntWritable, Text, IntWritable> {
    
    		@Override
    		protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
    			int sum = 0;
    			for(IntWritable value : values) {
    				sum += value.get();
    			}
    			context.write(key, new IntWritable(sum));
    		}
    
    	}
    	
    	
    	
    	public static void main(String[] args) throws Exception {
    		Configuration conf = new Configuration();
    		Job job = new Job(conf, "Word Count hadoop-0.20");
    	      
            //setting the class names
            job.setJarByClass(WordCount.class);
            job.setMapperClass(WordCountMapper.class);
            job.setReducerClass(WordCountReducer.class);
    
            //setting the output data type classes
            job.setOutputKeyClass(Text.class);
            job.setOutputValueClass(IntWritable.class);
    
            //to accept the hdfs input and outpur dir at run time
            FileInputFormat.addInputPath(job, new Path(args[0]));
            FileOutputFormat.setOutputPath(job, new Path(args[1]));
    		
            System.exit(job.waitForCompletion(true) ?

    0 : 1); } }



    设置:

    export CLASSPATH=$CLASSPATH:/home/hadoop/*:/home/hadoop/lib/*:'.'

    javac WordCount.java

    jar cvf WordCount.jar *.class

    hadoop jar WordCount.jar WordCount s3://15-319-s13/book-dataset/pg_00 /output

    执行成功后,由于output目录在Hadoop FS下,所以能够这样查看:

    hadoop fs -cat /output/part-r-00000  | less



    主要參考:

    http://kickstarthadoop.blogspot.com/2011/04/word-count-hadoop-map-reduce-example.html

    http://kickstarthadoop.blogspot.com/2011/05/word-count-example-with-hadoop-020.html

  • 相关阅读:
    public,protected,private辨析
    转载---Java集合对象的深度复制与普通复制
    SSM框架学习之高并发秒杀业务--笔记4-- web层
    PCB布线总的原则
    模拟电子技术目录
    放大器(PA+LAN)在射频上的应用
    AD软件Bug和自我失误的对战
    二、cadence焊盘与封装制作操作步骤详细说明
    图腾柱电路工作原理
    转载:介绍AD另外一种奇葩的多通道复用的方法
  • 原文地址:https://www.cnblogs.com/clnchanpin/p/7112750.html
Copyright © 2011-2022 走看看