zoukankan      html  css  js  c++  java
  • Hadoop AWS Word Count 样例

    在AWS里用Elastic Map Reduce 开一个Cluster

    然后登陆master node并编译下面程序:


    import java.io.IOException;
    import java.util.StringTokenizer;
    
    import org.apache.hadoop.conf.Configuration;
    import org.apache.hadoop.fs.Path;
    import org.apache.hadoop.io.IntWritable;
    import org.apache.hadoop.io.LongWritable;
    import org.apache.hadoop.io.Text;
    import org.apache.hadoop.mapreduce.Job;
    import org.apache.hadoop.mapreduce.Mapper;
    import org.apache.hadoop.mapreduce.Reducer;
    import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
    import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
    
    
    public class WordCount {
    
    	public static class WordCountMapper extends Mapper<LongWritable, Text, Text, IntWritable> {
    
    		private final IntWritable one = new IntWritable(1);
    		private Text word = new Text();
    		
    		@Override
    		protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
    			String line = value.toString();
    			StringTokenizer tokenizer = new StringTokenizer(line);
    			while(tokenizer.hasMoreTokens()) {
    				word.set(tokenizer.nextToken());
    				context.write(word, one);
    			}
    		}
    		
    	}
    	
    	public static class WordCountReducer extends Reducer<Text, IntWritable, Text, IntWritable> {
    
    		@Override
    		protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
    			int sum = 0;
    			for(IntWritable value : values) {
    				sum += value.get();
    			}
    			context.write(key, new IntWritable(sum));
    		}
    
    	}
    	
    	
    	
    	public static void main(String[] args) throws Exception {
    		Configuration conf = new Configuration();
    		Job job = new Job(conf, "Word Count hadoop-0.20");
    	      
            //setting the class names
            job.setJarByClass(WordCount.class);
            job.setMapperClass(WordCountMapper.class);
            job.setReducerClass(WordCountReducer.class);
    
            //setting the output data type classes
            job.setOutputKeyClass(Text.class);
            job.setOutputValueClass(IntWritable.class);
    
            //to accept the hdfs input and outpur dir at run time
            FileInputFormat.addInputPath(job, new Path(args[0]));
            FileOutputFormat.setOutputPath(job, new Path(args[1]));
    		
            System.exit(job.waitForCompletion(true) ?

    0 : 1); } }



    设置:

    export CLASSPATH=$CLASSPATH:/home/hadoop/*:/home/hadoop/lib/*:'.'

    javac WordCount.java

    jar cvf WordCount.jar *.class

    hadoop jar WordCount.jar WordCount s3://15-319-s13/book-dataset/pg_00 /output

    执行成功后,由于output目录在Hadoop FS下,所以能够这样查看:

    hadoop fs -cat /output/part-r-00000  | less



    主要參考:

    http://kickstarthadoop.blogspot.com/2011/04/word-count-hadoop-map-reduce-example.html

    http://kickstarthadoop.blogspot.com/2011/05/word-count-example-with-hadoop-020.html

  • 相关阅读:
    讲解Python中的递归函数
    世界史
    mysql 登录及常用命令
    html5 的draggable属性使用<转载收藏>
    html块级元素和内联元素小结
    今天的感悟,对于python中的list()与w3c教程
    html,CSS文字大小单位px、em、pt的关系换算
    java SE (java Standard Edition)
    suds调用webservice
    Web API系列(三)统一异常处理
  • 原文地址:https://www.cnblogs.com/clnchanpin/p/7112750.html
Copyright © 2011-2022 走看看