zoukankan      html  css  js  c++  java
  • Hadoop入门经典:WordCount

    转:http://blog.csdn.net/jediael_lu/article/details/38705371

    以下程序在hadoop1.2.1上测试成功。

    本例先将源代码呈现,然后详细说明执行步骤,最后对源代码及执行过程进行分析。

    一、源代码

     1 package org.jediael.hadoopdemo.wordcount;  
     2   
     3 import java.io.IOException;  
     4 import java.util.StringTokenizer;  
     5   
     6 import org.apache.hadoop.conf.Configuration;  
     7 import org.apache.hadoop.fs.Path;  
     8 import org.apache.hadoop.io.IntWritable;  
     9 import org.apache.hadoop.io.LongWritable;  
    10 import org.apache.hadoop.io.Text;  
    11 import org.apache.hadoop.mapreduce.Job;  
    12 import org.apache.hadoop.mapreduce.Mapper;  
    13 import org.apache.hadoop.mapreduce.Reducer;  
    14 import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;  
    15 import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;  
    16 import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;  
    17 import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;  
    18   
    19 public class WordCount {  
    20   
    21     public static class WordCountMap extends  
    22             Mapper<LongWritable, Text, Text, IntWritable> {  
    23   
    24         private final IntWritable one = new IntWritable(1);  
    25         private Text word = new Text();  
    26   
    27         public void map(LongWritable key, Text value, Context context)  
    28                 throws IOException, InterruptedException {  
    29             String line = value.toString();  
    30             StringTokenizer token = new StringTokenizer(line);  
    31             while (token.hasMoreTokens()) {  
    32                 word.set(token.nextToken());  
    33                 context.write(word, one);  
    34             }  
    35         }  
    36     }  
    37   
    38     public static class WordCountReduce extends  
    39             Reducer<Text, IntWritable, Text, IntWritable> {  
    40   
    41         public void reduce(Text key, Iterable<IntWritable> values,  
    42                 Context context) throws IOException, InterruptedException {  
    43             int sum = 0;  
    44             for (IntWritable val : values) {  
    45                 sum += val.get();  
    46             }  
    47             context.write(key, new IntWritable(sum));  
    48         }  
    49     }  
    50   
    51     public static void main(String[] args) throws Exception {  
    52         Configuration conf = new Configuration();  
    53         Job job = new Job(conf);  
    54         job.setJarByClass(WordCount.class);  
    55         job.setJobName("wordcount");  
    56   
    57         job.setOutputKeyClass(Text.class);  
    58         job.setOutputValueClass(IntWritable.class);  
    59   
    60         job.setMapperClass(WordCountMap.class);  
    61         job.setReducerClass(WordCountReduce.class);  
    62   
    63         job.setInputFormatClass(TextInputFormat.class);  
    64         job.setOutputFormatClass(TextOutputFormat.class);  
    65   
    66         FileInputFormat.addInputPath(job, new Path(args[0]));  
    67         FileOutputFormat.setOutputPath(job, new Path(args[1]));  
    68   
    69         job.waitForCompletion(true);  
    70     }  
    71 }  

    二、执行程序

    1、从eclipse从导出至wordcount.jar,并上传至hadoop服务器,本例中,将程序上传至/home/jediael/project。

    2、安装hadoop伪分布模式,可参考Hadoop1.2.1伪分布模式安装指南,本实例将运行在hadoop的伪公布环境中。

    3、在HDFS中创建目录wcinput,用作输入目录,并将需要分析的文件复制到目录下。

    1. [root@jediael conf]# hadoop fs -mkdir wcinput  
      [root@jediael conf]# hadoop fs -copyFromLocal * wcinput   
      [root@jediael conf]# hadoop fs -ls wcinput   
      Found 26 items   
      -rw-r--r-- 1 root supergroup 1524 2014-08-20 12:29 /user/root/wcinput/automaton-urlfilter.txt   
      -rw-r--r-- 1 root supergroup 1311 2014-08-20 12:29 /user/root/wcinput/configuration.xsl   
      -rw-r--r-- 1 root supergroup 131090 2014-08-20 12:29 /user/root/wcinput/domain-suffixes.xml   
      -rw-r--r-- 1 root supergroup 4649 2014-08-20 12:29 /user/root/wcinput/domain-suffixes.xsd   
      -rw-r--r-- 1 root supergroup 824 2014-08-20 12:29 /user/root/wcinput/domain-urlfilter.txt   
      -rw-r--r-- 1 root supergroup 3368 2014-08-20 12:29 /user/root/wcinput/gora-accumulo-mapping.xml   
      -rw-r--r-- 1 root supergroup 3279 2014-08-20 12:29 /user/root/wcinput/gora-cassandra-mapping.xml   
      -rw-r--r-- 1 root supergroup 3447 2014-08-20 12:29 /user/root/wcinput/gora-hbase-mapping.xml   
      -rw-r--r-- 1 root supergroup 2677 2014-08-20 12:29 /user/root/wcinput/gora-sql-mapping.xml   
      -rw-r--r-- 1 root supergroup 2993 2014-08-20 12:29 /user/root/wcinput/gora.properties   
      -rw-r--r-- 1 root supergroup 983 2014-08-20 12:29 /user/root/wcinput/hbase-site.xml   
      -rw-r--r-- 1 root supergroup 3096 2014-08-20 12:29 /user/root/wcinput/httpclient-auth.xml   
      -rw-r--r-- 1 root supergroup 3948 2014-08-20 12:29 /user/root/wcinput/log4j.properties   
      -rw-r--r-- 1 root supergroup 511 2014-08-20 12:29 /user/root/wcinput/nutch-conf.xsl   
      -rw-r--r-- 1 root supergroup 42610 2014-08-20 12:29 /user/root/wcinput/nutch-default.xml   
      -rw-r--r-- 1 root supergroup 753 2014-08-20 12:29 /user/root/wcinput/nutch-site.xml   
      -rw-r--r-- 1 root supergroup 347 2014-08-20 12:29 /user/root/wcinput/parse-plugins.dtd   
      -rw-r--r-- 1 root supergroup 3016 2014-08-20 12:29 /user/root/wcinput/parse-plugins.xml   
      -rw-r--r-- 1 root supergroup 857 2014-08-20 12:29 /user/root/wcinput/prefix-urlfilter.txt   
      -rw-r--r-- 1 root supergroup 2484 2014-08-20 12:29 /user/root/wcinput/regex-normalize.xml   
      -rw-r--r-- 1 root supergroup 1736 2014-08-20 12:29 /user/root/wcinput/regex-urlfilter.txt   
      -rw-r--r-- 1 root supergroup 18969 2014-08-20 12:29 /user/root/wcinput/schema-solr4.xml   
      -rw-r--r-- 1 root supergroup 6020 2014-08-20 12:29 /user/root/wcinput/schema.xml   
      -rw-r--r-- 1 root supergroup 1766 2014-08-20 12:29 /user/root/wcinput/solrindex-mapping.xml   
      -rw-r--r-- 1 root supergroup 1044 2014-08-20 12:29 /user/root/wcinput/subcollections.xml   
      -rw-r--r-- 1 root supergroup 1411 2014-08-20 12:29 /user/root/wcinput/suffix-urlfilter.txt  

    4、运行程序

     
    1. [root@jediael project]# hadoop org.jediael.hadoopdemo.wordcount.WordCount wcinput wcoutput3   
      14/08/20 12:50:25 WARN mapred.JobClient: Use GenericOptionsParser for parsing the arguments. Applications should implement Tool for the same.   
      14/08/20 12:50:26 INFO input.FileInputFormat: Total input paths to process : 26   
      14/08/20 12:50:26 INFO util.NativeCodeLoader: Loaded the native-hadoop library   
      14/08/20 12:50:26 WARN snappy.LoadSnappy: Snappy native library not loaded   
      14/08/20 12:50:26 INFO mapred.JobClient: Running job: job_201408191134_0005   
      14/08/20 12:50:27 INFO mapred.JobClient: map 0% reduce 0%   
      14/08/20 12:50:38 INFO mapred.JobClient: map 3% reduce 0%   
      14/08/20 12:50:39 INFO mapred.JobClient: map 7% reduce 0%   
      14/08/20 12:50:50 INFO mapred.JobClient: map 15% reduce 0%   
      14/08/20 12:50:57 INFO mapred.JobClient: map 19% reduce 0%   
      14/08/20 12:50:58 INFO mapred.JobClient: map 23% reduce 0%   
      14/08/20 12:51:00 INFO mapred.JobClient: map 23% reduce 5%   
      14/08/20 12:51:04 INFO mapred.JobClient: map 30% reduce 5%   
      14/08/20 12:51:06 INFO mapred.JobClient: map 30% reduce 10%   
      14/08/20 12:51:11 INFO mapred.JobClient: map 38% reduce 10%   
      14/08/20 12:51:16 INFO mapred.JobClient: map 38% reduce 11%   
      14/08/20 12:51:18 INFO mapred.JobClient: map 46% reduce 11%   
      14/08/20 12:51:19 INFO mapred.JobClient: map 46% reduce 12%   
      14/08/20 12:51:22 INFO mapred.JobClient: map 46% reduce 15%   
      14/08/20 12:51:25 INFO mapred.JobClient: map 53% reduce 15%   
      14/08/20 12:51:31 INFO mapred.JobClient: map 53% reduce 17%   
      14/08/20 12:51:32 INFO mapred.JobClient: map 61% reduce 17%   
      14/08/20 12:51:39 INFO mapred.JobClient: map 69% reduce 17%   
      14/08/20 12:51:40 INFO mapred.JobClient: map 69% reduce 20%   
      14/08/20 12:51:45 INFO mapred.JobClient: map 73% reduce 20%   
      14/08/20 12:51:46 INFO mapred.JobClient: map 76% reduce 23%   
      14/08/20 12:51:52 INFO mapred.JobClient: map 80% reduce 23%   
      14/08/20 12:51:53 INFO mapred.JobClient: map 84% reduce 23%   
      14/08/20 12:51:55 INFO mapred.JobClient: map 84% reduce 25%   
      14/08/20 12:51:59 INFO mapred.JobClient: map 88% reduce 25%   
      14/08/20 12:52:00 INFO mapred.JobClient: map 92% reduce 25%   
      14/08/20 12:52:02 INFO mapred.JobClient: map 92% reduce 29%   
      14/08/20 12:52:06 INFO mapred.JobClient: map 96% reduce 29%   
      14/08/20 12:52:07 INFO mapred.JobClient: map 100% reduce 29%   
      14/08/20 12:52:11 INFO mapred.JobClient: map 100% reduce 30%   
      14/08/20 12:52:15 INFO mapred.JobClient: map 100% reduce 100%   
      14/08/20 12:52:17 INFO mapred.JobClient: Job complete: job_201408191134_0005   
      14/08/20 12:52:18 INFO mapred.JobClient: Counters: 29   
      14/08/20 12:52:18 INFO mapred.JobClient: Job Counters   
      14/08/20 12:52:18 INFO mapred.JobClient: Launched reduce tasks=1   
      14/08/20 12:52:18 INFO mapred.JobClient: SLOTS_MILLIS_MAPS=192038   
      14/08/20 12:52:18 INFO mapred.JobClient: Total time spent by all reduces waiting after reserving slots (ms)=0   
      14/08/20 12:52:18 INFO mapred.JobClient: Total time spent by all maps waiting after reserving slots (ms)=0   
      14/08/20 12:52:18 INFO mapred.JobClient: Launched map tasks=26   
      14/08/20 12:52:18 INFO mapred.JobClient: Data-local map tasks=26   
      14/08/20 12:52:18 INFO mapred.JobClient: SLOTS_MILLIS_REDUCES=95814   
      14/08/20 12:52:18 INFO mapred.JobClient: File Output Format Counters   
      14/08/20 12:52:18 INFO mapred.JobClient: Bytes Written=123950   
      14/08/20 12:52:18 INFO mapred.JobClient: FileSystemCounters   
      14/08/20 12:52:18 INFO mapred.JobClient: FILE_BYTES_READ=352500   
      14/08/20 12:52:18 INFO mapred.JobClient: HDFS_BYTES_READ=247920   
      14/08/20 12:52:18 INFO mapred.JobClient: FILE_BYTES_WRITTEN=2177502   
      14/08/20 12:52:18 INFO mapred.JobClient: HDFS_BYTES_WRITTEN=123950   
      14/08/20 12:52:18 INFO mapred.JobClient: File Input Format Counters   
      14/08/20 12:52:18 INFO mapred.JobClient: Bytes Read=244713   
      14/08/20 12:52:18 INFO mapred.JobClient: Map-Reduce Framework   
      14/08/20 12:52:18 INFO mapred.JobClient: Map output materialized bytes=352650   
      14/08/20 12:52:18 INFO mapred.JobClient: Map input records=7403   
      14/08/20 12:52:18 INFO mapred.JobClient: Reduce shuffle bytes=352650   
      14/08/20 12:52:18 INFO mapred.JobClient: Spilled Records=45210   
      14/08/20 12:52:18 INFO mapred.JobClient: Map output bytes=307281   
      14/08/20 12:52:18 INFO mapred.JobClient: Total committed heap usage (bytes)=3398606848   
      14/08/20 12:52:18 INFO mapred.JobClient: CPU time spent (ms)=14400   
      14/08/20 12:52:18 INFO mapred.JobClient: Combine input records=0   
      14/08/20 12:52:18 INFO mapred.JobClient: SPLIT_RAW_BYTES=3207   
      14/08/20 12:52:18 INFO mapred.JobClient: Reduce input records=22605   
      14/08/20 12:52:18 INFO mapred.JobClient: Reduce input groups=6749   
      14/08/20 12:52:18 INFO mapred.JobClient: Combine output records=0   
      14/08/20 12:52:18 INFO mapred.JobClient: Physical memory (bytes) snapshot=4799041536   
      14/08/20 12:52:18 INFO mapred.JobClient: Reduce output records=6749   
      14/08/20 12:52:18 INFO mapred.JobClient: Virtual memory (bytes) snapshot=19545337856   
      14/08/20 12:52:18 INFO mapred.JobClient: Map output records=22605 

    5、查看结果


    1. root@jediael project]# hadoop fs -ls wcoutput3   
      Found 3 items   
      -rw-r--r-- 1 root supergroup 0 2014-08-20 12:52 /user/root/wcoutput3/_SUCCESS   
      drwxr-xr-x - root supergroup 0 2014-08-20 12:50 /user/root/wcoutput3/_logs   
      -rw-r--r-- 1 root supergroup 123950 2014-08-20 12:52 /user/root/wcoutput3/part-r-00000   
      [root@jediael project]# hadoop fs -cat wcoutput3/part-r-00000  
      !!      2  
      !ci.*.*.us      1  
      !co.*.*.us      1  
      !town.*.*.us    1  
      "AS     22  
      "Accept"        1  
      "Accept-Language"       1  
      "License");     22  
      "NOW"   1  
      "WiFi"  1  
      "Z"     1  
      "all"   1  
      "content"       1  
      "delete 1  
      "delimiter"     1  

    三、程序分析

    1、WordCountMap类继承了org.apache.hadoop.mapreduce.Mapper,4个泛型类型分别是map函数输入key的类型,输入value的类型,输出key的类型,输出value的类型。
     
    2、WordCountReduce类继承了org.apache.hadoop.mapreduce.Reducer,4个泛型类型含义与map类相同。
     
    3、map的输出类型与reduce的输入类型相同,而一般情况下,map的输出类型与reduce的输出类型相同,因此,reduce的输入类型与输出类型相同。
     
    4、hadoop根据以下代码确定输入内容的格式:
    job.setInputFormatClass(TextInputFormat.class);
    TextInputFormat是hadoop默认的输入方法,它继承自FileInputFormat。在TextInputFormat中,它将数据集切割成小数据集InputSplit,每一个InputSplit由一个mapper处理。此外,InputFormat还提供了一个RecordReader的实现,将一个InputSplit解析成<key,value>的形式,并提供给map函数:
    key:这个数据相对于数据分片中的字节偏移量,数据类型是LongWritable。
    value:每行数据的内容,类型是Text。
    因此,在本例中,map函数的key/value类型是LongWritable与Text。
     
    5、Hadoop根据以下代码确定输出内容的格式:
    job.setOutputFormatClass(TextOutputFormat.class);
    TextOutputFormat是hadoop默认的输出格式,它会将每条记录一行的形式存入文本文件,如
    the 30
    happy 23
    ……
  • 相关阅读:
    洛谷 1850 NOIP2016提高组 换教室
    2018牛客多校第三场 C.Shuffle Cards
    2018牛客多校第一场 B.Symmetric Matrix
    2018牛客多校第一场 A.Monotonic Matrix
    2018牛客多校第一场 D.Two Graphs
    2018宁夏邀请赛L Continuous Intervals
    2018宁夏邀请赛K Vertex Covers
    BZOJ
    HDU
    ACM International Collegiate Programming Contest, Egyptian Collegiate Programming Contest (ECPC 2015)
  • 原文地址:https://www.cnblogs.com/nucdy/p/5669516.html
Copyright © 2011-2022 走看看