zoukankan      html  css  js  c++  java
  • Hadoop入门经典:WordCount


    以下程序在hadoop1.2.1上测试成功。

    本例先将源代码呈现,然后详细说明执行步骤,最后对源代码及执行过程进行分析。

    一、源代码

    package org.jediael.hadoopdemo.wordcount;
    
    import java.io.IOException;
    import java.util.StringTokenizer;
    
    import org.apache.hadoop.conf.Configuration;
    import org.apache.hadoop.fs.Path;
    import org.apache.hadoop.io.IntWritable;
    import org.apache.hadoop.io.LongWritable;
    import org.apache.hadoop.io.Text;
    import org.apache.hadoop.mapreduce.Job;
    import org.apache.hadoop.mapreduce.Mapper;
    import org.apache.hadoop.mapreduce.Reducer;
    import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
    import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
    import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
    import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
    
    public class WordCount {
    
    	public static class WordCountMap extends
    			Mapper<LongWritable, Text, Text, IntWritable> {
    
    		private final IntWritable one = new IntWritable(1);
    		private Text word = new Text();
    
    		public void map(LongWritable key, Text value, Context context)
    				throws IOException, InterruptedException {
    			String line = value.toString();
    			StringTokenizer token = new StringTokenizer(line);
    			while (token.hasMoreTokens()) {
    				word.set(token.nextToken());
    				context.write(word, one);
    			}
    		}
    	}
    
    	public static class WordCountReduce extends
    			Reducer<Text, IntWritable, Text, IntWritable> {
    
    		public void reduce(Text key, Iterable<IntWritable> values,
    				Context context) throws IOException, InterruptedException {
    			int sum = 0;
    			for (IntWritable val : values) {
    				sum += val.get();
    			}
    			context.write(key, new IntWritable(sum));
    		}
    	}
    
    	public static void main(String[] args) throws Exception {
    		Configuration conf = new Configuration();
    		Job job = new Job(conf);
    		job.setJarByClass(WordCount.class);
    		job.setJobName("wordcount");
    
    		job.setOutputKeyClass(Text.class);
    		job.setOutputValueClass(IntWritable.class);
    
    		job.setMapperClass(WordCountMap.class);
    		job.setReducerClass(WordCountReduce.class);
    
    		job.setInputFormatClass(TextInputFormat.class);
    		job.setOutputFormatClass(TextOutputFormat.class);
    
    		FileInputFormat.addInputPath(job, new Path(args[0]));
    		FileOutputFormat.setOutputPath(job, new Path(args[1]));
    
    		job.waitForCompletion(true);
    	}
    }
    

    二、执行程序

    1、从eclipse从导出至wordcount.jar,并上传至hadoop服务器,本例中,将程序上传至/home/jediael/project。

    2、安装hadoop伪分布模式,可参考Hadoop1.2.1伪分布模式安装指南,本实例将运行在hadoop的伪公布环境中。

    3、在HDFS中创建目录wcinput,用作输入目录,并将需要分析的文件复制到目录下。

    [root@jediael conf]# hadoop fs -mkdir wcinput
    [root@jediael conf]# hadoop fs -copyFromLocal * wcinput 
    [root@jediael conf]# hadoop fs -ls wcinput 
    Found 26 items 
    -rw-r--r-- 1 root supergroup 1524 2014-08-20 12:29 /user/root/wcinput/automaton-urlfilter.txt 
    -rw-r--r-- 1 root supergroup 1311 2014-08-20 12:29 /user/root/wcinput/configuration.xsl 
    -rw-r--r-- 1 root supergroup 131090 2014-08-20 12:29 /user/root/wcinput/domain-suffixes.xml 
    -rw-r--r-- 1 root supergroup 4649 2014-08-20 12:29 /user/root/wcinput/domain-suffixes.xsd 
    -rw-r--r-- 1 root supergroup 824 2014-08-20 12:29 /user/root/wcinput/domain-urlfilter.txt 
    -rw-r--r-- 1 root supergroup 3368 2014-08-20 12:29 /user/root/wcinput/gora-accumulo-mapping.xml 
    -rw-r--r-- 1 root supergroup 3279 2014-08-20 12:29 /user/root/wcinput/gora-cassandra-mapping.xml 
    -rw-r--r-- 1 root supergroup 3447 2014-08-20 12:29 /user/root/wcinput/gora-hbase-mapping.xml 
    -rw-r--r-- 1 root supergroup 2677 2014-08-20 12:29 /user/root/wcinput/gora-sql-mapping.xml 
    -rw-r--r-- 1 root supergroup 2993 2014-08-20 12:29 /user/root/wcinput/gora.properties 
    -rw-r--r-- 1 root supergroup 983 2014-08-20 12:29 /user/root/wcinput/hbase-site.xml 
    -rw-r--r-- 1 root supergroup 3096 2014-08-20 12:29 /user/root/wcinput/httpclient-auth.xml 
    -rw-r--r-- 1 root supergroup 3948 2014-08-20 12:29 /user/root/wcinput/log4j.properties 
    -rw-r--r-- 1 root supergroup 511 2014-08-20 12:29 /user/root/wcinput/nutch-conf.xsl 
    -rw-r--r-- 1 root supergroup 42610 2014-08-20 12:29 /user/root/wcinput/nutch-default.xml 
    -rw-r--r-- 1 root supergroup 753 2014-08-20 12:29 /user/root/wcinput/nutch-site.xml 
    -rw-r--r-- 1 root supergroup 347 2014-08-20 12:29 /user/root/wcinput/parse-plugins.dtd 
    -rw-r--r-- 1 root supergroup 3016 2014-08-20 12:29 /user/root/wcinput/parse-plugins.xml 
    -rw-r--r-- 1 root supergroup 857 2014-08-20 12:29 /user/root/wcinput/prefix-urlfilter.txt 
    -rw-r--r-- 1 root supergroup 2484 2014-08-20 12:29 /user/root/wcinput/regex-normalize.xml 
    -rw-r--r-- 1 root supergroup 1736 2014-08-20 12:29 /user/root/wcinput/regex-urlfilter.txt 
    -rw-r--r-- 1 root supergroup 18969 2014-08-20 12:29 /user/root/wcinput/schema-solr4.xml 
    -rw-r--r-- 1 root supergroup 6020 2014-08-20 12:29 /user/root/wcinput/schema.xml 
    -rw-r--r-- 1 root supergroup 1766 2014-08-20 12:29 /user/root/wcinput/solrindex-mapping.xml 
    -rw-r--r-- 1 root supergroup 1044 2014-08-20 12:29 /user/root/wcinput/subcollections.xml 
    -rw-r--r-- 1 root supergroup 1411 2014-08-20 12:29 /user/root/wcinput/suffix-urlfilter.txt
    4、运行程序

    [root@jediael project]# hadoop org.jediael.hadoopdemo.wordcount.WordCount wcinput wcoutput3 
    14/08/20 12:50:25 WARN mapred.JobClient: Use GenericOptionsParser for parsing the arguments. Applications should implement Tool for the same. 
    14/08/20 12:50:26 INFO input.FileInputFormat: Total input paths to process : 26 
    14/08/20 12:50:26 INFO util.NativeCodeLoader: Loaded the native-hadoop library 
    14/08/20 12:50:26 WARN snappy.LoadSnappy: Snappy native library not loaded 
    14/08/20 12:50:26 INFO mapred.JobClient: Running job: job_201408191134_0005 
    14/08/20 12:50:27 INFO mapred.JobClient: map 0% reduce 0% 
    14/08/20 12:50:38 INFO mapred.JobClient: map 3% reduce 0% 
    14/08/20 12:50:39 INFO mapred.JobClient: map 7% reduce 0% 
    14/08/20 12:50:50 INFO mapred.JobClient: map 15% reduce 0% 
    14/08/20 12:50:57 INFO mapred.JobClient: map 19% reduce 0% 
    14/08/20 12:50:58 INFO mapred.JobClient: map 23% reduce 0% 
    14/08/20 12:51:00 INFO mapred.JobClient: map 23% reduce 5% 
    14/08/20 12:51:04 INFO mapred.JobClient: map 30% reduce 5% 
    14/08/20 12:51:06 INFO mapred.JobClient: map 30% reduce 10% 
    14/08/20 12:51:11 INFO mapred.JobClient: map 38% reduce 10% 
    14/08/20 12:51:16 INFO mapred.JobClient: map 38% reduce 11% 
    14/08/20 12:51:18 INFO mapred.JobClient: map 46% reduce 11% 
    14/08/20 12:51:19 INFO mapred.JobClient: map 46% reduce 12% 
    14/08/20 12:51:22 INFO mapred.JobClient: map 46% reduce 15% 
    14/08/20 12:51:25 INFO mapred.JobClient: map 53% reduce 15% 
    14/08/20 12:51:31 INFO mapred.JobClient: map 53% reduce 17% 
    14/08/20 12:51:32 INFO mapred.JobClient: map 61% reduce 17% 
    14/08/20 12:51:39 INFO mapred.JobClient: map 69% reduce 17% 
    14/08/20 12:51:40 INFO mapred.JobClient: map 69% reduce 20% 
    14/08/20 12:51:45 INFO mapred.JobClient: map 73% reduce 20% 
    14/08/20 12:51:46 INFO mapred.JobClient: map 76% reduce 23% 
    14/08/20 12:51:52 INFO mapred.JobClient: map 80% reduce 23% 
    14/08/20 12:51:53 INFO mapred.JobClient: map 84% reduce 23% 
    14/08/20 12:51:55 INFO mapred.JobClient: map 84% reduce 25% 
    14/08/20 12:51:59 INFO mapred.JobClient: map 88% reduce 25% 
    14/08/20 12:52:00 INFO mapred.JobClient: map 92% reduce 25% 
    14/08/20 12:52:02 INFO mapred.JobClient: map 92% reduce 29% 
    14/08/20 12:52:06 INFO mapred.JobClient: map 96% reduce 29% 
    14/08/20 12:52:07 INFO mapred.JobClient: map 100% reduce 29% 
    14/08/20 12:52:11 INFO mapred.JobClient: map 100% reduce 30% 
    14/08/20 12:52:15 INFO mapred.JobClient: map 100% reduce 100% 
    14/08/20 12:52:17 INFO mapred.JobClient: Job complete: job_201408191134_0005 
    14/08/20 12:52:18 INFO mapred.JobClient: Counters: 29 
    14/08/20 12:52:18 INFO mapred.JobClient: Job Counters 
    14/08/20 12:52:18 INFO mapred.JobClient: Launched reduce tasks=1 
    14/08/20 12:52:18 INFO mapred.JobClient: SLOTS_MILLIS_MAPS=192038 
    14/08/20 12:52:18 INFO mapred.JobClient: Total time spent by all reduces waiting after reserving slots (ms)=0 
    14/08/20 12:52:18 INFO mapred.JobClient: Total time spent by all maps waiting after reserving slots (ms)=0 
    14/08/20 12:52:18 INFO mapred.JobClient: Launched map tasks=26 
    14/08/20 12:52:18 INFO mapred.JobClient: Data-local map tasks=26 
    14/08/20 12:52:18 INFO mapred.JobClient: SLOTS_MILLIS_REDUCES=95814 
    14/08/20 12:52:18 INFO mapred.JobClient: File Output Format Counters 
    14/08/20 12:52:18 INFO mapred.JobClient: Bytes Written=123950 
    14/08/20 12:52:18 INFO mapred.JobClient: FileSystemCounters 
    14/08/20 12:52:18 INFO mapred.JobClient: FILE_BYTES_READ=352500 
    14/08/20 12:52:18 INFO mapred.JobClient: HDFS_BYTES_READ=247920 
    14/08/20 12:52:18 INFO mapred.JobClient: FILE_BYTES_WRITTEN=2177502 
    14/08/20 12:52:18 INFO mapred.JobClient: HDFS_BYTES_WRITTEN=123950 
    14/08/20 12:52:18 INFO mapred.JobClient: File Input Format Counters 
    14/08/20 12:52:18 INFO mapred.JobClient: Bytes Read=244713 
    14/08/20 12:52:18 INFO mapred.JobClient: Map-Reduce Framework 
    14/08/20 12:52:18 INFO mapred.JobClient: Map output materialized bytes=352650 
    14/08/20 12:52:18 INFO mapred.JobClient: Map input records=7403 
    14/08/20 12:52:18 INFO mapred.JobClient: Reduce shuffle bytes=352650 
    14/08/20 12:52:18 INFO mapred.JobClient: Spilled Records=45210 
    14/08/20 12:52:18 INFO mapred.JobClient: Map output bytes=307281 
    14/08/20 12:52:18 INFO mapred.JobClient: Total committed heap usage (bytes)=3398606848 
    14/08/20 12:52:18 INFO mapred.JobClient: CPU time spent (ms)=14400 
    14/08/20 12:52:18 INFO mapred.JobClient: Combine input records=0 
    14/08/20 12:52:18 INFO mapred.JobClient: SPLIT_RAW_BYTES=3207 
    14/08/20 12:52:18 INFO mapred.JobClient: Reduce input records=22605 
    14/08/20 12:52:18 INFO mapred.JobClient: Reduce input groups=6749 
    14/08/20 12:52:18 INFO mapred.JobClient: Combine output records=0 
    14/08/20 12:52:18 INFO mapred.JobClient: Physical memory (bytes) snapshot=4799041536 
    14/08/20 12:52:18 INFO mapred.JobClient: Reduce output records=6749 
    14/08/20 12:52:18 INFO mapred.JobClient: Virtual memory (bytes) snapshot=19545337856 
    14/08/20 12:52:18 INFO mapred.JobClient: Map output records=22605
    5、查看结果

    root@jediael project]# hadoop fs -ls wcoutput3 
    Found 3 items 
    -rw-r--r-- 1 root supergroup 0 2014-08-20 12:52 /user/root/wcoutput3/_SUCCESS 
    drwxr-xr-x - root supergroup 0 2014-08-20 12:50 /user/root/wcoutput3/_logs 
    -rw-r--r-- 1 root supergroup 123950 2014-08-20 12:52 /user/root/wcoutput3/part-r-00000 
    [root@jediael project]# hadoop fs -cat wcoutput3/part-r-00000
    !!      2
    !ci.*.*.us      1
    !co.*.*.us      1
    !town.*.*.us    1
    "AS     22
    "Accept"        1
    "Accept-Language"       1
    "License");     22
    "NOW"   1
    "WiFi"  1
    "Z"     1
    "all"   1
    "content"       1
    "delete 1
    "delimiter"     1
    ………………

    三、程序分析

    1、WordCountMap类继承了org.apache.hadoop.mapreduce.Mapper,4个泛型类型分别是map函数输入key的类型,输入value的类型,输出key的类型,输出value的类型。

    2、WordCountReduce类继承了org.apache.hadoop.mapreduce.Reducer,4个泛型类型含义与map类相同。

    3、map的输出类型与reduce的输入类型相同,而一般情况下,map的输出类型与reduce的输出类型相同,因此,reduce的输入类型与输出类型相同。

    4、hadoop根据以下代码确定输入内容的格式:
    job.setInputFormatClass(TextInputFormat.class);
    TextInputFormat是hadoop默认的输入方法,它继承自FileInputFormat。在TextInputFormat中,它将数据集切割成小数据集InputSplit,每一个InputSplit由一个mapper处理。此外,InputFormat还提供了一个RecordReader的实现,将一个InputSplit解析成<key,value>的形式,并提供给map函数:
    key:这个数据相对于数据分片中的字节偏移量,数据类型是LongWritable。
    value:每行数据的内容,类型是Text。
    因此,在本例中,map函数的key/value类型是LongWritable与Text。

    5、Hadoop根据以下代码确定输出内容的格式:
    job.setOutputFormatClass(TextOutputFormat.class);
    TextOutputFormat是hadoop默认的输出格式,它会将每条记录一行的形式存入文本文件,如
    the 30
    happy 23
    ……











  • 相关阅读:
    课时15.DTD文档声明下(了解)
    Python-01 学习第一节
    常用数据库备份还原命令
    Oracle排除记录集
    存储过程分页语句
    TFS统计编码行数语句
    数据库所有表替换所有列的特定字符串
    MSSQL查询所有数据库表,指定数据库的字段、索引
    统计整个库所有表的记录数
    执​行​o​r​a​c​l​e​函​数​的​四​种​方​法
  • 原文地址:https://www.cnblogs.com/jediael/p/4304065.html
Copyright © 2011-2022 走看看