zoukankan html css js c++ java

Hadoop实例之利用MapReduce实现Wordcount单词统计 (附源代码)

大致思路是将hdfs上的文本作为输入，MapReduce通过InputFormat会将文本进行切片处理，并将每行的首字母相对于文本文件的首地址的偏移量作为输入键值对的key，文本内容作为输入键值对的value，经过在map函数处理，输出中间结果<word,1>的形式，并在reduce函数中完成对每个单词的词频统计。整个程序代码主要包括两部分：Mapper部分和Reducer部分。

Mapper代码

    public  static  class  doMapper  extends  Mapper<Object,  Text,  Text,  IntWritable>{  
//第一个Object表示输入key的类型；第二个Text表示输入value的类型；第三个Text表示表示输出键的类型；第四个IntWritable表示输出值的类型  
        public  static  final  IntWritable  one  =  new  IntWritable(1);  
        public  static  Text  word  =  new  Text();  
        @Override  
        protected  void  map(Object  key,  Text  value,  Context  context)  
                    throws  IOException,  InterruptedException
 //抛出异常  {  
            StringTokenizer  tokenizer  =  new  StringTokenizer(value.toString(),  " ");  
//StringTokenizer是Java工具包中的一个类，用于将字符串进行拆分
            while(tokenizer.hasMoreTokens()) //循环每一行拆分出的所有单词
           { 
                 word.set(tokenizer.nextToken()); //返回当前位置到下一个分隔符之间的字符串
                  context.write(word,  one);  //将word存到容器中，记一个数  
            }
        
        }  
    }

在map函数里有三个参数，前面两个Object key,Text value就是输入的key和value，第三个参数Context context是可以记录输入的key和value。例如context.write(word,one)；此外context还会记录map运算的状态。map阶段采用Hadoop的默认的作业输入方式，把输入的value用StringTokenizer()方法截取出的单词设置为key，设置value为1，然后直接输出<key,value>。

Reducer代码

    public  static  class  doReducer  extends  Reducer<Text,  IntWritable,  Text,  IntWritable>{  
//参数同Map一样，依次表示是输入键类型，输入值类型，输出键类型，输出值类型
        private  IntWritable  result  =  new  IntWritable();  
        @Override  
        protected  void  reduce(Text  key,  Iterable<IntWritable>  values,  Context  context)  
        throws  IOException,  InterruptedException  {  
        int  sum  =  0;  
        for  (IntWritable  value  :  values)  {  
        sum  +=  value.get();  
        }  
//for循环遍历，将得到的values值累加  
        result.set(sum);  
        System.out.println(sum);
        context.write(key,  result);  
        }  
    }

map输出的<key,value>先要经过shuffle过程把相同key值的所有value聚集起来形成<key,values>后交给reduce端。reduce端接收到<key,values>之后，将输入的key直接复制给输出的key,用for循环遍历values并求和，求和结果就是key值代表的单词出现的总次，将其设置为value，直接输出<key,value>。

完整代码：

import  java.io.IOException;  
import  java.util.StringTokenizer;  
import  org.apache.hadoop.fs.Path;  
import  org.apache.hadoop.io.IntWritable;  
import  org.apache.hadoop.io.Text;  
import  org.apache.hadoop.mapreduce.Job;  
import  org.apache.hadoop.mapreduce.Mapper;  
import  org.apache.hadoop.mapreduce.Reducer;  
import  org.apache.hadoop.mapreduce.lib.input.FileInputFormat;  
import  org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;  
public  class  WordCount  {  
    public  static  void  main(String[]  args)  throws  IOException,  ClassNotFoundException,  InterruptedException  {  
        Job  job  =  Job.getInstance();  
        job.setJobName("WordCount");  
        job.setJarByClass(WordCount.class);  
        job.setMapperClass(doMapper.class);  
        job.setReducerClass(doReducer.class);  
        job.setOutputKeyClass(Text.class);  
        job.setOutputValueClass(IntWritable.class);  
        Path  in  =  new  Path("hdfs://192.168.68.130:9000/user/hadoop/wordcount.txt"); //需要统计的文本所在位置 
        Path  out  = new Path("hdfs://192.168.68.130:9000/user/hadoop/output3");  //注意output3不能存在
        FileInputFormat.addInputPath(job,  in);  
        FileOutputFormat.setOutputPath(job,  out);  
        System.exit(job.waitForCompletion(true) ? 0  :  1);  
    }  
    public  static  class  doMapper  extends  Mapper<Object,  Text,  Text,  IntWritable>{  
        public  static  final  IntWritable  one  =  new  IntWritable(1);  
        public  static  Text  word  =  new  Text();  
        @Override  
        protected  void  map(Object  key,  Text  value,  Context  context)  
                    throws  IOException,  InterruptedException  {  
            StringTokenizer  tokenizer  =  new  StringTokenizer(value.toString(),  " ");  
            while(tokenizer.hasMoreTokens()) { 
                 word.set(tokenizer.nextToken());
                  context.write(word,  one);  
            }
        
        }  
    }  
    public  static  class  doReducer  extends  Reducer<Text,  IntWritable,  Text,  IntWritable>{  
        private  IntWritable  result  =  new  IntWritable();  
        @Override  
        protected  void  reduce(Text  key,  Iterable<IntWritable>  values,  Context  context)  
        throws  IOException,  InterruptedException  {  
        int  sum  =  0;  
        for  (IntWritable  value  :  values)  {  
        sum  +=  value.get();  
        }  
        result.set(sum);  
        System.out.println(sum);
        context.write(key,  result);  
        }  
    }  
}

查看全文

相关阅读:
on、where、having的区别和关系
 Java知识点补缺
 Hive部署到IDEA报错 Hive Schema version 2.1.0 does not match metastore's schema version 1.2.0 Metastore is not upgraded or corrupt 解决方案
 Hive知识点总结
 区分同步与异步、阻塞与非阻塞
 Hive查询分区元数据，PARTITIONED BY
单例模式总结
 Sql语句执行顺序
 收藏大数据相关面试题比较好的链接
 实习技能

原文地址：https://www.cnblogs.com/sakura--/p/11448874.html