zoukankan      html  css  js  c++  java
  • hadoop-WordCount案例

    1.导入依赖

     <dependencies>
        <dependency>
          <groupId>junit</groupId>
          <artifactId>junit</artifactId>
          <version>4.11</version>
          <scope>test</scope>
        </dependency>
    
        <dependency>
          <groupId>junit</groupId>
          <artifactId>junit</artifactId>
          <version>RELEASE</version>
        </dependency>
    
        <dependency>
          <groupId>org.apache.logging.log4j</groupId>
          <artifactId>log4j-core</artifactId>
          <version>2.9.1</version>
        </dependency>
    
        <dependency>
          <groupId>org.apache.hadoop</groupId>
          <artifactId>hadoop-common</artifactId>
          <version>2.7.2</version>
        </dependency>
    
        <dependency>
          <groupId>org.apache.hadoop</groupId>
          <artifactId>hadoop-client</artifactId>
          <version>2.7.2</version>
        </dependency>
        <dependency>
          <groupId>org.apache.hadoop</groupId>
          <artifactId>hadoop-hdfs</artifactId>
          <version>2.7.2</version>
        </dependency>
    
        <dependency>
          <groupId>jdk.tools</groupId>
          <artifactId>jdk.tools</artifactId>
          <version>1.8</version>
          <scope>system</scope>
          <systemPath>E:/jdk1.8.0_73/jdk1.8.0_73/lib/tools.jar</systemPath>
        </dependency>
      </dependencies>

    2.编写WcMapper

    package com.wn.wordcount;
    
    import org.apache.hadoop.io.IntWritable;
    import org.apache.hadoop.io.LongWritable;
    import org.apache.hadoop.io.Text;
    import org.apache.hadoop.mapreduce.Mapper;
    import org.junit.Test;
    
    import java.io.IOException;
    
    public class WcMapper extends Mapper<LongWritable, Text,Text, IntWritable> {
    
        private Text word=new Text();
        private IntWritable one=new IntWritable(1);
    
        @Override
        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
            //拿到这一行数据
            String line = value.toString();
            //按照空格切分数据
            String[] words = line.split(" ");
            //遍历数组
            for (String word:words){
                this.word.set(word);
                context.write(this.word,this.one);
            }
        }
    }

    3.编写WcReducer

    package com.wn.wordcount;
    
    import org.apache.hadoop.io.IntWritable;
    import org.apache.hadoop.io.Text;
    import org.apache.hadoop.mapreduce.Reducer;
    
    import java.io.IOException;
    
    public class WcReducer extends Reducer<Text, IntWritable,Text,IntWritable> {
    
        private IntWritable total=new IntWritable();
    
        @Override
        protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
            //做累加
            int sum=0;
            for (IntWritable value : values) {
                sum+=value.get();
            }
            //包装结果并输出
            total.set(sum);
            context.write(key,total);
        }
    }

    4.编写WcDriver

    package com.wn.wordcount;
    
    import org.apache.hadoop.conf.Configuration;
    import org.apache.hadoop.fs.Path;
    import org.apache.hadoop.io.IntWritable;
    import org.apache.hadoop.io.Text;
    import org.apache.hadoop.mapreduce.Job;
    import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
    import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
    
    import java.io.IOException;
    
    public class WcDriver {
        public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
            //获取一个Job实例
            Job job = Job.getInstance(new Configuration());
            //设置类路径
            job.setJarByClass(WcDriver.class);
            //设置mapper和reducer
            job.setMapperClass(WcMapper.class);
            job.setReducerClass(WcReducer.class);
            //设置mapper和reducer输出类型
            job.setMapOutputKeyClass(Text.class);
            job.setMapOutputValueClass(IntWritable.class);
            job.setOutputKeyClass(Text.class);
            job.setOutputValueClass(IntWritable.class);
            //设置输入的数据
            FileInputFormat.setInputPaths(job,new Path(args[0]));
            FileOutputFormat.setOutputPath(job,new Path(args[1]));
            //提交job
            boolean b = job.waitForCompletion(true);
            System.exit(b?0:1);
        }
    }

    5.运行打包

      

      打包完成后,会在对应的target目录下生成.jar文件

      

    6.上传并执行

      6.1 使用【rz】命令将.jar文件上传到hadoop上

        

      6.2 运行wordcount程序

    hadoop jar hadoop-1.0-SNAPSHOT.jar com.wn.wordcount.WcDriver /input.txt /output

        执行完成后会在HDFS上生成对应的output文件

        

       6.3 查看运行结果

    hadoop fs -cat /output/*

        

    7.WordCount流程

      7.1 将文件拆分成splits,由于测试用的文件较小,所以每个文件为一乐splits,并将文件按行分割形成<key,value>对,key为偏移量,value为文本行;这一步有MapReduce框架自动完成;

        

       7.2 将分给好的<key,value>对交给用户定义的map方法进行处理,生成新的<key,value>对;

        

      7.3 得到map方法输出的<key,value>对后,mapper会将他们按照key值进行排序,并执行Combine过程,将key值相同的value累加,得到mapper的最终输出结果;

        

      7.4 reducer先对从mapper接收的数据进行排序,在交由用户自定义的reduce方法进行处理,得到新的<key,value>对,并作为wordcount的输出结果;

        

  • 相关阅读:
    数据透视表快速按年月分组
    会计-汇兑损益账务处理
    vs Mvc晋级
    sql语句建立新表SMFIELD
    access左侧导航栏拉窄后,鼠标悬停时无法拉宽。
    SQL函数min和max用法
    转发一个很齐全的gridview应用帖子
    循环
    JavaScript的进阶学习
    JavaScript的学习
  • 原文地址:https://www.cnblogs.com/wnwn/p/12599167.html
Copyright © 2011-2022 走看看