zoukankan      html  css  js  c++  java
  • hadoop-job(mapReducer计算单词出现的个数)

    1.============map===============

    package com.it18zhang.hadoop.mr;

    import org.apache.hadoop.io.IntWritable;
    import org.apache.hadoop.io.LongWritable;
    import org.apache.hadoop.io.Text;
    import org.apache.hadoop.mapreduce.Mapper;

    import java.io.IOException;

    /**
    * Mapper
    */
    public class WordCountMapper extends Mapper<LongWritable, Text, Text, IntWritable> {
    /**
    * key : 行首偏移量,字节数,意义不大。
    * value : 一行文本
    */
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
    //
    String line = value.toString() ;
    String[] arr = line.split(" ");

    Text keyOut = new Text() ;
    IntWritable valueOut = new IntWritable(1) ;
    for(String word : arr){
    keyOut.set(word);
    context.write(keyOut,valueOut);
    }
    }
    }

    2.============refucer===============

    package com.it18zhang.hadoop.mr;

    import org.apache.hadoop.io.IntWritable;
    import org.apache.hadoop.io.Text;
    import org.apache.hadoop.mapreduce.Reducer;

    import java.io.IOException;

    /**
    * reducer
    */
    public class WordCountReducer extends Reducer<Text, IntWritable, Text, IntWritable> {
    /**
    * key : word
    * values : 该key下聚合的value
    */
    protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
    int count = 0 ;
    for(IntWritable iw : values){
    count = count + iw.get() ;
    }
    context.write(key , new IntWritable(count));
    }
    }

    3.============统计===============

    package com.it18zhang.hadoop.mr;

    import org.apache.hadoop.conf.Configuration;
    import org.apache.hadoop.fs.FileSystem;
    import org.apache.hadoop.fs.Path;
    import org.apache.hadoop.io.IntWritable;
    import org.apache.hadoop.io.Text;
    import org.apache.hadoop.mapreduce.Job;
    import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
    import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

    import java.io.IOException;
    public class App {
    public static void main(String[] args) throws Exception {
    if(args == null || args.length<2){
    throw new Exception("参数不足,需要2个参数");
    }
    Configuration conf = new Configuration();
    FileSystem fs = FileSystem.get(conf);
    //递归删除输出目录
    fs.delete(new Path(args[1]),true);

    //创建一个作业
    Job job = Job.getInstance(conf);
    //调用job方法 名字随便期(word_count_add )
    job.setJobName("word_count_add");
    //获取类的路径
    job.setJarByClass(App.class);

    // //需要计算的文件路径
    // FileInputFormat.addInputPath(job,new Path("file:///Users/yangyanqing/godev/wc"));
    // //计算后文件输出
    // FileOutputFormat.setOutputPath(job,new Path("file:///Users/yangyanqing/godev/wc/out"));
    //需要计算的文件路径
    FileInputFormat.addInputPath(job,new Path(args[0]));
    //计算后文件输出
    FileOutputFormat.setOutputPath(job,new Path(args[1]));

    //设置mapper类和reducer类
    job.setMapperClass(WordCountMapper.class);
    job.setReducerClass(WordCountReducer.class);

    //输出mapper类和reducer类的类型
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class );
    //设置readuce个数
    job.setNumReduceTasks(1);
    //开始作业
    job.waitForCompletion(true);
    }
    }

  • 相关阅读:
    webpack基础
    LeetCode232. 用栈实现队列做题笔记
    mysql 时间加减一个月
    leetcode 1381. 设计一个支持增量操作的栈 思路与算法
    LeetCode 141. 环形链表 做题笔记
    leetcode 707. 设计链表 做题笔记
    leetcode 876. 链表的中间结点 做题笔记
    leetcode 143. 重排链表 做题笔记
    leetcode 1365. 有多少小于当前数字的数字 做题笔记
    LeetCode1360. 日期之间隔几天 做题笔记
  • 原文地址:https://www.cnblogs.com/nyfz/p/9041992.html
Copyright © 2011-2022 走看看