zoukankan      html  css  js  c++  java
  • hadoop-job(mapReducer计算单词出现的个数)

    1.============map===============

    package com.it18zhang.hadoop.mr;

    import org.apache.hadoop.io.IntWritable;
    import org.apache.hadoop.io.LongWritable;
    import org.apache.hadoop.io.Text;
    import org.apache.hadoop.mapreduce.Mapper;

    import java.io.IOException;

    /**
    * Mapper
    */
    public class WordCountMapper extends Mapper<LongWritable, Text, Text, IntWritable> {
    /**
    * key : 行首偏移量,字节数,意义不大。
    * value : 一行文本
    */
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
    //
    String line = value.toString() ;
    String[] arr = line.split(" ");

    Text keyOut = new Text() ;
    IntWritable valueOut = new IntWritable(1) ;
    for(String word : arr){
    keyOut.set(word);
    context.write(keyOut,valueOut);
    }
    }
    }

    2.============refucer===============

    package com.it18zhang.hadoop.mr;

    import org.apache.hadoop.io.IntWritable;
    import org.apache.hadoop.io.Text;
    import org.apache.hadoop.mapreduce.Reducer;

    import java.io.IOException;

    /**
    * reducer
    */
    public class WordCountReducer extends Reducer<Text, IntWritable, Text, IntWritable> {
    /**
    * key : word
    * values : 该key下聚合的value
    */
    protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
    int count = 0 ;
    for(IntWritable iw : values){
    count = count + iw.get() ;
    }
    context.write(key , new IntWritable(count));
    }
    }

    3.============统计===============

    package com.it18zhang.hadoop.mr;

    import org.apache.hadoop.conf.Configuration;
    import org.apache.hadoop.fs.FileSystem;
    import org.apache.hadoop.fs.Path;
    import org.apache.hadoop.io.IntWritable;
    import org.apache.hadoop.io.Text;
    import org.apache.hadoop.mapreduce.Job;
    import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
    import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

    import java.io.IOException;
    public class App {
    public static void main(String[] args) throws Exception {
    if(args == null || args.length<2){
    throw new Exception("参数不足,需要2个参数");
    }
    Configuration conf = new Configuration();
    FileSystem fs = FileSystem.get(conf);
    //递归删除输出目录
    fs.delete(new Path(args[1]),true);

    //创建一个作业
    Job job = Job.getInstance(conf);
    //调用job方法 名字随便期(word_count_add )
    job.setJobName("word_count_add");
    //获取类的路径
    job.setJarByClass(App.class);

    // //需要计算的文件路径
    // FileInputFormat.addInputPath(job,new Path("file:///Users/yangyanqing/godev/wc"));
    // //计算后文件输出
    // FileOutputFormat.setOutputPath(job,new Path("file:///Users/yangyanqing/godev/wc/out"));
    //需要计算的文件路径
    FileInputFormat.addInputPath(job,new Path(args[0]));
    //计算后文件输出
    FileOutputFormat.setOutputPath(job,new Path(args[1]));

    //设置mapper类和reducer类
    job.setMapperClass(WordCountMapper.class);
    job.setReducerClass(WordCountReducer.class);

    //输出mapper类和reducer类的类型
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class );
    //设置readuce个数
    job.setNumReduceTasks(1);
    //开始作业
    job.waitForCompletion(true);
    }
    }

  • 相关阅读:
    java类型与Hadoop类型之间的转换
    Elasticsearch之四种查询类型和搜索原理(博主推荐)
    Eclipse/MyEclipse如何快速提取变量(最强帮手)
    8592 KMP算法
    SDUT 1304-取数字问题(DFS)
    堆排序(小根堆)
    Android显示GIF图片
    HDU 1007 近期点对
    java 显示目录下全部文件
    UVa 11292
  • 原文地址:https://www.cnblogs.com/nyfz/p/9041992.html
Copyright © 2011-2022 走看看