zoukankan      html  css  js  c++  java
  • hadoop MapReduce —— 输出每个单词所对应的文件

    下面是四个文件及其内容。

    代码实现:

    Mapper:
    package cn.tedu.invert;
    
    import java.io.IOException;
    
    import org.apache.hadoop.io.IntWritable;
    import org.apache.hadoop.io.LongWritable;
    import org.apache.hadoop.io.Text;
    import org.apache.hadoop.mapreduce.Mapper;
    import org.apache.hadoop.mapreduce.lib.input.FileSplit;
    
    public class InvertMapper extends Mapper<LongWritable, Text, Text, Text> {
        
        @Override
        public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
            // 获取文件名
            FileSplit fileSplit = (FileSplit)context.getInputSplit();
            String pathName = fileSplit.getPath().getName();
            
            // 将文件中的内容提取 
            String[] words = value.toString().split(" ");
            
            // 每一个单词都对应着自己所在文件的文件名
            for(String word:words){
                context.write(new Text(word), new Text(pathName));
            }
        }
    }
    Reducer:
    package cn.tedu.invert;
    
    import java.io.IOException;
    import java.util.HashSet;
    
    import org.apache.hadoop.io.IntWritable;
    import org.apache.hadoop.io.Text;
    import org.apache.hadoop.mapreduce.Reducer;
    
    public class InvertReducer extends Reducer<Text, Text, Text, Text> {
    
        public void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
            
            // 哈希表不存重复元素,将重复的文件名去掉
            HashSet<String> set = new HashSet<>();
            for (Text text : values) {
                set.add(text.toString());
            }
            
            StringBuilder sb = new StringBuilder();
            for (String str : set) {
                sb.append(str.toString()).append(" ");
            }
            
            context.write(key, new Text(sb.toString()));
        }
    }

     Driver:

    package cn.tedu.invert;
    
    import org.apache.hadoop.conf.Configuration;
    import org.apache.hadoop.fs.Path;
    import org.apache.hadoop.io.IntWritable;
    import org.apache.hadoop.io.Text;
    import org.apache.hadoop.mapreduce.Job;
    import org.apache.hadoop.mapreduce.Mapper;
    import org.apache.hadoop.mapreduce.Reducer;
    import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
    import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
    
    public class InvertDriver {
    
        public static void main(String[] args) throws Exception {
            Configuration conf = new Configuration();
            Job job = Job.getInstance(conf, "JobName");
            job.setJarByClass(cn.tedu.invert.InvertDriver.class);
            job.setMapperClass(InvertMapper.class);
            job.setReducerClass(InvertReducer.class);
            
            job.setOutputKeyClass(Text.class);
            job.setOutputValueClass(Text.class);
    
            FileInputFormat.setInputPaths(job, new Path("hdfs://192.168.74.129:9000/text/invert"));
            FileOutputFormat.setOutputPath(job, new Path("hdfs://192.168.74.129:9000/result/invert_result"));
    
            if (!job.waitForCompletion(true))
                return;
        }
    }

    结果:

  • 相关阅读:
    高仿中国银行ATM系统
    第二次冲刺2
    第二轮冲刺1
    本日进度7
    本日进度6
    本日进度5
    本日进度4
    本日进度3
    本日进度2
    本日进度
  • 原文地址:https://www.cnblogs.com/chuijingjing/p/10122714.html
Copyright © 2011-2022 走看看