zoukankan      html  css  js  c++  java
  • hadoop 倒排索引-分布式作业二

    import java.io.IOException;
    import java.util.StringTokenizer;
    
    import org.apache.hadoop.conf.Configuration;
    import org.apache.hadoop.fs.Path;
    import org.apache.hadoop.io.Text;
    import org.apache.hadoop.mapreduce.Job;
    import org.apache.hadoop.mapreduce.Mapper;
    import org.apache.hadoop.mapreduce.Reducer;
    import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
    import org.apache.hadoop.mapreduce.lib.input.FileSplit;
    import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
    
    public class InvertedIndex {
    
    	public static class InversedIndexMapper extends Mapper<Object, Text, Text, Text> {
    		
    		private Text outKey = new Text();
    		private Text outVal = new Text();
    		
    		@Override
    		public void map (Object key,Text value,Context context) {
    			StringTokenizer tokens = new StringTokenizer(value.toString());
    			FileSplit split = (FileSplit) context.getInputSplit();
    			while(tokens.hasMoreTokens()) {
    				String token = tokens.nextToken();
    				try {
    					outKey.set(token + ":" + split.getPath());
    					outVal.set("1");
    					context.write(outKey, outVal);
    				} catch (IOException e) {
    					e.printStackTrace();
    				} catch (InterruptedException e) {
    					e.printStackTrace();
    				}
    			}
    			
    			System.out.println("Map phase finished ...");
    		}
    	}
    	
    	public static class InversedIndexCombiner extends Reducer<Text, Text, Text, Text> {
    		
    		private Text outKey = new Text();
    		private Text outVal = new Text();
    		
    		@Override
    		public void reduce(Text key,Iterable<Text> values,Context context) {
    			String[] keys = key.toString().split(":");
    			int sum = 0;
    			for(Text val : values) {
    				sum += Integer.parseInt(val.toString());
    			}
    			try {
    				outKey.set(keys[0]);
    				int index = keys[keys.length-1].lastIndexOf('/');
    				outVal.set(keys[keys.length-1].substring(index+1) + ":" + sum);
    				context.write(outKey, outVal);
    			} catch (IOException e) {
    				e.printStackTrace();
    			} catch (InterruptedException e) {
    				e.printStackTrace();
    			}
    
    			System.out.println("Combine phase finished ...");
    		}
    		
    	}
    	
    	public static class InversedIndexReducer extends Reducer<Text, Text, Text, Text> {
    		
    		@Override
    		public void reduce (Text key,Iterable<Text> values,Context context) {
    			StringBuffer sb = new StringBuffer();
    			for(Text text : values) {
    				sb.append(text.toString() + " ,");
    			}
    			try {
    				context.write(key, new Text(sb.toString()));
    			} catch (IOException e) {
    				e.printStackTrace();
    			} catch (InterruptedException e) {
    				e.printStackTrace();
    			}
    
    			System.out.println("Reduce phase finished ...");
    		}
    	}
    	
    	public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException {
    		Configuration conf = new Configuration();
    		@SuppressWarnings("deprecation")
    		Job job = new Job(conf,"index inverted");
    		
    		job.setJarByClass(InvertedIndex.class);
    		job.setMapperClass(InversedIndexMapper.class);
    		job.setCombinerClass(InversedIndexCombiner.class);
    		job.setReducerClass(InversedIndexReducer.class);
    		job.setMapOutputKeyClass(Text.class);
    		job.setMapOutputValueClass(Text.class);
    		job.setOutputKeyClass(Text.class);
    		job.setOutputValueClass(Text.class);
    
    		job.setNumReduceTasks(1);
    		
    		FileInputFormat.addInputPath(job, new Path("input"));
    		FileOutputFormat.setOutputPath(job, new Path("output"));
    		if(job.waitForCompletion(true))
    		{
    			System.out.println("All job finished ...");
    			System.exit(0);
    		}
    	}
    
    }
    

      

  • 相关阅读:
    HttpRunner3.X
    基于C++的ByteBuf封装
    关于matlab的配色
    关于样本方差的无偏估计
    使用Python求解Nonogram
    菜鸡的一些力扣记录
    LeetCode链表练习
    C语言中的链表
    Python中的链表简介
    Nebula Graph 源码解读系列 | Vol.03 Planner 的实现
  • 原文地址:https://www.cnblogs.com/shenbingyu/p/4940676.html
Copyright © 2011-2022 走看看