package com.sogou.web.selector.updana.wapPc; import org.apache.hadoop.conf.Configured; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.WritableComparable; import org.apache.hadoop.io.WritableComparator; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Partitioner; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.input.TextInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import org.apache.hadoop.util.Tool; import org.apache.hadoop.util.ToolRunner; import com.sogou.web.selector.wapcoverage.GBKOutputFormat; public class URLTitle extends Configured implements Tool { private static class KeyPartitioner extends Partitioner<TextPair, Text>{ @Override public int getPartition(TextPair key, Text value, int numPartitions) { // TODO Auto-generated method stub return (key.getFirst().hashCode()&Integer.MAX_VALUE)%numPartitions; } } private static class GroupPartitioner extends WritableComparator{ protected GroupPartitioner() { super(TextPair.class,true); } @Override public int compare(WritableComparable a, WritableComparable b) { // TODO Auto-generated method stub TextPair t1=(TextPair)a; TextPair t2=(TextPair)b; return t1.getFirst().compareTo(t2.getFirst()); } } public int run(String[] args) throws Exception { // TODO Auto-generated method stub Job job = new Job(this.getConf(), "URL_Title_Analysis"); //设置运行job job.setJarByClass(this.getClass()); //设置Map相关内容 job.setMapperClass(WapPCMapper.class); job.setMapOutputKeyClass(TextPair.class); job.setMapOutputValueClass(Text.class); //设子reduce job.setReducerClass(WapPcReducer.class); job.setOutputValueClass(Text.class); job.setOutputKeyClass(Text.class); //设置输出入格式文件 job.setInputFormatClass(TextInputFormat.class); job.setOutputFormatClass(GBKOutputFormat.class); //设置分区和分组 job.setPartitionerClass(KeyPartitioner.class); job.setGroupingComparatorClass(GroupPartitioner.class); System.exit(job.waitForCompletion(true) ? 0 : 1); return 0; } public static void main(String[] args) throws Exception { Tool UrlTitle = new URLTitle(); ToolRunner.run(UrlTitle, args); } }
可以提取A,B两个文件中的URL和Title中相等的Title,并输出需要的Title的关联URL