zoukankan      html  css  js  c++  java
  • hadoop 不同URLTitle文件提取关联URL

    package com.sogou.web.selector.updana.wapPc;
    
    import org.apache.hadoop.conf.Configured;
    import org.apache.hadoop.fs.Path;
    import org.apache.hadoop.io.Text;
    import org.apache.hadoop.io.WritableComparable;
    import org.apache.hadoop.io.WritableComparator;
    import org.apache.hadoop.mapreduce.Job;
    import org.apache.hadoop.mapreduce.Partitioner;
    import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
    import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
    import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
    import org.apache.hadoop.util.Tool;
    import org.apache.hadoop.util.ToolRunner;
    
    import com.sogou.web.selector.wapcoverage.GBKOutputFormat;
    
    public class URLTitle extends Configured implements Tool {
        private static class KeyPartitioner extends Partitioner<TextPair, Text>{
    
            @Override
            public int getPartition(TextPair key, Text value, int numPartitions) {
                // TODO Auto-generated method stub
                return (key.getFirst().hashCode()&Integer.MAX_VALUE)%numPartitions;
            }
            
        }
        private static class GroupPartitioner extends WritableComparator{
    
            protected GroupPartitioner() {
                super(TextPair.class,true);
            }
            @Override
            public int compare(WritableComparable a, WritableComparable b) {
                // TODO Auto-generated method stub
                TextPair t1=(TextPair)a;
                TextPair t2=(TextPair)b;
                return t1.getFirst().compareTo(t2.getFirst());
            }
            
        }
        public int run(String[] args) throws Exception {
            // TODO Auto-generated method stub
            Job job = new Job(this.getConf(), "URL_Title_Analysis");
            //设置运行job
            job.setJarByClass(this.getClass());
            //设置Map相关内容
            job.setMapperClass(WapPCMapper.class);
            job.setMapOutputKeyClass(TextPair.class);
            job.setMapOutputValueClass(Text.class);
            //设子reduce
            job.setReducerClass(WapPcReducer.class);
            job.setOutputValueClass(Text.class);
            job.setOutputKeyClass(Text.class);
            //设置输出入格式文件
            job.setInputFormatClass(TextInputFormat.class);
            job.setOutputFormatClass(GBKOutputFormat.class);
            
            //设置分区和分组
            job.setPartitionerClass(KeyPartitioner.class);
            job.setGroupingComparatorClass(GroupPartitioner.class);
            
            System.exit(job.waitForCompletion(true) ? 0 : 1);
            return 0;
        }
    
        public static void main(String[] args) throws Exception {
            Tool UrlTitle = new URLTitle();
            ToolRunner.run(UrlTitle, args);
        }
    }

    可以提取A,B两个文件中的URL和Title中相等的Title,并输出需要的Title的关联URL

  • 相关阅读:
    Segmentation fault (core dumped)
    Missing separate debuginfos, use: debuginfo-install
    Qt学习资源
    Qt学习过程中遇到的问题
    深入浅出MFC--第一章
    MVC – 3.EF(Entity Framework)
    MVC基础知识 – 2.新语法
    js获取url参数值(HTML之间传值)
    解决IIS7、IIS7.5中时间格式显示的问题
    web.config详解 -- asp.net夜话之十一
  • 原文地址:https://www.cnblogs.com/csxf/p/3768503.html
Copyright © 2011-2022 走看看