zoukankan      html  css  js  c++  java
  • hadoop 不同URLTitle文件提取关联URL

    package com.sogou.web.selector.updana.wapPc;
    
    import org.apache.hadoop.conf.Configured;
    import org.apache.hadoop.fs.Path;
    import org.apache.hadoop.io.Text;
    import org.apache.hadoop.io.WritableComparable;
    import org.apache.hadoop.io.WritableComparator;
    import org.apache.hadoop.mapreduce.Job;
    import org.apache.hadoop.mapreduce.Partitioner;
    import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
    import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
    import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
    import org.apache.hadoop.util.Tool;
    import org.apache.hadoop.util.ToolRunner;
    
    import com.sogou.web.selector.wapcoverage.GBKOutputFormat;
    
    public class URLTitle extends Configured implements Tool {
        private static class KeyPartitioner extends Partitioner<TextPair, Text>{
    
            @Override
            public int getPartition(TextPair key, Text value, int numPartitions) {
                // TODO Auto-generated method stub
                return (key.getFirst().hashCode()&Integer.MAX_VALUE)%numPartitions;
            }
            
        }
        private static class GroupPartitioner extends WritableComparator{
    
            protected GroupPartitioner() {
                super(TextPair.class,true);
            }
            @Override
            public int compare(WritableComparable a, WritableComparable b) {
                // TODO Auto-generated method stub
                TextPair t1=(TextPair)a;
                TextPair t2=(TextPair)b;
                return t1.getFirst().compareTo(t2.getFirst());
            }
            
        }
        public int run(String[] args) throws Exception {
            // TODO Auto-generated method stub
            Job job = new Job(this.getConf(), "URL_Title_Analysis");
            //设置运行job
            job.setJarByClass(this.getClass());
            //设置Map相关内容
            job.setMapperClass(WapPCMapper.class);
            job.setMapOutputKeyClass(TextPair.class);
            job.setMapOutputValueClass(Text.class);
            //设子reduce
            job.setReducerClass(WapPcReducer.class);
            job.setOutputValueClass(Text.class);
            job.setOutputKeyClass(Text.class);
            //设置输出入格式文件
            job.setInputFormatClass(TextInputFormat.class);
            job.setOutputFormatClass(GBKOutputFormat.class);
            
            //设置分区和分组
            job.setPartitionerClass(KeyPartitioner.class);
            job.setGroupingComparatorClass(GroupPartitioner.class);
            
            System.exit(job.waitForCompletion(true) ? 0 : 1);
            return 0;
        }
    
        public static void main(String[] args) throws Exception {
            Tool UrlTitle = new URLTitle();
            ToolRunner.run(UrlTitle, args);
        }
    }

    可以提取A,B两个文件中的URL和Title中相等的Title,并输出需要的Title的关联URL

  • 相关阅读:
    使用Python创建TCP代理之工业时代造轮子
    CVE-2020-0796 SMB远程代码执行漏洞(分析、验证及加固)
    Oracle 找到引起账户锁定的IP
    【OGG 故障处理】OGG-01031
    【OGG 故障处理】OGG-01028
    【OGG 故障处理】 丢失归档恢复
    19C imp 导入合并表空间
    CentOS 7 配置VNCServer
    ORA-3136 问题处理
    HugePages概述--翻译自19C文档
  • 原文地址:https://www.cnblogs.com/csxf/p/3768503.html
Copyright © 2011-2022 走看看