zoukankan      html  css  js  c++  java
  • 036 关于网站的UV分析

    一:准备

    1.统计的维度

      guid

      tracktime

      provice

    2.key与value的设定

      key:date+provice_guid

      value:NullWritable

    3.案例分析

      表示某天某个省份的某个人无论访问网站多少次,仅仅记做一次访问统计

      UV:统计页面访问的总人数---》userID对于用户进行去重

    二:程序

    1.map程序

      

       

    2.reduce程序

      

    3.结果

      

    4.理解点

      1)怎么去重

        数据key的形式:date+provice_guid。

        当guid是相同的时候,在shuffle的group分组时,key被分组,一起的放在一起,而value则是nullwritable,没有使用value。

        所以到达reduce的时候,数据已经被去重了。

      2)NullWritable.get()

        使用反射,获得NullWritable的对象。

    5.完整程序

      1 package com.senior.network;
      2 
      3 import java.io.IOException;
      4 import java.util.HashMap;
      5 import java.util.Map;
      6 import java.util.Set;
      7 
      8 import org.apache.commons.lang.StringUtils;
      9 import org.apache.hadoop.conf.Configuration;
     10 import org.apache.hadoop.conf.Configured;
     11 import org.apache.hadoop.fs.Path;
     12 import org.apache.hadoop.io.IntWritable;
     13 import org.apache.hadoop.io.LongWritable;
     14 import org.apache.hadoop.io.NullWritable;
     15 import org.apache.hadoop.io.Text;
     16 import org.apache.hadoop.mapreduce.Job;
     17 import org.apache.hadoop.mapreduce.Mapper;
     18 import org.apache.hadoop.mapreduce.Mapper.Context;
     19 import org.apache.hadoop.mapreduce.Reducer;
     20 import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
     21 import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
     22 import org.apache.hadoop.util.Tool;
     23 import org.apache.hadoop.util.ToolRunner;
     24 
     25 public class WebUvCount extends Configured implements Tool{
     26     //Mapper
     27     public static class WebUvCountMapper extends Mapper<LongWritable,Text,Text,NullWritable>{
     28         private Text mapoutputkey=new Text();
     29         @Override
     30         protected void cleanup(Context context) throws IOException,InterruptedException {
     31             
     32         }
     33         @Override
     34         protected void setup(Context context) throws IOException,InterruptedException {
     35             
     36         }
     37         
     38         @Override
     39         protected void map(LongWritable key, Text value, Context context)throws IOException, InterruptedException {
     40             String lineValue=value.toString();
     41             String[] strs=lineValue.split("	");
     42             if(30>strs.length){
     43                 context.getCounter("webPvMapper_counter", "length_LT_30").increment(1L);
     44                 return;
     45             }
     46             String guidValue=strs[5];    //
     47             if(StringUtils.isEmpty(guidValue)){
     48                 return;
     49             }
     50             String trackTimeValue=strs[17];
     51             if(StringUtils.isEmpty(trackTimeValue)){
     52                 return;
     53             }
     54             String dateVAlue=trackTimeValue.substring(0,13);//
     55             String priviceIdValue=strs[23];      
     56             
     57             Integer priviceId=Integer.MAX_VALUE;
     58             try{
     59                 priviceId = Integer.valueOf(priviceIdValue);  //
     60             }catch(Exception e){
     61                 return;
     62             }
     63             
     64             mapoutputkey.set(dateVAlue+"	"+priviceIdValue+"_"+guidValue);
     65             context.write(mapoutputkey,NullWritable.get());
     66         }
     67         
     68     }
     69     
     70     
     71     
     72     //Reducer
     73     public static class WebUvCountReducer extends Reducer<Text,NullWritable,Text,IntWritable>{
     74         private Text outputkey=new Text();
     75         private Map<String,Integer> dateMap;
     76         private IntWritable outputvalue=new IntWritable();
     77         
     78         @Override
     79         protected void setup(Context context)throws IOException, InterruptedException {
     80             dateMap=new HashMap<String,Integer>();
     81         }
     82 
     83         @Override
     84         protected void reduce(Text key, Iterable<NullWritable> values,Context context)throws IOException, InterruptedException {
     85             String date=key.toString().split("_")[0];
     86             if(dateMap.containsKey(date)){
     87                 Integer previousUV=dateMap.get(date);
     88                 Integer uv=previousUV+1;
     89                 dateMap.put(date, uv);
     90             }else{
     91                 dateMap.put(date, 1);
     92             }
     93         }
     94 
     95         @Override
     96         protected void cleanup(Context context)throws IOException, InterruptedException {
     97             Set<String> dateSet=dateMap.keySet();
     98             for(String date:dateSet){
     99                 Integer uv=dateMap.get(date);
    100                 outputkey.set(date);
    101                 outputvalue.set(uv);
    102                 context.write(outputkey, outputvalue);
    103             }
    104         }
    105         
    106         
    107     }
    108     
    109     //Driver
    110     public int run(String[] args)throws Exception{
    111         Configuration conf=this.getConf();
    112         Job job=Job.getInstance(conf,this.getClass().getSimpleName());
    113         job.setJarByClass(WebUvCount.class);
    114         //input
    115         Path inpath=new Path(args[0]);
    116         FileInputFormat.addInputPath(job, inpath);
    117         
    118         //output
    119         Path outpath=new Path(args[1]);
    120         FileOutputFormat.setOutputPath(job, outpath);
    121         
    122         //map
    123         job.setMapperClass(WebUvCountMapper.class);
    124         job.setMapOutputKeyClass(Text.class);
    125         job.setMapOutputValueClass(NullWritable.class);
    126         
    127         //shuffle
    128         
    129         //reduce
    130         job.setReducerClass(WebUvCountReducer.class);
    131         job.setOutputKeyClass(Text.class);
    132         job.setOutputValueClass(IntWritable.class);
    133         
    134         //submit
    135         boolean isSucess=job.waitForCompletion(true);
    136         return isSucess?0:1;
    137     }
    138     
    139     //main
    140     public static void main(String[] args)throws Exception{
    141         Configuration conf=new Configuration();
    142         //compress
    143         conf.set("mapreduce.map.output.compress", "true");
    144         conf.set("mapreduce.map.output.compress.codec", "org.apache.hadoop.io.compress.SnappyCodec");
    145         args=new String[]{
    146                 "hdfs://linux-hadoop01.ibeifeng.com:8020/user/beifeng/mapreduce/wordcount/inputWebData",
    147                 "hdfs://linux-hadoop01.ibeifeng.com:8020/user/beifeng/mapreduce/wordcount/outputWebData6"
    148         };
    149         int status=ToolRunner.run(new WebUvCount(), args);
    150         System.exit(status);
    151     }
    152 
    153 }
  • 相关阅读:
    read、readline、readlines和linecache的使用
    无法启用internet连接共享,为LAN连接配置的IP地址需要使用自动IP寻址
    虚拟机pycharm
    Ubuntu安装谷歌浏览器
    pandas dataframe重复数据查看.判断.去重
    git 删除误上传的.idea文件
    python logger日志通用配置文件
    pyinstaller打包python文件成exe(原理.安装.问题)
    SSH 免密登录服务器
    homebrew安装和解决brew安装速度慢的问题
  • 原文地址:https://www.cnblogs.com/juncaoit/p/6020760.html
Copyright © 2011-2022 走看看