zoukankan      html  css  js  c++  java
  • 一个典型的MapRuduce实例------webcount(网站统计访客信息)

    统计某一特定网站的某个时辰访客人数

    所用版本:hadoop2.6.5

    数据样式如下:

    111.111.111.111 - - [16/Dec/2012:05:32:50 -0500] "GET / HTTP/1.1" 200 14791 "-" "Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)"
    111.111.111.111 - - [16/Dec/2012:05:33:50 -0500] "GET / HTTP/1.1" 200 14791 "-" "Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)"
    111.111.111.111 - - [16/Dec/2012:05:34:45 -0500] "GET / HTTP/1.1" 200 14791 "-" "Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)"
    111.111.111.111 - - [16/Dec/2012:05:34:50 -0500] "GET / HTTP/1.1" 200 14791 "-" "Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)"
    111.111.111.111 - - [16/Dec/2012:09:34:55 -0500] "GET / HTTP/1.1" 200 14791 "-" "Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)"
    111.111.111.111 - - [16/Dec/2012:10:23:30 -0500] "GET / HTTP/1.1" 200 14791 "-" "Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)"
    111.111.111.111 - - [16/Dec/2012:10:32:50 -0500] "GET / HTTP/1.1" 200 14791 "-" "Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)"

    辅助类

     1 package com.trendwise.software;
     2 
     3 import java.text.SimpleDateFormat; 
     4 import java.util.Date; 
     5 import java.io.DataInput; import java.io.DataOutput; 
     6 import java.io.IOException; 
     7 import org.apache.hadoop.io.WritableComparable; 
     8 
     9 public class DateWritable implements WritableComparable<DateWritable>{
    10     private final static SimpleDateFormat formatter = new SimpleDateFormat( "yyyy-MM-dd' T 'HH:mm:ss.SSS" ); 
    11     private Date date; 
    12     public Date getDate() { 
    13         return date; 
    14     } 
    15     public void setDate( Date date ) { 
    16         this.date = date; 
    17     } 
    18 
    19     @Override
    20     public void readFields(DataInput in) throws IOException {
    21         date = new Date( in.readLong() );         
    22     }
    23 
    24     @Override
    25     public void write(DataOutput out) throws IOException {
    26         out.writeLong( date.getTime() );         
    27     }
    28 
    29     @Override
    30     public int compareTo(DateWritable o) {
    31         return date.compareTo( o.getDate() ); 
    32     }
    33     
    34     public String toString() { 
    35         return formatter.format( date); 
    36     }     
    37 }

    mapper 映射特定年份中每月每天每个时辰的访客数

     1 package com.trendwise.software;
     2 
     3 import java.io.IOException;
     4 import java.util.Calendar;
     5 import org.apache.hadoop.io.IntWritable;
     6 import org.apache.hadoop.io.LongWritable;
     7 import org.apache.hadoop.io.Text;
     8 import org.apache.hadoop.mapreduce.Mapper;
     9 
    10 public class LogMapper extends Mapper<LongWritable, Text, DateWritable, IntWritable> { 
    11     public static DateWritable dates = new DateWritable(); 
    12     public final static IntWritable two = new IntWritable(1); 
    13     public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { 
    14         String text = value.toString(); 
    15         // Get the date and time 
    16         int openBracket = text.indexOf( '[' ); 
    17         int closeBracket = text.indexOf( ']' ); 
    18         if( openBracket != -1 && closeBracket != -1 ) { 
    19             // Read the date 
    20             String dateString = text.substring( text.indexOf( '[' ) + 1, text. indexOf( ']' ) ); 
    21             // Build a date object from a string of the form: 16/Dec/2012:05:32:50 -0500 
    22             int index = 0; 
    23             int nextIndex = dateString.indexOf( '/' ); 
    24             int day = Integer.parseInt( dateString.substring(index, nextIndex) );
    25             
    26             index = nextIndex; nextIndex = dateString.indexOf( '/', index+1 ); 
    27             String month = dateString.substring( index+1, nextIndex ); 
    28             index = nextIndex; 
    29             nextIndex = dateString.indexOf( ':', index ); 
    30             int year = Integer.parseInt(dateString.substring(index + 1, nextIndex)); 
    31             index = nextIndex; nextIndex = dateString.indexOf( ':', index+1 ); 
    32             int hour = Integer.parseInt(dateString.substring(index + 1, nextIndex)); 
    33             // Build a calendar object for this date 
    34             Calendar calendar = Calendar.getInstance(); 
    35             calendar.set( Calendar.DATE, day );
    36             calendar.set( Calendar.YEAR, year ); 
    37             calendar.set( Calendar.HOUR, hour ); 
    38             calendar.set( Calendar.MINUTE, 0 ); 
    39             calendar.set( Calendar.SECOND, 0 ); 
    40             calendar.set( Calendar.MILLISECOND, 0 ); 
    41             if( month.equalsIgnoreCase( "dec" ) ) { 
    42                 calendar.set( Calendar.MONTH, Calendar.DECEMBER ); 
    43             } 
    44             else if( month.equalsIgnoreCase( "nov" ) ) { 
    45                 calendar.set( Calendar.MONTH, Calendar.NOVEMBER ); 
    46             } 
    47             else if( month.equalsIgnoreCase( "oct" ) ) { 
    48                 calendar.set( Calendar.MONTH, Calendar.OCTOBER ); 
    49             }
    50             else if( month.equalsIgnoreCase( "sep" ) ) { 
    51                 calendar.set( Calendar.MONTH, Calendar.SEPTEMBER ); 
    52             } 
    53             else if( month.equalsIgnoreCase( "aug" ) ) { 
    54                 calendar.set( Calendar.MONTH, Calendar.AUGUST ); 
    55             } 
    56             else if( month.equalsIgnoreCase( "jul" ) ) { 
    57                 calendar.set( Calendar.MONTH, Calendar.JULY ); 
    58             } 
    59             else if( month.equalsIgnoreCase( "jun" ) ) {
    60                 calendar.set( Calendar.MONTH, Calendar.JUNE ); 
    61             } 
    62             else if( month.equalsIgnoreCase( "may" ) ) {
    63                 calendar.set( Calendar.MONTH, Calendar.MAY ); 
    64             } 
    65             else if( month.equalsIgnoreCase( "apr" ) ) { 
    66                 calendar.set( Calendar.MONTH, Calendar.APRIL ); 
    67             } 
    68             else if( month.equalsIgnoreCase( "mar" ) ) { 
    69                 calendar.set( Calendar.MONTH, Calendar.MARCH ); 
    70             } 
    71             else if( month.equalsIgnoreCase( "feb" ) ) { 
    72                 calendar.set( Calendar.MONTH, Calendar.FEBRUARY ); 
    73             } 
    74             else if( month.equalsIgnoreCase( "jan" ) ) { 
    75                 calendar.set( Calendar.MONTH, Calendar.JANUARY ); 
    76             } 
    77             
    78             dates.setDate( calendar.getTime() ); 
    79             context.write(dates, two); 
    80             
    81         }
    82     }
    83 }

    reducer 汇总一个时辰内访客人数

     1 package com.trendwise.software;
     2 
     3 import java.io.IOException;
     4 import org.apache.hadoop.io.IntWritable;
     5 import org.apache.hadoop.mapreduce.Reducer;
     6  
     7 public class  LogReducer extends Reducer<DateWritable, IntWritable, DateWritable, IntWritable> {
     8     @Override
     9     public void reduce( DateWritable key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException { 
    10     
    11         int countn = 0; 
    12         for(IntWritable v :values){ 
    13             countn += v.get(); 
    14         }     
    15         context.write(key, new IntWritable( countn) ); 
    16     } 
    17 }

    driver 配置信息,程序入口

     1 package com.trendwise.software;
     2 
     3 import java.io.IOException;
     4 import org.apache.hadoop.conf.Configuration;
     5 import org.apache.hadoop.fs.Path;
     6 import org.apache.hadoop.io.IntWritable;
     7 import org.apache.hadoop.mapreduce.Job;
     8 import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
     9 import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
    10 
    11 public class Driver {
    12     
    13     public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException { 
    14                 
    15         String in = args[0];
    16         String out = args[1];
    17         int unitmb =Integer.valueOf(args[2]);                
    18         int nreducer = Integer.valueOf(args[3]);
    19         
    20         Configuration conf = new Configuration();                
    21         conf.set("mapreduce.input.fileinputformat.split.maxsize", String.valueOf(unitmb * 1024 * 1024));
    22         conf.set("mapred.min.split.size", String.valueOf(unitmb * 1024 * 1024));
    23         conf.set("mapreduce.input.fileinputformat.split.minsize.per.node", String.valueOf(unitmb * 1024 * 1024));
    24         conf.set("mapreduce.input.fileinputformat.split.minsize.per.rack", String.valueOf(unitmb * 1024 * 1024));
    25                 
    26         Job job = new Job(conf);        
    27         FileInputFormat.addInputPath(job, new Path(in));
    28         FileOutputFormat.setOutputPath(job, new Path(out));            
    29         job.setMapperClass(LogMapper.class); 
    30         job.setReducerClass(LogReducer.class); 
    31         job.setCombinerClass(LogReducer.class); 
    32         job.setNumReduceTasks(nreducer);
    33         job.setMapOutputKeyClass(DateWritable.class);
    34         job.setMapOutputValueClass(IntWritable.class);    
    35         job.setOutputKeyClass(DateWritable.class); 
    36         job.setOutputValueClass(IntWritable.class);
    37         job.setJarByClass(Driver.class);
    38         job.waitForCompletion(true);    
    39                     
    40     }     
    41 }

    command

    result

  • 相关阅读:
    Web Ajax入门一讲
    Delphi – 我的代码之简单五子棋
    闲话 纪念我的4520G
    Delphi 我的代码之窗体移动
    破文 黑客游戏
    破文 OD常用断点
    Web 简单的开始 – Ajax + XML +DOM
    工具 – XMLSPY 和 UModel 商业版 2010v12.0有注册机
    API InterlockedCompareExchange用法
    软件工程 设计模式学习之策略模式Strategy
  • 原文地址:https://www.cnblogs.com/learn21cn/p/6132528.html
Copyright © 2011-2022 走看看