zoukankan      html  css  js  c++  java
  • Hadoop小程序数据筛选

      1、有一批路由日志,需要提取MAC地址和时间,删除其他内容。

      日志内容格式如下:

    Apr 15 10:04:42 hostapd: wlan0: STA 14:7D:C5:9E:84
    Apr 15 10:04:43 hostapd: wlan0: STA 14:7D:C5:9E:85
    Apr 15 10:04:44 hostapd: wlan0: STA 14:7D:C5:9E:86
    Apr 15 10:04:45 hostapd: wlan0: STA 14:7D:C5:9E:87
    Apr 15 10:04:46 hostapd: wlan0: STA 14:7D:C5:9E:88
    Apr 15 10:04:47 hostapd: wlan0: STA 14:7D:C5:9E:89
    Apr 15 10:04:48 hostapd: wlan0: STA 14:7D:C5:9E:14
    Apr 15 10:04:49 hostapd: wlan0: STA 14:7D:C5:9E:24
    Apr 15 10:04:52 hostapd: wlan0: STA 14:7D:C5:9E:34
    Apr 15 10:04:32 hostapd: wlan0: STA 14:7D:C5:9E:44
    Apr 15 10:04:22 hostapd: wlan0: STA 14:7D:C5:9E:54

      筛选后的内容格式为:

    Apr 15 10:04:42 14:7D:C5:9E:84
    Apr 15 10:04:43 14:7D:C5:9E:85
    Apr 15 10:04:44 14:7D:C5:9E:86
    Apr 15 10:04:45 14:7D:C5:9E:87
    Apr 15 10:04:46 14:7D:C5:9E:88
    Apr 15 10:04:47 14:7D:C5:9E:89
    Apr 15 10:04:48 14:7D:C5:9E:14
    Apr 15 10:04:49 14:7D:C5:9E:24
    Apr 15 10:04:52 14:7D:C5:9E:34
    Apr 15 10:04:32 14:7D:C5:9E:44
    Apr 15 10:04:22 14:7D:C5:9E:54

      

      2、算法思路

      源文件——》Mapper(分割原始数据、输出所需数据、处理异常数据)——》输出到HDFS

      3、编写程序

    import java.io.IOException;
    
    import org.apache.hadoop.conf.Configuration;
    import org.apache.hadoop.conf.Configured;
    import org.apache.hadoop.fs.Path;
    import org.apache.hadoop.io.LongWritable;
    import org.apache.hadoop.io.NullWritable;
    import org.apache.hadoop.io.Text;
    import org.apache.hadoop.mapreduce.Job;
    import org.apache.hadoop.mapreduce.Mapper;
    import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
    import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
    import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
    import org.apache.hadoop.util.Tool;
    import org.apache.hadoop.util.ToolRunner;
    
    
    public class App_1 extends Configured implements Tool{
        enum Counter{
            LINESKIP,//记录出错的行
        }
        /**
         *Mapper<LongWritable,Text,NullWritable,Text>
         *LongWritable,Text 是输入数据的key和value  如:路由日志的每一行的首字符的偏移量作为key,整一行的内容作为value
         *NullWritable,Text    是输出数据的key和value
         *
         */
        public static class RouterMapper extends Mapper<LongWritable,Text,NullWritable,Text>{
        
            //map(LongWritable key,Text value,Context context)
            //LongWritable key,Text value,和RouterMapper类的输入数据的key、value对应
            //Context 上下文环境
            public void map(LongWritable key,Text value,Context context)throws IOException,InterruptedException{
                String line = value.toString();
                try{
                    
                    String[] lineSplit = line.split(" ");//分割原始数据
                    String month = lineSplit[0];//获取月份
                    String day = lineSplit[1];//获取日期
                    String time = lineSplit[2];//获取时间
                    String mac = lineSplit[6];//获取网卡地址
                    
                    //转换成hadoop能读取的输出格式,要和RouterMapper类的输出数据格式一致
                    Text out = new Text(month+" "+day+" "+time+" "+mac);
                    
                    //输出
                    context.write(NullWritable.get(), out);
                    
                }catch(ArrayIndexOutOfBoundsException e) {
                    //对异常数据进行处理,出现异常,令计数器+1
                    context.getCounter(Counter.LINESKIP).increment(1);
                    return;
                }
            }
        }
        
        @Override
        public int run(String[] arg0) throws Exception {
            Configuration conf = getConf();
            Job job = new Job(conf,"App_1");//指定任务名称
            job.setJarByClass(App_1.class);//指定Class
            FileInputFormat.addInputPath(job, new Path(arg0[0]));//输入路径
            FileOutputFormat.setOutputPath(job, new Path(arg0[1]));//输出路径
            
            job.setMapperClass(RouterMapper.class);//调用RouterMapper类作为Mapper的任务代码
            job.setOutputFormatClass(TextOutputFormat.class);
            
            job.setOutputKeyClass(NullWritable.class);//指定输出的key格式,要和RouterMapper的输出数据格式一致
            job.setOutputValueClass(Text.class);//指定输出的value格式,要和RouterMapper的输出数据格式一致
            job.waitForCompletion(true);
            return job.isSuccessful()?0:1;
        }
        
        //测试用的main方法
        //main方法运行的时候需要指定输入路径和输出路径
        public static void main(String[] args) throws Exception{
            int res = ToolRunner.run(new Configuration(), new App_1(), args);
            System.exit(res);
            
        }
    }

      要分析的路由日志文件已上传到HDFS的hdfs://h1:9000/user/coder/in目录中,h1是我的namenode主机名。 运行前配置参数,注意输出路径的存放目录必须不存在。

      4、运行结束后,可在eclipse中直接查看结果

      

  • 相关阅读:
    使用开源GIS克隆一个Google Map
    《3s新闻周刊》No.4:与国产GIS企业一起成长
    Google发布免费的SketchUp
    VB和VBA工程的一些限制
    推荐一个新的RSS阅读站点:抓虾
    USGS如何利用Google Earth
    ESRI今年的Dev Summit的幻灯片
    Google Map创建工具和资源
    《3s新闻周刊》第6期发布,本期话题:海外聚焦――空间信息技术进行式
    《Excel与VBA程序设计》最新进度(>75%)
  • 原文地址:https://www.cnblogs.com/luxh/p/2994522.html
Copyright © 2011-2022 走看看