zoukankan      html  css  js  c++  java
  • 茄子快传数据分析(一)----数据清理

    茄子快传数据分析(一)----数据清理

    茄子快传原理

    流程图: 
    这里写图片描述

    数据

    “events”: “1473367236143u00010u0001connectByQRCodeu0001u00010u0001u0001u0001u0001u0001u0001u0001u0001u0001u0001u0001u0001u0001u0001u0001u00011609072239570000027u0001 1473367261933u00010u0001AppLaunchu0001u00010u0001u0001u0001u0001u0001u0001u0001u0001u0001u0001u0001u0001u0001u0001u0001u00011609072239570000028u0001 1473367280349u00010u0001connectByQRCodeu0001u00010u0001u0001u0001u0001u0001u0001u0001u0001u0001u0001u0001u0001u0001u0001u0001u00011609072239570000029u0001 1473367331326u00010u0001AppLaunchu0001u00010u0001u0001u0001u0001u0001u0001u0001u0001u0001u0001u0001u0001u0001u0001u0001u00011609072239570000030u0001 1473367353310u00010u0001connectByQRCodeu0001u00010u0001u0001u0001u0001u0001u0001u0001u0001u0001u0001u0001u0001u0001u0001u0001u00011609072239570000031u0001 1473367387087u00010u0001AppLaunchu0001u00010u0001u0001u0001u0001u0001u0001u0001u0001u0001u0001u0001u0001u0001u0001u0001u00011609072239570000032u0001 1473367402167u00010u0001connectByQRCodeu0001u00010u0001u0001u0001u0001u0001u0001u0001u0001u0001u0001u0001u0001u0001u0001u0001u00011609072239570000033u0001 1473367451994u00010u0001AppLaunchu0001u00010u0001u0001u0001u0001u0001u0001u0001u0001u0001u0001u0001u0001u0001u0001u0001u00011609072239570000034u0001 1473367474316u00010u0001connectByQRCodeu0001u00010u0001u0001u0001u0001u0001u0001u0001u0001u0001u0001u0001u0001u0001u0001u0001u00011609072239570000035u0001 1473367564181u00010u0001AppLaunchu0001u00010u0001u0001u0001u0001u0001u0001u0001u0001u0001u0001u0001u0001u0001u0001u0001u00011609072239570000036u0001 1473367589527u00010u0001connectByQRCodeu0001u00010u0001u0001u0001u0001u0001u0001u0001u0001u0001u0001u0001u0001u0001u0001u0001u00011609072239570000037u0001 1473367610310u00010u0001AppLaunchu0001u00010u0001u0001u0001u0001u0001u0001u0001u0001u0001u0001u0001u0001u0001u0001u0001u00011609072239570000038u0001 1473367624647u00010u0001connectByQRCodeu0001u00010u0001u0001u0001u0001u0001u0001u0001u0001u0001u0001u0001u0001u0001u0001u0001u00011609072239570000039u0001 1473368004298u00010u0001AppLaunchu0001u00010u0001u0001u0001u0001u0001u0001u0001u0001u0001u0001u0001u0001u0001u0001u0001u00011609072239570000040u0001 1473368017851u00010u0001connectByQRCodeu0001u00010u0001u0001u0001u0001u0001u0001u0001u0001u0001u0001u0001u0001u0001u0001u0001u00011609072239570000041u0001 1473369599067u00010u0001AppLaunchu0001u00010u0001u0001u0001u0001u0001u0001u0001u0001u0001u0001u0001u0001u0001u0001u0001u00011609072239570000042u0001 1473369622274u00010u0001connectByQRCodeu0001u00010u0001u0001u0001u0001u0001u0001u0001u0001u0001u0001u0001u0001u0001u0001u0001u00011609072239570000043u0001 ”,

    数据解析

    使用json在线解析器解析为: 
    “header”: { 
    “cid_sn”: “1501004207EE98AA”, sdn码 
    “mobile_data_type”: “”, 
    “os_ver”: “9”, 操作系统 
    “mac”: “88:1f:a1:03:7d:a8”, 物理地址 
    “resolution”: “2560x1337”, 分辨率 
    “commit_time”: “1473399829041”, 提交时间 
    “sdk_ver”: “103”, sdk版本 
    “device_id_type”: “mac”, 设备类型 
    “city”: “江门市”, 城市 
    “android_id”: “”, 安卓设备的安卓id 
    “device_model”: “MacBookPro11,1”,设备型号 
    “carrier”: “中国xx”, 运营商 
    “promotion_channel”: “1”, 推广渠道 
    “app_ver_name”: “1.7”, app版本号 
    “imei”: “”, 入网表示 
    “app_ver_code”: “23”, 公司内部版本码 
    “pid”: “pid”, 
    “net_type”: “3”, 网络类型 
    “device_id”: “m.88:1f:a1:03:7d:a8”, 设备ip 
    “app_device_id”: “m.88:1f:a1:03:7d:a8”, 
    “release_channel”: “appstore”, 发布渠道 
    “country”: “CN”, 
    “time_zone”: “28800000”, 时区编码 
    “os_name”: “ios”, 操作系统类型 
    “manufacture”: “apple”, 生产厂家 
    “commit_id”: “fde7ee2e48494b24bf3599771d7c2a78”, 事件标示 
    “app_token”: “XIAONIU_I”, app标示 
    “account”: “none”, 登陆账号 
    “app_id”: “com.appid.xiaoniu”, app组名 
    “build_num”: “YVF6R16303000403”, 编译号 
    “language”: “zh” 系统所使用语言 

    }

    数据清理

    1、 数据清理需求分析 
    release_channel,device_id,city,device_id_type,app_ver_name 这几个字段如果缺失,则过滤 
    将数据整成 字段,字段,字段,…… 这种形式 
    在每条数据中添加一个字段:user_id(值就是mac) 
    2、数据清理代码

    public class AppLogClean {
        public static class MapTask extends Mapper<LongWritable, Text, Text, NullWritable> {
            StringBuilder sb = new StringBuilder();
            Text k = new Text();
            @Override
            protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, NullWritable>.Context context)
                    throws IOException, InterruptedException {
                // 得到每行数据
                String line = value.toString();
                JSONObject ob1 = JSON.parseObject(line);
                JSONObject ob2 = ob1.getJSONObject("header");
    
    
    
                // 关键数据是否有丢失
                // release_channel,device_id,city,device_id_type,app_ver_name
                // 这几个字段如果缺失,则过滤
                if (StringUtils.isBlank(ob2.getString("release_channel")) 
                        || StringUtils.isBlank(ob2.getString("device_id"))
                        || StringUtils.isBlank(ob2.getString("city"))
                        || StringUtils.isBlank(ob2.getString("device_id_type"))
                        || StringUtils.isBlank(ob2.getString("app_ver_name"))
                        || StringUtils.isBlank(ob2.getString("os_name"))
                        || StringUtils.isBlank(ob2.getString("mac"))) {
                    return;
                }
    
                if (ob2.getString("app_ver_name").equals("android")) {
                    if (StringUtils.isBlank(ob2.getString("android_id"))) {
                        return;
                    }
                }
                sb.append(ob2.getString("cid_sn")).append(",");
                sb.append(ob2.getString("mobile_data_type")).append(",");
                sb.append(ob2.getString("os_ver")).append(",");
                sb.append(ob2.getString("mac")).append(",");
                sb.append(ob2.getString("resolution")).append(",");
                sb.append(ob2.getString("commit_time")).append(",");
                sb.append(ob2.getString("sdk_ver")).append(",");
                sb.append(ob2.getString("device_id_type")).append(",");
                sb.append(ob2.getString("city")).append(",");
                sb.append(ob2.getString("android_id")).append(",");
                sb.append(ob2.getString("device_model")).append(",");
                sb.append(ob2.getString("carrier")).append(",");
                sb.append(ob2.getString("promotion_channel")).append(",");
                sb.append(ob2.getString("app_ver_name")).append(",");
                sb.append(ob2.getString("imei")).append(",");
                sb.append(ob2.getString("app_ver_code")).append(",");
                sb.append(ob2.getString("pid")).append(",");
                sb.append(ob2.getString("net_type")).append(",");
                sb.append(ob2.getString("device_id")).append(",");
                sb.append(ob2.getString("app_device_id")).append(",");
                sb.append(ob2.getString("release_channel")).append(",");
                sb.append(ob2.getString("country")).append(",");
                sb.append(ob2.getString("time_zone")).append(",");
                sb.append(ob2.getString("os_name")).append(",");
                sb.append(ob2.getString("manufacture")).append(",");
                sb.append(ob2.getString("commit_id")).append(",");
                sb.append(ob2.getString("app_token")).append(",");
                sb.append(ob2.getString("account")).append(",");
                sb.append(ob2.getString("app_id")).append(",");
                sb.append(ob2.getString("build_num")).append(",");
                sb.append(ob2.getString("language")).append(",");
    
                String uid = ob2.getString("mac");//???
                sb.append(uid);
                k.set(sb.toString());
                context.write(k, NullWritable.get());
                //清除sb的数据
                sb.delete(0, sb.length());
            }
        }
    
        public static void main(String[] args) throws Exception {
            Configuration conf = new Configuration();
            Job job = Job.getInstance(conf);
    
            //设置map,设置driver,设置输出类型。。。
            job.setJarByClass(AppLogClean.class);
            job.setMapperClass(MapTask.class);
    
            job.setOutputKeyClass(Text.class);
            job.setOutputValueClass(NullWritable.class);
    
            FileInputFormat.addInputPath(job, new Path("D:\a\appuserdata\input\20170101"));
            FileOutputFormat.setOutputPath(job, new Path("D:\a\appuserdata\out"));
            //不需要reduce 可以设置为0
            job.setNumReduceTasks(0);
    
            boolean ret = job.waitForCompletion(true);
            //System.exit(ret?0:1);
            System.out.println(ret?"你很优秀":"滚去调bug!");
        }
    }
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    • 14
    • 15
    • 16
    • 17
    • 18
    • 19
    • 20
    • 21
    • 22
    • 23
    • 24
    • 25
    • 26
    • 27
    • 28
    • 29
    • 30
    • 31
    • 32
    • 33
    • 34
    • 35
    • 36
    • 37
    • 38
    • 39
    • 40
    • 41
    • 42
    • 43
    • 44
    • 45
    • 46
    • 47
    • 48
    • 49
    • 50
    • 51
    • 52
    • 53
    • 54
    • 55
    • 56
    • 57
    • 58
    • 59
    • 60
    • 61
    • 62
    • 63
    • 64
    • 65
    • 66
    • 67
    • 68
    • 69
    • 70
    • 71
    • 72
    • 73
    • 74
    • 75
    • 76
    • 77
    • 78
    • 79
    • 80
    • 81
    • 82
    • 83
    • 84
    • 85
    • 86
    • 87
    • 88
    • 89
    • 90
    • 91
    • 92
    • 93
    • 94

    3、数据清理完的文件 
    这里写图片描述 
    4、清理完成的数据格式 
    1501004207EE98AA,,22,1c:77:f6:78:f5:75,1080x1920,1473396818952,103,mac,江门市,867830021735040, 
    字段与字段间以逗号隔开

     
  • 相关阅读:
    bzoj2124 等差子序列(树状数组+hash)
    CF817F MEX Queries(线段树上二分)
    [USACO12MAR]摩天大楼里的奶牛(状态压缩DP)
    CF786B Legacy(线段树优化建图)
    绿豆蛙的归宿
    单选错位
    聪聪和可可
    Tyvj1952 Easy
    OSU!
    弱题
  • 原文地址:https://www.cnblogs.com/timssd/p/10659955.html
Copyright © 2011-2022 走看看