zoukankan      html  css  js  c++  java
  • pig笔记

    原谅我只是拿这个当笔记来写了,最近写的就这几个常用的

    1.基本使用

    REGISTER /home/vlab/ykt/StandartTrjn.jar
    DEFINE StandartTrjn com.zhangdan.pig.StandartTrjn();
    data_load = load 'yktdata/pid06/pid06_09.csv' using PigStorage(',') as (account:chararray,des:chararray,jntime:chararray);
    data_group = group data_load by account;
    --一次刷卡记录
    data_seq = foreach data_group{
        sorted = order data_load by jntime;
        generate flatten(StandartTrjn(sorted));
    }
    store data_seq into 'yktdata/pid06_result09/standart' using PigStorage(',');
    
    
    ----------生成所有的关系对
    REGISTER /home/vlab/ykt/GetConnection.jar
    DEFINE GetConnection com.zhangdan.ykt.GetConnection();
    data_load = load 'yktdata/pid06_result09/standart/part-r-*' using PigStorage(',') as (account:chararray,des:chararray,year:chararray,month:chararray,day:chararray,jnstart:chararray,jnend:chararray,times:int);
    data_group = group data_load by (year,month,day,des);
    data_seq = foreach data_group{
       valid = distinct data_load;
       sorted = order valid by jnstart;
       generate flatten(GetConnection(sorted));
    };
    --data_dis = distinct data_seq;
    store data_seq into 'yktdata/pid06_result09/connection' using PigStorage(','); 
    --------统计相遇次数
    data_load = load 'yktdata/pid06_result09/connection/part-r-*' using PigStorage(',') as (account1:chararray,account2:chararray,jntime1:chararray,jntime2:chararray);
    data_group = group data_load by (account1,account2);
    data_count = foreach data_group{
        generate flatten(group),COUNT(data_load) as cc;
    };
    data_order = order data_count by cc desc;
    store data_order into'yktdata/pid06_result09/connectioncount' using PigStorage(',');

    2.这个是大师姐给我提供的,将两条相连记录合并

    REGISTER /home/vlab/markovPairsjar/datafu-1.2.0.jar;
    DEFINE MarkovPairs datafu.pig.stats.MarkovPairs();   ---××××××××××××
    REGISTER /home/vlab/ykt/gettimebysecond.jar
    DEFINE getSecondtime com.zhangdan.pig.GetTimebySecond();
    data_load = load 'yktdata/pid06_10.csv' using PigStorage(',') as (account:chararray,des:chararray,jntime:chararray);
    data_group = group data_load by account;
    --连接连续的两次刷卡:卡号,地点,刷卡时间,下次刷卡时间
    data_seq = foreach data_group{
        sorted = order data_load by jntime;
        pair = MarkovPairs(sorted); ---×××××××××××××
        generate flatten(pair) as (elem1:TUPLE(account:chararray,des:chararray,jntime:chararray),elem2:TUPLE(account:chararray,des:chararray,jntime:chararray));
    }--连接连续的两次刷卡
    
    --卡号,地点,刷卡时间,下次刷卡时间,时间差
    data_long = foreach data_seq{
         generate elem1.account as account,elem1.des as des1,RTRIM(elem1.jntime) as jnstart,elem2.des as des2,RTRIM(elem2.jntime) as jnen,getSecondtime(elem1.jntime,elem2.jntime) as resu;
    };
    --data_result = filter data_long by resu<5*60;
    
    
    --data_result = foreach data_seq generate flatten(elem1),flatten(elem2);
    store data_long into 'yktdata/combine' using PigStorage(',');

    pig对于刚刚处理大量数据的人来讲真的方便好多,对于不擅长写代码的孩纸更是容易不少,

    讲真,掌握一门语言如java或者python,应该可以帮我们得到任意形式的数据,千万不要仅仅依赖pig

  • 相关阅读:
    进入社会,我们要做哪些准备?
    2天玩转单反相机(第二讲)
    Google广告优化与工具
    Ext JS 4:模型剖析
    iphone开发教程(1) iOS大纲
    iPhone objectivec字符串类NSString的使用
    Ext JS 4倒计时:图形和图表
    Ext JS 4倒计时:开发者预览版
    WOSA介绍
    “苹果皮”与知识产权
  • 原文地址:https://www.cnblogs.com/xunyingFree/p/5282137.html
Copyright © 2011-2022 走看看