zoukankan      html  css  js  c++  java
  • (OK) 刘姐实验中的大数据分析—awk—paste—system

    [root@localhost 给同光的数据]# pwd
    /opt/cBPM-设计文档/张同光—设计文档/给同光的数据

    [root@localhost 给同光的数据]# head -5 task-usage.csv


        2070226000000,2070237000000,2916101306,0,38675718,2.193e-05,0,0,0,0,0,,0,0.0002365,,8.215,0.04253,0,0,0
        2070226000000,2070300000000,4665712499,305,460145180,0.04279,0.06628,0.07739,0.001984,0.002632,0.06641,,0.0003786,0.4438,,5.141,0.01595,0,0,0.01657
        2070226000000,2070300000000,4665712499,1605,32064642,0.01996,0.06,0.07727,0.007782,0.008438,0.0603,,0.0003805,0.07349,,2.237,0.006193,0,0,0.03137
        2070226000000,2070300000000,4665896876,310,905033,0.01558,0.06702,0.07727,0.0047,0.005348,0.06714,,0.0003691,0.03802,,5.537,0.01843,0,0,0.01036
        2070226000000,2070300000000,6371483842,190,2854214373,0.0008802,0.000865,0.001848,0.0001869,0.0002813,0.001354,,0,0.02026,,6.742,0.02581,0,0,0.000124
    
    

    [root@localhost 给同光的数据]# head -5 task-event.csv


        2070226325733,,6397857072,512,,0,sd97khHSDKGM3BF42qtSNY39C4ZR1IRQWe3h9vSy4mU=,0,1,0.01562,0.03979,0.000309,0
        2070226331535,,3996125377,36,2097345596,5,/ADqb6ab3/Bxrz3dyZaDPRX7DAfUdiR+JTSVDrtJ+qU=,3,9,0.1257,0.01271,0.0001183,0
        2070226331538,,3996125377,36,,0,/ADqb6ab3/Bxrz3dyZaDPRX7DAfUdiR+JTSVDrtJ+qU=,3,9,0.1257,0.01271,0.0001183,0
        2070227032231,,515042969,8,,5,/fk1fVcVxZ6iM6gHZzqbIyq56m5zrmHfpdcZ/zzkq4c=,2,0,0.01562,0.01553,0.0002155,0
        2070227032234,,515042969,8,,0,/fk1fVcVxZ6iM6gHZzqbIyq56m5zrmHfpdcZ/zzkq4c=,2,0,0.01562,0.01553,0.0002155,0

    +++++++++++++++++++++++++
    -----------------------需求1.
    +++++++++++++++++++++++++
    合并task-usage.csv和task-event.csv文件,生成新文件task.csv169, 新文件由以下字段组成:

    -----------------------文件task-usage.csv中的        $3$4,$1,$6
    job ID(第3列,$3)
    task index(第4列,$4)
    1. start time of the measurement period(第1列,$1)、
    3. mean CPU usage rate(第6列,$6)
    -----------执行下面命令,生成task-usage.csv3416
    awk -F, '{print $3,$4}' OFS='' task-usage.csv > task-usage.csv34
    awk -F, '{print $1,$6}' OFS=',' task-usage.csv > task-usage.csv16
    paste -d"," task-usage.csv34 task-usage.csv16 > task-usage.csv3416

    -----------------------和 文件task-event.csv中的        $3$4,$9
    4.job ID(第3列,$3)、
    5.task index(第4列,$4)、
    6.priority(第9列,$9),
    -----------执行下面命令,生成task-event.csv349
    awk -F, '{print $3,$4}' OFS='' task-event.csv > task-event.csv34
    awk -F, '{print $9}' OFS='' task-event.csv > task-event.csv9
    paste -d"," task-event.csv34 task-event.csv9 > task-event.csv349

    -----------------------各字段仍以空格隔开。合并时需判断:两个文件的job ID和task index一样的(即task-event.csv的$3、$4分别等于task-usage.csv的$3、$4),其余字段才在一行;

    -----------执行下面命令,合并task-usage.csv3416,task-event.csv349,生成task.csv169
    awk -F, 'ARGIND==1 {w[$1]=$2}
    ARGIND==2 {
        flag=0;
        for(a in w)
            if($1==a) {
                flag=1;
                print $2,$3,w[a];
                break;
            }
    }' task-event.csv349 task-usage.csv3416 > task.csv169

        awk -F, 'ARGIND==1 {w[$1]=$2}
        ARGIND==2 {
            flag=0;
            for(a in w)
                if($1==a) {
                    flag=1;
                    print $2,$3,w[a];
                    break;
                }
        }' task-event.csv349 task-usage.csv3416 > task.csv169


    **********************************************************************
    由于task-event.csv349 和 task-usage.csv3416 两个文件很大,有300M+,实际数据更大,上面算法很低效,计算时间长达几十个小时,因此,使用如下算法生成 task.csv169 文件
    **********************************************************************
    [root@localhost 给同光的数据]# gedit binarysearch.awk

        #!/bin/awk -f
        # date :2010-10-02
        # binary search by awk ,just for fun
        # awk -f binarysearch.awk a b
    
        # 参考 http://bbs.chinaunix.net/forum.php?mod=viewthread&tid=1794824
        # sort task-usage.csv3416 > task-usage.csv3416s
        # sort task-event.csv349 > task-event.csv349s
        # cp task-event.csv349s a; cp task-usage.csv3416s b
        # mv task-event.csv349s a; mv task-usage.csv3416s b
        # awk -f binarysearch.awk a b > task.csv169temp
        # sort task.csv169temp > task.csv169
    
        NR==FNR { a[k++] = $0 }
    
        NR>FNR {
            start= 0;end = k-1
            while(start<= end) {
                mid =int(start+ ((end - start)/2))
                #start,end都很大时,比如元素达到 2^30 时,平常做法mid =int((start+ end)/2)将超过整数的最大值 2^31 -1,此时讲溢 #出,值为负了。所以要用这个办法,O(∩_∩)O~
                        #第1个文件(task-event.csv349s),各字段在b[x]
                split(a[mid], b, ",")
                        #第2个文件(task-usage.csv3416s b),各字段在c[x]
                split($1, c, ",")
                #if($1==b[1]) {print "ok "$1 " was found";break}
    
                        #下面比较字符串
                #if(b[1]""==c[1]"") {print c[2],c[3],b[2],b[1],c[1]; break}
                if(b[1]""==c[1]"") {print c[2],c[3],b[2]; break}
                else if (c[1]"" > b[1]"") start = mid+1
                #else if (c[1]"" > b[1]"") {print b[1],c[1]; start = mid+1}
                #else if (sprintf("%s",c[1]) > sprintf("%s",b[1])) {print b[1],c[1]; start = mid+1}
                else end= mid-1
    
                }
        }


    **********************************************************************


    +++++++++++++++++++++++++
    -----------------------需求2.
    +++++++++++++++++++++++++
    对于合并后的文件task.csv,进行条件统计:
     
    条件1:以5分钟(task.csv的start time of the measurement period字段($1)间隔差=300000000)为时间间隔,
     
    条件2:按priority字段($4)的取值,分别统计mean_CPU_usage字段值:
     
    priority>=9的任务,统计该时间间隔内mean CPU usage rate和的中位数,生成新文件task-usage_normal.csv;
    priority<9的任务,统计该时间间隔内mean CPU usage rate和的中位数,生成新文件task-usage_lower.csv;
     
    生成的新文件,包含字段
    1.autoNo、 5. mean CPU usage rate。

    -----------执行下面命令,生成文件task-usage_normal.csv,task-usage_lower.csv

    //执行之前,先删除相关文件
    [root@localhost 给同光的数据]# rm 20* task-usage_normal.csv temp -f
    [root@localhost 给同光的数据]# rm 20* task-usage_lower.csv temp -f

    //查看开始时间,下面命令输出的 第一行 第一列  即是开始时间,赋值给starttime,根据需要调整steplen
    [root@localhost 给同光的数据]# less task.csv169

    //执行下面命令,生成文件task-usage_normal.csv
    awk 'BEGIN{
        starttime=2070226000000;
        steplen=1000000;
        endtime=starttime+steplen;
    }

    {
        if($1 <  endtime)<endtime)
            {
                    /*优先级判断*/
            if($3>=1){
                /*printf("time:%d, endtime:%u ", $1, endtime);*/
                /*printf("priority:%d ", $3);*/
                print $2 >> endtime;
                    }
        }else{
                    /*判断endtime文件是否存在*/
            /*print endtime;*/
            cmd="test -f "endtime"";
            ret=system(cmd);
            if(!ret){      /*endtime文件存在, 则执行下面*/

            /*endtime先排序,放入temp文件,然后取中位数*/
            /*print endtime;*/
            cmd="sort "endtime" >temp";
            system(cmd);

            /*system("sleep 5");*/

            print endtime-steplen "~" endtime >> "task-usage_normal.csv";

            cmd="wc temp >tmp";
            system(cmd);
            getline var < "tmp"; split(var,a," "); midline=int(a[1]/2+1);

            cmd="awk NR=="midline" temp >> task-usage_normal.csv";   /* task-usage_lower.csv */
            system(cmd);

            close("tmp");
            close("temp");
            close(endtime);
                    }

            endtime+=steplen;
            }
    }' task.csv169


    上面代码见下面附件

    源代码下载:aa.txt



    [root@localhost 给同光的数据]# ls
    date.tmp           task-event.csv9     task-usage.csv34                                                                 需求.txt
    task.csv169        task-event.csv.org  task-usage.csv3416                                                               需求.txt~
    task-event.csv     task-usage.csv      task-usage.csv.org
    task-event.csv34   task-usage.csv16    temp.txt
    task-event.csv349  task_usage.csv34
    [root@localhost 给同光的数据]#


    +++++++++++++++++++++++++
    -----------------------需求3.
    +++++++++++++++++++++++++

    cpu_by_time_prio.txt 内容如下,包含数千行,要求 根据 第一列 大小 排序:

        6.220000000000000000e+03 0.000000000000000000e+00 1.582216674804687500e+03 3.727216339111328100e+01 1.667797279357910200e+01 0.000000000000000000e+00 3.194387207031250000e+02 0.000000000000000000e+00 3.024159669876098600e+00 2.463942146301269500e+01 2.162471771240234400e+01 1.739675781250000000e+03 1.963522148132324200e+01 1.394829864501953100e+02
        2.980000000000000000e+03 0.000000000000000000e+00 3.961403503417968700e+02 4.579420471191406200e+01 4.849912643432617200e+01 0.000000000000000000e+00 3.451186523437500000e+02 0.000000000000000000e+00 2.817885160446167000e+00 2.533868217468261700e+01 4.820736694335937500e+01 1.439588256835937500e+03 2.016125297546386700e+01 1.407745971679687500e+02


    方法:

        cut -d' ' -f1 cpu_by_time_prio.txt > cpu_by_time_prio.txt1tmp
        awk '{print sprintf("%04d", $0);}' cpu_by_time_prio.txt1tmp > cpu_by_time_prio.txt1
        cut -d' ' -f2- cpu_by_time_prio.txt > cpu_by_time_prio.txt2-14
        paste -d" " cpu_by_time_prio.txt1 cpu_by_time_prio.txt2-14 | sort > cpu_by_time_prio.txt-ok





    ++++++++++++++++++++++++++下面杂项,不用看
    //取中位数
    awk 'BEGIN{
        endtime=2070227000000;

        temp=2070228000000;
        cmd="wc "temp" >tmp";
        system(cmd);
        getline var < "tmp"; split(var,a," "); midline=int(a[1]/2+1);

        cmd="test -f "endtime"";
        ret=system(cmd);
        if(!ret){ print endtime}

        cmd="sort "endtime" >temp";
        system(cmd);
        "wc -l temp" | getline var; split(var,a," "); midline=int(a[1]/2+1);
        print midline;
        cmd="awk NR=="midline" temp > temp.txt";
        system(cmd);
        close("temp");
    }'
    cat temp.txt


    cat task.csv169 | grep 2070249000000
    cat task.csv169 | grep 2070249000000|cut -d' ' -f2|sort

        cmd="awk NR=="midline" temp > temp.txt";
        print midline;
        "sed -n 5p temp" | getline var; print var;

    FILENAME==ARGV[1] {
            array[array_size] = strtonum($0);
            array_size++;
    }
    awk 'BEGIN { "sort 2070235000000 >temp"; "wc -l temp" | getline var; split(var,a," "); midline=int(a[1]/2+1);  print midline}'

    awk 'BEGIN { "sort 2070235000000 >temp"; "wc -l temp" | getline var; split(var,a," "); midline=int(a[1]/2+1);  print midline; NR==midline{print} < temp}'<temp}'

    awk '{i=1;while((getline<"testdata")>0)print $0;}'

    awk 判断文件是否存在
    awk 'BEGIN{a=system("test -f /etc/passwdd");if(a) {print "file is exist."}}'

    <temp}'



    </temp}'
    </temp}'
    </endtime)

  • 相关阅读:
    用二重循环打印图形--------矩形 三角形 菱形
    数组的应用
    柳暗花明又一村的———for循环
    E-PUCK机器人-开始
    E-PUCK机器人-软件
    E-PUCK机器人-电池使用
    E-PUCK机器人-硬件
    E-PUCK机器人-FAQ
    E-PUCK机器人-Tiny Bootloader和其他开发工具
    E-PUCK机器人-例子
  • 原文地址:https://www.cnblogs.com/ztguang/p/12647028.html
Copyright © 2011-2022 走看看