zoukankan      html  css  js  c++  java
  • 搜索关键词统计

    #!/bin/bash
    # 统计搜索词 分析171,173上的日志 
    
    sourceDir="/export/manager/kmsearch/log/wordlog"
    tmpDateFile="/tmp/search_wordlog_tmp.txt"
    tmpSearchWordlog="/tmp/search_wordlog"
    
    
    # 分析获取哪些日志文件
    startDate="2015-05-04"
    startTimeStamp=`date -d "$startDate" +%s`
    
    endDate="2015-12-31"
    endTimeStamp=`date -d "$endDate" +%s`
    
    echo "" > $tmpDateFile 
    for((i=$startTimeStamp; i<=$endTimeStamp; i=i+86400))
    do
    dateStr=`date -d @$i  "+%Y-%m-%d"`
    echo "$dateStr.txt" >> $tmpDateFile
    done
    
    #下载 171
    echo "downloading from 171..."
    dateArr=$(cat $tmpDateFile )
    for tmpStr in ${dateArr[@]}
    do
    scp root@10.15.200.171:$sourceDir/$tmpStr $tmpSearchWordlog/171/
    done
    
    #173
    echo "downloading from 173..."
    dateArr=$(cat $tmpDateFile )
    for tmpStr in ${dateArr[@]}
    do
    scp root@10.15.200.173:$sourceDir/$tmpStr $tmpSearchWordlog/173/
    done
    
    
    #输出到同一个文件
    echo "combine all data... "
    echo '' > $tmpSearchWordlog/alldata.txt
    dateArr=$(cat $tmpDateFile )
    for tmpStr in ${dateArr[@]}
    do
    cat $tmpSearchWordlog/171/$tmpStr >> $tmpSearchWordlog/alldata.txt
    cat $tmpSearchWordlog/173/$tmpStr >> $tmpSearchWordlog/alldata.txt
    done
    
    
    #统计 - all
    #cat $tmpSearchWordlog/alldata.txt | awk -F ',' '{print $2","$6}' | sort | uniq | awk -F ',' '{print $1}'| sort | uniq -c | sort -rn | head -100 | awk '{print $1"	"$2" "$3}' > $tmpSearchWordlog/allTop.txt
    #exit
    
    
    #拆分文件 3,000,000行 <200M
    cd $tmpSearchWordlog
    find . -name 'part.alldata.txt*' | xargs rm -rf
    split -l3000000 alldata.txt part.alldata.txt
    allPartFiles=`find . -name "part.alldata.txt*"`
    for tmpStr in $allPartFiles
    do
    cat $tmpStr | awk -F ',' '{print $2","$6}' | sort | uniq | awk -F ',' '{print $1}'| sort | uniq -c | sort -rn | head -900 | awk '{print $1"	"$2" "$3}' > ${tmpStr}_Tops.txt &
    done
    
    echo 'waiting 1分钟...'
    sleep 60
    
    # 整合统计
    find . -name 'part.alldata.txt*_Tops.txt' | xargs cat | awk '{print $2"	"$1}'  | tr '[A-Z]' '[a-z]' | sort > partsAllTops.txt
    
    # 关键词统计
    awk '{a[$1]+=$2;}END{for(i in a){print i,a[i];}}' partsAllTops.txt | awk '{print $2"	"$1}' | sort -rn | grep -v 'www.' | grep -v 'http:' > statistic.result
  • 相关阅读:
    Linux下chkconfig命令详解
    几种主流的快照技术
    HANA内存数据库与oracle数据库的性能比较
    计算机网络知识汇总
    bzoj1211: [HNOI2004]树的计数 prufer序列裸题
    1003: [ZJOI2006]物流运输 最短路+dp
    HDU
    2243: [SDOI2011]染色 树链剖分+线段树染色
    bzoj1036: [ZJOI2008]树的统计Count 树链剖分
    bzoj1042: [HAOI2008]硬币购物 dp+容斥
  • 原文地址:https://www.cnblogs.com/bandbandme/p/5156947.html
Copyright © 2011-2022 走看看