zoukankan      html  css  js  c++  java
  • 春晚项目中的相关脚本

    根据昵称爬取id的数据预处理以及各式转换

    #!/bin/bash
    root_dir=`pwd`
    out_all_file="$root_dir"/result_data/user.all
    out_map="$root_dir"/result_data/name_id.map
    rm -rf $out_all_file
    rm -rf $out_map
    #######put the user.out in the dictory $root_dir/source_data/#######
    ####processing the jar#################################
    #java -cp "$root_dir"/src/weiboApi.jar com.bobo.parser.ParseUserInfo $root_dir 
    echo "java processing is done"
    
    #########data output after translate########
    ##screenName/id/gender/location/status_count/friendscount/followerscount/verfied#####
    dos2unix  "$root_dir"/source_data/data.dealed
    dos2unix "$root_dir"/source_data/user.out
    join -a 1 -t $'	' -e "null" -o 1.1  2.2 2.3 2.4  2.5  2.6  2.7  2.8   1.2  1.3  "$root_dir"/source_data/data.dealed "$root_dir"/source_data/user.out>$out_all_file
    echo "generate all_info is done"
    cut  $out_all_file -f 1,2>$out_map
    echo "generate name_id.map is done"
    UserParser
    #!/bin/bash
    tmp_dir=`pwd`
    root_dir=$tmp_dir"/人气榜"
    first_dirs=`ls $root_dir`
    category_tmpfile=$tmp_dir"/category.tmp"
    data_tmpfile=$tmp_dir"/data.tmp"
    data_file=$tmp_dir"/data.dealed"
    rm -rf $category_tmpfile
    rm -rf $data_tmpfile
    rm -rf $data_file
    for first_dir in $first_dirs
    do 
        second_dirs=`ls $root_dir"/"$first_dir | awk -F'.' '{print $1}'`
        for second_dir in $second_dirs
        do
            cat $root_dir"/"$first_dir"/"$second_dir".txt" >>$data_tmpfile
            line_count=`wc -l $root_dir"/"$first_dir"/"$second_dir".txt" | awk -F' ' '{print $1}'`
            for nu in $(seq 1 $line_count)
            do 
                echo $first_dir"    "$second_dir>>$category_tmpfile
            done
        done
    done
    #######处理前需要注意去除前一个文件在系统下的换行符号^M###########
    dos2unix $category_tmpfile
    dos2unix $data_tmpfile
    paste -d "	" $data_tmpfile $category_tmpfile>$data_file
    dataPre
      1 #!/bin/sh
      2 cur_dir=`pwd`
      3 source_dir="/data/beiyou/minelab/fans_count_list"
      4 result_file=$cur_dir/result.data
      5 source_files="$source_dir/0.data $source_dir/1.data $source_dir/2.data $source_dir/3.data $source_dir/4.data $source_dir/5.data"
      6 for file in $source_files
      7 do
      8 #       echo $file
      9         count=`cat $file|cut -f 3| awk '{for(i=1;i<=NF;i++) print $i}' | sort | uniq | wc -l`
     10         echo $count
     11         echo $count>>$result_file
     12         echo "$file is done"
     13 done
    数组定义以及拆分每一行


     

    #!/bin/bash
    root_dir=/data/beiyou/minelab
    #date=`date -d "0 day ago" +%Y%m%d`
    date=20140119
    # the taw data dir
    source_dir=$root_dir/source_data/Video/$date
    # the predata(extract two columns) dir
    pre_dir=$root_dir/Src/liweibo/source_data/video/$date
    # the segment and remove stopwords dir
    seg_dir=$root_dir/Src/liweibo/result_data/video/$date
    # the total word_count dir
    word_count_dir=$root_dir/Src/liweibo/result_data/videoWordCount/$date
    # the yinhang dir
    yinhang_dir=$root_dir/Src/yinhang
    # the final dir , top n word of every program
    outfile_final_dir=$root_dir/VideoResult/$date
    
    rm -rf $pre_dir
    rm -rf $seg_dir
    rm -rf $word_count_dir
    rm -rf $outfile_final_dir
    mkdir $pre_dir
    mkdir $seg_dir
    mkdir $word_count_dir
    mkdir $outfile_final_dir
    
    dir_list=`ls $source_dir`
    for dir in $dir_list
    do
            file_list=`ls $source_dir/$dir`
            for file_name in $file_list
            do
                    if [ "$file_name" == "aiqiyi.retain" ] || [ "$file_name" == "youku.retain" ] || [ "$file_name" == "souhu.retain" ]; then
                            cat $source_dir/$dir/$file_name | awk -F'	' '{print $2"	"$4}' > $pre_dir/$file_name.pre
                            java -cp $yinhang_dir/forLiWeiBo.jar SinaPre $pre_dir/$file_name.pre $seg_dir/$file_name.seg $root_dir/source_data/Common/stopwords.list $yinhang_dir/bin/
                            echo "$file_name datapre and segment is done!"
                    elif [ "$file_name" == "pptv.retain" ];then
                            cat $source_dir/$dir/$file_name | awk -F'	' '{print $2"	"$3}' > $pre_dir/$file_name.pre
                            java -cp $yinhang_dir/forLiWeiBo.jar SinaPre $pre_dir/$file_name.pre $seg_dir/$file_name.seg $root_dir/source_data/Common/stopwords.list $yinhang_dir/bin/
                            echo "$file_name datapre and segment is done!"
                    fi
            done
    done
    
    echo "begin to calculate word count..."
    program_list=`cat $root_dir/Src/liweibo/conf/program.list`
    seg_file_list=`ls $seg_dir`
    for seg_file in $seg_file_list
    do
            for program in $program_list
            do
                    cat $seg_dir/$seg_file | grep $program | awk -F'	' '{print $2}' | awk '{ for(i=1;i<=NF;i++) print $i }' | sort | uniq -c | sort -r -n >> $word_count_dir/$program.tf
            done
    done
    
    echo "begin to generate top n..."
    head -n 20 $word_count_dir/*.tf >> $outfile_final_dir/videoWordTf.topn
    视频词频统计
     #!/bin/bash
    root_dir=/data/beiyou/minelab
    #date=`date -d "0 day ago" +%Y%m%d`
    date=20140119
    # the taw data dir
    source_dir=$root_dir/source_data/Video/$date
    # the predata(extract two columns) dir
    pre_dir=$root_dir/Src/liweibo/source_data/video/$date
    # the segment and remove stopwords dir
    seg_dir=$root_dir/Src/liweibo/result_data/video/$date
    # the total word_count dir
    word_count_dir=$root_dir/Src/liweibo/result_data/videoWordCount/$date
    # the yinhang dir
    yinhang_dir=$root_dir/Src/yinhang
    # the final dir , top n word of every program
    outfile_final_dir=$root_dir/VideoResult/$date
    ########the top num of word#####
    if [ "$#" != 1 ];then
            echo "error parameters!"
            exit 1
    fi
    topN=$1
    
    rm -rf $pre_dir
    rm -rf $seg_dir
    rm -rf $word_count_dir
    rm -rf $outfile_final_dir
    mkdir $pre_dir
    mkdir $seg_dir
    mkdir $word_count_dir
    mkdir $outfile_final_dir
    
    dir_list=`ls $source_dir`
    for dir in $dir_list
    do
            file_list=`ls $source_dir/$dir`
            for file_name in $file_list
            do
                    if [ "$file_name" == "aiqiyi.retain" ] || [ "$file_name" == "youku.retain" ] || [ "$file_name" == "souhu.retain" ]; then
                            cat $source_dir/$dir/$file_name | awk -F'	' '{print $2"	"$4}' > $pre_dir/$file_name.pre
                            java -cp $yinhang_dir/forLiWeiBo.jar SinaPre $pre_dir/$file_name.pre $seg_dir/$file_name.seg $root_dir/source_data/Common/stopwords.list $yinhang_dir/bin/
                            echo "$file_name datapre and segment is done!"
                    elif [ "$file_name" == "pptv.retain" ];then
                            cat $source_dir/$dir/$file_name | awk -F'	' '{print $2"	"$3}' > $pre_dir/$file_name.pre
                            java -cp $yinhang_dir/forLiWeiBo.jar SinaPre $pre_dir/$file_name.pre $seg_dir/$file_name.seg $root_dir/source_data/Common/stopwords.list $yinhang_dir/bin/
                            echo "$file_name datapre and segment is done!"
                    fi
            done
    done
    视频统计1

    vi的一些技巧:

    去除空行

    :1,$g/^$/d

  • 相关阅读:
    Spring AOP
    TestNG配合ant脚本进行单元测试
    TestNG配合catubuter统计单元测试的代码覆盖率
    junit配合catubuter统计单元测试的代码覆盖率
    TestNG离线安装步骤
    spring 整合redis集群中使用@autowire无效问题的解决办法
    @Repository、@Service、@Controller 和 @Component
    用VMware克隆CentOS 6.5如何进行网络设置
    centos 6.5 dhcp桥接方式上网络设置
    centos 6.5 关闭防火墙
  • 原文地址:https://www.cnblogs.com/bobodeboke/p/3523483.html
Copyright © 2011-2022 走看看