zoukankan      html  css  js  c++  java
  • 对节目微博进行强过滤之后的处理

    1,对原始数据.data进行过滤,利用java实现

    package com.bobo.DataPre;
    
    import java.io.BufferedReader;
    import java.io.BufferedWriter;
    import java.io.File;
    import java.io.FileReader;
    import java.io.FileWriter;
    import java.io.IOException;
    import java.io.PrintWriter;
    
    import com.bobo.util.Constants;
    import com.bobo.util.Fenci;
    import com.bobo.util.StopwordsRemover;
    import com.bobo.util.StringUtil;
    import com.bobo.util.UtilityForRemoveAtName;
    
    public class ProgramDataFilter {
    
        /**
         * @param args
         */
        public static void main(String[] args) {
            long start = System.currentTimeMillis();
            for (int i = 0; i < Constants.ProgramNameList.length; i++) {
    
            }
            ProgramDataFilter pre = new ProgramDataFilter();
            String inFilePath;
            String outFilePath;
            String programName;
            String[] keywords;
            for (int i = 0; i < Constants.ProgramNameList.length; i++) {
                programName = Constants.ProgramNameList[i];
                keywords = Constants.keywordsList[i];
                inFilePath = Constants.TitleDir + File.separator + programName
                        + ".title.uniqByWeiboId";
                outFilePath = Constants.FilterDir + File.separator + programName
                        + ".filter.fenci";
                pre.dataSetAndRmStop(inFilePath, outFilePath, programName, keywords);
                long end = System.currentTimeMillis();
                System.out.println(programName + "数据预处理,分词、去处停用时、去除@花费的时间为:"
                        + (end - start) / 1000);
            }
    
        }
    
        public boolean isRelative(String weiboText, String programName,
                String[] filterWords) {
            // 包含节目名称
            if (!weiboText.contains(programName)) {
                return false;
            }
            // 对于歧义性小的,单独利用名字就够了
            if (filterWords.length < 1) {
                return true;
            }
    
            if (weiboText.contains("" + programName + "")) {
                return true;
            }
    
            // 包含节目名称的同时,包含演员名称或者节目类别
    
            for (String keyword : filterWords) {
                if (weiboText.contains(keyword)) {
                    return true;
                }
            }
    
            return false;
        }
    
        // 第一步,进行分词、去除停用词、去除@后的用户名称?
        private void dataSetAndRmStop(String inFilePath, String outFilePath,
                String programName, String[] keywords) {
            FileReader fr = null;
            BufferedReader br = null;
            FileWriter fw = null;
            BufferedWriter bw = null;
            PrintWriter pw = null;
            String line = null;
            Fenci fenci = new Fenci();
            fenci.initial();
            StopwordsRemover stop = new StopwordsRemover();
            stop.setStoppingListSet(stop
                    .loadStoppingListSet("./conf/stopwords.list"));
            String weiboText;
    
            try {
                fr = new FileReader(inFilePath);
                br = new BufferedReader(fr);
                fw = new FileWriter(outFilePath);
                bw = new BufferedWriter(fw);
                pw = new PrintWriter(bw);
    
                while ((line = br.readLine()) != null) {
                    String[] lineArr = line.split("	");
                    if (lineArr.length != 3) {
                        continue;
                    }
                    weiboText = lineArr[1];
                    if (StringUtil.isNullOrEmpty(weiboText)) {
                        continue;
                    }
                    if (!isRelative(weiboText, programName, keywords)) {
                        continue;
                    }
    
                    String fenciString = stop.removeStoppingWords(fenci
                            .testICTCLAS_ParagraphProcess((UtilityForRemoveAtName
                                    .removeName(weiboText))));
                    if (!StringUtil.isNullOrEmpty(fenciString)) {
                        pw.println(lineArr[0]+"	"+fenciString);
                    }
    
                }
    
            } catch (Exception e) {
                e.printStackTrace();
                System.out.println("RemoveUrlUtil.java文件去除链接出现异常");
            } finally {
                try {
                    br.close();
                } catch (IOException e) {
                    // TODO Auto-generated catch block
                    e.printStackTrace();
                }
                pw.close();
            }
    
        }
    
    }
    过滤和分词

    生成的文件格式是

    用户id “ ” 微博文本

    2,提取每个节目下的用户列表,并将用户的列表和用户的profile进行对应 

    #!/bin/bash
    
    data_dir=/home/minelab/liweibo/springNightForLargePaper/second_test/data
    user_file=/home/minelab/liweibo/springNightForLargePaper/springNightUser/sina_user.data
    dest_dir=/home/minelab/liweibo/springNightForLargePaper/second_test/userForProgram
    
    program_list=`ls $data_dir |  awk -F'.' '{print $1}'`
    
    for program in $program_list
    do
        rm -rf $dest_dir/"$program"_userid_times_profile.map
        cat $data_dir/"$program".filter.fenci | awk -F'	' '{print $1}' | sort | uniq -c | sort -r -n |sed 's/^ *//g' | sed 's/ /	/g' | awk -F'	' '{print $2"	"$1}' | sort > $dest_dir/"$program"_userid_times.map 
        join -t $'	'  $dest_dir/"$program"_userid_times.map $user_file > $dest_dir/"$program"_userid_times_profile.map
        rm -rf $dest_dir/"$program"_userid_times.map
        echo $program is done!
    done
        echo done!
    提取每个节目的用户id列表和用户profile

     3,构建节目id" "评论该节目的用户数“ ”评论该节目的用户列表 

    #!/bin/bash
    
    #生成文件格式节目id"    "评论该节目的用户数目"  "评论该节目的用户id列表(不同id之间使用空格分开)
    #如果一个用户多次评论某个节目,当作一次进行处理
    program_dir=/home/minelab/liweibo/springNightForLargePaper/second_test/userForProgram
    inter_dir=/home/minelab/liweibo/springNightForLargePaper/second_test/inter_data
    result_file=$inter_dir/programid_userlist.map
    tmp_file=$inter_dir/programid_userlist.tmp
    program_list=`ls $program_dir`
    
    rm -rf $result_file
    rm -rf $tmp_file
    i=1
    for program in $program_list
    do
        user_list=`cat $program_dir/$program |  awk -F'	' '{printf("%s ",$1);}End{print;}'`
        line_num=`cat $program_dir/$program | wc -l | awk '{print $1}'`
        echo $i"    "$line_num" "$user_list>>$tmp_file
        i=$[$i+1]
    done
    
    
    #根据节目的流程度排序
    cat $tmp_file | sort -t $'	' -k 2 -r -n > $result_file
    rm -rf $tmp_file
    echo "done"
    构架节目—用户矩阵

     4,构建用户id“ ”该用户评论的节目数目" "该用户评论的节目列表

    #!/usr/bin/python
    import sys 
    
    def main():
        inputfile  = "programid_userlist.map"
        outputfile = "tmp"
    
        fin = open(inputfile, 'r')
        fout = open(outputfile, 'w')
        user_program = {}
        for line in fin:
            fields = line.strip().split('	')
            program_id = fields[0]
            userids = fields[2].split(' ')
            for userid in userids:
                if userid not in user_program:
                    user_program[userid] = set()
                user_program[userid].add(program_id)
        fin.close()
    
        for userid in user_program:
            print>>fout, "%s	%s	%s"%(userid, len(user_program[userid]), ' '.join(user_program[userid]))
        fout.close()
    
    
    if __name__ == "__main__":
         main()
    构建用户——节目矩阵

     5,简单根据词频统计每个节目下的热门词汇

    #!/bin/sh
    
    #根据词频统计每个节目下的热门词汇1000
    data_dir=/home/minelab/liweibo/springNightForLargePaper/second_test/data
    result_dir=/home/minelab/liweibo/springNightForLargePaper/second_test/topWords
    file_list=`ls $data_dir | awk -F'.' '{print $1}'`
    
    for file in $file_list
    do
        cat $data_dir/$file".filter.fenci" | awk -F'	' '{print $2}' | sed 's/^ //g' | sed 's/ /
    /g' | sort | uniq -c | sort -r -n | sed 's/^ *//g' | awk '{print $2"	"$1}' > $result_dir/$file.topwords
        echo $file is done!
    done
    echo done!
    topwords

    6,根据出现次数统计每个节目对应用户的标签分布

  • 相关阅读:
    web安全与防御
    网页的分段传输与渲染
    关于promise的详细讲解
    mvc/mvvm小小的总结
    瀑布流布局:从上往下布局方式(——)往同级元素中高度最低的元素后面排列
    页面刷新-导航高亮不变
    safari浏览器会将时间、自动识别为号码(包括电话号码、qq号码全部标注为蓝色)
    fullpage.js配合bootstrap制作响应式网站
    bootstrap ----tooltip
    范围选择器,jquery.range插件使用
  • 原文地址:https://www.cnblogs.com/bobodeboke/p/3836982.html
Copyright © 2011-2022 走看看