1,对原始数据.data进行过滤,利用java实现
package com.bobo.DataPre; import java.io.BufferedReader; import java.io.BufferedWriter; import java.io.File; import java.io.FileReader; import java.io.FileWriter; import java.io.IOException; import java.io.PrintWriter; import com.bobo.util.Constants; import com.bobo.util.Fenci; import com.bobo.util.StopwordsRemover; import com.bobo.util.StringUtil; import com.bobo.util.UtilityForRemoveAtName; public class ProgramDataFilter { /** * @param args */ public static void main(String[] args) { long start = System.currentTimeMillis(); for (int i = 0; i < Constants.ProgramNameList.length; i++) { } ProgramDataFilter pre = new ProgramDataFilter(); String inFilePath; String outFilePath; String programName; String[] keywords; for (int i = 0; i < Constants.ProgramNameList.length; i++) { programName = Constants.ProgramNameList[i]; keywords = Constants.keywordsList[i]; inFilePath = Constants.TitleDir + File.separator + programName + ".title.uniqByWeiboId"; outFilePath = Constants.FilterDir + File.separator + programName + ".filter.fenci"; pre.dataSetAndRmStop(inFilePath, outFilePath, programName, keywords); long end = System.currentTimeMillis(); System.out.println(programName + "数据预处理,分词、去处停用时、去除@花费的时间为:" + (end - start) / 1000); } } public boolean isRelative(String weiboText, String programName, String[] filterWords) { // 包含节目名称 if (!weiboText.contains(programName)) { return false; } // 对于歧义性小的,单独利用名字就够了 if (filterWords.length < 1) { return true; } if (weiboText.contains("《" + programName + "》")) { return true; } // 包含节目名称的同时,包含演员名称或者节目类别 for (String keyword : filterWords) { if (weiboText.contains(keyword)) { return true; } } return false; } // 第一步,进行分词、去除停用词、去除@后的用户名称? private void dataSetAndRmStop(String inFilePath, String outFilePath, String programName, String[] keywords) { FileReader fr = null; BufferedReader br = null; FileWriter fw = null; BufferedWriter bw = null; PrintWriter pw = null; String line = null; Fenci fenci = new Fenci(); fenci.initial(); StopwordsRemover stop = new StopwordsRemover(); stop.setStoppingListSet(stop .loadStoppingListSet("./conf/stopwords.list")); String weiboText; try { fr = new FileReader(inFilePath); br = new BufferedReader(fr); fw = new FileWriter(outFilePath); bw = new BufferedWriter(fw); pw = new PrintWriter(bw); while ((line = br.readLine()) != null) { String[] lineArr = line.split(" "); if (lineArr.length != 3) { continue; } weiboText = lineArr[1]; if (StringUtil.isNullOrEmpty(weiboText)) { continue; } if (!isRelative(weiboText, programName, keywords)) { continue; } String fenciString = stop.removeStoppingWords(fenci .testICTCLAS_ParagraphProcess((UtilityForRemoveAtName .removeName(weiboText)))); if (!StringUtil.isNullOrEmpty(fenciString)) { pw.println(lineArr[0]+" "+fenciString); } } } catch (Exception e) { e.printStackTrace(); System.out.println("RemoveUrlUtil.java文件去除链接出现异常"); } finally { try { br.close(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } pw.close(); } } }
生成的文件格式是
用户id “ ” 微博文本
2,提取每个节目下的用户列表,并将用户的列表和用户的profile进行对应
#!/bin/bash data_dir=/home/minelab/liweibo/springNightForLargePaper/second_test/data user_file=/home/minelab/liweibo/springNightForLargePaper/springNightUser/sina_user.data dest_dir=/home/minelab/liweibo/springNightForLargePaper/second_test/userForProgram program_list=`ls $data_dir | awk -F'.' '{print $1}'` for program in $program_list do rm -rf $dest_dir/"$program"_userid_times_profile.map cat $data_dir/"$program".filter.fenci | awk -F' ' '{print $1}' | sort | uniq -c | sort -r -n |sed 's/^ *//g' | sed 's/ / /g' | awk -F' ' '{print $2" "$1}' | sort > $dest_dir/"$program"_userid_times.map join -t $' ' $dest_dir/"$program"_userid_times.map $user_file > $dest_dir/"$program"_userid_times_profile.map rm -rf $dest_dir/"$program"_userid_times.map echo $program is done! done echo done!
3,构建节目id" "评论该节目的用户数“ ”评论该节目的用户列表
#!/bin/bash #生成文件格式节目id" "评论该节目的用户数目" "评论该节目的用户id列表(不同id之间使用空格分开) #如果一个用户多次评论某个节目,当作一次进行处理 program_dir=/home/minelab/liweibo/springNightForLargePaper/second_test/userForProgram inter_dir=/home/minelab/liweibo/springNightForLargePaper/second_test/inter_data result_file=$inter_dir/programid_userlist.map tmp_file=$inter_dir/programid_userlist.tmp program_list=`ls $program_dir` rm -rf $result_file rm -rf $tmp_file i=1 for program in $program_list do user_list=`cat $program_dir/$program | awk -F' ' '{printf("%s ",$1);}End{print;}'` line_num=`cat $program_dir/$program | wc -l | awk '{print $1}'` echo $i" "$line_num" "$user_list>>$tmp_file i=$[$i+1] done #根据节目的流程度排序 cat $tmp_file | sort -t $' ' -k 2 -r -n > $result_file rm -rf $tmp_file echo "done"
4,构建用户id“ ”该用户评论的节目数目" "该用户评论的节目列表
#!/usr/bin/python import sys def main(): inputfile = "programid_userlist.map" outputfile = "tmp" fin = open(inputfile, 'r') fout = open(outputfile, 'w') user_program = {} for line in fin: fields = line.strip().split(' ') program_id = fields[0] userids = fields[2].split(' ') for userid in userids: if userid not in user_program: user_program[userid] = set() user_program[userid].add(program_id) fin.close() for userid in user_program: print>>fout, "%s %s %s"%(userid, len(user_program[userid]), ' '.join(user_program[userid])) fout.close() if __name__ == "__main__": main()
5,简单根据词频统计每个节目下的热门词汇
#!/bin/sh #根据词频统计每个节目下的热门词汇1000 data_dir=/home/minelab/liweibo/springNightForLargePaper/second_test/data result_dir=/home/minelab/liweibo/springNightForLargePaper/second_test/topWords file_list=`ls $data_dir | awk -F'.' '{print $1}'` for file in $file_list do cat $data_dir/$file".filter.fenci" | awk -F' ' '{print $2}' | sed 's/^ //g' | sed 's/ / /g' | sort | uniq -c | sort -r -n | sed 's/^ *//g' | awk '{print $2" "$1}' > $result_dir/$file.topwords echo $file is done! done echo done!
6,根据出现次数统计每个节目对应用户的标签分布