一. 信息检索技术简述
二. TF-IDF模型
1. 概念
1)词w在文档d中的词频tf(Term Frequency),指词w在文档d中出现的频率。
tf(w, d)=count(w, d) / size (d)
2)词w在整个文档集合中的逆向文档频率idf(Inverse Document Frequency), 即文档总数n与词w所出现文件数docs(W, D)比值的对数:
idf = log (n / docs (W, D))
Tf-idf (q, d)
= sum { i=I …k | tf-idf(w[i], d) }
= sum { i=1…k | tf(w[i], d) * idf( w[i]) }
import nltk from nltk.corpus import * def mid_text_dir(): corpus_root=r"D:segfile" wordlists=PlaintextCorpusReader(corpus_root, '.*') return wordlists def getTextTermFreq(wordlists): filelist=wordlists.fileids() path='D:/mi-result/' for file in filelist: word=wordlists.words(file) vocab=set(word) f1=open(path+file+'.txt', 'w+') tip="the text has %d different words and the sum of vocab is %d" % (len(vocab),len(word)) print(tip) f1.write(tip) f1.write('---------------------- ') fdist=nltk.FreqDist(word) for w in vocab: f1.write(w.ljust(25)+str(fdist[w]).ljust(10)+str(fdist[w]/len(word))+' ') f1.close() print(tip)
def getFilelist(path) : //访问目标语料库,获得文件列表 filelist = [] files = os.listdir(path) for f in files : if(f[0] == '.') : pass else : filelist.append(f) return filelist,path def fenci(argv,path) : //分词,并保存文件 sFilePath = 'D:/my_segfile/' if not os.path.exists(sFilePath) : os.mkdir(sFilePath) filename = argv try: f = open(path+filename,'r+', encoding= 'gbk') file_list = f.read() except UnicodeDecodeError: f = open(path+filename,'r+', encoding= 'utf-8') file_list = f.read() f.close() seg_list = jieba.cut(file_list,cut_all=True) result = [] for seg in seg_list : seg = ' '.join(seg.split()) if (seg != '' and seg != " " and seg != " ") : result.append(seg) f = open(sFilePath+"/"+filename+"-seg.txt","w+") f.write(' '.join(result)) f.close() def Tfidf(filelist) : //运用TF-IDF算法进行匹配度计算, 该算法首先遍历所有文件,获得所有单词的列表,然后以文章为元素,每个文章保存相同位置的词的词频,然后分别计算每个词的词频。 path = 'D:/segfile/' corpus = [] for ff in filelist : fname = path + ff f = open(fname,'r+') content = f.read() f.close() corpus.append(content) vectorizer = CountVectorizer() transformer = TfidfTransformer() x= vectorizer.fit_transform(corpus) print ("line of x.toarray is %d" % len(x.toarray())) print ("rows of x.toarray is %d" % len(x.toarray()[0])) tfidf = transformer.fit_transform(vectorizer.fit_transform(corpus)) word = vectorizer.get_feature_names() f = open('D:/mi-result/Word.txt','w+') for i in word: f.write(i.ljust(20)) f.close() Term_freq=x.toarray() for h in range(len(Term_freq)): temp_path="D:/mi-result/Term_frequence_text%d.txt" % h f = open(temp_path,'w+') for i in range(len(Term_freq[0])): f.write(word[i].ljust(25)+""+str(Term_freq[h][i])+" ") f.close() weight = tfidf.toarray() sFilePath = 'D:/tfidffile' if not os.path.exists(sFilePath) : os.mkdir(sFilePath) for i in range(len(weight)) : print ("--------Writing all the tf-idf in the %d file into %s/re%d.txt--------" % (i, sFilePath, i)) path= "%s/re%d.txt" % (sFilePath, i) ##f = open(sFilePath+'/re'+i+'.txt','w+') f = open(path,'w+') for j in range(len(word)) : f.write(word[j].ljust(25)+""+str(weight[i][j])+" ") f.close() if __name__ == "__main__" : wordlists=mid_text_dir() getTextTermFreq(wordlists) argv='D:/segfile/' (allfile,path) = getFilelist(argv) for ff in allfile: print ("Using jieba on %s " % ff) fenci(ff,path) Tfidf(allfile)