zoukankan      html  css  js  c++  java
  • 计算两篇文章相似度代码

    # -*- coding:gb2312 -*-
    from gensim import corpora, models, similarities
    from nltk.tokenize import word_tokenize
    from nltk.corpus import brown
    courses=[]
    temp=""
    for line in file('aaa'):
        if(line!="
    "):
                temp =temp+line.strip()+"	"
        else:
            courses.append(temp)
            temp=""
    
    courses_name = []
    for course in courses:
        x=course.strip().split('	')
        courses_name.append(x[0].strip('#*'))
    print courses_name[0:3]
    document=['#*AD','ADdd']
    document=document[0].decode('utf-8').lower()
    print document
    texts_tokenized = [[word.lower() for word in word_tokenize(document.decode('utf-8'))] for document in courses]
    print texts_tokenized[0]
    from nltk.corpus import stopwords
    english_stopwords = stopwords.words('english')
    texts_filtered_stopwords = [[word for word in document if not word in english_stopwords] for document in texts_tokenized]
    print texts_filtered_stopwords[0]
    
    english_punctuations = [',', '.', ':', ';', '?', '(', ')', '[', ']', '&', '!', '*', '@', '#', '$', '%']
    from nltk.stem.lancaster import LancasterStemmer
    texts_filtered = [[word for word in document if not word in english_punctuations] for document in texts_filtered_stopwords]
    print texts_filtered[0]
    st = LancasterStemmer()
    texts_stemmed = [[st.stem(word) for word in docment] for docment in texts_filtered]
    print texts_stemmed[0]
    all_stems = sum(texts_stemmed, [])
    stems_once = set(stem for stem in set(all_stems) if all_stems.count(stem) == 2)#去掉次数为2的低频词汇
    texts = [[stem for stem in text if stem not in stems_once] for text in texts_stemmed]
    print texts
    from gensim import corpora, models, similarities
    import logging
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
    
    dictionary = corpora.Dictionary(texts)
    corpus = [dictionary.doc2bow(text) for text in texts]
    tfidf = models.TfidfModel(corpus)
    corpus_tfidf = tfidf[corpus]
    lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=10)
    index = similarities.MatrixSimilarity(lsi[corpus])
    ml_course = texts[15]
    ml_bow = dictionary.doc2bow(ml_course)
    ml_lsi = lsi[ml_bow]
    print ml_lsi
    sims = index[ml_lsi]
    sort_sims = sorted(enumerate(sims), key=lambda item: -item[1])
    print sort_sims[1:11]
    #print courses_name[23]
    

      

  • 相关阅读:
    13_函数的基本使用简介
    12_goto语句的使用
    11_for语句的使用
    10_switch语句的使用
    09_if条件语句的使用
    08_类型别名(类型声明)
    day-32网络编程
    day-31网络编程
    day-30网络编程
    day-29元类、异常处理
  • 原文地址:https://www.cnblogs.com/ldphoebe/p/5717928.html
Copyright © 2011-2022 走看看