zoukankan      html  css  js  c++  java
  • python 相似语句匹配(非机器学习)

    #coding=utf-8
    
    import xlrd
    import distance
    from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
    import numpy as np
    from scipy.linalg import norm
    
    workbook = xlrd.open_workbook(u'工程师问答.xls')
    sheet_names= workbook.sheet_names()
    
    ls = []
    for sheet_name in sheet_names:
    
        sheet1 = workbook.sheet_by_name(sheet_name)
        for i in range(1, 3858):
            row = sheet1.row_values(i)
            ls.append(row[0])
    
    # print len(ls)
    target = u'D90的发动机热效率是多少?'
    print u'目标语句:' + target
    
    
    # 编辑距离计算
    def edit_distance(s1, s2):
        return distance.levenshtein(s1, s2)
    
    results = list(filter(lambda x: edit_distance(x, target) <= 5, ls))
    print u'1)编辑距离计算,阈值为5'
    for i in results:
        print i
    
    # 杰卡德系数计算
    def jaccard_similarity(s1, s2):
        def add_space(s):
            return ' '.join(list(s))
        
        # 将字中间加入空格
        s1, s2 = add_space(s1), add_space(s2)
        # 转化为TF矩阵
        cv = CountVectorizer(tokenizer=lambda s: s.split())
        corpus = [s1, s2]
        vectors = cv.fit_transform(corpus).toarray()
        # 求交集
        numerator = np.sum(np.min(vectors, axis=0))
        # 求并集
        denominator = np.sum(np.max(vectors, axis=0))
        # 计算杰卡德系数
        return 1.0 * numerator / denominator
    
    results = list(filter(lambda x: jaccard_similarity(x, target) > 0.6, ls))
    print u'2)杰卡德系数计算,阈值为0.6'
    for i in results:
        print i
    
    
    # TF 计算
    def tf_similarity(s1, s2):
        def add_space(s):
            return ' '.join(list(s))
        
        # 将字中间加入空格
        s1, s2 = add_space(s1), add_space(s2)
        # 转化为TF矩阵
        cv = CountVectorizer(tokenizer=lambda s: s.split())
        corpus = [s1, s2]
        vectors = cv.fit_transform(corpus).toarray()
        # 计算TF系数
        return np.dot(vectors[0], vectors[1]) / (norm(vectors[0]) * norm(vectors[1]))
    
    results = list(filter(lambda x: tf_similarity(x, target) > 0.7, ls))
    print u'3)TF 计算,阈值为0.7'
    for i in results:
        print i
    
    
    # TFIDF 系数
    def tfidf_similarity(s1, s2):
        def add_space(s):
            return ' '.join(list(s))
        
        # 将字中间加入空格
        s1, s2 = add_space(s1), add_space(s2)
        # 转化为TF矩阵
        cv = TfidfVectorizer(tokenizer=lambda s: s.split())
        corpus = [s1, s2]
        vectors = cv.fit_transform(corpus).toarray()
        # 计算TF系数
        return np.dot(vectors[0], vectors[1]) / (norm(vectors[0]) * norm(vectors[1]))
    
    results = list(filter(lambda x: tfidf_similarity(x, target) > 0.6, ls))
    print u'4)TFIDF 系数,阈值为0.6'
    for i in results:
        print i
  • 相关阅读:
    1093 Count PAT's(25 分)
    1089 Insert or Merge(25 分)
    1088 Rational Arithmetic(20 分)
    1081 Rational Sum(20 分)
    1069 The Black Hole of Numbers(20 分)
    1059 Prime Factors(25 分)
    1050 String Subtraction (20)
    根据生日计算员工年龄
    动态获取当前日期和时间
    对计数结果进行4舍5入
  • 原文地址:https://www.cnblogs.com/zhangtianyuan/p/9989451.html
Copyright © 2011-2022 走看看