zoukankan      html  css  js  c++  java
  • python编辑距离

    import numpy as np
    import json
    import codecs
    
    # 计算编辑距离
    def edit_distance(word1, word2):
        len1 = len(word1)
        len2 = len(word2)
        dp = np.zeros((len1 + 1, len2 + 1))
        for i in range(len1 + 1):
            dp[i][0] = i
        for j in range(len2 + 1):
            dp[0][j] = j
    
        for i in range(1, len1 + 1):
            for j in range(1, len2 + 1):
                if word1[i - 1] == word2[j - 1]:
                    temp = 0
                else:
                    temp = 1
                dp[i][j] = min(dp[i - 1][j - 1] + temp, min(dp[i - 1][j] + 1, dp[i][j - 1] + 1))
        return dp[len1][len2]
    
    
    # 190801
    # 根据编辑距离计算相似度
    def simility(word1, word2):
        res = edit_distance(word1, word2)
        maxLen = max(len(word1), len(word2))
        return 1-res*1.0/maxLen
    
    bianhaos = []
    sub_sens = []
    with codecs.open(r'C:UsersAdministrator.SC-201812211013PycharmProjectsuntitled29yiwoqucodexianbingshi_write_sub.txt','r','utf8') as f:
        for line in f:
            # bianhao,sub_sen = line.split('<->')
            # sub_sen = sub_sen.strip().strip('<b>').strip('<e>')
            # bianhaos.append(bianhao)
            sub_sens.append(line)
    count = len(sub_sens)
    leibie = [-1]*count
    cla = 0
    print(count)
    for i in range(count):
        if leibie[i] != -1:
            continue
        leibie[i] = cla
        sub1 = sub_sens[i]
        for j in range(count):
            if leibie[j] != -1:
                continue
            sub2 = sub_sens[j]
            sim = simility(sub1,sub2)
            if sim >= 0.5:
                leibie[j] = cla
        cla = cla + 1
        print(i)
    print(leibie)
    with open('leibie05.json','w') as f:
        json.dump(leibie,f)
  • 相关阅读:
    阿里云nginx创建多站点
    linux 卸载php mysql apache
    centos php环境搭建
    jquery simple modal
    nodejs 安装express
    nodejs fs.open
    nodejs supervisor
    nodejs 运行
    nodejs shell
    PHP array_pad()
  • 原文地址:https://www.cnblogs.com/yiwoqu/p/11542074.html
Copyright © 2011-2022 走看看