zoukankan      html  css  js  c++  java
  • 单词纠错系统

    单词纠错

    vocab = set([line.rstrip() for line in open('/content/drive/My Drive/data/vocab_data/vocab.txt')])
    

    需要生成所有候选集合

    def generate_candidates(word):
      """
      word: 给定的输入(错误的输入)
      返回所有(valid)候选集合
      """
      # 生成编辑距离为1的单词
      # 1. insert 2. delete 3. replace
      # app: replace: bpp1, cpp1, app1, abp1....
      #      insert: bappl, cappl, abppl, acppl....
      #      delete: ppl, apl, app....
      # 假设使用26个字符
      letters = 'abcderghigklmnopqrstuvwxyz'
    
      splits = [(word[:i], word[i:]) for i in range(len(word)+1)]
    
      # insert copteration
      inserts = [L+c+R for L,R in splits for c in letters]
      # delete opteration
      deletes = [L+R[1:] for L,R in splits if R]
      # repalce opteration
      repalces =[L+c+R[1:] for L,R in splits if R for c in letters]
        
      # 生成的所有候选单词
      candidates =  set(inserts+deletes+repalces)
    
      # 过滤掉不存在与词典库里面的单词
      return [word for word in candidates if word in vocab]
    
    generate_candidates("apple")
    

    读取语料库

    import nltk
    nltk.download('reuters')
    nltk.download('punkt')
    from nltk.corpus import reuters
    categories = reuters.categories()
    corpus = reuters.sents(categories=categories)
    

    构建语言模型:bigram

    term_count = {}
    bigram_count = {}
    for doc in corpus:
      doc = ['<s>'] + doc
      # bigram; [i, i+1]
      for i in range(0,len(doc)-1):
        term = doc[i]
        bigram = doc[i:i+2]
    
        if term in term_count:
          term_count[term]+=1
        else:
          term_count[term]=1
    
        bigram = ''.join(bigram)
        if bigram in bigram_count:
          bigram_count[bigram]+=1
        else:
          bigram_count[bigram]=1
    

    用户打错的概率

    channel_prob = {}
    
    for line in open('/content/drive/My Drive/data/vocab_data/spell-errors.txt'):
      items = line.split(":")
      correct = items[0].strip()
      mistakes =[item.strip() for item in items[1].strip().split(",")]
      channel_prob[correct] = {}
      for mis in mistakes:
        channel_prob[correct][mis]=1.0/len(mistakes)
    
    import numpy as np
    
    V = len(term_count.keys())
    
    file = open('/content/drive/My Drive/data/vocab_data/testdata.txt')
    for line in file:
      items = line.rstrip().split('	')
      line = items[2].split()
      # ["I","loke","palying"]
      for word in line:
        if word not in vocab:
          # 需要替换word成正确的单词
          # Step1: 生成所有的valid候选集合
          candidates = generate_candidates(word=word)
    
          if len(candidates) < 1:
            continue  # 不建议这么做(这是不对的)
            # TODO : 根据条件生成更多的集合条件
          probs = []
          # 对于每一恶搞candidate,计算他的score
          # score = p(correct)*p(misktake|correct)
          #       = log p(correct)*p(misktake|correct)
          # 返回 score最大的candiate
          for candi in candidates:
            prob = []
            # a. 计算channelprobability
            if candi in channel_prob and word in channel_prob[candi]:
              prob += np.log(channel_prob[candi][word])
            else:
              prob += np.log(0.0001)
    
            # b. 计算语言模型的概率,开始有个s的符号
            idx = items[2].index(word)+1
    
            if (items[2][idx-1] in bigram_count and candi in bigram_count[items[2]]):
              prob += np.log((bigram_count[item[2][idx-1]][candi]+1.0)/(
                  term_count[bigram_count[items[2][idx-1]]]+V))
            else:
              prob += np.log(1.0/V)
    
            probs.append(prob)
          max_idx = probs.index(max(probs))
          print(word,candidates[max_idx])
    
    

    后续未完待续.....

  • 相关阅读:
    VIM 用正则表达式,非贪婪匹配,匹配竖杠,竖线, 匹配中文,中文正则,倒数第二列, 匹配任意一个字符 :
    中国科学院图书馆分类法
    让进程在后台可靠运行的几种方法 nohup,setsid,&,disown,CTRL-z ,screen
    Exception Handling Statements (C# Reference)
    ChannelFactory.Endpoint 上的地址属性为空。ChannelFactory 的终结点必须指定一个有效的地址。
    .NET中六个重要的概念:栈、堆、值类型、引用类型、装箱和拆箱
    WCF Host中的BaseAddress 和 Endpoint中的Address的区别
    使用vs自带的wcf配置工具
    Automatic Code Generation-->Implement Interface
    Learning WCF Chapter1 Exposing Multiple Service Endpoints
  • 原文地址:https://www.cnblogs.com/TuringEmmy/p/12534966.html
Copyright © 2011-2022 走看看