zoukankan      html  css  js  c++  java
  • 拼写检查算法 Golang 版

    最近看了 阮一峰的一篇文章介绍使用贝叶斯推断方法做拼写检查的文章,该文章的易懂程度输于 Google 技术总监写的原文,其优秀的译文

    说明了啥,越是大师级的人写的文章往往越易懂。所以关于贝叶斯方法我就不解释了。只帖代码

    我使用golang对照实现了一遍:

    一是为了弄懂其算法细节

    二是不使前段时间看的golang语法忘记

    就像几年前在学校时候对着C版的数据结构书用C#去实现一样。

    package main

    
    
    import (
        "fmt"
        "io/ioutil"
        "regexp"
    )
    
    
    var (
        NWORDS map[string]int
    )
    
    
    const (
        alphabet = "abcdefghijklmnopqrstuvwxyz"
    )
    
    
    func words(text string) []string {
        regex, _ := regexp.Compile("[a-z]+")
        return regex.FindAllString(text, -1)
    }
    
    
    func train(features []string) map[string]int {
        result := make(map[string]int)
        for i := range features {
            _, isexist := result[features[i]]
            if !isexist {
                result[features[i]] = 1
            } else {
                result[features[i]] += 1
            }
        }
    
    
        return result
    }
    
    
    func edit1(word string) []string {
        type tuple struct{ a, b string }
        var splits []tuple
        for i := 0; i < len(word)+1; i++ {
            splits = append(splits, tuple{word[:i], word[i:]})
        }
    
    
        var deletes []string
        for _, t := range splits {
            if len(t.b) > 0 {
                deletes = append(deletes, t.a+t.b[1:])
            }
        }
    
    
        var transposes []string
        for _, t := range splits {
            if len(t.b) > 1 {
                transposes = append(transposes, t.a+string(t.b[1])+string(t.b[0])+t.b[2:])
            }
        }
    
    
        var replaces []string
        for _, c := range alphabet {
            for _, t := range splits {
                if len(t.b) > 0 {
                    replaces = append(replaces, t.a+string(c)+t.b[1:])
                }
            }
        }
    
    
        var inserts []string
        for _, c := range alphabet {
            for _, t := range splits {
                inserts = append(inserts, t.a+string(c)+t.b)
            }
        }
    
    
        //concat this slice 
        deletes = append(deletes, transposes...)
        deletes = append(deletes, replaces...)
        deletes = append(deletes, inserts...)
    
    
        return set(deletes)
    }
    
    
    func known_edits2(word string) []string {
        var arr []string
        for _, e1 := range edit1(word) {
            for _, e2 := range edit1(e1) {
                if _, ok := NWORDS[e2]; ok {
                    arr = append(arr, e2)
                }
            }
        }
        return set(arr)
    }
    
    
    func known(words []string) []string {
        var knows []string
        for _, value := range words {
            if _, ok := NWORDS[value]; ok {
                knows = append(knows, value)
            }
        }
        return knows
    }
    
    
    func appendIfMissing(slice []string, i string) []string {
        for _, ele := range slice {
            if ele == i {
                return slice
            }
        }
        return append(slice, i)
    }
    
    
    func set(arr []string) []string {
        var result []string
        for _, ele := range arr {
            result = appendIfMissing(result, ele)
        }
        return result
    }
    
    
    func correct(word string) string {
        candidates := known([]string{word})
        if len(candidates) <= 0 {
            candidates = known(edit1(word))
            if len(candidates) <= 0 {
                candidates = known(known_edits2(word))
            }
        }
        return max(candidates, NWORDS)
    }
    
    
    func max(arr []string, dict map[string]int) string {
        flag := 0
        index := 0
        for ix, value := range arr {
            if v, ok := dict[value]; ok {
                if v > flag {
                    flag = v
                    index = ix
                }
            }
        }
        return arr[index]
    }
    
    
    func main() {
        buf, _ := ioutil.ReadFile("big.txt")
        NWORDS = train(words(string(buf)))
        word := "beford"
        fmt.Println("input:", word, "correct word:", correct(word))
    }

    python 版本只有30行左右,golang对各种集合操作和python对比差了许多。

    python里用set(arr),即可将列表里重复的删除。简洁的用for in 构造列表实在很cool

  • 相关阅读:
    谈谈我对雾霾的看法
    2016年书单分享
    我的面试心得:面试官视角
    Cesium原理篇:GroundPrimitive
    Cesium原理篇:Batch
    Peter Hessler和他的中国三部曲(上)
    Cesium原理篇:Material
    全球PM25实时可视化
    Cesium原理篇:Property
    次郎的寿司梦
  • 原文地址:https://www.cnblogs.com/solo/p/2746667.html
Copyright © 2011-2022 走看看