zoukankan      html  css  js  c++  java
  • hmm模型及手写python

    文章参照

    链接:https://www.cnblogs.com/pinard/p/6945257.html

    # coding=utf-8
    import re
    import numpy as np
    
    
    class Hmm(object):
        def __init__(self, train_path):
            self.train_path = train_path
            self.clean_data()
    
        def clean_data(self):
            with open(self.train_path, encoding='utf-8') as f:
                sents = f.read()
            self.sents = [[word.split(" ") for word in sent.split("\n")] for sent in sents.split("\n\n")]
            self.Q = sorted(list(set([word[1] for sent in self.sents for word in sent])))  # 隐含状态集合
            self.V = sorted(list(set([word[0] for sent in self.sents for word in sent])))  # 观测集合
    
        def train(self):
            # 1、求hmm的初试隐含状态概率pi
            first_label = [sent[0][1] for sent in self.sents]
            self.pi = np.array([round(first_label.count(q) / len(first_label), 4) for q in self.Q])
            # 2、求hmm的隐含状态转移概率矩阵A
            label = [[word[1] for word in sent] for sent in self.sents]
            two_label = [[tag[index:index + 2] for index in range(len(tag) - 1)] for tag in label]
            two_label = [''.join(word) for label in two_label for word in label]
            self.A = np.array(
                [[round(two_label.count(q1 + q2) / sum([1 for label in two_label if label[0] == q1]), 4) for q2 in self.Q]
                 for q1 in self.Q])
            # 3、求hmm的发射概率矩阵B
            word_label = [[''.join(word) for word in sent] for sent in self.sents]
            word_label = [word for label in word_label for word in label]
            label = [t for tag in label for t in tag]
            self.B = np.array([[word_label.count(v + q) / label.count(q) for v in self.V] for q in self.Q])
    
        def predict(self, sent):
            O = np.array([self.V.index(word) for word in sent])
            δ = np.zeros((len(O), len(self.A)))  # 第一个局部
            Ψ = np.zeros((len(O), len(self.A)))  # 第二个局部
            # 1、初始化t=1时刻维特比的两个局部变量
            δ[0] = self.pi * self.B[:, O[0]]
            # 2、递归求序列每一步的两个局部变量
            for index in range(1, len(δ)):
                δ[index] = np.max(δ[index - 1] * self.A.T, 1) * self.B[:, O[index]]
                Ψ[index] = np.argmax(δ[index - 1] * self.A.T, 1)
                # 3、求最后一个概率最大对应的隐含标签
            label = [δ[-1].argmax()]
            # 4、回溯求整个序列的隐含标签
            for index, tag in enumerate(Ψ[::-1]):
                if index < len(Ψ) - 1:
                    label.append(int(tag[int(label[-1])]))
            label = label[::-1]
            label = ''.join([self.Q[index] for index in label])
            return label
    
    
    if __name__ == '__main__':
        text = '维特比算法是一个分词方法'
        train_path = 'test.txt'
        hmm = Hmm(train_path)
        hmm.train()
        label = hmm.predict(text)
        print([text[word.start():word.end()] for word in re.finditer(r'bi+|o', label)])
    

    test.txt

    维 b
    特 i
    比 i
    算 b
    法 i
    也 o
    是 o
    寻 b
    找 i
    序 b
    列 i
    最 b
    短 i
    路 b
    径 i
    的 o
    一 b
    个 i
    通 b
    用 i
    方 b
    法 i
    
    同 b
    时 i
    维 b
    特 i
    比 i
    算 b
    法 i
    仅 b
    仅 i
    局 b
    限 i
    于 o
    求 o
    序 b
    列 i
    最 b
    短 i
    路 b
    径 i
    
    如 b
    果 i
    大 b
    家 i
    看 b
    过 i
    之 b
    前 i
    写 o
    的 o
    文 b
    本 i
    挖 b
    掘 i
    的 o
    分 b
    词 i
    原 b
    理 i
    中 o
    的 o
    维 b
    特 i
    比 i
    算 b
    法 i
    View Code
  • 相关阅读:
    Tair分布式key/value存储
    Ehcache详细解读
    专访阿里中间件高级专家沈询
    boost之词法解析器spirit
    快速部署Python应用:Nginx+uWSGI配置详解
    CMake如何执行shell命令
    show engine innodb status 详解
    HTTP Request header
    json python api
    mysql 索引对于select速度提升作用实验
  • 原文地址:https://www.cnblogs.com/xiaoruirui/p/15625057.html
Copyright © 2011-2022 走看看