zoukankan      html  css  js  c++  java
  • 中文分词系列(一)

    关于中文分词的一些资料网上资料很多,大家可以自己去了解了解,今天这里只关注代码怎么写。

    中文分词主要可以归纳为“规则分词”、“统计分词”以及“规则+统计”三个主要派别,今天主要了解“规则分词”中常见的正向、逆向和双向最大化匹配,这三个都是基于现在词典做的,所以得准备一个中文词典,一行一个词。

    一.正向最大化匹配

      描述:

        1.找到词典中最长的词,记下长度L

        2.从 “左向右” 取长度为L的字符串,查找词典进行匹配,若匹配成功,则将这个词切分出来。若匹配失败,将这个字符串的最后一个字符去掉,将剩下的串作为新的匹配串    进行匹配。如此重复下去,直到切完。

    二.逆向最大化匹配

      描述:

        1.找到词典中最长的词,记下长度L

        2.从 ”右向左“ 取长度为L的字符串,查找词典进行匹配,若匹配成功,则将这个词切分出来。若匹配失败,将这个字符串的最前面一个字符去掉,将剩下的串作为新的匹配    串进行匹配。如此重复下去,直到切完。

    三.双向最大化匹配

      描述:

        1.将正向和逆向进行比较,先取词数切分最少的作为结果。

    四.代码采用python

        1.load 词典

      

     1 #!/usr/bin/env python
     2 # -*- coding:utf-8 -*-
     3 
     4 #这是词典路径
     5 dictPath = '../resource/dict.txt'
     6 
     7 def loadDict():
     8     print('load dict...')
     9     dictionary = dict()
    10     maximum = 0
    11     # read resource
    12     with open(dictPath, 'r', encoding='utf8') as f:
    13         for line in f:
    14             line = line.strip()
    15             if not line:
    16                 continue
    17             str = line.split(' ')
    18             dictionary[str[0]] = str[2]
    19             wordLength = len(line)
    20             if wordLength > maximum:
    21                 maximum = wordLength #词典中最长的词的长度
    22     return dictionary, maximum
    View Code

        2.核心方法

     1 #!/usr/bin/env python
     2 # -*- coding:utf-8 -*-
     3 
     4 from word_segmentation.regulation.ReverseMaximumMatchMethod import RMM
     5 from word_segmentation.regulation.MaximumMatchMethod import MM
     6 from word_segmentation.regulation.BiDirectctionMatchMethod import BDMM
     7 from word_segmentation.util.LoadDict import loadDict
     8 
     9 class RegulationMatch(object):
    10     def __init__(self):
    11         self.dictionary, self.maximum = loadDict()
    12 
    13     def cut(self, text, method):
    14         #逆向
    15         if method == 'RMM':
    16             return RMM.cut(text, self.dictionary, self.maximum)
    17         #正向
    18         if method == 'MM':
    19             return MM.cut(text, self.dictionary, self.maximum)
    20         #双向
    21         if method == 'BDMM':
    22             return BDMM.cut(text, self.dictionary, self.maximum)
    View Code
     1 #!/usr/bin/env python
     2 # -*- coding:utf-8 -*-
     3 
     4 '''
     5 词和词性
     6 '''
     7 class Word(object) :
     8     def __init__(self, token, property):
     9         self.__token = token
    10         self.__property = property
    11     #单词
    12     def getToken(self):
    13         return self.__token
    14     #词性
    15     def getProperty(self):
    16         return self.__property
    View Code
     1 #!/usr/bin/env python
     2 # -*- coding:utf-8 -*-
     3 
     4 from word_segmentation.regulation.Word import Word
     5 
     6 '''
     7 正向最大化匹配
     8 MaximumMatchMethod
     9 '''
    10 class MM(object):
    11     def __init__(self):
    12         pass
    13 
    14     @staticmethod
    15     def cut(text, dictionary, maximum):
    16         result = []
    17         textLength = len(text)
    18         start = 0
    19         while textLength > 0:
    20             word = None
    21             for size in range(maximum, 0, -1):
    22                 if textLength - size < 0:
    23                     continue
    24                 piece = text[start:(start + size)]
    25                 if dictionary.__contains__(piece):
    26                     word = piece
    27                     result.append(Word(piece, dictionary.get(piece)))
    28                     textLength -= size
    29                     start += size
    30                     break
    31             if word is None:
    32                 textLength -= 1
    33         return result
    View Code
     1 #!/usr/bin/env python
     2 # -*- coding:utf-8 -*-
     3 
     4 from word_segmentation.regulation.Word import Word
     5 
     6 '''
     7 逆向最大化匹配
     8 ReverseMaximumMatchMethod
     9 '''
    10 class RMM(object):
    11     def __init__(self):
    12         pass
    13 
    14     @staticmethod
    15     def cut(text, dictionary, maximum):
    16         result = []
    17         textLength = len(text)
    18         while textLength > 0:
    19             word = None
    20             for size in range(maximum, 0, -1):
    21                 if textLength - size < 0:
    22                     continue
    23                 piece = text[(textLength - size) : textLength]
    24                 if dictionary.__contains__(piece):
    25                     word = piece
    26                     result.append(Word(piece, dictionary.get(piece)))
    27                     textLength -= size
    28                     break
    29             if word is None:
    30                 textLength -= 1
    31         return result[::-1]
    View Code
     1 # -*- coding:utf-8 -*-
     2 
     3 from word_segmentation.regulation.MaximumMatchMethod import MM
     4 from word_segmentation.regulation.ReverseMaximumMatchMethod import RMM
     5 
     6 '''
     7     比较正向最大匹配和逆向最大匹配结果:
     8     1.如果分词数量结果不同,那么取分词数量较少的那个
     9     2.如果分词数量结果相同
    10         a.分词结果相同,可以返回任何一个
    11         b.分词结果不同,返回单字数比较少的那个
    12         c.分词结果不同,单字数相同,返回谁呢(可以返回逆向分词结果)
    13 '''
    14 class BDMM(object):
    15     def __init__(self):
    16         pass
    17 
    18     @staticmethod
    19     def cut(text, dictionary, maximum):
    20         mmResult = MM.cut(text, dictionary, maximum)
    21         rmmResult = RMM.cut(text, dictionary, maximum)
    22         mmSegment = []
    23         rmmSegment = []
    24         for word in mmResult:
    25             mmSegment.append(word.getToken())
    26             # print('token = %s, property = %s' %(word.getToken(), word.getProperty()))
    27         for word in rmmResult:
    28             rmmSegment.append(word.getToken())
    29 
    30         if mmSegment.__len__() < rmmSegment.__len__():
    31             return mmResult
    32         elif mmSegment.__len__() == rmmSegment.__len__():
    33             flag = True
    34             for segment in mmSegment:
    35                 if segment not in rmmSegment:
    36                     flag = False
    37                     break
    38             if flag:
    39                 return mmResult
    40             else:
    41                 mmSingleWords = 0
    42                 rmmSingleWords = 0
    43                 for word in mmSegment:
    44                     if len(word) == 1:
    45                         mmSingleWords += 1
    46                 for word in rmmSegment:
    47                     if len(word) == 1:
    48                         rmmSingleWords += 1
    49                 if mmSingleWords < rmmSingleWords:
    50                     return mmResult
    51                 else:
    52                     return rmmResult
    53         else:
    54             return rmmResult
    View Code
     1 #!/usr/bin/env python
     2 # -*- coding:utf-8 -*-
     3 
     4 from word_segmentation.regulation.ReverseMaximumMatchMethod import RMM
     5 import word_segmentation.regulation.MaximumMatchMethod
     6 import word_segmentation.regulation.BiDirectctionMatchMethod
     7 from word_segmentation.regulation.RegulationMatchMthod import RegulationMatch
     8 
     9 def test():
    10     pass
    11 if __name__ == '__main__':
    12     text = '各国有各国的困难…'
    13     print('分词:')
    14     print('各国有各国的困难…')
    15     regulation = RegulationMatch()
    16     mmResult = regulation.cut(text, 'MM')
    17     rmmResult = regulation.cut(text, 'RMM')
    18     bdmmResult = regulation.cut(text, 'BDMM')
    19     mmSegment = []
    20     rmmSegment = []
    21     bdmmSegment = []
    22     for word in mmResult:
    23         mmSegment.append(word.getToken())
    24         #print('token = %s, property = %s' %(word.getToken(), word.getProperty()))
    25     for word in rmmResult:
    26         rmmSegment.append(word.getToken())
    27     for word in bdmmResult:
    28         bdmmSegment.append(word.getToken())
    29 
    30     print('正向匹配: %s'  % mmSegment)
    31     print('逆向匹配: %s'  % rmmSegment)
    32     print('双向匹配: %s' % bdmmSegment)
    View Code
  • 相关阅读:
    MQTT
    群晖搭建webssh
    OSI 协议
    centos7 yum安装ffmpeg,以及ffmpeg的简单用法
    centos7 RTMP直播服务器搭建
    elasticsearch
    H5的storage
    bootstrap 列表組
    eclipse的debug模式下启动不了tomcat
    bootstrap collapse
  • 原文地址:https://www.cnblogs.com/little-horse/p/10344322.html
Copyright © 2011-2022 走看看