中文分词:双向匹配最大算法(BI-MM)
启发式规则:
- 1.如果正反向分词结果词数不同,则取分词数量较少的那个
- 2.如果分词结果词数相同
- a. 分词结果相同,就说明没有歧义,可返回任意一个
- b. 分词结果不同,返回其中单字较少的那个
代码实现
#使用双向最大匹配算法实现中文分词
words_dic = []
import BMM #引入逆向匹配算法
import fmm #引入正向匹配算法
def init():
'''
读取词典文件
载入词典
:return:
'''
with open(r"C:UserslenovoPycharmProjectsfencivenvdicdic.txt","r",encoding="utf-8") as dic_input:
for word in dic_input:
words_dic.append(word.strip())#列表
#实现双向匹配算法中的切词方法
def cut_words(raw_sentence,words_dic):
bmm_word_list = BMM.cut_words(raw_sentence,words_dic) #返回切词列表
fmm_word_list = fmm.cut_words(raw_sentence,words_dic) #返回切词列表
bmm_word_list_size = len(bmm_word_list) #计算各自的长度
fmm_word_list_size = len(fmm_word_list)
if bmm_word_list_size != fmm_word_list_size: #如果不相等返回分词数较少的那个
if bmm_word_list_size < fmm_word_list_size:
return bmm_word_list
else:
return fmm_word_list
else: #如果相等了,怎么办
Fsingle = 0
Bsingle = 0
issame = True
for i in range(len(fmm_word_list)):#对列表中的词进行分词
if fmm_word_list[i] not in bmm_word_list: #二者不等,跳出
issame = False
if len(fmm_word_list[i]) == 1: #正向匹配的的单字计数累加,为后续的比较打下基础
Fsingle = Fsingle + 1
if len(bmm_word_list[i]) == 1: #逆向匹配的的单字计数累加
Bsingle = Bsingle + 1
if issame:
return fmm_word_list #二者相等返回任意一个
elif Bsingle > Fsingle:
return fmm_word_list
else:
return bmm_word_list
def main():
'''
与用户交互接口
:return:
'''
init()
while True:
print("请输入要分词序列:")
input_str = input()
if not input_str:
break
result = cut_words(input_str,words_dic)
print("分词结果")
print(result)
if __name__=="__main__":
main()#使用逆向最大匹配算法实现中文分词
words_dic = []
def init():
'''
读取词典文件
获取词典
:return:
'''
with open(r"C:UserslenovoPycharmProjectsfencivenvdicdic.txt","r",encoding= "utf-8")as dic_input:
for word in dic_input:
words_dic.append(word.strip())
#实现逆向最大匹配算法的切词方法
def cut_words(raw_sentence,words_dic):
#统计词典中词的最大长度
max_length = max(len(word) for word in words_dic )
sentence = raw_sentence.strip()
#统计序列的长度
words_length = len(sentence)
cut_words_list = []
while words_length > 0: #判断是否需要继续切词
max_cut_length = min(max_length,words_length)
subsentence = sentence[-max_cut_length:]
while max_cut_length >0:
if subsentence in words_dic:
cut_words_list.append(subsentence)
break
elif max_cut_length == 1:
cut_words_list.append(subsentence)
break
else:
max_cut_length = max_cut_length - 1
subsentence = subsentence[-max_cut_length:]
sentence = sentence[0:-max_cut_length]
words_length = words_length - max_cut_length
cut_words_list.reverse()#自身反转
words = "/".join(cut_words_list)
return words
def main():
'''
与用户交互接口
:return:
'''
init()
while True:
print("请输入您要分词的序列")
input_str = input()
if not input_str:
break
result = cut_words(input_str,words_dic)
print("分词结果:")
print(result)
if __name__=="__main__":
main()