# -*- coding: utf-8 -*- # MM # 使用正向最大匹配算法实现中文分词 dic = [] MAX_LENGTH = 5 def init(): """ 读文件 获取中文词典 :return: """ input = open("test.txt") lines = input.readlines() for line in lines: temp = line.split(',') dic.append(temp[0]) for d in dic: print(d) def if_contain(words): """ 判断当前词在词典中是否存在 :param words: :return: """ flag = False for d in dic: if d == words: flag = True break return flag def spl(sentence): """ 正向最大匹配算法的主要实现部分 从后向前切割字符串,直到切割出的子串与词典中的词匹配 :param sentence: :return: """ result = '' words = [] while len(sentence) > 0: except_flag = False for i in range(MAX_LENGTH, 0, -1): temp = sentence[:i] # 中文字符串切割方式 print(i,temp) flag = if_contain(temp) if flag: words.append(temp) sentence = sentence[i:] except_flag = True break if not except_flag: # 判断当前字符串是否在词典中并不存在,若该字符串从头切割到尾都没有词典中的词则认为无法切割并且 # 词典中不存在,此时直接将该词当成切割后的结果加入结果列表 words.append(sentence) break for w in words: result += (w + '/') return result def main(): """ 与用户交互接口 :return: """ init() while True: input_str = input(">") if not input_str: break result = spl(input_str) print("分词结果为:") print(result) if __name__ == "__main__": main()