字典分词 代码(C)
本文地址: http://blog.csdn.net/caroline_wendy
给定字典, 给定一句话, 进行分词.
使用深度遍历(DFS)的方法.
使用一个參数string, 保存当前分支的分词后的句子; 使用一个參数vector, 保存全部可能的组合.
使用一个验证函数, 推断句子能否够分词.
代码:
/* * main.cpp * * Created on: 2014.9.18 * Author: Spike * Copyright (c) 2014年 WCL. All rights reserved. */ /*eclipse cdt, gcc 4.8.1*/ #include <iostream> #include <vector> #include <string> #include <set> using namespace std; bool Match(string s, string m) { int l = m.length(); if (s.substr(0, l) == m) { return true; } return false; } bool Validate(string s, vector<string> &dict) { //1. calculate all alphabets in the query set<char> sc; for (size_t i = 0; i < s.length(); i++) { sc.insert(s[i]); } //2. calculate all alphabets in the dictionary set<char> dc; for (vector<string>::iterator it = dict.begin(); it != dict.end(); it++) { for (size_t i = 0; i < (*it).length(); i++) { dc.insert((*it)[i]); } } for (set<char>::iterator it = sc.begin(); it != sc.end(); it++) { if (dc.find(*it) == dc.end()) { return false; } } return true; } string Split(string s, vector<string> &dict, string cur, vector<string>& list) { if (s.length() == 0) { list.push_back(cur); return s; } for (vector<string>::iterator it = dict.begin(); it != dict.end(); it++) { if (Match(s, *it)) { string tmp = cur; string latter = s.substr(it->length(), s.length() - it->length()); cur += (*it) + "~"; // add current word to cur_str cur += Split(latter, dict, cur, list); // split remaining words cur = tmp; //back to last status } } return "No Result"; } vector<string> SplitWords(string s, vector<string> &dict) { string cur = ""; vector<string> list; if (!Validate(s, dict)) { return list; } Split(s, dict, cur, list); return list; } int main() { vector<string> dict={"程序猿","公务员","员","我","喜","做","程序","一","欢","喜欢","做一个","一个"}; vector<string> words = SplitWords("我喜欢做一个程序猿", dict); for (vector<string>::iterator it=words.begin(); it!=words.end(); it++) { cout<<(*it)<<endl; } return 0; }
简化版本号(没有验证):
/* * main.cpp * * Created on: 2014.9.18 * Author: Spike * Copyright (c) 2014年 WCL. All rights reserved. */ /*eclipse cdt, gcc 4.8.1*/ #include <iostream> #include <vector> #include <string> #include <set> using namespace std; bool Match(string s, string m) { int l = m.length(); if (s.substr(0, l) == m) { return true; } return false; } string Split(string s, vector<string> &dict, string cur, vector<string>& list) { if (s.length() == 0) { list.push_back(cur); return s; } for (vector<string>::iterator it = dict.begin(); it != dict.end(); it++) { if (Match(s, *it)) { string tmp = cur; string latter = s.substr(it->length()); cur += (*it) + " | "; // add current word to cur_str cur += Split(latter, dict, cur, list); // split remaining words cur = tmp; //back to last status } } return "No Result"; } vector<string> SplitWords(string s, vector<string> &dict) { string cur = ""; vector<string> list; Split(s, dict, cur, list); return list; } int main() { vector<string> dict={"程序猿","公务员","员","我","喜","做","程序","一","欢","喜欢","做一个","一个"}; string s = "我喜欢做一个程序猿"; vector<string> words = SplitWords(s, dict); for (vector<string>::iterator it=words.begin(); it!=words.end(); it++) { cout<<(*it)<<endl; } return 0; }
我~喜~欢~做~一个~程序猿~ 我~喜~欢~做~一个~程序~员~ 我~喜~欢~做一个~程序猿~ 我~喜~欢~做一个~程序~员~ 我~喜欢~做~一个~程序猿~ 我~喜欢~做~一个~程序~员~ 我~喜欢~做一个~程序猿~ 我~喜欢~做一个~程序~员~