一个小应用,show下代码,为了推荐下我非常喜欢的glog,gflag:)感谢google 它们让我的生活更轻松:)
另外读取数据库用otl相当方便,我用otl封装了下写了一个DBReader,这样处理数据库基本就和处理文本一样了完全相同的接口,完全屏蔽了数据库的操作方便了很多。
/** * ============================================================================== * * \file quechaokafei.cc * \autho goldenlock * \Description: 由于标题中 “雀巢 咖啡” 的存在, 造成用户搜”雀巢咖啡“搜不到产品 * 解决办法读取数据库中所有单品对于标题如果发现* ”雀巢*咖啡“ ”咖啡*雀巢“ 则这个词加入到 * blacklist中 当前dict/quechaokafei.txt * * ============================================================================== */ #define private public #define protected public #include <iostream> #include <string> #include <vector> #include <fstream> #include "utils/db_reader.h" #include <algorithm> #include <boost/progress.hpp> #include <glog/logging.h> #include <gflags/gflags.h> #include <tr1/unordered_set> #include "gbk_ch_converter.h" #include "string_help.h" #include "config_help.h" #include "debug_help.h" #include "include/segmentor.h" #include "include/gbk_datrie.h" using namespace std; DEFINE_string(type, "simple", ""); DEFINE_string(config, "read_db.ini", "数据库配置文件,读取其中的title数据"); DEFINE_string(section, "all_title", "读取其中的title数据"); DEFINE_string(o, "dict/quechaokafei.txt", "提取的黑名单词"); DEFINE_int32(min_len, 2, "单个词要求都>=2 当前"); DEFINE_string(prob_dir, "testNgramTitle", "概率分词器的souce dir"); struct QuechaokafeiFunc { typedef std::tr1::unordered_set<string> HashSet; HashSet m_candidates; ch_convert::ChConverter m_converter; segment::ProbSegmentor m_seg; segment::GBK_DATrie_ m_trie; ofstream ofs; QuechaokafeiFunc() : m_seg(FLAGS_prob_dir), m_trie(m_seg.m_seg.m_trie, m_seg.m_seg.m_encoder), ofs(FLAGS_o.c_str()) { } void findQuechaokafei(const vector<string>& vec) { for (size_t i = 0; i < vec.size() - 1; i++) { if (vec[i].size() < FLAGS_min_len * 2 || !m_trie.search(vec[i])) continue; for (size_t j = i + 1; j < vec.size(); j++) { if (vec[j].size() < FLAGS_min_len * 2) continue; if (m_trie.search(vec[j])) { string s1 = vec[i] + vec[j]; string s2 = vec[j] + vec[i]; if (m_trie.search(s1)) { m_candidates.insert(s1); } if (m_trie.search(s2)) { m_candidates.insert(s2); } } } } } void writeResult() { std::copy(m_candidates.begin(), m_candidates.end(), ostream_iterator<string>(ofs, "\n")); } template<typename Stream> void operator()(Stream & os) { string key; vector<string> vec; while (!os.eof()) { os >> key; //---规则化处理key key = m_converter.Normalize(key); if (key.empty()) continue; key = filterString2(key); if (key.empty()) continue; m_seg.maxSegment(key, vec); findQuechaokafei(vec); } } }; void run() { DBReader db_reader; db_reader.init(FLAGS_config, FLAGS_section); QuechaokafeiFunc quechaokafei_func; db_reader.process(quechaokafei_func); quechaokafei_func.writeResult(); } int main(int argc, char *argv[]) { FLAGS_logtostderr = true; google::InitGoogleLogging(argv[0]); google::InstallFailureSignalHandler(); int s = google::ParseCommandLineFlags(&argc, &argv, false); boost::progress_timer timer; run(); return 0; }