zoukankan      html  css  js  c++  java
  • 在字典中却掉影响召回的长词

    一个小应用,show下代码,为了推荐下我非常喜欢的glog,gflag:)感谢google 它们让我的生活更轻松:)
    另外读取数据库用otl相当方便,我用otl封装了下写了一个DBReader,这样处理数据库基本就和处理文本一样了完全相同的接口,完全屏蔽了数据库的操作方便了很多。
     
    /** 
     *  ==============================================================================
     * 
     *          \file   quechaokafei.cc
     *          \autho goldenlock  
     *          \Description:  由于标题中 “雀巢 咖啡” 的存在, 造成用户搜”雀巢咖啡“搜不到产品
     *                         解决办法读取数据库中所有单品对于标题如果发现 
     *                         ”雀巢*咖啡“ ”咖啡*雀巢“ 则这个词加入到
     *                         blacklist中  当前dict/quechaokafei.txt
     *
     *  ==============================================================================
     */
    
    #define private public
    #define protected public
    #include <iostream>
    #include <string>
    #include <vector>
    #include <fstream>
    
    #include "utils/db_reader.h"
    #include <algorithm>
    #include <boost/progress.hpp>
    #include <glog/logging.h>
    #include <gflags/gflags.h>
    
    #include <tr1/unordered_set>
    #include "gbk_ch_converter.h"
    #include "string_help.h"
    #include "config_help.h"
    #include "debug_help.h"
    #include "include/segmentor.h"
    #include "include/gbk_datrie.h"
    
    using namespace std;
    DEFINE_string(type, "simple", "");
    DEFINE_string(config, "read_db.ini", "数据库配置文件,读取其中的title数据");
    DEFINE_string(section, "all_title", "读取其中的title数据");
    DEFINE_string(o, "dict/quechaokafei.txt", "提取的黑名单词");
    DEFINE_int32(min_len, 2, "单个词要求都>=2 当前");
    DEFINE_string(prob_dir, "testNgramTitle", "概率分词器的souce dir");
    
    struct QuechaokafeiFunc
    {
        typedef std::tr1::unordered_set<string> HashSet;
        HashSet m_candidates;
        ch_convert::ChConverter m_converter;
        segment::ProbSegmentor m_seg;
        segment::GBK_DATrie_ m_trie;
        ofstream ofs;
    
        QuechaokafeiFunc()
        : m_seg(FLAGS_prob_dir), m_trie(m_seg.m_seg.m_trie, m_seg.m_seg.m_encoder), ofs(FLAGS_o.c_str())
        {
    
        }
    
        void findQuechaokafei(const vector<string>& vec)
        {
            for (size_t i = 0; i < vec.size() - 1; i++)
            {
                if (vec[i].size() < FLAGS_min_len * 2 || !m_trie.search(vec[i]))
                    continue;
                for (size_t j = i + 1; j < vec.size(); j++)
                {
                    if (vec[j].size() < FLAGS_min_len * 2)
                        continue;
                    if (m_trie.search(vec[j]))
                    {
                        string s1 = vec[i] + vec[j];
                        string s2 = vec[j] + vec[i];
                        if (m_trie.search(s1))
                        {
                            m_candidates.insert(s1);
                        }
    
                        if (m_trie.search(s2))
                        {
                            m_candidates.insert(s2);
                        }
                    }
    
                }
            }
        }
    
        void writeResult()
        {
            std::copy(m_candidates.begin(), m_candidates.end(), ostream_iterator<string>(ofs, "\n"));
        }
    
        template<typename Stream>
                void operator()(Stream & os)
        {
            string key;
            vector<string> vec;
            while (!os.eof())
            {
                os >> key;
                //---规则化处理key
                key = m_converter.Normalize(key);
                if (key.empty())
                    continue;
                key = filterString2(key);
                if (key.empty())
                    continue;
                m_seg.maxSegment(key, vec);
                findQuechaokafei(vec);
            }
        }
    };
    
    void run()
    {
        DBReader db_reader;
        db_reader.init(FLAGS_config, FLAGS_section);
        QuechaokafeiFunc quechaokafei_func;
        db_reader.process(quechaokafei_func);
        quechaokafei_func.writeResult();
    }
    
    int main(int argc, char *argv[])
    {
        FLAGS_logtostderr = true;
        google::InitGoogleLogging(argv[0]);
        google::InstallFailureSignalHandler();
        int s = google::ParseCommandLineFlags(&argc, &argv, false);
        boost::progress_timer timer;
    
        run();
    
        return 0;
    }
    
  • 相关阅读:
    SurfaceView 和 View 区别
    投资学第一章 investments-introduction
    HDU 1879 继续畅通工程 (Prim(普里姆算法)+Kruskal(克鲁斯卡尔))
    多个Activity之间的切换与数据交互
    HDU 4715 Difference Between Primes (打表)
    org.xml.sax.SAXParseException: An invalid XML character (Unicode: 0x0) was found in the CDATA sectio
    用纯jsp实现用户的登录、注册与退出
    Java单态模式
    植物-蔬菜:刺儿菜
    汉语-词语:生活
  • 原文地址:https://www.cnblogs.com/rocketfan/p/1931817.html
Copyright © 2011-2022 走看看