zoukankan      html  css  js  c++  java
  • 在字典中却掉影响召回的长词

    一个小应用,show下代码,为了推荐下我非常喜欢的glog,gflag:)感谢google 它们让我的生活更轻松:)
    另外读取数据库用otl相当方便,我用otl封装了下写了一个DBReader,这样处理数据库基本就和处理文本一样了完全相同的接口,完全屏蔽了数据库的操作方便了很多。
     
    /** 
     *  ==============================================================================
     * 
     *          \file   quechaokafei.cc
     *          \autho goldenlock  
     *          \Description:  由于标题中 “雀巢 咖啡” 的存在, 造成用户搜”雀巢咖啡“搜不到产品
     *                         解决办法读取数据库中所有单品对于标题如果发现 
     *                         ”雀巢*咖啡“ ”咖啡*雀巢“ 则这个词加入到
     *                         blacklist中  当前dict/quechaokafei.txt
     *
     *  ==============================================================================
     */
    
    #define private public
    #define protected public
    #include <iostream>
    #include <string>
    #include <vector>
    #include <fstream>
    
    #include "utils/db_reader.h"
    #include <algorithm>
    #include <boost/progress.hpp>
    #include <glog/logging.h>
    #include <gflags/gflags.h>
    
    #include <tr1/unordered_set>
    #include "gbk_ch_converter.h"
    #include "string_help.h"
    #include "config_help.h"
    #include "debug_help.h"
    #include "include/segmentor.h"
    #include "include/gbk_datrie.h"
    
    using namespace std;
    DEFINE_string(type, "simple", "");
    DEFINE_string(config, "read_db.ini", "数据库配置文件,读取其中的title数据");
    DEFINE_string(section, "all_title", "读取其中的title数据");
    DEFINE_string(o, "dict/quechaokafei.txt", "提取的黑名单词");
    DEFINE_int32(min_len, 2, "单个词要求都>=2 当前");
    DEFINE_string(prob_dir, "testNgramTitle", "概率分词器的souce dir");
    
    struct QuechaokafeiFunc
    {
        typedef std::tr1::unordered_set<string> HashSet;
        HashSet m_candidates;
        ch_convert::ChConverter m_converter;
        segment::ProbSegmentor m_seg;
        segment::GBK_DATrie_ m_trie;
        ofstream ofs;
    
        QuechaokafeiFunc()
        : m_seg(FLAGS_prob_dir), m_trie(m_seg.m_seg.m_trie, m_seg.m_seg.m_encoder), ofs(FLAGS_o.c_str())
        {
    
        }
    
        void findQuechaokafei(const vector<string>& vec)
        {
            for (size_t i = 0; i < vec.size() - 1; i++)
            {
                if (vec[i].size() < FLAGS_min_len * 2 || !m_trie.search(vec[i]))
                    continue;
                for (size_t j = i + 1; j < vec.size(); j++)
                {
                    if (vec[j].size() < FLAGS_min_len * 2)
                        continue;
                    if (m_trie.search(vec[j]))
                    {
                        string s1 = vec[i] + vec[j];
                        string s2 = vec[j] + vec[i];
                        if (m_trie.search(s1))
                        {
                            m_candidates.insert(s1);
                        }
    
                        if (m_trie.search(s2))
                        {
                            m_candidates.insert(s2);
                        }
                    }
    
                }
            }
        }
    
        void writeResult()
        {
            std::copy(m_candidates.begin(), m_candidates.end(), ostream_iterator<string>(ofs, "\n"));
        }
    
        template<typename Stream>
                void operator()(Stream & os)
        {
            string key;
            vector<string> vec;
            while (!os.eof())
            {
                os >> key;
                //---规则化处理key
                key = m_converter.Normalize(key);
                if (key.empty())
                    continue;
                key = filterString2(key);
                if (key.empty())
                    continue;
                m_seg.maxSegment(key, vec);
                findQuechaokafei(vec);
            }
        }
    };
    
    void run()
    {
        DBReader db_reader;
        db_reader.init(FLAGS_config, FLAGS_section);
        QuechaokafeiFunc quechaokafei_func;
        db_reader.process(quechaokafei_func);
        quechaokafei_func.writeResult();
    }
    
    int main(int argc, char *argv[])
    {
        FLAGS_logtostderr = true;
        google::InitGoogleLogging(argv[0]);
        google::InstallFailureSignalHandler();
        int s = google::ParseCommandLineFlags(&argc, &argv, false);
        boost::progress_timer timer;
    
        run();
    
        return 0;
    }
    
  • 相关阅读:
    作业day17
    python开发学习day17 (生成器;面向过程;三元表达式;生成式,内置函数)
    作业day16
    python开发学习day16 (三层装饰器;迭代器)
    python开发学习day15 (叠加装饰器;递归调用;匿名函数)
    作业day14
    python开发学习day14 (装饰器;语法糖;装饰器模板)
    python开发学习day13 (函数嵌套;名称空间与作用域;闭包函数)
    作业day12
    Python-简单算法程序
  • 原文地址:https://www.cnblogs.com/rocketfan/p/1931817.html
Copyright © 2011-2022 走看看