zoukankan      html  css  js  c++  java
  • elasticsearch实战 修改IK源码实现词组动态更新

    下载IK源码

    https://github.com/medcl/elasticsearch-analysis-ik/tree/v5.2.0

    选择你对应ik的版本(ps:版本最好一致)

    http://localhost:9200/?pretty查看es版本 我的是6.5.1

    修改源码

    1.创建一个ext包同时增加3个类文件

    DBHelper

    package org.wltea.analyzer.ext;
    
    import org.apache.logging.log4j.Logger;
    import org.elasticsearch.common.logging.Loggers;
    
    import java.sql.Connection;
    import java.sql.DriverManager;
    import java.sql.PreparedStatement;
    import java.sql.ResultSet;
    import java.text.SimpleDateFormat;
    import java.util.Date;
    import java.util.HashMap;
    import java.util.Map;
    
    public class DBHelper {
        Logger logger= Loggers.getLogger(DBHelper.class);
    
    
        public static String url = null;
        public static String dbUser = null;
        public static String dbPwd = null;
        public static String dbTable = null;
        private Connection conn;
        public static Map<String, Date> lastImportTimeMap = new HashMap<String, Date>();
    
        static{
            try {
                Class.forName("com.mysql.jdbc.Driver");// 加载Mysql数据驱动
            } catch (Exception e) {
                e.printStackTrace();
            }
        }
        private Connection getConn() throws Exception {
            try {
                conn = DriverManager.getConnection(url, dbUser, dbPwd);// 创建数据连接
            } catch (Exception e) {
                logger.warn("异常了");
                e.printStackTrace();
            }
            return conn;
        }
    
    
        /**
         * 从数据库获得分词信息
         * @param key 字段名
         * @param type 分词类型 0扩展分词 1停分词
         * @param delete 是否有效 0有效 1无效
         * @param flag 是否每次加载最新的
         * @param synonyStr
         * @return
         * @throws Exception
         */
        public String getKey(String key, Integer type,boolean delete,boolean flag,String synonyStr) throws Exception {
    
            conn = getConn();
            StringBuilder data = new StringBuilder();
            PreparedStatement ps = null;
            ResultSet rs = null;
            try {
                StringBuilder sql = new StringBuilder("select  *  from " + dbTable + " where 1=1");
                //lastImportTime 最新更新时间
                Date lastImportTime = DBHelper.lastImportTimeMap.get(key);
                SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
                if (lastImportTime != null && flag) {
                    sql.append(" and update_time > '" + sdf.format(lastImportTime) + "'");
                }
                sql.append(" and " + key + " !=''");
                if(type!=null){
                    sql.append("and word_type="+type);
                }
                if(delete){
                    sql.append(" and delete_type="+1);
                }else{
                    sql.append(" and delete_type="+0);
                }
                lastImportTime = new Date();
                lastImportTimeMap.put(key,lastImportTime);
                //如果打印出来的时间 和本地时间不一样,则要注意JVM时区是否和服务器系统时区一致
                logger.warn("sql==={}",sql.toString());
                System.out.print(conn);
                ps = conn.prepareStatement(sql.toString());
                rs = ps.executeQuery();
                while (rs.next()) {
                    String value = rs.getString(key);
                    if (StringUtils.isNotBlank(value)) {
                        if (StringUtils.isNotBlank(synonyStr)) {
                            data.append(value + synonyStr);
                        } else {
                            data.append(value + ",");
                        }
                    }
    
                }
            } catch (Exception e) {
                e.printStackTrace();
            } finally {
                try {
                    if (ps != null) {
                        ps.close();
    
                    }
                    if (rs != null) {
                        rs.close();
                    }
    
                    conn.close();
    
                } catch (Exception e) {
                    e.printStackTrace();
                }
            }
            return data.toString();
        }
    //测试
    //    public static void main(String[] args) throws Exception {
    //        DBHelper dbHelper=new DBHelper();
    //        String extWords=dbHelper.getKey("ext_word",true);
    //        List<String> extList = Arrays.asList(extWords.split(","));
    //        System.out.println(extList);
    //        // System.out.println(getKey("stopword"));
    //        // System.out.println(getKey("synonym"));
    //        LocalDate now=LocalDate.now();
    //
    //    }
    
    }

    DBRunnable

    package org.wltea.analyzer.ext;
    
    
    import org.apache.logging.log4j.Logger;
    import org.elasticsearch.common.logging.Loggers;
    import org.wltea.analyzer.dic.Dictionary;
    
    import java.util.Arrays;
    import java.util.List;
    
    public class DBRunnable implements Runnable {
        Logger logger = Loggers.getLogger(DBRunnable.class);
        private String wordField;
    
    
        public DBRunnable(String wordField) {
            super();
            this.wordField = wordField;
        }
    
    
        @Override
        public void run() {
            logger.warn("开始加载词库========");
            //获取词库
            Dictionary dic = Dictionary.getSingleton();
            DBHelper dbHelper = new DBHelper();
            try {
                String extWords = dbHelper.getKey(wordField, 0,false,true,",");
                String stopWords = dbHelper.getKey(wordField, 1,false,true,",");
                String extDelWords = dbHelper.getKey(wordField, 0,true,true,",");
                String extStopWords = dbHelper.getKey(wordField, 1,true,true,",");
                if(StringUtils.isNotBlank(extWords)){
                    List<String> extList = Arrays.asList(extWords.split(","));
                    //把扩展词加载到主词库中
                    dic.addWords(extList);
                    logger.warn("加载扩展词成功========");
                    logger.warn("extWords为==={}",extWords);
                }
                if(StringUtils.isNotBlank(stopWords)){
                    List<String> stopList = Arrays.asList(stopWords.split(","));
                    //把扩展词加载到主词库中
                    dic.addStopWords(stopList);
                    logger.warn("加载停用词成功========");
                    logger.warn("stopWords为==={}",stopWords);
                }
                //移除词库
                if(StringUtils.isNotBlank(extDelWords)){
                    List<String> stopList = Arrays.asList(extDelWords.split(","));
                    //把扩展词加载到主词库中
                    dic.disableWords(stopList);
                    logger.warn("移除扩展词成功========");
                    logger.warn("extDelWords==={}",extDelWords);
                }
                if(StringUtils.isNotBlank(extStopWords)){
                    List<String> stopList = Arrays.asList(extStopWords.split(","));
                    //把扩展词加载到主词库中
                    dic.disableStopWords(stopList);
                    logger.warn("移除停用词成功========");
                    logger.warn("extStopWords==={}",extStopWords);
                }
    
            } catch (Exception e) {
    
                logger.warn("加载扩展词失败========{}",e);
            }
    
        }
    
    }

    StringUtils

    package org.wltea.analyzer.ext;
    
    
    public class StringUtils {
        /**
         * 判断字符串是否为空 为空返回true 否则返回false
         * @param str
         * @return
         */
        public static boolean isBlank(String str) {
            int strLen;
            if (str == null || (strLen = str.length()) == 0) {
                return true;
            }
            for (int i = 0; i < strLen; i++) {
                if ((Character.isWhitespace(str.charAt(i)) == false)) {
                    return false;
                }
            }
            return true;
        }
        /**
         * 判断字符串是否不为空 为空返回false 否则返回true
         * @param str
         * @return
         */
        public static boolean isNotBlank(String str) {
            return !StringUtils.isBlank(str);
        }
    }

    2.Dictionary增加几个方法

    /**
         * 批量加载新停用词条
         *
         * @param words
         *            Collection<String>词条列表
         */
        public void addStopWords(Collection<String> words) {
            if (words != null) {
                for (String word : words) {
                    if (word != null) {
                        // 批量加载词条到主内存词典中
                        _StopWords.fillSegment(word.trim().toCharArray());
                    }
                }
            }
    
        }
        /**
         * 批量移除停用词条
         *
         * @param words
         *            Collection<String>词条列表
         */
        public void disableStopWords(Collection<String> words) {
            if (words != null) {
                for (String word : words) {
                    if (word != null) {
                        // 批量加载词条到主内存词典中
                        _StopWords.disableSegment(word.trim().toCharArray());
                    }
                }
            }
    
        }
        /**
         * 读取jdbc配置初始化 定时更新数据库词组定时任务
         *
         * @throws IOException
         */
        public  void initReloadMysqlWordJob() throws IOException {
    
            logger.warn("============IKAnalyzer==============");
            Path file = PathUtils.get(getDictRoot(), "jdbc.properties");
            Properties prop = new Properties();
            prop.load(new FileInputStream(file.toFile()));
            logger.info("===========load jdbc.properties========");
            for(Object key : prop.keySet()) {
                logger.info("==========>>" + key + "=" + prop.getProperty(String.valueOf(key)));
            }
            boolean autoReloadDic=Boolean.valueOf(prop.getProperty("autoReloadDic"));
            if(autoReloadDic){
                String dbUser = prop.getProperty("dbUser");
                String dbPwd = prop.getProperty("dbPwd");
                //获取每隔多久从数据库更新信息 默认60S
                Integer flushTime = Integer.valueOf(prop.getProperty("flushTime"));
                String dbTable = prop.getProperty("dbTable","t_es_ik_dic");
                DBHelper.dbTable=dbTable;
                DBHelper.dbUser=dbUser;
                DBHelper.dbPwd=dbPwd;
                DBHelper.url=prop.getProperty("dbUrl");
                String wordFieldName = prop.getProperty("wordFieldName");
                ScheduledExecutorService scheduledExecutorService  =  Executors.newSingleThreadScheduledExecutor();
                scheduledExecutorService.scheduleAtFixedRate(new DBRunnable(wordFieldName), 0, flushTime, TimeUnit.SECONDS);
            }
        }

    4.在init方法启用job

    public static synchronized Dictionary initial(Configuration cfg) {
            if (singleton == null) {
                synchronized (Dictionary.class) {
                    if (singleton == null) {
    
                        singleton = new Dictionary(cfg);
                        singleton.loadMainDict();
                        singleton.loadSurnameDict();
                        singleton.loadQuantifierDict();
                        singleton.loadSuffixDict();
                        singleton.loadPrepDict();
                        singleton.loadStopWordDict();
                        try {
                            singleton.initReloadMysqlWordJob();
                        } catch (IOException e) {
                            logger.error("动态加载mysql词组失败....");
                            e.printStackTrace();
                        }
                        if(cfg.isEnableRemoteDict()){
                            // 建立监控线程
                            for (String location : singleton.getRemoteExtDictionarys()) {
                                // 10 秒是初始延迟可以修改的 60是间隔时间 单位秒
                                pool.scheduleAtFixedRate(new Monitor(location), 10, 60, TimeUnit.SECONDS);
                            }
                            for (String location : singleton.getRemoteExtStopWordDictionarys()) {
                                pool.scheduleAtFixedRate(new Monitor(location), 10, 60, TimeUnit.SECONDS);
                            }
                        }
    
                        return singleton;
                    }
                }
            }
            return singleton;
        }

    将ik安装导入es

    1.打包

    2.将zip文件移动到es的plugins文件夹

    解压并重命名为ik

    3.ik目录的config创建一个jdbc.properties文件

    dbUrl=jdbc:mysql://ip/port #数据库连接
    dbUser=user #数据库用户名
    dbPwd=password #数据库密码
    dbTable=md_es_ik_dic #词库表
    wordFieldName=word #词组字段
    flushTime=5 #刷新时间 (秒)
    autoReloadDic=true #是否启用

    4.创建数据库表

    DROP TABLE IF EXISTS `md_es_ik_dic`;
    CREATE TABLE `md_es_ik_dic` (
      `id` int(11) NOT NULL AUTO_INCREMENT COMMENT '自增id',
      `word` varchar(100) DEFAULT '' COMMENT '扩展分词',
      `word_type` varchar(100) DEFAULT '' COMMENT '0:扩展分词  1:停用分词 ',
      `delete_type` tinyint(4) DEFAULT '0' COMMENT '0表示未删除,1表示删除',
      `create_time` timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP COMMENT '创建时间',
      `update_time` timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP COMMENT '更新时间',
      PRIMARY KEY (`id`)
    ) ENGINE=InnoDB AUTO_INCREMENT=16 DEFAULT CHARSET=utf8 COMMENT='词库维护表';

     5.es lib增加一个mysql数据库驱动文件

    6.启动es测试

    get请求es:http://127.0.0.1:9200/_analyze

    {
        "analyzer":"ik_max_word",
        "text":"我是一名小正太"
    }

    分词结果

    {
        "tokens": [
            {
                "token": "我",
                "start_offset": 0,
                "end_offset": 1,
                "type": "CN_CHAR",
                "position": 0
            },
            {
                "token": "是",
                "start_offset": 1,
                "end_offset": 2,
                "type": "CN_CHAR",
                "position": 1
            },
            {
                "token": "一名",
                "start_offset": 2,
                "end_offset": 4,
                "type": "CN_WORD",
                "position": 2
            },
            {
                "token": "一",
                "start_offset": 2,
                "end_offset": 3,
                "type": "TYPE_CNUM",
                "position": 3
            },
            {
                "token": "名",
                "start_offset": 3,
                "end_offset": 4,
                "type": "COUNT",
                "position": 4
            },
            {
                "token": "小",
                "start_offset": 4,
                "end_offset": 5,
                "type": "CN_CHAR",
                "position": 5
            },
            {
                "token": "正",
                "start_offset": 5,
                "end_offset": 6,
                "type": "CN_CHAR",
                "position": 6
            },
            {
                "token": "太",
                "start_offset": 6,
                "end_offset": 7,
                "type": "CN_CHAR",
                "position": 7
            }
        ]
    }

    如果我们需要小正太分词也分一个词在数据库新增

    es日期打印

    再次测试分词结果

    {
        "tokens": [
            {
                "token": "我",
                "start_offset": 0,
                "end_offset": 1,
                "type": "CN_CHAR",
                "position": 0
            },
            {
                "token": "是",
                "start_offset": 1,
                "end_offset": 2,
                "type": "CN_CHAR",
                "position": 1
            },
            {
                "token": "一名",
                "start_offset": 2,
                "end_offset": 4,
                "type": "CN_WORD",
                "position": 2
            },
            {
                "token": "一",
                "start_offset": 2,
                "end_offset": 3,
                "type": "TYPE_CNUM",
                "position": 3
            },
            {
                "token": "名",
                "start_offset": 3,
                "end_offset": 4,
                "type": "COUNT",
                "position": 4
            },
            {
                "token": "小正太",
                "start_offset": 4,
                "end_offset": 7,
                "type": "CN_WORD",
                "position": 5
            }
        ]
    }

    可以看到小正太分成了一个词

    可能遇到的问题

    启动报错:Plugin [analysis-ik] was built for Elasticsearch version 6.5.0 but version 6.5.1 is running

    因为要求es版本和ik版本要完全一致,可以尝试一下修改ik目录下的plugin-descriptor.properties

    改成es版本

    找不到数据库驱动

    ikpom增加数据库驱动依赖   es lib放入数据库驱动jar

    The last packet sent successfully to the server was 0 milliseconds ago. The driver has not received any packets from the server.

      permission java.net.SocketPermission "ip:port", "listen,accept,connect,resolve";

    修改jre下的lib/security  java.policy

     

    我的是在:/Library/Java/JavaVirtualMachines/jdk1.8.0_181.jdk/Contents/Home/jre/lib/security

    增加:

    permission java.net.SocketPermission "ip:port", "listen,accept,connect,resolve";

    可能会出现当前文件只读  切换为root权限修改即可

     

  • 相关阅读:
    struts2 DMI
    MFC添加背景图片
    c++ 副本构造器
    climits
    Qt中的qreal
    Http概述(一)
    重构学习-重构原则
    QDir的mkdir和mkpath区别
    Qt学习笔记网络(一)
    Qt5 新特性
  • 原文地址:https://www.cnblogs.com/LQBlog/p/10443862.html
Copyright © 2011-2022 走看看