zoukankan      html  css  js  c++  java
  • jieba分词/jieba-analysis(java版)

    简介

    支持分词模式
    Search模式,用于对用户查询词分词
    Index模式,用于对索引文档分词
    特性
    支持多种分词模式
    全角统一转成半角
    用户词典功能
    conf 目录有整理的搜狗细胞词库
    因为性能原因,最新的快照版本去除词性标注,也希望有更好的 Pull Request 可以提供该功能。

    简单使用

    获取jieba-analysis

    <dependency>
      <groupId>com.huaban</groupId>
      <artifactId>jieba-analysis</artifactId>
      <version>1.0.2</version>
    </dependency>

    案例

    @Test
    public void testDemo() {
        JiebaSegmenter segmenter = new JiebaSegmenter();
        String[] sentences =
            new String[] {"这是一个伸手不见五指的黑夜。我叫孙悟空,我爱北京,我爱Python和C++。", "我不喜欢日本和服。", "雷猴回归人间。",
                          "工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作", "结果婚的和尚未结过婚的"};
        for (String sentence : sentences) {
            System.out.println(segmenter.process(sentence, SegMode.INDEX).toString());
        }
    }

    原文链接:https://github.com/huaban/jieba-analysis

    我的应用

    package com.analysis;
    
    import java.io.BufferedReader;
    import java.io.InputStreamReader;
    import java.sql.Connection;
    import java.sql.DriverManager;
    import java.sql.PreparedStatement;
    import java.sql.ResultSet;
    import java.util.List;
    import java.util.UUID;
    
    import org.junit.Before;
    import org.junit.Test;
    
    import com.huaban.analysis.jieba.JiebaSegmenter;
    import com.huaban.analysis.jieba.JiebaSegmenter.SegMode;
    import com.huaban.analysis.jieba.SegToken;
    
    public class jiebaTest {
    
        private Connection con = null;
        private PreparedStatement pstmt = null;
    
        /**
         * 连接
         */
        @Before
        public void beforeDemo() throws Exception {
            Class.forName("com.mysql.jdbc.Driver");
            String url = "jdbc:mysql://localhost:3306/test?user=root&password=root";
            con = DriverManager.getConnection(url);
        }
    
        /**
         * 分词查询测试
         */
        @Test
        public void getDemo() throws Exception {
            BufferedReader br = new BufferedReader(new InputStreamReader(System.in));
            String str = br.readLine();
    
            String sql = "select * from t_jieba where name = ?";
            pstmt = con.prepareStatement(sql);
    
            pstmt.setString(1, str);
            ResultSet rs = pstmt.executeQuery();
    
            while (rs.next()) {
                System.out.println(rs.getInt(1)+"--"+rs.getString(2)+"--"+rs.getString(3)+"--"+rs.getString(4)+"--"+rs.getString(5));
                pstmt.clearParameters();
                String sql1 = "update t_jieba set times = ? where id = ?";
                pstmt = con.prepareStatement(sql1);
                pstmt.setInt(1, 1+ new Integer(rs.getString(5)));
                pstmt.setInt(2, rs.getInt(1));
                pstmt.executeUpdate();
            }
            
            rs.close();
            pstmt.close();
        }
    
        /**
         * 分词插入测试
         */
        @Test
        public void addDemo() throws Exception {
            String sql = "insert into t_jieba (name,cid,c_name,times) select ?,?,?,? from DUAL where not EXISTS(select name from t_jieba where name=?)";
            pstmt = con.prepareStatement(sql);
            JiebaSegmenter segmenter = new JiebaSegmenter();
            String[] sentences = new String[] { "大话数据结构", "深入浅出设计模式", "JavaEE开发的颠覆者: Spring Boot实战", "java从入门到放弃" };
            for (String sentence : sentences) {
                //System.out.println(segmenter.process(sentence, SegMode.INDEX).toString());
                String uuid = UUID.randomUUID().toString();
                uuid = uuid.replace("-", "");
                List<SegToken> list = segmenter.process(sentence, SegMode.INDEX);
                for (SegToken segToken : list) {
                    String name = segToken.word.trim();
                    if (name != null && !"".equals(name)) {
                        pstmt.setString(1, segToken.word);
                        pstmt.setString(2, uuid);
                        pstmt.setString(3, sentence);
                        pstmt.setString(4, "0");
                        pstmt.setString(5, segToken.word);
                        pstmt.executeUpdate();
                        pstmt.clearParameters();
                    }
                }
            }
            pstmt.close();
            System.out.println("插入成功!");
        }
    
    }
    MyDemo
  • 相关阅读:
    rm
    Linux下解包/打包,压缩/解压命令
    虚拟机安装---vm12+ubuntukylin16.04
    mysql-5.6.41-winx64安装
    tensorflow学习笔记一------下载安装,配置环境(基于ubuntu16.04 pycharm)
    大一上学期C语言学习心得总结
    常见HTTP状态码
    Java语言基础及java核心
    linux下安装JMeter(小白教程)
    Linux下安装JDK(小白教程)
  • 原文地址:https://www.cnblogs.com/bky-lzw/p/7799238.html
Copyright © 2011-2022 走看看