简介
支持分词模式
Search模式,用于对用户查询词分词
Index模式,用于对索引文档分词
特性
支持多种分词模式
全角统一转成半角
用户词典功能
conf 目录有整理的搜狗细胞词库
因为性能原因,最新的快照版本去除词性标注,也希望有更好的 Pull Request 可以提供该功能。
简单使用
获取jieba-analysis
<dependency> <groupId>com.huaban</groupId> <artifactId>jieba-analysis</artifactId> <version>1.0.2</version> </dependency>
案例
@Test public void testDemo() { JiebaSegmenter segmenter = new JiebaSegmenter(); String[] sentences = new String[] {"这是一个伸手不见五指的黑夜。我叫孙悟空,我爱北京,我爱Python和C++。", "我不喜欢日本和服。", "雷猴回归人间。", "工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作", "结果婚的和尚未结过婚的"}; for (String sentence : sentences) { System.out.println(segmenter.process(sentence, SegMode.INDEX).toString()); } }
原文链接:https://github.com/huaban/jieba-analysis
我的应用
package com.analysis; import java.io.BufferedReader; import java.io.InputStreamReader; import java.sql.Connection; import java.sql.DriverManager; import java.sql.PreparedStatement; import java.sql.ResultSet; import java.util.List; import java.util.UUID; import org.junit.Before; import org.junit.Test; import com.huaban.analysis.jieba.JiebaSegmenter; import com.huaban.analysis.jieba.JiebaSegmenter.SegMode; import com.huaban.analysis.jieba.SegToken; public class jiebaTest { private Connection con = null; private PreparedStatement pstmt = null; /** * 连接 */ @Before public void beforeDemo() throws Exception { Class.forName("com.mysql.jdbc.Driver"); String url = "jdbc:mysql://localhost:3306/test?user=root&password=root"; con = DriverManager.getConnection(url); } /** * 分词查询测试 */ @Test public void getDemo() throws Exception { BufferedReader br = new BufferedReader(new InputStreamReader(System.in)); String str = br.readLine(); String sql = "select * from t_jieba where name = ?"; pstmt = con.prepareStatement(sql); pstmt.setString(1, str); ResultSet rs = pstmt.executeQuery(); while (rs.next()) { System.out.println(rs.getInt(1)+"--"+rs.getString(2)+"--"+rs.getString(3)+"--"+rs.getString(4)+"--"+rs.getString(5)); pstmt.clearParameters(); String sql1 = "update t_jieba set times = ? where id = ?"; pstmt = con.prepareStatement(sql1); pstmt.setInt(1, 1+ new Integer(rs.getString(5))); pstmt.setInt(2, rs.getInt(1)); pstmt.executeUpdate(); } rs.close(); pstmt.close(); } /** * 分词插入测试 */ @Test public void addDemo() throws Exception { String sql = "insert into t_jieba (name,cid,c_name,times) select ?,?,?,? from DUAL where not EXISTS(select name from t_jieba where name=?)"; pstmt = con.prepareStatement(sql); JiebaSegmenter segmenter = new JiebaSegmenter(); String[] sentences = new String[] { "大话数据结构", "深入浅出设计模式", "JavaEE开发的颠覆者: Spring Boot实战", "java从入门到放弃" }; for (String sentence : sentences) { //System.out.println(segmenter.process(sentence, SegMode.INDEX).toString()); String uuid = UUID.randomUUID().toString(); uuid = uuid.replace("-", ""); List<SegToken> list = segmenter.process(sentence, SegMode.INDEX); for (SegToken segToken : list) { String name = segToken.word.trim(); if (name != null && !"".equals(name)) { pstmt.setString(1, segToken.word); pstmt.setString(2, uuid); pstmt.setString(3, sentence); pstmt.setString(4, "0"); pstmt.setString(5, segToken.word); pstmt.executeUpdate(); pstmt.clearParameters(); } } } pstmt.close(); System.out.println("插入成功!"); } }