zoukankan      html  css  js  c++  java
  • 编译Ansj之Solr插件

      Ansj是一个比较优秀的中文分词组件,具体情况就不在本文介绍了。ansj作者在其官方代码中,提供了对lucene接口的支持。如果用在Solr下,还需要简单的扩展一下。

    1、基于maven管理

       ansj是基于maven进行开发管理的。我们首先修改一下其pom.xml,具体如下所示:

      

    <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
    	xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
    	<modelVersion>4.0.0</modelVersion>
    
    	<parent>
    		<groupId>org.ansj</groupId>
    		<artifactId>MavenAccount-aggregator</artifactId>
    		<version>0.0.1</version>
    		<relativePath>../pom.xml</relativePath>
    	</parent>
    
    	<artifactId>ansj_lucene4_plug</artifactId>
    	<version>2.0.2</version>
    	<packaging>jar</packaging>
    
    	<name>ansj_lucene4_plug</name>
    	
     	<properties>
            <solr.version>4.8.0</solr.version>
        </properties>
    
    	<dependencies>
    		<dependency>
    			<groupId>org.ansj</groupId>
    			<artifactId>ansj_seg</artifactId>
    			<version>2.0.5</version>
    			<classifier>min</classifier>
    			<scope>provided</scope>
    		</dependency>
    		
    		<dependency>
    			<groupId>org.apache.lucene</groupId>
    			<artifactId>lucene-core</artifactId>
    			<version>${solr.version}</version>
    			<scope>provided</scope>
    		</dependency>
    		<dependency>
    			<groupId>org.apache.lucene</groupId>
    			<artifactId>lucene-highlighter</artifactId>
    			<version>${solr.version}</version>
    			<scope>provided</scope>
    		</dependency>
    
    		<dependency>
    			<groupId>org.apache.lucene</groupId>
    			<artifactId>lucene-queries</artifactId>
    			<version>${solr.version}</version>
    			<scope>provided</scope>
    		</dependency>
    
    		<dependency>
    			<groupId>org.apache.lucene</groupId>
    			<artifactId>lucene-queryparser</artifactId>
    			<version>${solr.version}</version>
    			<scope>provided</scope>
    		</dependency>
    		
    	   <dependency>
    			<groupId>org.apache.solr</groupId>
    			<artifactId>solr-dataimporthandler</artifactId>
    			<version>${solr.version}</version>
    			<scope>provided</scope>
    	  </dependency>
    
    		<dependency>
    			<groupId>junit</groupId>
    			<artifactId>junit</artifactId>
    			<version>4.4</version>
    			<scope>test</scope>
    		</dependency>
    	</dependencies>
    </project>
    

      其中,代码依赖的配置项:<scope>provided</scope> 表示只用于代码编译阶段。依赖关系整理好以后,写一个TokenizerFactory类,用于solr中配置使用,代码如下:

    package org.ansj.solr;
    
    import java.io.BufferedReader;
    import java.io.File;
    import java.io.FileInputStream;
    import java.io.FileNotFoundException;
    import java.io.IOException;
    import java.io.InputStreamReader;
    import java.io.Reader;
    import java.util.HashSet;
    import java.util.Map;
    import java.util.Set;
    
    import org.ansj.lucene.util.AnsjTokenizer;
    import org.ansj.splitWord.analysis.IndexAnalysis;
    import org.ansj.splitWord.analysis.ToAnalysis;
    import org.apache.lucene.analysis.Tokenizer;
    import org.apache.lucene.analysis.util.TokenizerFactory;
    import org.apache.lucene.util.AttributeSource.AttributeFactory;
    
    public class AnsjTokenizerFactory extends TokenizerFactory{
        boolean pstemming;
        boolean isQuery;
        private String stopwordsDir;
        public Set<String> filter;  
    
        public AnsjTokenizerFactory(Map<String, String> args) {
            super(args);
            assureMatchVersion();
            isQuery = getBoolean(args, "isQuery", true);
            pstemming = getBoolean(args, "pstemming", false);
            stopwordsDir = get(args,"words");
            addStopwords(stopwordsDir);
        }
        //add stopwords list to filter
        private void addStopwords(String dir) {
            if (dir == null){
                System.out.println("no stopwords dir");
                return;
            }
            //read stoplist
            System.out.println("stopwords: " + dir);
            filter = new HashSet<String>();
            File file = new File(dir); 
            InputStreamReader reader;
            try {
                reader = new InputStreamReader(new FileInputStream(file),"UTF-8");
                BufferedReader br = new BufferedReader(reader); 
                String word = br.readLine();  
                while (word != null) {
                    filter.add(word);
                    word = br.readLine(); 
                }  
            } catch (FileNotFoundException e) {
                System.out.println("No stopword file found");
            } catch (IOException e) {
                System.out.println("stopword file io exception");
            }      
        }
        @Override
        public Tokenizer create(AttributeFactory factory, Reader input) {
            if(isQuery == true){
                //query
                return new AnsjTokenizer(new ToAnalysis(new BufferedReader(input)), input, filter, pstemming);
            } else {
                //index
                return new AnsjTokenizer(new IndexAnalysis(new BufferedReader(input)), input, filter, pstemming);
            }
        }       
    }
    

      pstemming 参数是ansj需要的参数。

      isQuery 是用于判断是查询还是索引,一般搜索index阶段分词比较细,查询的分词比较粗。

    2、编译jar包。

        代码结构如下:

       

      编写mavn编译命令:mvn install -DskipTests=true# 忽略单元测试编译。

      

        执行编译:

    [INFO] Scanning for projects...
    [INFO]                                                                         
    [INFO] ------------------------------------------------------------------------
    [INFO] Building ansj_lucene4_plug 2.0.2
    [INFO] ------------------------------------------------------------------------
    [INFO] 
    [INFO] --- maven-clean-plugin:2.4.1:clean (default-clean) @ ansj_lucene4_plug ---
    [INFO] Deleting R:ansj-segansj_segplugansj_lucene4_plug	arget
    [INFO] 
    [INFO] --- maven-resources-plugin:2.4.3:resources (default-resources) @ ansj_lucene4_plug ---
    [INFO] Using 'UTF-8' encoding to copy filtered resources.
    [INFO] skip non existing resourceDirectory R:ansj-segansj_segplugansj_lucene4_plugsrcmain
    esources
    [INFO] 
    [INFO] --- maven-compiler-plugin:2.3.2:compile (default-compile) @ ansj_lucene4_plug ---
    [INFO] Compiling 5 source files to R:ansj-segansj_segplugansj_lucene4_plug	argetclasses
    [INFO] 
    [INFO] --- maven-resources-plugin:2.4.3:testResources (default-testResources) @ ansj_lucene4_plug ---
    [INFO] Using 'UTF-8' encoding to copy filtered resources.
    [INFO] skip non existing resourceDirectory R:ansj-segansj_segplugansj_lucene4_plugsrc	est
    esources
    [INFO] 
    [INFO] --- maven-compiler-plugin:2.3.2:testCompile (default-testCompile) @ ansj_lucene4_plug ---
    [INFO] Compiling 3 source files to R:ansj-segansj_segplugansj_lucene4_plug	arget	est-classes
    [INFO] 
    [INFO] --- maven-surefire-plugin:2.7.1:test (default-test) @ ansj_lucene4_plug ---
    [INFO] Tests are skipped.
    [INFO] 
    [INFO] --- maven-jar-plugin:2.3.1:jar (default-jar) @ ansj_lucene4_plug ---
    [INFO] Building jar: R:ansj-segansj_segplugansj_lucene4_plug	argetansj_lucene4_plug-2.0.2.jar
    [INFO] 
    [INFO] --- maven-install-plugin:2.3.1:install (default-install) @ ansj_lucene4_plug ---
    [INFO] Installing R:ansj-segansj_segplugansj_lucene4_plug	argetansj_lucene4_plug-2.0.2.jar to C:UsersGCZX-016.m2
    epositoryorgansjansj_lucene4_plug2.0.2ansj_lucene4_plug-2.0.2.jar
    [INFO] Installing R:ansj-segansj_segplugansj_lucene4_plugpom.xml to C:UsersGCZX-016.m2
    epositoryorgansjansj_lucene4_plug2.0.2ansj_lucene4_plug-2.0.2.pom
    [INFO] ------------------------------------------------------------------------
    [INFO] BUILD SUCCESS
    [INFO] ------------------------------------------------------------------------
    [INFO] Total time: 8.149s
    [INFO] Finished at: Tue May 05 15:29:19 CST 2015
    [INFO] Final Memory: 27M/245M
    [INFO] ------------------------------------------------------------------------
    

      

      

      

  • 相关阅读:
    拷贝某文件至某位置
    Java对象的序列化和反序列
    常见的RuntimeException异常有哪些
    array数组增删元素
    失眠怎么办
    构造函数和函数区别(关键的new操作符)
    匿名函数递归(arguments.callee)和命名函数递归
    localeCompare方法在chrome浏览器取值问题
    random()方法
    iframe 父子页面之间取值
  • 原文地址:https://www.cnblogs.com/likehua/p/4479278.html
Copyright © 2011-2022 走看看