zoukankan      html  css  js  c++  java
  • lucene简单使用

    lucene7以上最低要求jdk1.8

    lucene下载地址:

    http://archive.apache.org/dist/lucene/java/
    <dependency>
                <groupId>org.apache.lucene</groupId>
                <artifactId>lucene-core</artifactId>
                <version>6.0.0</version>
            </dependency>
    
            <dependency>
                <groupId>org.apache.lucene</groupId>
                <artifactId>lucene-highlighter</artifactId>
                <version>6.0.0</version>
            </dependency>
    
            <!-- http://mvnrepository.com/artifact/org.apache.lucene/lucene-analyzers-common -->
            <dependency>
                <groupId>org.apache.lucene</groupId>
                <artifactId>lucene-analyzers-common</artifactId>
                <version>6.0.0</version>
            </dependency>
    
            <!-- http://mvnrepository.com/artifact/org.apache.lucene/lucene-memory -->
            <dependency>
                <groupId>org.apache.lucene</groupId>
                <artifactId>lucene-memory</artifactId>
                <version>6.0.0</version>
            </dependency>
    
            <dependency>
                <groupId>junit</groupId>
                <artifactId>junit</artifactId>
                <version>4.9</version>
            </dependency>
    
            <!-- http://mvnrepository.com/artifact/org.apache.lucene/lucene-queryparser -->
            <dependency>
                <groupId>org.apache.lucene</groupId>
                <artifactId>lucene-queryparser</artifactId>
                <version>6.0.0</version>
            </dependency>
            <dependency>
                <groupId>commons-io</groupId>
                <artifactId>commons-io</artifactId>
                <version>2.6</version>
            </dependency>
            <dependency>
                <groupId>org.apache.lucene</groupId>
                <artifactId>lucene-core</artifactId>
                <version>6.0.0</version>
            </dependency>
            <dependency>
                <groupId>org.apache.lucene</groupId>
                <artifactId>lucene-analyzers-smartcn</artifactId>
                <version>7.3.0</version>
            </dependency>
            <dependency>
                <groupId>com.janeluo</groupId>
                <artifactId>ikanalyzer</artifactId>
                <version>2012_u6</version>
                <!--排除掉里面旧的lucene包,因为我们要重写里面的分析器和分词器  -->
                <exclusions>
                    <exclusion>
                        <groupId>org.apache.lucene</groupId>
                        <artifactId>lucene-core</artifactId>
                    </exclusion>
                    <exclusion>
                        <groupId>org.apache.lucene</groupId>
                        <artifactId>lucene-queryparser</artifactId>
                    </exclusion>
                    <exclusion>
                        <groupId>org.apache.lucene</groupId>
                        <artifactId>lucene-analyzers-common</artifactId>
                    </exclusion>
                </exclusions>
            </dependency>
            <dependency>
                <groupId>com.janeluo</groupId>
                <artifactId>ikanalyzer</artifactId>
                <version>2012_u6</version>
            </dependency>
    package com.ytkj.lucene;
    
    import org.apache.commons.io.FileUtils;
    import org.apache.lucene.analysis.Analyzer;
    import org.apache.lucene.analysis.TokenStream;
    import org.apache.lucene.analysis.standard.StandardAnalyzer;
    import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
    import org.apache.lucene.document.Document;
    import org.apache.lucene.document.Field;
    import org.apache.lucene.document.TextField;
    import org.apache.lucene.index.*;
    import org.apache.lucene.search.*;
    import org.apache.lucene.store.Directory;
    import org.apache.lucene.store.FSDirectory;
    import org.wltea.analyzer.lucene.IKAnalyzer;
    
    import java.io.File;
    import java.io.IOException;
    
    /**
     * lucene入门程序
     */
    public class LuceneFrist {
        /**
         * 创建索引库
         * @throws Exception
         */
        public static void createIndex()throws Exception{
            //1.创建一个Director对象,指定索引库保存的位置,保存在磁盘中
            Directory directory=FSDirectory.open(new File("E:\lucene\lucenetemp").toPath());
            //2.创建IndexWriter对象
                //创建使用的分词器
            StandardAnalyzer analyzer = new StandardAnalyzer();
            IndexWriter indexWriter=new IndexWriter(directory,new IndexWriterConfig(analyzer));
            //3.读取磁盘上的文件,对应每个文件创建一个文档对象
            File dir=new File("E:\lucene\luceneresource");
            File[] files = dir.listFiles();
            for (File file : files) {
                //文件名称
                String name = file.getName();
                //文件路径
                String path = file.getPath();
                //文件内容
                String content = FileUtils.readFileToString(file, "utf-8");
                //文件大小
                long size = FileUtils.sizeOf(file);
                //创建域 参数:域的名称 文件名称 是否存储
                Field fieldName=new TextField("name",name,Field.Store.YES);
                Field fieldPath=new TextField("path",path,Field.Store.YES);
                Field fieldContent=new TextField("content",content,Field.Store.YES);
                Field fieldSize=new TextField("size",size+"",Field.Store.YES);
                //4.创建文档对象
                Document document=new Document();
                //5.向文档对象中添加域
                document.add(fieldName);
                document.add(fieldPath);
                document.add(fieldContent);
                document.add(fieldSize);
                //6.把文档对象写入索引库
                indexWriter.addDocument(document);
            }
            //7.关闭indexwriter对象
            indexWriter.close();
        }
    
        /**
         * 查询索引库
         * @throws Exception
         */
        public static void searchIndex() throws Exception {
            //1.创建一个Director对象,指定索引库保存的位置
            Directory directory=FSDirectory.open(new File("E:\lucene\lucenetemp").toPath());
            //2.创建indexReader对象
            IndexReader indexReader= DirectoryReader.open(directory);
            //3.创建indexsearcher对象,构造方法中的参数indexReader对象
            IndexSearcher indexSearcher=new IndexSearcher(indexReader);
            //4.创建一个query对象
            Query query=new TermQuery(new Term("content","spring"));
            //5.执行查询,得到一个TopDocs对象 参数:查询对象 查询结果返回的最大记录数
            TopDocs topDocs = indexSearcher.search(query, 10);
            //6.取查询结果的总记录数
            int totalHits = topDocs.totalHits;
            System.out.println("查询结果的总记录数:"+totalHits);
            //7.获取文档列表
            ScoreDoc[] scoreDocs = topDocs.scoreDocs;
            for (ScoreDoc scoreDoc : scoreDocs) {
                //取文档id
                int docId = scoreDoc.doc;
                //8.根据id获取文档对象
                Document document = indexSearcher.doc(docId);
                System.out.println(document.get("name"));
                System.out.println(document.get("path"));
                System.out.println(document.get("content"));
                System.out.println(document.get("size"));
            }
            //关闭创建indexReader对象
            indexReader.close();
    
        }
    
        /**
         * 查看分词器的分词效果
         * @throws Exception
         */
        public static  void testTikenStream() throws Exception {
            //创建使用的标准分词器
            StandardAnalyzer analyzer = new StandardAnalyzer();
            //使用分词器对象的tokenStream方法获取tokenStream对象
            TokenStream tokenStream=analyzer.tokenStream("","org.springframework.jdbc.datasource.DataSourceTransactionManager");
            //向tokenstream对象中设置一个引用,相当于一个指针
            CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);
            //调用tokenstream的rest方法
            tokenStream.reset();
            //循环遍历tokenStream对象
            while (tokenStream.incrementToken()){
                System.out.println(charTermAttribute.toString());
            }
            //关闭
            tokenStream.close();
    
        }
    
        /**
         * 中文分词器测试
         */
        public static  void testIKAnalyzer() throws Exception {
            String etext = "Analysis is one of the main causes of slow indexing. Simply put, the more you analyze the slower analyze the indexing (in most cases).";
            String chineseText = "张三说的确实在理。";
            /**
             * ikanalyzer 中文分词器 因为Analyzer的createComponents方法API改变了 需要我们自己实现
             * 分析器IKAnalyzer4Lucene7和分词器IKTokenizer4Lucene7
             */
            // IKAnalyzer 细粒度切分
            try (Analyzer ik = new IKAnalyzer();) {
                TokenStream ts = ik.tokenStream("content", etext);
                System.out.println("IKAnalyzer中文分词器 细粒度切分,英文分词效果:");
                doToken(ts);
                ts = ik.tokenStream("content", chineseText);
                System.out.println("IKAnalyzer中文分词器 细粒度切分,中文分词效果:");
                doToken(ts);
            }
    
            // IKAnalyzer 智能切分
            try (Analyzer ik = new IKAnalyzer(true);) {
                TokenStream ts = ik.tokenStream("content", etext);
                System.out.println("IKAnalyzer中文分词器 智能切分,英文分词效果:");
                doToken(ts);
                ts = ik.tokenStream("content", chineseText);
                System.out.println("IKAnalyzer中文分词器 智能切分,中文分词效果:");
                doToken(ts);
            }
        }
        private static void doToken(TokenStream ts) throws IOException {
            ts.reset();
            CharTermAttribute cta = ts.getAttribute(CharTermAttribute.class);
            while (ts.incrementToken()) {
                System.out.print(cta.toString() + "|");
            }
            System.out.println();
            ts.end();
            ts.close();
        }
    
    
        public static void main(String[] args) throws Exception {
            //createIndex();
            //searchIndex();
            //testTikenStream();
            testIKAnalyzer();
        }
    }
  • 相关阅读:
    搜狗输入法用户体验
    Day06
    Day05
    Spark-RDD操作(26个常用函数附实例)
    软件工程培训第五天(hive进阶)
    hive窗口函数
    hive操作(行转列,列转行)
    Hive中使用case then分情况求和
    hive分组排序(rank函数+partiton实现)
    软件工程培训第四天总结,hive的学习
  • 原文地址:https://www.cnblogs.com/yscec/p/11946661.html
Copyright © 2011-2022 走看看