zoukankan      html  css  js  c++  java
  • lucene创建索引

    创建索引.

    1.lucene下载.

    下载地址:http://archive.apache.org/dist/lucene/java/.
    lucene不同版本之间有不小的差别,这里下载的是lucene 4.3.

    2.导入jar包

    打开eclipse,新建dynamic web project.解压下载的lucene压缩包,依次找到下面几个jar包,加到/WebContent/WEB-INF/lib目录下,然后Add to Build Path:

    包名 位置
    lucene-analyzers-common-4.3.0.jar lucene-4.3.0/analysis/common
    lucene-analyzers-smartcn-4.3.0.jar lucene-4.3.0/analysis/smartcn
    lucene-core-4.3.0.jar lucene-4.3.0/core
    lucene-highlighter-4.3.0.jar lucene-4.3.0/highlighter
    lucene-queries-4.3.0.jar lucene-4.3.0/queries
    lucene-queryparser-4.3.0.jar lucene-4.3.0/queryparser

    3.创建索引

    package ac.ucas.lucene;
    
    import java.io.File;
    import java.io.IOException;
    
    import org.apache.lucene.analysis.Analyzer;
    import org.apache.lucene.analysis.standard.StandardAnalyzer;
    import org.apache.lucene.document.Document;
    import org.apache.lucene.document.IntField;
    import org.apache.lucene.document.Field.Store;
    import org.apache.lucene.document.StringField;
    import org.apache.lucene.document.TextField;
    import org.apache.lucene.index.IndexWriter;
    import org.apache.lucene.index.IndexWriterConfig;
    import org.apache.lucene.index.IndexWriterConfig.OpenMode;
    import org.apache.lucene.store.Directory;
    import org.apache.lucene.store.FSDirectory;
    import org.apache.lucene.util.Version;
    
    public class IndexCreate {
    
        public static void main(String[] args) {
            // TODO Auto-generated method stub
            // 创建标准分词器
            Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_43);
            // 创建indexwriter配置信息
            IndexWriterConfig indexWriterConfig = new IndexWriterConfig(Version.LUCENE_43, analyzer);
            // 设置索引的打开方式
            indexWriterConfig.setOpenMode(OpenMode.CREATE_OR_APPEND);
            // 索引的存储路径
            Directory directory = null;
            // 索引的增删改由indexWriter创建
            IndexWriter indexWriter = null;
    
            try {
                directory = FSDirectory.open(new File("/Users/yaopan/Documents/eclipseworkspace/test"));
                if (indexWriter.isLocked(directory)) {//若indexWriter锁定则解锁
                    indexWriter.unlock(directory);
                }
    
                //实例化indexWriter
                indexWriter = new IndexWriter(directory, indexWriterConfig);
            } catch (IOException e) {
                // TODO Auto-generated catch block
                e.printStackTrace();
            }
    
            Document doc1 = new Document();
            //添加三个域
            doc1.add(new StringField("id", "abcde", Store.YES));
            doc1.add(new TextField("content", "极客学院", Store.YES));
            doc1.add(new IntField("num", 1, Store.YES));
    
            // 写入索引
            try {
                indexWriter.addDocument(doc1);
            } catch (Exception e) {
                // TODO Auto-generated catch block
                e.printStackTrace();
            }
    
            Document doc2 = new Document();
            doc2.add(new StringField("id", "addff", Store.YES));
            doc2.add(new TextField("content", "LUCENE案例", Store.YES));
            doc2.add(new IntField("num", 2, Store.YES));
    
            // 写入索引
            try {
                indexWriter.addDocument(doc2);
            } catch (Exception e) {
                // TODO Auto-generated catch block
                e.printStackTrace();
    
            }
    
            try {
                indexWriter.commit();
    
                indexWriter.close();
                directory.close();
            } catch (Exception e) {
                // TODO Auto-generated catch block
                e.printStackTrace();
            }
    
            System.out.println("index ceate complete!");
        }
    }
    

    4.lucene分词器

    {%codeblock lang:java lucene分词器 %}

    
    
    package ac.ucas.lucene;
    
    import java.io.IOException;
    import java.io.StringReader;
    
    import org.apache.lucene.analysis.Analyzer;
    import org.apache.lucene.analysis.TokenStream;
    import org.apache.lucene.analysis.cjk.CJKAnalyzer;
    import org.apache.lucene.analysis.core.KeywordAnalyzer;
    import org.apache.lucene.analysis.core.SimpleAnalyzer;
    import org.apache.lucene.analysis.core.StopAnalyzer;
    import org.apache.lucene.analysis.core.WhitespaceAnalyzer;
    import org.apache.lucene.analysis.standard.StandardAnalyzer;
    import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
    import org.apache.lucene.util.Version;
    
    import sun.dc.pr.PRError;
    
    public class AnalyerStudy {
    
        private static String str = "lucene, 全文检索框架";
        public static void print(Analyzer analyzer){
            StringReader stringReader=new StringReader(str);
            try {
                TokenStream tokenStream=analyzer.tokenStream(str, stringReader);
                tokenStream.reset();
                CharTermAttribute term=tokenStream.getAttribute(CharTermAttribute.class);
                System.out.println("分词技术:"+analyzer.getClass());
                while(tokenStream.incrementToken()){
                    System.out.print(term.toString()+" | ");
                }
                System.out.println("
    ");
            } catch (IOException e) {
                // TODO Auto-generated catch block
                e.printStackTrace();
            }
        }
        public static void main(String[] args) {
            Analyzer analyzer=null;
            //标准分词
            analyzer=new StandardAnalyzer(Version.LUCENE_43);
            print(analyzer);
    
    
            //空格分词
            analyzer =new WhitespaceAnalyzer(Version.LUCENE_43);
            print(analyzer);
    
            //简单分词
            analyzer=new SimpleAnalyzer(Version.LUCENE_43);
            print(analyzer);
    
            //二分法
            analyzer=new CJKAnalyzer(Version.LUCENE_43);
            print(analyzer);
    
            //关键字
            analyzer=new KeywordAnalyzer();
            print(analyzer);
    
            //
            analyzer=new StopAnalyzer(Version.LUCENE_43);
            print(analyzer);
        }
    }
    
    

    {% endcodeblock %}

    分词结果:
    分词技术:class org.apache.lucene.analysis.standard.StandardAnalyzer
    lucene | 全 | 文 | 检 | 索 | 框 | 架 |
    分词技术:class org.apache.lucene.analysis.core.WhitespaceAnalyzer

    lucene, | 全文检索框架 |
    分词技术:class org.apache.lucene.analysis.core.SimpleAnalyzer
    lucene | 全文检索框架 |
    分词技术:class org.apache.lucene.analysis.cjk.CJKAnalyzer
    lucene | 全文 | 文检 | 检索 | 索框 | 框架 |
    分词技术:class org.apache.lucene.analysis.core.KeywordAnalyzer
    lucene, 全文检索框架 |
    分词技术:class org.apache.lucene.analysis.core.StopAnalyzer
    lucene | 全文检索框架 |

    5. 使用luke打开索引

    Luke是一个用于Lucene搜索引擎的,方便开发和诊断的第三方工具,它可以访问现有Lucene的索引.
    luke下载地址:https://github.com/DmitryKey/luke/releases

  • 相关阅读:
    新思路:Exception Handle
    转战github了
    矩阵内积和Schur补
    原子范数及线谱估计
    次梯度方法
    机器学习——推荐系统
    机器学习——异常值检测
    机器学习——聚类分析和主成分分析
    常用不等式集锦
    机器学习——支持向量机(SVM)
  • 原文地址:https://www.cnblogs.com/hainange/p/6153794.html
Copyright © 2011-2022 走看看