zoukankan      html  css  js  c++  java
  • Lucene笔记一

    Lucene就是一个全文检索的工具,建立索引用的,类似于新华字典的目录

    这里使用的是lucene-4.4.0版本,入门代码所需jar包如下图所示(解压lucene-4.4.0后的目录):

    入门代码:

    import java.io.File;
    import java.io.IOException;
    
    import org.apache.lucene.analysis.Analyzer;
    import org.apache.lucene.analysis.standard.StandardAnalyzer;
    import org.apache.lucene.document.Document;
    import org.apache.lucene.document.IntField;
    import org.apache.lucene.document.StringField;
    import org.apache.lucene.document.TextField;
    import org.apache.lucene.document.Field.Store;
    import org.apache.lucene.index.DirectoryReader;
    import org.apache.lucene.index.IndexReader;
    import org.apache.lucene.index.IndexWriter;
    import org.apache.lucene.index.IndexWriterConfig;
    import org.apache.lucene.index.IndexableField;
    import org.apache.lucene.index.Term;
    import org.apache.lucene.search.IndexSearcher;
    import org.apache.lucene.search.Query;
    import org.apache.lucene.search.ScoreDoc;
    import org.apache.lucene.search.TermQuery;
    import org.apache.lucene.search.TopDocs;
    import org.apache.lucene.store.Directory;
    import org.apache.lucene.store.FSDirectory;
    import org.apache.lucene.util.Version;
    import org.junit.Test;
    
    /*8
     * luceneDemo
     * 
     */
    public class TestLucene {
        /**
         * 通过lucene 提供的api 对数据建立索引,indexWriter
         * @throws IOException 
         * 
         */
        @Test
        public void testAdd() throws IOException{
            
            //索引在硬盘上面存放的位置..
            Directory directory=FSDirectory.open(new File("D:/INDEX"));
            //lucene 当前使用的版本...
            Version matchVersion=Version.LUCENE_44;
            //分词器...(把一段文本分词)(黑马程序员是高端的培训机构)
            //analzyer 是一个抽象类,具体的切分词规则由子类实现...
            Analyzer analyzer=new StandardAnalyzer(matchVersion);
            
            IndexWriterConfig config=new IndexWriterConfig(matchVersion, analyzer);
            
            //构造索引写入的对象..
            IndexWriter indexWriter=new IndexWriter(directory, config);
            
            //往索引库里面写数据..
            //索引库里面的数据都是document 一个document相当于是一条记录
            //这个document里面的数据相当于索引结构..
            Document document=new Document();
            IndexableField indexableField=new IntField("id",1, Store.YES);
            IndexableField stringfield=new StringField("title","对王召廷的个人评价",Store.YES);
            IndexableField teIndexableField=new TextField("content","风流倜傥有点黄",Store.YES);
            document.add(indexableField);
            document.add(stringfield);
            document.add(teIndexableField);
            //索引库里面接收的数据都是document对象
            indexWriter.addDocument(document);
            indexWriter.close();
        }
        
        /**
         * 对建立的索引进行搜索...
         * 通过indexSearcher 去搜索...
         * @throws IOException 
         */
        @Test
        public void testSearcher() throws IOException{
            
            //索引在硬盘上面存放的位置..
            Directory directory=FSDirectory.open(new File("D:/INDEX"));
            //把索引目录里面的索引读取到IndexReader 当中...
            IndexReader indexReader=DirectoryReader.open(directory);
    //        /构造搜索索引的对象..
            IndexSearcher indexSearcher=new IndexSearcher(indexReader);
            
            //Query 它是一个查询条件对象,它是一个抽象类,不同的查询规则就构造不同的子类...
            Query query=new TermQuery(new Term("title", "对王召廷的个人评价"));
            
            //检索符合query 条件的前面N 条记录..
            //
            TopDocs topDocs=indexSearcher.search(query, 10);
            //返回总记录数...
            System.out.println(topDocs.totalHits);
            
            //存放的都是document 的id
            ScoreDoc scoreDocs []=topDocs.scoreDocs;
            
            for(ScoreDoc scoreDoc:scoreDocs){
                //返回的就是document id
                int docID=scoreDoc.doc;
                //我还需要根据id 检索到对应的document
                Document document=indexSearcher.doc(docID);
                
                System.out.println("id=="+document.get("id"));
                System.out.println("title=="+document.get("title"));
                System.out.println("content=="+document.get("content"));
                
            }
     
        }
        
    }

    原理分析图:

    demo演示: 

    根据入门代码流程提炼工具类代码:

    import java.io.File;
    import java.io.IOException;
    
    import org.apache.lucene.analysis.Analyzer;
    import org.apache.lucene.analysis.standard.StandardAnalyzer;
    import org.apache.lucene.index.DirectoryReader;
    import org.apache.lucene.index.IndexReader;
    import org.apache.lucene.index.IndexWriter;
    import org.apache.lucene.index.IndexWriterConfig;
    import org.apache.lucene.search.IndexSearcher;
    import org.apache.lucene.store.Directory;
    import org.apache.lucene.store.FSDirectory;
    import org.apache.lucene.util.Version;
    
    /**
     * lucene 工具类...
     * @author Administrator
     *
     */
    /**
     * 提炼规则,假设这段代码可以完成一个功能,把这个代码提炼到一个方法里面去,假设这个方法在某个业务罗继承可以共用,那么往上抽取,
     * 假设在其它逻辑层也可以用,提炼到工具类里面去。
     * 
     */
    public class LuceneUtils {
        private static IndexWriter indexWriter=null;
        private static IndexSearcher indexSearcher=null;
        
        
        //索引存放目录..
        private static Directory directory=null;
        
        private static IndexWriterConfig indexWriterConfig=null;
        
        private static Version version=null;
        
        
        private static Analyzer analyzer=null;
        
        static {
            try {
                directory=FSDirectory.open(new File(Constants.URL));
                version=Version.LUCENE_44;
                analyzer=new StandardAnalyzer(version);
                indexWriterConfig=new IndexWriterConfig(version, analyzer);
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
        /**
         * 
         * @return 返回用于操作索引的对象...
         * @throws IOException
         */
        public static IndexWriter getIndexWriter() throws IOException{
            indexWriter=new IndexWriter(directory, indexWriterConfig);
            return indexWriter;
        }
        /**
         * 返回用于搜索索引的对象...
         * @return
         * @throws IOException 
         */
        public static IndexSearcher  getIndexSearcher() throws IOException{
            
            IndexReader indexReader=DirectoryReader.open(directory);
            indexSearcher=new IndexSearcher(indexReader);
            
            return indexSearcher;
        }
        /**
         * 
         * 返回lucene 当前的版本...
         * @return
         */
        public static Version getVersion() {
            return version;
        }
        /**
         * 
         * 返回lucene 当前使用的分词器..
         * @return
         */
        public static Analyzer getAnalyzer() {
            return analyzer;
        }
        
    }
    public class Constants {
        /**
         * 索引存放的目录
         */
        public static final String URL="d:/indexdir/news";
    }

    bean:

    package cn.itcast.bean;
    
    public class Article {
        private int id;
        
        public int getId() {
            return id;
        }
    
        public void setId(int id) {
            this.id = id;
        }
    
        public String getTitle() {
            return title;
        }
    
        public void setTitle(String title) {
            this.title = title;
        }
    
        public String getContent() {
            return content;
        }
    
        public void setContent(String content) {
            this.content = content;
        }
    
        public String getAuthor() {
            return author;
        }
    
        public void setAuthor(String author) {
            this.author = author;
        }
    
        public String getUrl() {
            return url;
        }
    
        public void setUrl(String url) {
            this.url = url;
        }
    
        private String title;
        
        private String content;
        
        private String author;
        
        private String url;
        
    
    }

    转换工具类:

    package cn.itcast.lucene;
    
    import org.apache.lucene.document.Document;
    import org.apache.lucene.document.IntField;
    import org.apache.lucene.document.StringField;
    import org.apache.lucene.document.TextField;
    import org.apache.lucene.document.Field.Store;
    import org.apache.lucene.index.IndexableField;
    
    import cn.itcast.bean.Article;
    
    /*8
     * 对象与索引库document 之间的转化
     * 
     */
    public class ArticleToDocument {
        
        
        public static Document articleToDocument(Article article){
            Document document=new Document();
            IntField idfield=new IntField("id", article.getId(), Store.YES);
            //StringField 对应的值不分词,textField 分词..
            TextField titleField=new TextField("title", article.getTitle(),Store.YES);
            TextField contentField=new TextField("content", article.getContent(),Store.YES);
            //修改这个字段对应的权重值,默认这个值为1f
    //        contentField.setBoost(3f);
            StringField authorField=new StringField("author", article.getAuthor(), Store.YES);
            StringField urlField=new StringField("url", article.getUrl(), Store.YES);
            document.add(idfield);
            document.add(titleField);
            document.add(contentField);
            document.add(authorField);
            document.add(urlField);
            return document;
        }
    
    }

    Dao层:

    package cn.itcast.dao;
    
    import java.io.IOException;
    
    import org.apache.lucene.document.Document;
    import org.apache.lucene.index.IndexWriter;
    import org.apache.lucene.index.Term;
    import org.apache.lucene.queryparser.classic.MultiFieldQueryParser;
    import org.apache.lucene.queryparser.classic.QueryParser;
    import org.apache.lucene.search.IndexSearcher;
    import org.apache.lucene.search.Query;
    import org.apache.lucene.search.ScoreDoc;
    import org.apache.lucene.search.TopDocs;
    
    import cn.itcast.bean.Article;
    import cn.itcast.lucene.ArticleToDocument;
    import cn.itcast.uitls.LuceneUtils;
    
    /**
     * 使用lucene 的API 来操作索引库..
     * @author Administrator
     *
     */
    public class LuceneDao {
        
        public void addIndex(Article article) throws IOException{
            IndexWriter indexWriter=LuceneUtils.getIndexWriter();
            Document doc=ArticleToDocument.articleToDocument(article);
            indexWriter.addDocument(doc);
            indexWriter.close();
        }
        
        /**
         * 删除符合条件的记录...
         * @param fieldName
         * @param fieldValue
         * @throws IOException
         */
        public void delIndex(String fieldName,String fieldValue) throws IOException{
            IndexWriter indexWriter=LuceneUtils.getIndexWriter();
            
            //一定要梦想,万一实现了勒
            Term term=new Term(fieldName, fieldValue);
            
            indexWriter.deleteDocuments(term);
            
            indexWriter.close();
        }
        /**
         * 
         * 更新
         * 
         * update table set ?  where condtion
         * @throws IOException 
         * 
         * 
         */
        public void updateIndex(String fieldName,String fieldValue,Article article) throws IOException{
            IndexWriter indexWriter=LuceneUtils.getIndexWriter();
            /**
             * 1:term 设置更新的条件...
             * 
             * 2:设置更新的内容的对象..
             * 
             */
            Term term=new Term(fieldName,fieldValue);
            Document doc=ArticleToDocument.articleToDocument(article);
            /**
             * 
             * 在lucene 里面是先删除符合这个条件term 的记录,在创建一个doc 记录...
             * 
             */
            indexWriter.updateDocument(term, doc);
            indexWriter.close();
        }
        /**
         * 0,10
         * 10,10
         * 20,10
         * @param keywords
         * @throws Exception
         */
        public void findIndex(String keywords,int firstResult,int maxResult) throws Exception{
            
            IndexSearcher indexSearcher=LuceneUtils.getIndexSearcher();
            //第一个条件.. 单字段查询...
    //        Query query=new TermQuery(new Term("title","梦想"))
            
            //select *  from  table where fieldname="" or content=""
            
            String fields []={"title","content"};
            
            //第二种条件:使用查询解析器,多字段。。。 我们需要重新导入一个jar queryParser 的jar... 位置在lucene解压后的queryparser文件夹下
            QueryParser queryParser=new MultiFieldQueryParser(LuceneUtils.getVersion(),fields,LuceneUtils.getAnalyzer());
            
    //        /这个事一个条件..
            Query query=queryParser.parse(keywords);
            
            
            //query 它是一个查询条件,query 是一个抽象类,不同的查询规则构造部同的子类即可
            //检索符合query 条件的前面N 条记录...
            //检索的是索引目录... (总记录数,socreDOC (docID))
            //使用lucene 提供的api 进行操作...
            TopDocs topDocs=indexSearcher.search(query,firstResult+maxResult);
    //        /存放的是docID
            ScoreDoc scoreDocs []=topDocs.scoreDocs;
            //判断:scoreDocs 的length  (实际取出来的数量..) 与 firstResult+maxResult 的值取小值...
            
            //在java jdk 里面提供了一个api
            int endResult=Math.min(scoreDocs.length, firstResult+maxResult);
            
            
            for(int i=firstResult;i<endResult;i++){
    //            /取出来的是docID,这个id 是lucene 自己来维护。
                int docID=scoreDocs[i].doc;
                Document document=indexSearcher.doc(docID);
                System.out.println("id==="+document.get("id"));
                System.out.println("title==="+document.get("title"));
                System.out.println("content==="+document.get("content"));
                System.out.println("url==="+document.get("url"));
                System.out.println("author==="+document.get("author"));
            }
            
        }
    }

    测试类:

    package cn.itcast.junit;
    
    import java.io.IOException;
    
    import org.junit.Test;
    
    import cn.itcast.bean.Article;
    import cn.itcast.dao.LuceneDao;
    
    /**
     * 测试luceneDao
     * @author Administrator
     *
     */
    public class LuceneDaoTest {
        
        private LuceneDao luceneDao=new LuceneDao();
        
        @Test
        public void testCreate() throws IOException{
            for(int i=28;i<=28;i++){
                Article article=new Article();
                article.setId(i);
                article.setTitle("一定要梦想,万一实现了勒");
                article.setContent("矫情我觉得这句话太矫情了矫情矫情矫情矫情矫情矫情");
                article.setUrl("http://www.tianmao.com");
                article.setAuthor("马云");
                luceneDao.addIndex(article);
            }
            
            
        }
        @Test
        public void testsearcher() throws Exception{
    //        article.setTitle("一定要梦想,万一实现了勒");   textfield   分词     标准分词器      
    //        article.setContent("我觉得这句话太矫情了");   textfield   分词    标准分词器
            luceneDao.findIndex("梦想",20,10);
            
        }
        @Test
        public void testdelete() throws IOException{
            String fieldName="title";
            String fieldValue="定";
            luceneDao.delIndex(fieldName, fieldValue);
        }
    
        @Test
        public void testUpdate() throws IOException{
            String fieldName="title";
            String fieldValue="定";
            
            Article article=new Article();
            article.setId(9527);
            article.setTitle("一定要梦想,万一实现了勒");
            article.setContent("我觉得这句话太矫情了");
            article.setUrl("http://www.tianmao.com");
            article.setAuthor("马云");
            
            luceneDao.updateIndex(fieldName, fieldValue, article);
            
        }
        
        
    }

     分词器的流程图:

     关于分词器,网上可以找到很多种类的分词器配合Lucene使用,相关分词规则查看对应说明。

    举例如下:

    Analyzer analyzer=new StandardAnalyzer(Version.LUCENE_44);//中文单字切分、英文按空格切分成单词

    Analyzer analyzer=new CJKAnalyzer(Version.LUCENE_44);//二分法分词,中文相连的两个词作为一个索引

    Analyzer analyzer=new IKAnalyzer();//第三方的分词器,对中文支持较好,可以自定义分词单词与停用词

    索引库优化

    package cn.itcast.lucene;
    
    import java.io.File;
    import java.io.IOException;
    
    import org.apache.lucene.analysis.Analyzer;
    import org.apache.lucene.analysis.standard.StandardAnalyzer;
    import org.apache.lucene.index.DirectoryReader;
    import org.apache.lucene.index.IndexReader;
    import org.apache.lucene.index.IndexWriter;
    import org.apache.lucene.index.IndexWriterConfig;
    import org.apache.lucene.index.LogDocMergePolicy;
    import org.apache.lucene.index.MergePolicy;
    import org.apache.lucene.index.Term;
    import org.apache.lucene.search.IndexSearcher;
    import org.apache.lucene.search.Query;
    import org.apache.lucene.search.TermQuery;
    import org.apache.lucene.search.TopDocs;
    import org.apache.lucene.store.Directory;
    import org.apache.lucene.store.FSDirectory;
    import org.apache.lucene.store.IOContext;
    import org.apache.lucene.store.RAMDirectory;
    import org.apache.lucene.util.Version;
    import org.junit.Test;
    
    import cn.itcast.uitls.Constants;
    
    public class TestOptimise {
        /*8
         * 优化的第一种方式:通过 IndexWriterConfig 优化设置mergePolicy(合并策略)
         * 
         * 
         */
        public void testoptimise() throws IOException{
            Directory directory=FSDirectory.open(new File(Constants.URL));
            
            Analyzer analyzer=new StandardAnalyzer(Version.LUCENE_44);
            IndexWriterConfig config=new IndexWriterConfig(Version.LUCENE_44, analyzer);
            
            LogDocMergePolicy mergePolicy=new LogDocMergePolicy();
            
            /**
             * 当这个值越小,更少的内存会被运用当创建索引的时候,搜索的时候越快,创建的时候越慢。
             * 当这个值越大,更多的内存会被运用当创建索引的时候,搜索的时候越慢,创建的时候越快..
             * larger values >10
             * 
             * 2<=smaller<=10
             * 
             */
            //设置合并因子..
            mergePolicy.setMergeFactor(10);
    //        /设置索引的合并策略..
            config.setMergePolicy(mergePolicy);
            IndexWriter indexWriter=new IndexWriter(directory, config);
        }
        
        /**
         * 通过directory 去优化....
         * @throws IOException 
         * 
         */
        @Test
        public void testoptimise2() throws IOException{
            //现在的索引放在硬盘上面...
            Directory directory=FSDirectory.open(new File(Constants.URL));
    //        /通过这个对象吧directory 里面的数据读取到directory1 里面来..
            IOContext ioContext=new IOContext();
            //相办法吧directory 的索引读取到内存当中来...
            Directory directory1=new RAMDirectory(directory,ioContext);
            IndexReader indexReader=DirectoryReader.open(directory1);
            IndexSearcher indexSearcher=new IndexSearcher(indexReader);
            Query query=new TermQuery(new Term("title", "想"));
            TopDocs topDocs=indexSearcher.search(query, 100);
            System.out.println(topDocs.totalHits);
        }
        
        /**
         * 索引文件越大,会影响检索的速度..  (减少索引文件的大小)
         * 
         * 1:排除停用词..
         * 
         */
        public void testoptimise3(){
            
            
        }
        /**
         * 将索引分目盘存放  将数据归类...
         * 
         */
        public void testoptimise4(){
            
            
        }
    }
  • 相关阅读:
    继承性03
    继承性
    Arrays与Math类
    Static关键字
    random模块,time模块,os模块,sys模块
    re模块
    冒泡排序、递归、二分查找
    内置函数
    生成器和生成器表达式
    迭代器
  • 原文地址:https://www.cnblogs.com/lm970585581/p/9410322.html
Copyright © 2011-2022 走看看