zoukankan      html  css  js  c++  java
  • Lucence使用入门

    参考:
    https://blog.csdn.net/u014209975/article/details/50525624
    https://www.cnblogs.com/hanyinglong/p/5395600.html
    http://lucene.apache.org/core/4_0_0/core/overview-summary.html
    https://www.jianshu.com/p/0a2bbe0f4c42

    依赖:

    lucene-analyzers.jar
    lucene-benchmark.jar
    lucene-core.jar
    lucene-highlighter.jar
    lucene-memory.jar
    lucene-parser.jar
    lucene-remote.jar
    lucene-smartcn.jar
    

    实体类:

    package com.h3c.lucence;
    
    import java.io.Serializable;
    
    public class Entity implements Serializable {
    
    	private static final long serialVersionUID = 3701082756628915138L;
    
    	private Integer id;
    
    	private String type;
    
        private String virtualDoc;
    
        private String summary;
    
        private float score;
    
        public Integer getId() {
    		return id;
    	}
    
    	public void setId(Integer id) {
    		this.id = id;
    	}
    
    	public String getType() {
    		return type;
    	}
    
    	public void setType(String type) {
    		this.type = type;
    	}
    
    	public String getVirtualDoc() {
            if (null == virtualDoc) {
                // TODO 根据entity的值构造虚拟的文档,包括所有属性及对应的值,用于全文检索
            	// 格式:字段1:属性值1,字段2:属性值2,...
            }
            return virtualDoc;
        }
    
        public void setVirtualDoc(String virtualDoc) {
            this.virtualDoc = virtualDoc;
        }
    
        public String getSummary() {
            StringBuilder sb = new StringBuilder();
            String tmpSum = summary;
            tmpSum = tmpSum.replace("<SPAN style="color:red;">", "");
            tmpSum = tmpSum.replace("</SPAN>", "");
            String virtualDoc2 = getVirtualDoc();
            int length = tmpSum.length();
            int firstIndex = virtualDoc2.indexOf(tmpSum);
            if (firstIndex > 0) {
                sb.append("...");
            }
            sb.append(summary);
            if (firstIndex + length < virtualDoc2.length()) {
                sb.append("...");
            }
    
            return sb.toString();
        }
    
        public void setSummary(String summary) {
            this.summary = summary;
        }
    
        public float getScore() {
            return score;
        }
    
        public void setScore(float score) {
            this.score = score;
        }
    }
    

    Demo类:

    package com.h3c.lucence;
    
    import java.io.Closeable;
    import java.io.File;
    import java.io.IOException;
    import java.util.ArrayList;
    import java.util.List;
    import java.util.regex.Matcher;
    import java.util.regex.Pattern;
    
    import org.apache.lucene.analysis.Analyzer;
    import org.apache.lucene.analysis.TokenStream;
    import org.apache.lucene.analysis.standard.StandardAnalyzer;
    import org.apache.lucene.document.Document;
    import org.apache.lucene.document.Field;
    import org.apache.lucene.index.IndexReader;
    import org.apache.lucene.index.IndexWriter;
    import org.apache.lucene.index.IndexWriterConfig;
    import org.apache.lucene.index.Term;
    import org.apache.lucene.search.BooleanClause;
    import org.apache.lucene.search.BooleanQuery;
    import org.apache.lucene.search.IndexSearcher;
    import org.apache.lucene.search.PrefixQuery;
    import org.apache.lucene.search.Query;
    import org.apache.lucene.search.ScoreDoc;
    import org.apache.lucene.search.TermQuery;
    import org.apache.lucene.search.TopDocs;
    import org.apache.lucene.search.WildcardQuery;
    import org.apache.lucene.search.highlight.Highlighter;
    import org.apache.lucene.search.highlight.InvalidTokenOffsetsException;
    import org.apache.lucene.search.highlight.QueryScorer;
    import org.apache.lucene.search.highlight.SimpleHTMLFormatter;
    import org.apache.lucene.search.highlight.SimpleSpanFragmenter;
    import org.apache.lucene.search.highlight.TokenSources;
    import org.apache.lucene.store.Directory;
    import org.apache.lucene.store.FSDirectory;
    import org.apache.lucene.util.Version;
    
    public class Demo {
        /** lucene索引目录 */
        private static Directory ciIndexDir;
    
        private static final String CI_CONTENT_FLAG = "virtualDoc";
    
        /** 分词分析工具,使用标准分析工具,单个含字和连续的英文单词作为索引。 */
        private static final Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_36);
    
        private static Pattern VALID_IPV4_PATTERN = null;
        private static Pattern VALID_IPV6_PATTERN = null;
        private static final String ipv4Pattern = "(([01]?\d\d?|2[0-4]\d|25[0-5])\.){3}([01]?\d\d?|2[0-4]\d|25[0-5])";
        private static final String ipv6Pattern = "([0-9a-f]{1,4}:){7}([0-9a-f]){1,4}";
    
        private static IndexWriter indexWriter;
    
        static {
        	VALID_IPV4_PATTERN = Pattern.compile(ipv4Pattern, Pattern.CASE_INSENSITIVE);
            VALID_IPV6_PATTERN = Pattern.compile(ipv6Pattern, Pattern.CASE_INSENSITIVE);
            IndexWriterConfig conf = new IndexWriterConfig(Version.LUCENE_36, analyzer);
            conf.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND);
            try {
                indexWriter = new IndexWriter(getCiIndexDir(), conf);
            } catch (IOException e) {
            	e.printStackTrace();
            }
        }
    
        private static Directory getCiIndexDir() {
            if (null == ciIndexDir) {
                try {
                    ciIndexDir = FSDirectory.open(new File("D://indexs"));
                } catch (IOException e) {
                	e.printStackTrace();
                }
            }
            return ciIndexDir;
        }
    
        private static boolean isIpAddress(String ipAddress) {
            Matcher m1 = VALID_IPV4_PATTERN.matcher(ipAddress);
            Matcher m2 = VALID_IPV6_PATTERN.matcher(ipAddress);
            return m1.matches() || m2.matches();
        }
    
        private static boolean isChinese(char c) {
            Character.UnicodeBlock ub = Character.UnicodeBlock.of(c);
            if (ub == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS || ub == Character.UnicodeBlock.CJK_COMPATIBILITY_IDEOGRAPHS
                    || ub == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A || ub == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_B
                    || ub == Character.UnicodeBlock.CJK_SYMBOLS_AND_PUNCTUATION || ub == Character.UnicodeBlock.HALFWIDTH_AND_FULLWIDTH_FORMS
                    || ub == Character.UnicodeBlock.GENERAL_PUNCTUATION) {
                return true;
            }
            return false;
        }
    
        private static BooleanQuery parseChineseCharacters(String inputString){
        	BooleanQuery query = new BooleanQuery();
        	if(isIpAddress(inputString)){
        		query.add(new TermQuery(new Term(CI_CONTENT_FLAG,inputString)), BooleanClause.Occur.MUST);
        		return query;
        	}
            BooleanQuery fieldQuery = new BooleanQuery();
            boolean isWord = false;
            StringBuilder tempWord = new StringBuilder();
            inputString = inputString.toLowerCase();
        	BooleanQuery booleanQuery = new BooleanQuery();
        	int length = inputString.length();
        	Query termQuery = null;
        	for(int i=0; i<length; i++){
        		char c = inputString.charAt(i);
        		if(c >= 'a' && c <= 'z' || c >= '0' && c <= '9'){//English character
        			isWord = true;
        			tempWord.append(c);
        		}
        		else{//Delimiter or Chinese character
        			isWord = false;
        			if(tempWord.length() > 0){
        				termQuery = new PrefixQuery(new Term(CI_CONTENT_FLAG,tempWord.toString()));
    //    				booleanQuery.add(termQuery,BooleanClause.Occur.MUST);
        				booleanQuery.add(termQuery,BooleanClause.Occur.SHOULD);
        				tempWord = new StringBuilder();
        			}
        		}
        		if(!isWord){
        			termQuery = new TermQuery(new Term(CI_CONTENT_FLAG,String.valueOf(c)));
        			if(isChinese(c)){//Chinese character
    //        			booleanQuery.add(termQuery,BooleanClause.Occur.MUST);
            			booleanQuery.add(termQuery,BooleanClause.Occur.SHOULD);
        			}
        			else{//Delimiter
            			booleanQuery.add(termQuery,BooleanClause.Occur.SHOULD);
        			}
    
        		}
        	}
        	if(tempWord.length() > 0){
        		termQuery = new WildcardQuery(new Term(CI_CONTENT_FLAG,tempWord.toString()+"*"));
    			booleanQuery.add(termQuery,BooleanClause.Occur.SHOULD);
    
    			termQuery = new WildcardQuery(new Term(CI_CONTENT_FLAG,"*" + tempWord.toString()));
    			booleanQuery.add(termQuery,BooleanClause.Occur.SHOULD);
    		}
    
        	// Begin 处理全局字段匹配
            termQuery = new WildcardQuery(new Term(CI_CONTENT_FLAG,inputString+"*"));
    		booleanQuery.add(termQuery,BooleanClause.Occur.SHOULD);
    
    		termQuery = new WildcardQuery(new Term(CI_CONTENT_FLAG,"*" + inputString));
    		booleanQuery.add(termQuery,BooleanClause.Occur.SHOULD);
    
    		termQuery = new WildcardQuery(new Term(CI_CONTENT_FLAG,"*" + inputString + "*"));
    		booleanQuery.add(termQuery,BooleanClause.Occur.SHOULD);
    		// End 处理全局字段匹配
    
        	BooleanClause clause = new BooleanClause(booleanQuery, BooleanClause.Occur.MUST);
        	fieldQuery.add(clause);
    
            BooleanClause fieldClause = new BooleanClause(fieldQuery, BooleanClause.Occur.MUST);
            query.add(fieldClause);
    
            return query;
        }
    
        /**
         * 全文检索
         * @param queryStr
         * @throws Exception
         */
        private static void contentSearch(String queryStr, boolean highlight) throws Exception {
            IndexReader indexReader = null;
            IndexSearcher indexSearcher = null;
            try {
                indexReader = IndexReader.open(getCiIndexDir());
                indexSearcher = new IndexSearcher(indexReader);
    
                //组合查询条件,需要根据业务自己定义
                Query query = parseChineseCharacters(queryStr);
    
    			TopDocs hits = indexSearcher.search(query, Integer.MAX_VALUE);
                if(hits.totalHits > 0) {
    	            if (highlight) {
    	                QueryScorer scorer = new QueryScorer(query, CI_CONTENT_FLAG);
    	                SimpleHTMLFormatter formatter = new SimpleHTMLFormatter("<SPAN style="color:red;">", "</SPAN>");
    	                Highlighter highlighter = new Highlighter(formatter, scorer);
    	                highlighter
    	                    .setTextFragmenter(new SimpleSpanFragmenter(scorer, 100));
    
    	                for (ScoreDoc scoreDoc : hits.scoreDocs) {
    	                    Document doc = indexSearcher.doc(scoreDoc.doc);
    	                    System.out.println(doc.get("virtualDoc"));
    	                    Entity entity = null;
    	                    entity = convertToEntity(doc, indexSearcher.getIndexReader(), scoreDoc.doc, highlighter);
    	                    entity.setScore(scoreDoc.score);
    	                }
    	            } else {
    	                for (ScoreDoc scoreDoc : hits.scoreDocs) {
    	                    Document doc = indexSearcher.doc(scoreDoc.doc);
    	                    System.out.println(doc.get("virtualDoc"));
    	                    Entity entity = null;
    	                    entity = convertToEntity(doc);
    	                    entity.setScore(scoreDoc.score);
    	                }
    	            }
                }
            } catch (IOException ioe) {
            	ioe.printStackTrace();
            } finally {
            	close(indexSearcher);
            	close(indexReader);
            }
        }
    
        /**
         * 对实现Closeable接口的统一关闭
         * @param object
         */
        private static void close(Closeable object) {
        	if(null != object) {
        		try {
    				object.close();
    			} catch (IOException e) {
    			}
        	}
        }
    
        /**
         * 实体转换为Doc
         * @param entity
         * @return
         */
        public static Document convertToDocument(Entity entity) {
            Document doc = new Document();
            String virtualDoc = entity.getVirtualDoc();
            //Field.Store.Yes存储,Field.Index.ANALYZED分词
            doc.add(new Field("id", String.valueOf(entity.getId()), Field.Store.YES, Field.Index.NOT_ANALYZED));
            doc.add(new Field("type", entity.getType(), Field.Store.YES, Field.Index.NOT_ANALYZED));
            doc.add(new Field(CI_CONTENT_FLAG, null == virtualDoc ? " " : virtualDoc, Field.Store.YES, Field.Index.ANALYZED));
            return doc;
        }
    
        /**
         * Doc转换为实体
         * @param doc
         * @return
         */
        public static Entity convertToEntity(Document doc) {
        	Entity ci = new Entity();
        	ci.setId(Integer.valueOf(doc.get("id")));
        	ci.setType(doc.get("type"));
            ci.setVirtualDoc(doc.get(CI_CONTENT_FLAG));
            return ci;
        }
    
        /**
         * 检索Entity,含高亮信息
         * @param doc
         * @param indexReader
         * @param docId
         * @param highlighter
         * @return
         * @throws IOException
         * @throws InvalidTokenOffsetsException
         */
        public static Entity convertToEntity(Document doc, IndexReader indexReader, int docId, Highlighter highlighter)
            throws IOException, InvalidTokenOffsetsException {
    
        	Entity entity = convertToEntity(doc);
            String virtualDoc = entity.getVirtualDoc();
            TokenStream stream = TokenSources.getAnyTokenStream(indexReader, docId, CI_CONTENT_FLAG, doc, analyzer);
            String highlighterSummary = highlighter.getBestFragment(stream, virtualDoc);
            if(highlighterSummary == null){
            	highlighterSummary = virtualDoc;
            }
            entity.setSummary(highlighterSummary);
    
            return entity;
        }
    
        /**
         * 给entity信息增加索引
         * @param entity
         */
        public static void addIndex(Entity entity) {
            try {
            	deleteIndex(entity);
                Document doc = convertToDocument(entity);
                indexWriter.addDocument(doc);
                indexWriter.commit();
            } catch (Exception e) {
               e.printStackTrace();
            }
        }
    
        /**
         * 批量增加索引
         * @param list
         */
        public static void addIndexs(List<Entity> list) {
            try {
            	List<Document> docs = new ArrayList<Document>();
            	deleteIndexs(list);
                for (Entity entity : list) {
                    Document doc = convertToDocument(entity);
                    docs.add(doc);
                }
                indexWriter.addDocuments(docs);
                indexWriter.commit();
            } catch (Exception e) {
            	e.printStackTrace();
            }
        }
    
        /**
         * 给实体信息更新索引
         * @param entity
         */
        public static void updateIndex(Entity entity) {
            try {
                addIndex(entity);
            } catch (Exception e) {
                e.printStackTrace();
            }
        }
    
        /**
         * 删除entity列表信息对应的索引
         * @param entity
         */
        public static void deleteIndexs(List<Entity> list) {
            try {
            	int size = list.size();
            	Term[] terms = new Term[size];
            	for(int i=0; i<size; i++) {
            		terms[i] = new Term("id", list.get(i).getId().toString());
            	}
                indexWriter.deleteDocuments(terms);
                indexWriter.commit();
            } catch (Exception e) {
                e.printStackTrace();
            }
        }
    
        /**
         * 删除实体信息对应的索引
         * @param entity
         */
        public static void deleteIndex(Entity entity) {
            try {
                indexWriter.deleteDocuments(new Term("id", entity.getId().toString()));
                indexWriter.commit();
            } catch (Exception e) {
                e.printStackTrace();
            }
        }
    
        /**
         * 删除实体类型对应的所以索引信息
         * @param type
         */
        public static void deleteIndexByType(String type) {
            try {
                indexWriter.deleteDocuments(new Term("type", type));
                indexWriter.commit();
            } catch (Exception e) {
                e.printStackTrace();
            }
        }
    
        @Override
        protected void finalize() throws Throwable {
            indexWriter.close();
        }
    
        public static void main(String[] args) throws Exception {
    		String queryStr = "http://mail6c1.shenzhenair.com";
    		contentSearch(queryStr, true);
    	}
    }
    
  • 相关阅读:
    解决linux下fflush(stdin)无效
    《转载》使用Chrome浏览器截取整个网页
    JDK切换版本
    消息队列函数(msgget、msgctl、msgsnd、msgrcv)及其范例
    Oracle 账户
    Oracle linux 安装 相关
    Android高德地图获取当前缩放等级及可视区域四个角的坐标
    Intellij idea 导入项目之后编译错误:无效的源版本:7
    数据库异常整理:org.hibernate.QueryException: could not resolve property: “mStation”
    MySQL(六)多表查询
  • 原文地址:https://www.cnblogs.com/kibana/p/9773865.html
Copyright © 2011-2022 走看看