zoukankan      html  css  js  c++  java
  • Lucene全文检索项目

    POM.xml

     1 <!--Lucene全文检索-->
     2         <dependency>
     3             <groupId>org.apache.lucene</groupId>
     4             <artifactId>lucene-core</artifactId>
     5             <version>${lucene.version}</version>
     6         </dependency>
     7         <dependency>
     8             <groupId>org.apache.lucene</groupId>
     9             <artifactId>lucene-queryparser</artifactId>
    10             <version>${lucene.version}</version>
    11         </dependency>
    12         <dependency>
    13             <groupId>org.apache.lucene</groupId>
    14             <artifactId>lucene-analyzers-common</artifactId>
    15             <version>${lucene.version}</version>
    16         </dependency>
    17         <dependency>
    18             <groupId>org.apache.lucene</groupId>
    19             <artifactId>lucene-highlighter</artifactId>
    20             <version>${lucene.version}</version>
    21         </dependency>
    22         <!--中文分词器-->
    23         <dependency>
    24             <groupId>org.apache.lucene</groupId>
    25             <artifactId>lucene-analyzers-smartcn</artifactId>
    26             <version>${lucene.version}</version>
    27         </dependency>

    LuceneUtil.java

      1 package io.guangsoft.erp.util;
      2 
      3 import org.apache.lucene.analysis.Analyzer;
      4 import org.apache.lucene.analysis.cn.smart.SmartChineseAnalyzer;
      5 import org.apache.lucene.document.Document;
      6 import org.apache.lucene.index.*;
      7 import org.apache.lucene.search.IndexSearcher;
      8 import org.apache.lucene.search.Query;
      9 import org.apache.lucene.search.ScoreDoc;
     10 import org.apache.lucene.search.TopDocs;
     11 import org.apache.lucene.search.highlight.*;
     12 import org.apache.lucene.store.Directory;
     13 import org.apache.lucene.store.FSDirectory;
     14 
     15 import java.nio.file.Paths;
     16 import java.util.List;
     17 
     18 public class LuceneUtil {
     19     //索引目录位置
     20     private static final String INDEX_DIR = "/index";
     21     //索引文件存放目录对象
     22     private static Directory directory;
     23     //分词器对象
     24     private static Analyzer analyzer;
     25     //索引写对象,线程安全
     26     private static IndexWriter indexWriter;
     27     //索引读对象,线程安全
     28     private static IndexReader indexReader;
     29     //索引搜索对象,线程安全
     30     private static IndexSearcher indexSearcher;
     31 
     32     static {
     33         try {
     34             directory = FSDirectory.open(Paths.get(INDEX_DIR));
     35             //系统关闭前关闭索引库的流
     36             Runtime.getRuntime().addShutdownHook(new Thread() {
     37                 @Override
     38                 public void run() {
     39                     try {
     40                         if(indexWriter != null) {
     41                             indexWriter.close();
     42                         }
     43                         if(indexReader != null) {
     44                             indexReader.close();
     45                         }
     46                         if(directory != null) {
     47                             directory.close();
     48                         }
     49                     } catch (Exception e) {
     50                         e.printStackTrace();
     51                     }
     52                 }
     53             });
     54         } catch (Exception e) {
     55             e.printStackTrace();
     56         }
     57     }
     58 
     59     //获取分词器
     60     public static Analyzer getAnalyzer() {
     61         if(analyzer == null) {
     62             analyzer = new SmartChineseAnalyzer();
     63         }
     64         return analyzer;
     65     }
     66 
     67     //获取索引Writer
     68     public static IndexWriter getIndexWriter() {
     69         if(indexWriter == null || !indexWriter.isOpen()) {
     70             try {
     71                 IndexWriterConfig indexWriterConfig = new IndexWriterConfig(getAnalyzer());
     72                 indexWriter = new IndexWriter(directory, indexWriterConfig);
     73             } catch (Exception e) {
     74                 e.printStackTrace();
     75             }
     76         }
     77         return indexWriter;
     78     }
     79 
     80     //获取索引Reader
     81     public static IndexReader getIndexReader() {
     82         try {
     83             if(indexReader == null) {
     84                 indexReader = DirectoryReader.open(directory);
     85             } else {
     86                 //对比索引库是否更新,更新则使用更新后的Reader
     87                 IndexReader newIndexReader = DirectoryReader.openIfChanged((DirectoryReader) indexReader);
     88                 if(newIndexReader != null) {
     89                     indexReader.close();
     90                     indexReader = newIndexReader;
     91                 }
     92             }
     93         } catch (Exception e) {
     94             e.printStackTrace();
     95         }
     96         return indexReader;
     97     }
     98 
     99     //获取索引Searcher
    100     public static IndexSearcher getIndexSearcher() {
    101         indexSearcher = new IndexSearcher(getIndexReader());
    102         return indexSearcher;
    103     }
    104 
    105     //打印索引文档(表)
    106     public static void printDocument(Document document) {
    107         System.out.println(document);
    108         List<IndexableField> fieldList = document.getFields();
    109         for(IndexableField field : fieldList) {
    110             System.out.println(field.name() + " : " + field.stringValue());
    111         }
    112     }
    113 
    114     //打印命中文档
    115     public static void printScoreDoc(ScoreDoc scoreDoc) {
    116         int docId = scoreDoc.doc;
    117         System.out.println("文档编号:" + docId);
    118         System.out.println("文档得分:" + scoreDoc.score);
    119         try {
    120             Document document = indexSearcher.doc(docId);
    121             printDocument(document);
    122         } catch (Exception e) {
    123             e.printStackTrace();
    124         }
    125     }
    126 
    127     //打印带得分的命中文档
    128     public static void printTopDocs(TopDocs topDocs) {
    129         int totalHits = topDocs.totalHits;
    130         System.out.println("命中文档总条数:" + totalHits);
    131         System.out.println("命中文档最大分数:" + topDocs.getMaxScore());
    132         ScoreDoc[] scoreDocs = topDocs.scoreDocs;
    133         for(ScoreDoc scoreDoc : scoreDocs) {
    134             printScoreDoc(scoreDoc);
    135         }
    136     }
    137 
    138     //高亮打印命中文档
    139     public static void printTopDocsHighlight(TopDocs topDocs, Query query) {
    140         // 格式化器:参数1:前置标签,参数2:后置标签
    141         Formatter formatter = new SimpleHTMLFormatter("<em>", "</em>");
    142         //打分对象,参数:query里面的条件,条件里面有搜索关键词
    143         Scorer scorer = new QueryScorer(query);
    144         //高亮工具:参数1.需要高亮什么颜色, 参数2.将哪些关键词进行高亮
    145         Highlighter hightlighter = new Highlighter(formatter, scorer);
    146         try {
    147             for(ScoreDoc scoreDoc : topDocs.scoreDocs) {
    148                 Document document = getIndexSearcher().doc(scoreDoc.doc);
    149                 List<IndexableField> fieldList = document.getFields();
    150                 for(IndexableField field : fieldList) {
    151                     String highlightValue = hightlighter.getBestFragment(getAnalyzer(), field.name(), field.stringValue());
    152                     if(highlightValue == null) {
    153                         highlightValue = field.stringValue();
    154                     }
    155                     System.out.println(field.name() + " : " + highlightValue);
    156                 }
    157             }
    158         } catch (Exception e) {
    159             e.printStackTrace();
    160         }
    161     }
    162 
    163 }

    LuceneDAO.java

     1 package io.guangsoft.erp.dao;
     2 
     3 import org.apache.lucene.search.TopDocs;
     4 
     5 import java.util.Map;
     6 
     7 public interface LuceneDAO {
     8 
     9     public void insertDoc(Map<String, String> docMap) throws Exception;
    10 
    11     public void deleteDoc(String id) throws Exception;
    12 
    13     public void updateDoc(Map<String, String> docMap) throws Exception;
    14 
    15     public void insertOrUpdateDoc(Map<String, String> docMap) throws Exception;
    16 
    17     //严格匹配整个字段,可传多个字段
    18     public TopDocs searchDocsByTerm(Map<String, String> termMap) throws Exception;
    19 
    20     //匹配分词后的字段,可传多个字段
    21     public TopDocs searchDocsByParser(Map<String, String> parserMap) throws Exception;
    22 
    23 }

    LuceneDAOImpl.java

      1 package io.guangsoft.erp.dao.impl;
      2 
      3 import io.guangsoft.erp.dao.LuceneDAO;
      4 import io.guangsoft.erp.util.LuceneUtil;
      5 import org.apache.lucene.document.Document;
      6 import org.apache.lucene.document.Field;
      7 import org.apache.lucene.document.FieldType;
      8 import org.apache.lucene.index.IndexOptions;
      9 import org.apache.lucene.index.IndexWriter;
     10 import org.apache.lucene.index.Term;
     11 import org.apache.lucene.queryparser.classic.QueryParser;
     12 import org.apache.lucene.queryparser.classic.QueryParserBase;
     13 import org.apache.lucene.search.*;
     14 
     15 import java.util.Map;
     16 
     17 public class LuceneDAOImpl implements LuceneDAO {
     18 
     19     @Override
     20     public void insertDoc(Map<String, String> docMap) throws Exception {
     21         FieldType fieldType = new FieldType();
     22         //是否存储记录
     23         fieldType.setStored(true);
     24         //文档型索引,只索引文档,不支持打分和位置检索
     25         fieldType.setIndexOptions(IndexOptions.DOCS);
     26         //是否要忽略field的加权基准值,如果为true可以节省内存消耗
     27         //但在打分质量方面会有更高的消耗,也不能使用index-time进行加权操作。
     28         fieldType.setOmitNorms(true);
     29         //是否使用分析器将域值分解成独立的语汇单元流,是否分词
     30         fieldType.setTokenized(true);
     31         //lucene索引库的一条记录
     32         Document document = new Document();
     33         for(Map.Entry<String, String> entry : docMap.entrySet()) {
     34             Field field = new Field(entry.getKey(), entry.getValue(), fieldType);
     35             document.add(field);
     36         }
     37         //保存到索引库
     38         IndexWriter indexWriter = LuceneUtil.getIndexWriter();
     39         indexWriter.addDocument(document);
     40         indexWriter.close();
     41     }
     42 
     43     @Override
     44     public void deleteDoc(String id) throws Exception {
     45         IndexWriter indexWriter = LuceneUtil.getIndexWriter();
     46         Term term = new Term("id", id);
     47         indexWriter.deleteDocuments(term);
     48         indexWriter.forceMergeDeletes();
     49         indexWriter.commit();
     50         indexWriter.close();
     51     }
     52 
     53     @Override
     54     public void updateDoc(Map<String, String> docMap) throws Exception {
     55         FieldType fieldType = new FieldType();
     56         fieldType.setStored(true);
     57         fieldType.setIndexOptions(IndexOptions.DOCS);
     58         fieldType.setOmitNorms(true);
     59         fieldType.setTokenized(true);
     60         Document document = new Document();
     61         for(Map.Entry<String, String> entry : docMap.entrySet()) {
     62             Field field = new Field(entry.getKey(), entry.getValue(), fieldType);
     63             document.add(field);
     64         }
     65         Term term = new Term("id", docMap.get("id"));
     66         IndexWriter indexWriter = LuceneUtil.getIndexWriter();
     67         indexWriter.updateDocument(term, document);
     68         indexWriter.close();
     69     }
     70 
     71     @Override
     72     public void insertOrUpdateDoc(Map<String, String> docMap) throws Exception {
     73         Term term = new Term("id", docMap.get("id"));
     74         TermQuery termQuery = new TermQuery(term);
     75         TopDocs topDocs = LuceneUtil.getIndexSearcher().search(termQuery, 1);
     76         if(topDocs.totalHits == 0) {
     77             insertDoc(docMap);
     78         } else {
     79             updateDoc(docMap);
     80         }
     81     }
     82 
     83     @Override
     84     public TopDocs searchDocsByTerm(Map<String, String> termMap) throws Exception {
     85         BooleanQuery.Builder booleanQueryBuilder = new BooleanQuery.Builder();
     86         for(Map.Entry<String, String> termEntry : termMap.entrySet()) {
     87             Term term = new Term(termEntry.getKey(), termEntry.getValue());
     88             TermQuery termQuery = new TermQuery(term);
     89             booleanQueryBuilder.add(termQuery, BooleanClause.Occur.MUST);
     90         }
     91         BooleanQuery booleanQuery = booleanQueryBuilder.build();
     92         //是否开启特定字段排序
     93         boolean orderFlag = false;
     94         TopDocs topDocs = null;
     95         if(orderFlag) {
     96             Sort sort = new Sort(new SortField[]{new SortField("createTime", SortField.Type.LONG, true)});
     97             topDocs = LuceneUtil.getIndexSearcher().search(booleanQuery, 99999999, sort);
     98         } else {
     99             topDocs = LuceneUtil.getIndexSearcher().search(booleanQuery, 99999999);
    100         }
    101         return topDocs;
    102     }
    103 
    104     @Override
    105     public TopDocs searchDocsByParser(Map<String, String> parserMap) throws Exception {
    106         BooleanQuery.Builder booleanQueryBuilder = new BooleanQuery.Builder();
    107         for(Map.Entry<String, String> parserEntry : parserMap.entrySet()) {
    108             QueryParser queryParser = new QueryParser(parserEntry.getKey(), LuceneUtil.getAnalyzer());
    109             queryParser.setDefaultOperator(QueryParserBase.AND_OPERATOR);
    110             Query query = queryParser.parse(parserEntry.getValue());
    111             booleanQueryBuilder.add(query, BooleanClause.Occur.MUST);
    112         }
    113         BooleanQuery booleanQuery = booleanQueryBuilder.build();
    114         //是否开启特定字段排序
    115         boolean orderFlag = false;
    116         TopDocs topDocs = null;
    117         if(orderFlag) {
    118             Sort sort = new Sort(new SortField[]{new SortField("createTime", SortField.Type.LONG, true)});
    119             topDocs = LuceneUtil.getIndexSearcher().search(booleanQuery, 99999999, sort);
    120         } else {
    121             topDocs = LuceneUtil.getIndexSearcher().search(booleanQuery, 99999999);
    122         }
    123         return topDocs;
    124     }
    125 
    126 }

    LuceneTest.java

     1 package io.guangsoft.erp;
     2 
     3 import com.alibaba.fastjson.JSONArray;
     4 import com.alibaba.fastjson.JSONObject;
     5 import io.guangsoft.erp.dao.LuceneDAO;
     6 import io.guangsoft.erp.dao.impl.LuceneDAOImpl;
     7 import io.guangsoft.erp.util.LuceneUtil;
     8 import org.apache.lucene.index.Term;
     9 import org.apache.lucene.search.TermQuery;
    10 import org.apache.lucene.search.TopDocs;
    11 import org.junit.Test;
    12 
    13 import java.util.HashMap;
    14 import java.util.Map;
    15 import java.util.stream.Collectors;
    16 
    17 public class LuceneTest {
    18 
    19     LuceneDAO luceneDAO = new LuceneDAOImpl();
    20 
    21     @Test
    22     public void testInsertDoc() throws Exception {
    23         JSONArray jsonArray = JSONArray.parseArray(
    24                 "[{id:'1',name:'李白',desc:'朝辞白帝彩云间'}, " +
    25                         "{id:'2',name:'杜甫',desc:'润物细无声'}, " +
    26                         "{id:'3',name:'苏轼',desc:'大江东去浪淘尽'}]");
    27         for(int i = 0; i < jsonArray.size(); i++) {
    28             JSONObject jsonObject = jsonArray.getJSONObject(i);
    29             Map<String, String> docMap = jsonObject.entrySet().stream().collect(Collectors.toMap(
    30                     Map.Entry :: getKey, entry -> entry.getValue().toString()
    31             ));
    32             luceneDAO.insertDoc(docMap);
    33         }
    34     }
    35 
    36     @Test
    37     public void testSearchDocsByTerm() throws Exception {
    38         Map<String, String> docMap = new HashMap<String, String>();
    39         docMap.put("name", "李白");
    40         TopDocs topDocs = luceneDAO.searchDocsByTerm(docMap);
    41         LuceneUtil.printTopDocs(topDocs);
    42     }
    43 
    44     @Test
    45     public void testSearchDocsByParser() throws Exception {
    46         Map<String, String> docMap = new HashMap<String, String>();
    47         docMap.put("name", "李白");
    48         TopDocs topDocs = luceneDAO.searchDocsByParser(docMap);
    49         LuceneUtil.printTopDocsHighlight(topDocs, new TermQuery(new Term("name", "李白")));
    50     }
    51 
    52     @Test
    53     public void testUpdateDoc() throws Exception {
    54         Map<String, String> docMap = new HashMap<String, String>();
    55         docMap.put("name", "李白");
    56         LuceneUtil.printTopDocs(luceneDAO.searchDocsByTerm(docMap));
    57         docMap.put("id", "1");
    58         docMap.put("desc", "人生得意须尽欢");
    59         luceneDAO.updateDoc(docMap);
    60         docMap.remove("id");
    61         docMap.remove("desc");
    62         LuceneUtil.printTopDocs(luceneDAO.searchDocsByTerm(docMap));
    63     }
    64 
    65     @Test
    66     public void testDeleteDoc() throws Exception{
    67         Map<String, String> docMap = new HashMap<String, String>();
    68         docMap.put("id", "1");
    69         LuceneUtil.printTopDocs(luceneDAO.searchDocsByTerm(docMap));
    70         luceneDAO.deleteDoc("1");
    71         LuceneUtil.printTopDocs(luceneDAO.searchDocsByTerm(docMap));
    72     }
    73 }
  • 相关阅读:
    【CUDA开发】CUDA面内存拷贝用法总结
    【CUDA开发】CUDA编程接口(一)------一十八般武器
    【CUDA开发】CUDA编程接口(一)------一十八般武器
    【计算机视觉】OPENCV对于有alpha通道的透明背景图片的读取和图片叠加
    【计算机视觉】OPENCV对于有alpha通道的透明背景图片的读取和图片叠加
    【CUDA开发】论CUDA和LAV解码器是否真的实用
    【CUDA开发】论CUDA和LAV解码器是否真的实用
    【VS开发】网络SOCKET编程INADDR_ANY选项
    【VS开发】网络SOCKET编程INADDR_ANY选项
    【FFMPEG】I,P,B帧和PTS,DTS时间戳的关系
  • 原文地址:https://www.cnblogs.com/guanghe/p/10863740.html
Copyright © 2011-2022 走看看