zoukankan      html  css  js  c++  java
  • IKAnalyzer2012FF_u1.jarlucene4.0简单实例 悟寰轩

      1 import java.io.File;
      2 import java.io.IOException;
      3 import java.io.StringReader;
      4 
      5 import org.apache.lucene.analysis.Analyzer;
      6 import org.apache.lucene.analysis.TokenStream;
      7 import org.apache.lucene.document.Document;
      8 import org.apache.lucene.document.TextField;
      9 import org.apache.lucene.document.Field.Store;
     10 import org.apache.lucene.index.IndexReader;
     11 import org.apache.lucene.index.IndexWriter;
     12 import org.apache.lucene.index.IndexWriterConfig;
     13 import org.apache.lucene.queryparser.classic.MultiFieldQueryParser;
     14 import org.apache.lucene.queryparser.classic.ParseException;
     15 import org.apache.lucene.queryparser.classic.QueryParser;
     16 import org.apache.lucene.search.IndexSearcher;
     17 import org.apache.lucene.search.Query;
     18 import org.apache.lucene.search.ScoreDoc;
     19 import org.apache.lucene.search.TopDocs;
     20 import org.apache.lucene.search.TopScoreDocCollector;
     21 import org.apache.lucene.search.highlight.Highlighter;
     22 import org.apache.lucene.search.highlight.InvalidTokenOffsetsException;
     23 import org.apache.lucene.search.highlight.QueryScorer;
     24 import org.apache.lucene.search.highlight.SimpleHTMLFormatter;
     25 import org.apache.lucene.store.Directory;
     26 import org.apache.lucene.store.FSDirectory;
     27 import org.apache.lucene.util.Version;
     28 import org.wltea.analyzer.lucene.IKAnalyzer;
     29 
     30 public class IndexTools {
     31     /**
     32      * 获得indexwriter对象
     33      * 
     34      * @param dir
     35      * @return
     36      * @throws IOException
     37      * @throws Exception
     38      */
     39     private IndexWriter getIndexWriter(Directory dir, Analyzer analyzer) throws IOException {
     40         IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_40, analyzer);
     41         return new IndexWriter(dir, iwc);
     42     }
     43     
     44     /**
     45      * 关闭indexwriter对象
     46      * 
     47      * @throws IOException
     48      * 
     49      * @throws Exception
     50      */
     51     private void closeWriter(IndexWriter indexWriter) throws IOException {
     52         if (indexWriter != null) {
     53             indexWriter.close();
     54         }
     55     }
     56     
     57     /**
     58      * 创建索引
     59      * 
     60      * @throws InvalidTokenOffsetsException
     61      */
     62     public void createIndex() throws InvalidTokenOffsetsException {
     63         String indexPath = "D://luceneindex"; // 建立索引文件的目录
     64         // 默认IKAnalyzer()-false:实现最细粒度切分算法,true:分词器采用智能切分
     65         Analyzer analyzer = new IKAnalyzer(true);
     66         IndexWriter indexWriter = null;
     67         Directory directory = null;
     68         try {
     69             directory = FSDirectory.open(new File(indexPath));
     70             indexWriter = getIndexWriter(directory, analyzer);
     71         } catch (Exception e) {
     72             System.out.println("索引打开异常!");
     73         }
     74         // 添加索引
     75         try {
     76             Document document = new Document();
     77             document.add(new TextField("filename", "标题:起点", Store.YES));
     78             document.add(new TextField("content", "内容:我是一名程序员", Store.YES));
     79             indexWriter.addDocument(document);
     80             Document document1 = new Document();
     81             document1.add(new TextField("filename", "标题:终点", Store.YES));
     82             document1.add(new TextField("content", "内容:我不再只是程序员", Store.YES));
     83             indexWriter.addDocument(document1);
     84             indexWriter.commit();
     85         } catch (IOException e1) {
     86             System.out.println("索引创建异常!");
     87         }
     88         try {
     89             closeWriter(indexWriter);
     90         } catch (Exception e) {
     91             System.out.println("索引关闭异常!");
     92         }
     93     }
     94     
     95     /**
     96      * 搜索
     97      * 
     98      * @throws ParseException
     99      * @throws IOException
    100      * @throws InvalidTokenOffsetsException
    101      */
    102     @SuppressWarnings("deprecation")
    103     public void searchIndex() throws ParseException, IOException, InvalidTokenOffsetsException {
    104         String indexPath = "D://luceneindex"; // 建立索引文件的目录
    105         // 默认IKAnalyzer()-false:实现最细粒度切分算法,true:分词器采用智能切分
    106         Analyzer analyzer = new IKAnalyzer(true);
    107         Directory directory = null;
    108         try {
    109             directory = FSDirectory.open(new File(indexPath));
    110         } catch (Exception e) {
    111             System.out.println("索引打开异常!");
    112         }
    113         IndexReader ireader = null;
    114         IndexSearcher isearcher = null;
    115         try {
    116             ireader = IndexReader.open(directory);
    117         } catch (IOException e) {
    118             System.out.println("打开索引文件!");
    119         }
    120         isearcher = new IndexSearcher(ireader);
    121         String keyword = "程序员";
    122         // 使用QueryParser查询分析器构造Query对象
    123         // eg:单个字段查询
    124         // String fieldName = "content";
    125         // QueryParser qp = new QueryParser(Version.LUCENE_40, fieldName, analyzer);
    126         String[] fields = { "filename", "content" };
    127         QueryParser qp = new MultiFieldQueryParser(Version.LUCENE_40, fields, analyzer);
    128         qp.setDefaultOperator(QueryParser.AND_OPERATOR);
    129         Query query = qp.parse(keyword);
    130         // 搜索相似度最高的5条记录
    131         TopDocs topDocs = isearcher.search(query, 25);
    132         System.out.println("命中:" + topDocs.totalHits);
    133         // 输出结果
    134         ScoreDoc[] scoreDocs = topDocs.scoreDocs;
    135         for (int i = 0; i < topDocs.totalHits; i++) {
    136             Document targetDoc = isearcher.doc(scoreDocs[i].doc);
    137             System.out.println("内容:" + targetDoc.toString());
    138         }
    139         // 分页,高亮显示
    140         higherIndex(analyzer, isearcher, query, topDocs);
    141     }
    142     
    143     public static void main(String[] args) {
    144         IndexTools tool = new IndexTools();
    145         try {
    146             tool.searchIndex();
    147         } catch (ParseException e) {
    148             System.out.println("解析错误");
    149         } catch (IOException e) {
    150             System.out.println("读取文件流错误");
    151         } catch (InvalidTokenOffsetsException e) {
    152             System.out.println("查询失败");
    153         }
    154     }
    155     
    156     /**
    157      * 分页,高亮显示
    158      * 
    159      * @param analyzer
    160      * @param isearcher
    161      * @param query
    162      * @param topDocs
    163      * @throws IOException
    164      * @throws InvalidTokenOffsetsException
    165      */
    166     public void higherIndex(Analyzer analyzer, IndexSearcher isearcher, Query query, TopDocs topDocs)
    167             throws IOException, InvalidTokenOffsetsException {
    168         TopScoreDocCollector results = TopScoreDocCollector.create(topDocs.totalHits, false);
    169         isearcher.search(query, results);
    170         // 分页取出指定的doc(开始条数, 取几条)
    171         ScoreDoc[] docs = results.topDocs(1, 2).scoreDocs;
    172         for (int i = 0; i < docs.length; i++) {
    173             Document targetDoc = isearcher.doc(docs[i].doc);
    174             System.out.println("内容:" + targetDoc.toString());
    175         }
    176         // 关键字高亮显示的html标签,需要导入lucene-highlighter-3.5.0.jar
    177         SimpleHTMLFormatter simpleHTMLFormatter = new SimpleHTMLFormatter("<font color='red'>", "</font>");
    178         Highlighter highlighter = new Highlighter(simpleHTMLFormatter, new QueryScorer(query));
    179         for (int i = 0; i < docs.length; i++) {
    180             Document doc = isearcher.doc(docs[i].doc);
    181             // 标题增加高亮显示
    182             TokenStream tokenStream1 = analyzer.tokenStream("filename", new StringReader(doc.get("filename")));
    183             String title = highlighter.getBestFragment(tokenStream1, doc.get("filename"));
    184             // 内容增加高亮显示
    185             TokenStream tokenStream2 = analyzer.tokenStream("content", new StringReader(doc.get("content")));
    186             String content = highlighter.getBestFragment(tokenStream2, doc.get("content"));
    187             System.out.println(doc.get("filename") + " : " + title + " : " + content);
    188         }
    189     }
    190 }
  • 相关阅读:
    网站常见的反爬虫和应对方法
    网站反爬虫
    webmagic的设计机制及原理-如何开发一个Java爬虫
    腾讯社区搜索架构演进
    搜索引擎评估与互联网用户行为建设
    深层网络搜索核心技术研讨
    Solr与Cassandra二级缓存实践
    .NET的前世今生与将来
    技术人生的职场众生相
    爆栈之旅
  • 原文地址:https://www.cnblogs.com/sunxucool/p/2799805.html
Copyright © 2011-2022 走看看