zoukankan html css js c++ java

lucene学习-2

接下来我会写一个lucene的实例。实际上在搜索引擎上随便搜索下都能找到这样的东西。不过还是写一下吧，这也是我学习的经历。

package com.zhyea.doggie;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.Term;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;

public class LuceneTest {

    public static void main(String[] args) {
        // 要用来检索的样本文件存储路径
        String docPath = "D:\aqsiqDevelop\workspace3\doggie\WebContent\docs";
        // 索引文件存储路径
        String indexPath = "D:\aqsiqDevelop\workspace3\doggie\WebContent\index";
        try {
            // 分析器，这里使用了标准分析器
            Analyzer analyzer = new StandardAnalyzer();
            // 准备好索引存储目录
            Directory dir = FSDirectory.open(new File(indexPath));
            // 创建IndexWriter（索引写入器）配置，
            // 在配置中指明创建IndexWriter使用的lucene的版本及使用的分析器
            IndexWriterConfig config = new IndexWriterConfig(Version.LATEST,
                    analyzer);
            // 创建IndexWriter（索引写入器），并指明索引存储路径和配置文件
            IndexWriter writer = new IndexWriter(dir, config);
            // 使用IndexWriter（索引写入器）创建索引，这里另外创建一个方法
            addDocuments(docPath, writer);
            
            /* -------------创建索引结束，以下是进行搜索------------ */
            // 创建索引读出器
            IndexReader reader = DirectoryReader.open(dir);
            // 创建搜索器
            IndexSearcher seacher = new IndexSearcher(reader);
            // 创建搜索对象
            Query query = new TermQuery(new Term("content", "杨过"));
            // 执行搜索，并返回结果
            TopDocs topDocs = seacher.search(query, 10000);
            // 展示搜索结果
            Document doc;
            for(ScoreDoc tmp : topDocs.scoreDocs){
                doc = reader.document(tmp.doc);
                System.out.println("书名：" + doc.get("name") 
                                 + "---------------------"
                                 + "路径：" + doc.get("path"));
            }
        } catch (Exception e) {
            e.printStackTrace();
        }
    }

    /**
     * 遍历样本文本所在的目录，进行分析。 
     * 这里采用的样本文本是金庸的三部小说：神雕、射雕和笑傲江湖。
     * @param docPath
     *            样本文本存储路径
     * @param writer
     *            索引写入器
     * @throws IOException
     */
    private static void addDocuments(String docPath, IndexWriter writer)
            throws IOException {
        File dir = new File(docPath);
        for (File tmp : dir.listFiles()) {
            //创建Document对象，代表一个被索引的基本单元
            Document doc = new Document();
            String fileName = tmp.getName();
            String filePath = tmp.getCanonicalPath();        
            String fileContent = readTxt(tmp);
            //创建Field，并加入Document
            doc.add(new StringField("name", fileName,   Field.Store.YES));
            doc.add(new StringField("path", filePath,   Field.Store.YES));
            doc.add(new TextField("content",fileContent,Field.Store.YES));
            //将Document从内存写入真实目录
            writer.addDocument(doc);
            //提交索引，将索引写入索引文件，这个别忘了
            writer.commit();
        }
    }

    /**
     * 换行标志符
     */
    static final String NEWLINE = System.getProperty("line.separator");

    /**
     * 读取txt文件
     * 
     * @param file
     *            txt文件对象
     * @return
     * @throws IOException
     */
    private static String readTxt(File file) throws IOException {
        BufferedReader br = null;
        try {
            br = new BufferedReader(new FileReader(file));
            StringBuilder builder = new StringBuilder();
            String line;
            while (null != (line = br.readLine())) {
                builder.append(line).append(NEWLINE);
            }
            return builder.toString();
        } finally {
            if (null != br) br.close();
        }
    }
}

执行代码，发现没有任何输出。用luke进行查看索引目录，发现content对应的是乱码：

在读取txt文件时，需要调整编码格式，或者直接调整txt的编码格式与工作空间默认编码相同即可。

这里就不写出了。

调整乱码后，再次执行程序，发现还是不能检索出什么东西。再次查看索引目录：

所有的中文字符都被分开成为单独的Term。这次需要调整分析器，将分析器调整为CJKAnalyzer。这次能够检索出结果了：

实际上，影响查询结果的不只是分析器，还有这一句：

new TermQuery(new Term("content", "杨过"));

好了，这些可以留到以后再说。

all。

查看全文

相关阅读:
每日一题20201109（15. 三数之和）
每日一题20201106（169. 多数元素）
每日一题之20201103（941. 有效的山脉数组）
每日一题之20201102（349. 两个数组的交集）
【USACO4.4】追查坏牛奶
 【九省联考2018】一双木棋
 【NOI2013】向量内积
 【HNOI2013】游走
 【ZJOI2008】骑士
 【HEOI2014】大工程

原文地址：https://www.cnblogs.com/amunote/p/4174474.html