zoukankan      html  css  js  c++  java
  • Lucene简单应用

     近期用Lucene做了个比较简单的站内检索,在这里和大家做个交流。全文检索的实现,从检索的数据源来分有两种:一种是数据库,另一种是已生成的文件(doc,html,txt......)。

    无论哪一种方式,实现原理都是一样的。主要分为两大步:

    一、将数据源转换为Lucene文件,保存到设定目录下

    private static String filePath = "D:\rookie\date\";//文件存放路径
    private static String indexPath = "D:\rookie\source";//索引存放路径

    public static void main(String[] args) throws Exception {
    /* 指明要索引文件夹的位置,这里是d盘的文件夹下 */
    File fileDir = new File(filePath);
    /* 这里放索引文件的位置 */
    File indexDir = new File(indexPath);

    Analyzer luceneAnalyzer = new StandardAnalyzer();
    IndexWriter indexWriter = new IndexWriter(indexDir, luceneAnalyzer,true);//提醒:最后一个参数为false时,不重新创建索引文件夹需要追加索引(即更新索引时使用false)
    File[] textFiles = fileDir.listFiles();
    long startTime = new Date().getTime();
    // 增加document到索引去
    for (int i = 0; i < textFiles.length; i++) {

    //支持html,txt文件
    if (textFiles[i].isFile() && textFiles[i].getName().endsWith(".txt")) {
    String temp = FileReaderAll(textFiles[i].getCanonicalPath(),"GBK");
    Document document = new Document();

    Field FieldId = new Field("id", "12345",Field.Store.YES, Field.Index.UN_TOKENIZED);//强烈建议在添加Field 时 保存一个Id
    Field FieldPath = new Field("path", textFiles[i].getPath(),Field.Store.YES, Field.Index.UN_TOKENIZED);
    Field FieldBody = new Field("contents", temp, Field.Store.YES,Field.Index.TOKENIZED,Field.TermVector.WITH_POSITIONS_OFFSETS);

    document.add(FieldId);
    document.add(FieldPath);
    document.add(FieldBody);
    indexWriter.addDocument(document);
    }


    }
    // optimize()方法是对索引进行优化
    indexWriter.optimize();
    indexWriter.close();

    // 测试一下索引的时间
    long endTime = new Date().getTime();
    System.out.println("索引已经添加到文档中,共花费了" + (endTime - startTime) + " 毫秒! 索引路径是:" + fileDir.getPath());
    }

    /**
    * 功能:读取html ,txt...
    * @author rookie_d
    */
    public static String FileReaderAll(String FileName, String charset)
    throws IOException {
    BufferedReader reader = new BufferedReader(new InputStreamReader(
    new FileInputStream(FileName), charset));
    String line = new String();
    String temp = new String();

    while ((line = reader.readLine()) != null) {
    temp += line;
    }
    reader.close();
    return temp;
    }

    二、从Lucene文件中进行检索

    /**
    * 功能:从索引中查询出包含要搜索名字的所有的文件
    * @author rookie_d
    */
    public static List luceneSearcher() {

    String queryString="好";//要检索的字符串
    String indexPath = "D:\rookie\source";//得到索引存放路径
    Hits hits = null;
    Query query = null;
    IndexSearcher searcher;
    List list = new ArrayList();
    try {
    searcher = new IndexSearcher(indexPath);
    Analyzer analyzer = new StandardAnalyzer();
    QueryParser qp = new QueryParser("contents", analyzer);
    System.out.println(qp.getField());
    try {
    query = qp.parse(queryString);
    System.out.println(query);
    } catch (org.apache.lucene.queryParser.ParseException e) {
    e.printStackTrace();
    }
    if (searcher != null) {
    hits = searcher.search(query);
    System.out.println(hits.length());
    if (hits!=null && hits.length() > 0) {
    System.out.println("共找到:" + hits.length() + "个结果!");
    for(int i=0;i<hits.length();i++){
    Document document = hits.doc(i);
    String path = document.get("path");
    File file = new File(path);
    list.add(file.getPath());
    }
    }else{
    System.out.println("*****no result find*****");
    }

    }
    } catch (IOException e) {
    e.printStackTrace();
    }
    return list;
    }

    在开发过程中遇到了更新索引的小难题,下面也给段转来的代码,作为菜鸟认为这段代码还是比较有用的

    mport java.io.IOException; 
    import org.apache.lucene.document.Document; 
    import org.apache.lucene.document.Field; 
    import org.apache.lucene.index.IndexWriter; 
    import org.apache.lucene.index.Term; 
    import org.apache.lucene.queryParser.QueryParser; 
    import org.apache.lucene.search.Hits; 
    import org.apache.lucene.search.IndexSearcher; 
    import org.apache.lucene.search.Query; 
    import org.apache.lucene.analysis.standard.StandardAnalyzer; 

    public class UpdateDocument { 

    private static String path = "d:/index"; 


    public static void main(String[] args){ 
    // addIndex(); 
    updateIndex(); 
    search("李四"); 
    search("王五"); 


    public static void addIndex(){ 
    try { 
    IndexWriter write = new IndexWriter(path,new StandardAnalyzer(),true); 

    Document doc = new Document(); 
    doc.add(new Field("id","123456",Field.Store.YES,Field.Index.UN_TOKENIZED)); 
    doc.add(new Field("userName","张三",Field.Store.YES,Field.Index.TOKENIZED)); 
    doc.add(new Field("comefrom","北京",Field.Store.YES,Field.Index.TOKENIZED)); 

    write.addDocument(doc); 

    write.close(); 

    } catch (IOException e) { 
    e.printStackTrace(); 




    public static void updateIndex(){ 
    try { 

    IndexWriter write = new IndexWriter(path,new StandardAnalyzer(),false); 
    Document docNew = new Document(); 
    docNew.add(new Field("id","123456",Field.Store.YES,Field.Index.UN_TOKENIZED)); 
    docNew.add(new Field("userName","王五",Field.Store.YES,Field.Index.TOKENIZED)); 
    Term term = new Term("id","123456"); 
    /** 
    调用updateDocument的方法,传给它一个新的doc来更新数据, 
    Term term = new Term("id","1234567"); 
    先去索引文件里查找id为1234567的Doc,如果有就更新它(如果有多条,最后更新后只有一条)。如果没有就新增. 
    数据库更新的时候,我们可以只针对某个列来更新,而lucene只能针对一行数据更新。 
    */ 
    write.updateDocument(term, docNew); 

    write.close(); //注意在这里一定要关闭write 

    } catch (IOException e) { 
    e.printStackTrace(); 



    public static Query queryParser(String str){ 
    QueryParser queryParser = new QueryParser("userName", new StandardAnalyzer()); 
    try { 
    Query query = queryParser.parse(str); 
    return query; 
    } catch (Exception e) { 
    e.printStackTrace(); 

    return null; 


    public static void search(String str){ 
    try { 
    IndexSearcher search = new IndexSearcher(path); 

    Query query = queryParser(str); 

    Hits hits = search.search(query); 
    if(hits==null){ 
    return; 

    if(hits.length() == 0){ 
    System.out.println(" 没有搜索到'" + str+"'"); 
    return; 

    for (int i = 0; i < hits.length(); i++) { 
    Document doc = hits.doc(i); 
    System.out.println("id = "+hits.id(i)); 
    System.out.println("own id = " + doc.get("id")); 
    System.out.println("userName = "+doc.get("userName")); 
    System.out.println("come from = "+doc.get("comefrom")); 
    System.out.println(""); 


    } catch (Exception e) { 
    e.printStackTrace(); 



    最后再给一段删除索引的代码:

    //删除Lucene中相应的索引
    File indexDir = new File(indexPath);/* 这里放索引文件的位置 */
    File[] textFiles = indexDir.listFiles();
    Analyzer luceneAnalyzer = new StandardAnalyzer();
    boolean create = false;
    if(textFiles==null||textFiles.length<=0){
    create = true;
    }
    IndexWriter indexWriter = new IndexWriter(indexDir, luceneAnalyzer,create);
    Term term = new Term("id",news.getId());
    indexWriter.deleteDocuments(term);
    indexWriter.optimize();//optimize()方法是对索引进行优化
    indexWriter.close();//关闭

    在删除和更新索引时要注意new IndexWriter(indexDir, luceneAnalyzer,false);最后一个参数为false

    关于全文检索的内容还有许多需要学习,写这篇文章来帮助新手和自己来熟悉Lucene,希望对你有一点帮助!

  • 相关阅读:
    python切片操作
    python中的内存管理
    python中x,y交换值的问题
    leetcode6:Zigzag Conversion@Python
    Leetcode4:Median of Two Sorted Arrays@Python
    Leetcode3:Longest Substring Without Repeating Characters@Python
    Leetcode2:Add Two Numbers@Python
    LeetCode344:Reverse String@Python
    支付宝 芝麻信用分过600,你不知道的八个特权
    穷爸爸富爸爸里面说的“现金流游戏”靠谱吗?
  • 原文地址:https://www.cnblogs.com/huideng/p/3979890.html
Copyright © 2011-2022 走看看