zoukankan html css js c++ java

Lucene 4.9索引txt文件

暂时只是跑起来了，不知道是否正确，困了，睡觉了，改天再弄。搜索那块是分页的，也没仔细弄。。。

参考着 http://blog.csdn.net/kingskyleader/article/details/8444739

在data下放了三个txt...

S:lucenedata永生.txt

S:lucenedata1.txt

S:lucenedata2.txt

永生是本小说，汉语的应该没有英文。

1.txt 内容: hello

2.txt 内容: hi hello 哈哈

程序运行之后控制台打印的信息：

adding [Ljava.io.File;@3f611531
adding [Ljava.io.File;@3f611531
adding [Ljava.io.File;@3f611531
S:lucenedata1.txt
1407857427736
S:lucenedata2.txt
1407857444245

具体改天再研究。

下面是代码：

pom:

<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
  xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
  <modelVersion>4.0.0</modelVersion>

  <groupId>LuceneTest</groupId>
  <artifactId>lucene</artifactId>
  <version>0.0.1-SNAPSHOT</version>
  <packaging>jar</packaging>

  <name>lucene</name>
  <url>http://maven.apache.org</url>

  <properties>
    <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
  </properties>

  <dependencies>
    <dependency>
      <groupId>junit</groupId>
      <artifactId>junit</artifactId>
      <version>3.8.1</version>
      <scope>test</scope>
    </dependency>
    
    <!-- lucene -->
    <dependency>
    <groupId>org.apache.lucene</groupId>
    <artifactId>lucene-core</artifactId>
    <version>4.9.0</version>
    </dependency>
    
    <dependency>
    <groupId>org.apache.lucene</groupId>
    <artifactId>lucene-queryparser</artifactId>
    <version>4.9.0</version>
    </dependency>
    
    <dependency>
    <groupId>org.apache.lucene</groupId>
    <artifactId>lucene-analyzers-common</artifactId>
    <version>4.9.0</version>
    </dependency>
    
    <dependency>
    <groupId>org.apache.lucene</groupId>
    <artifactId>lucene-highlighter</artifactId>
    <version>4.9.0</version>
    </dependency>
    
  </dependencies>
</project>

建立索引：

package lucene;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.LongField;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.index.Term;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;

public class InitIndex {

    public void creatIndex() throws IOException {

        boolean create = true;

        File data = new File("S:\lucene\data");
        File index = new File("S:\lucene\index");

        Directory dir = FSDirectory.open(index);

        Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_4_9);

        IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_4_9,
                analyzer);

        if (create) {
            // Create a new index in the directory, removing any
            // previously indexed documents:
            iwc.setOpenMode(OpenMode.CREATE);
        } else {
            // Add new documents to an existing index:
            iwc.setOpenMode(OpenMode.CREATE_OR_APPEND);
        }

        IndexWriter iw = new IndexWriter(dir, iwc);

        File[] file = data.listFiles();
        FileInputStream fis = null;
        for (File f : file) {
            
            fis = new FileInputStream(f);

            Document doc = new Document();

            Field pathField = new StringField("path", f.getPath(),
                    Field.Store.YES);
            doc.add(pathField);

            doc.add(new LongField("modified", f.lastModified(), Field.Store.YES));

            doc.add(new TextField("contents", new BufferedReader(
                    new InputStreamReader(fis, "GBK"))));

            if (iw.getConfig().getOpenMode() == OpenMode.CREATE) {
                // New index, so we just add the document (no old document can
                // be there):
                System.out.println("adding " + file);
                iw.addDocument(doc);
            } else {
                // Existing index (an old copy of this document may have been
                // indexed) so
                // we use updateDocument instead to replace the old one matching
                // the exact
                // path, if present:
                System.out.println("updating " + file);
                iw.updateDocument(new Term("path", f.getPath()), doc);
            }

        }
        iw.close();
        fis.close();
        
    }
}

搜索：

package lucene;

import java.io.File;
import java.io.IOException;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.queryparser.classic.ParseException;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;

public class Search {

    public void query() throws IOException, ParseException {

        String queries = "hello";

        int hitsPerPage = 10; 
        
        File index = new File("S:\lucene\index");

        IndexReader reader = DirectoryReader.open(FSDirectory.open(index));
        IndexSearcher searcher = new IndexSearcher(reader);
        Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_4_9);
        
        QueryParser parser = new QueryParser(Version.LUCENE_4_9, "contents", analyzer);  

        Query query = parser.parse(queries);
        
        TopDocs results = searcher.search(query, 5 * hitsPerPage);
        ScoreDoc[] hits = results.scoreDocs;  
        int numTotalHits = results.totalHits;  
         
        
        int start = 0;  
        int end = Math.min(numTotalHits, hitsPerPage);  
        
        for (int i = start; i < end; i++) {  
      
            Document doc = searcher.doc(hits[i].doc);  
            String path = doc.get("path");  
           System.out.println(path);
           String modified=doc.get("modified");
           System.out.println(modified);
                        
          }  
      

      
    

    }
}

主函数：

package lucene;

import java.io.IOException;

import org.apache.lucene.queryparser.classic.ParseException;

public class Main {

    public static void main(String args[]) throws IOException, ParseException{
        InitIndex id=new InitIndex();
        id.creatIndex();
        Search se=new Search();
        se.query();
    }
}

查看全文

相关阅读:
Spark开发-SparkUDAF(二)
Spark开发-Spark UDAF(一)
Spark开发-Spark中类型安全UDAF开发示例
 Spark开发_构建TypeSafe的Dataset
布隆过滤器（Bloom Filter）
一个 Spark 应用程序的完整执行流程
 Spark的RPC
Spark调优
 Hbase系列文章
 Flink怎么做到精确一次的?

原文地址：https://www.cnblogs.com/acehalo/p/3908760.html