zoukankan      html  css  js  c++  java
  • Lucene:为文本文件创建索引

    需求描述:为某个文件夹A下的所有后缀名为.txt的文件创建索引,索引文件存放于文件夹B下

    开发环境:Lucene 3.4.0 + eclipse indigo + jdk1.7.0,配置如下

    为文件创建索引的是mytest包下的indexer类,具体代码如下:

     

    View Code
    package mytest;

    import org.apache.lucene.index.IndexWriter;
    import org.apache.lucene.analysis.standard.StandardAnalyzer;
    import org.apache.lucene.document.Document;
    import org.apache.lucene.document.Field;
    import org.apache.lucene.store.FSDirectory;
    import org.apache.lucene.store.Directory;
    import org.apache.lucene.util.Version;

    import java.io.File;
    import java.io.FileFilter;
    import java.io.IOException;
    import java.io.FileReader;

    // From chapter 1

    public class Indexer {

    public static void main(String[] args) throws Exception {
    String indexDir = "E://eclipse//javaProject//lucene-3.4.0//indexDir"; //1
    String dataDir = "E://eclipse//javaProject//lucene-3.4.0//dataDir"; //2

    long start = System.currentTimeMillis();
    Indexer indexer = new Indexer(indexDir);
    int numIndexed;
    try {
    numIndexed = indexer.index(dataDir, new TextFilesFilter());
    } finally {
    indexer.close();
    }
    long end = System.currentTimeMillis();

    System.out.println("Indexing " + numIndexed + " files took "
    + (end - start) + " milliseconds");
    }

    private IndexWriter writer;

    public Indexer(String indexDir) throws IOException {
    Directory dir = FSDirectory.open(new File(indexDir));
    writer = new IndexWriter(dir, //3
    new StandardAnalyzer( //3
    Version.LUCENE_34),//3
    true, //3
    IndexWriter.MaxFieldLength.UNLIMITED); //3
    }

    public void close() throws IOException {
    writer.close(); //4
    }

    public int index(String dataDir, FileFilter filter)
    throws Exception {

    File[] files = new File(dataDir).listFiles();

    for (File f: files) {
    if (!f.isDirectory() &&
    !f.isHidden() &&
    f.exists() &&
    f.canRead() &&
    (filter == null || filter.accept(f))) {
    indexFile(f);
    }
    }

    return writer.numDocs(); //5
    }

    private static class TextFilesFilter implements FileFilter {
    public boolean accept(File path) {
    return path.getName().toLowerCase() //6
    .endsWith(".txt"); //6
    }
    }

    protected Document getDocument(File f) throws Exception {
    Document doc = new Document();
    doc.add(new Field("contents", new FileReader(f))); //7
    doc.add(new Field("filename", f.getName(), //8
    Field.Store.YES, Field.Index.NOT_ANALYZED));//8
    doc.add(new Field("fullpath", f.getCanonicalPath(), //9
    Field.Store.YES, Field.Index.NOT_ANALYZED));//9
    return doc;
    }

    private void indexFile(File f) throws Exception {
    System.out.println("Indexing " + f.getCanonicalPath());
    Document doc = getDocument(f);
    writer.addDocument(doc); //10
    }
    }
    /*
    #1 Create index in this directory
    #2 Index *.txt files from this directory
    #3 Create Lucene IndexWriter
    #4 Close IndexWriter
    #5 Return number of documents indexed
    #6 Index .txt files only, using FileFilter
    #7 Index file content
    #8 Index file name
    #9 Index file full path
    #10 Add document to Lucene index
    */

     

    程序运行结果如下:

    程序运行结束后,在指定目录下生成索引文件:

    推荐一个自己业余时间开发的网盘搜索引擎,360盘搜www.360panso.com

  • 相关阅读:
    阿里巴巴
    实用得 JS 代码
    C#获得当前插入数据的ID
    “职场五魅”助你成功
    SQL2005导入导出数据库方法集合
    VS05里checkboxlist用JS获取 value值
    sql 去除html标签函数
    百度新闻搜索结果页的采集
    把表中的某个字段格式如:2,3,4的数据分别插入到另一个表中
    jquery 定位元素并获取数据
  • 原文地址:https://www.cnblogs.com/eczhou/p/2257753.html
Copyright © 2011-2022 走看看