zoukankan      html  css  js  c++  java
  • 初识lucene

    lucene的介绍网上有好多,再写一遍可能有点多余了。

    使用lucene之前,有一系列的疑问

    • 为什么lucene就比数据库快?
    • 倒排索引是什么,他是怎么做到的
    • lucene的数据结构是什么样的,cpu消耗,内存消耗主要因为什么
    • lucene的索引流程以及查询流程是什么样的

    推荐两篇文章,更进一步了解lucene

    可以参考lucene与数据库对比部分

    http://www.chedong.com/tech/lucene.html

    可以参考第一篇和第二篇部分对lucene有一部分了解

    http://blog.csdn.net/forfuture1978/article/details/5668956

     

    《Lucene 原理与代码分析》看过一点,但是有点难度。

    现在从《lucene实战》这本书来看,lucene使用的是4.7可能与3.0有所区别。

    下面是第一节的例子

     

    package com.mitchz.lucence;
    
    import java.io.File;
    import java.io.FileFilter;
    import java.io.FileReader;
    import java.io.IOException;
    
    import org.apache.lucene.analysis.core.SimpleAnalyzer;
    import org.apache.lucene.document.Document;
    import org.apache.lucene.document.Field;
    import org.apache.lucene.document.StringField;
    import org.apache.lucene.document.TextField;
    import org.apache.lucene.index.IndexWriter;
    import org.apache.lucene.index.IndexWriterConfig;
    import org.apache.lucene.store.Directory;
    import org.apache.lucene.store.FSDirectory;
    import org.apache.lucene.util.Version;
    
    /**
     * @author mitchz
     * @version 1.0
     * @since 2014年4月30日
     * @category com.mitchz.lucence
     */
    public class Indexer
    {
    
    	private IndexWriter writer;
    
    	public Indexer(String indexDir) throws IOException
    	{
    		Directory dir = FSDirectory.open(new File(indexDir));
    		IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_47,
    				new SimpleAnalyzer(Version.LUCENE_47));
    		writer = new IndexWriter(dir, config);
    	}
    
    	public int index(String dataDir, FileFilter filter) throws Exception
    	{
    		File[] files = (new File(dataDir)).listFiles();
    		for (File file : files)
    		{
    			if (!file.isDirectory() && !file.isHidden() && file.canRead()
    					&& (filter == null || filter.accept(file)))
    			{
    				indexFile(file);
    			}
    		}
    		return writer.numDocs();
    	}
    
    	private static class TextFilesFilter implements FileFilter
    	{
    
    		@Override
    		public boolean accept(File path)
    		{
    			return path.getName().toLowerCase().endsWith(".txt");
    		}
    	}
    
    	protected Document getDocument(File file) throws Exception
    	{
    		Document doc = new Document();
    		doc.add(new TextField("contents", new FileReader(file)));
    		doc.add(new StringField("filename", file.getName(), Field.Store.YES));
    		doc.add(new StringField("fullpath", file.getCanonicalPath(), Field.Store.YES));
    		return doc;
    	}
    
    	protected void indexFile(File file) throws Exception
    	{
    		System.out.println("Indexing " + file.getCanonicalPath());
    		Document doc = getDocument(file);
    		writer.addDocument(doc);
    	}
    
    	protected void close() throws IOException
    	{
    		writer.close();
    	}
    
    	public static void main(String[] args) throws Exception
    	{
    		if (args.length != 2)
    		{
    			throw new IllegalArgumentException("Usage java " + Indexer.class.getName()
    					+ "<index dir> <data dir>");
    		}
    		String indexDir = args[0];
    		String dataDir = args[1];
    		System.out.println("indexDir:" + indexDir);
    		System.out.println("dataDir:" + dataDir);
    		long start = System.currentTimeMillis();
    		Indexer indexer = new Indexer(indexDir);
    		int numIndexed;
    		try
    		{
    			numIndexed = indexer.index(dataDir, new TextFilesFilter());
    		}
    		finally
    		{
    			indexer.close();
    		}
    		long end = System.currentTimeMillis();
    		System.out.println("Indexing " + numIndexed + " files took " + (end - start)
    				+ " milliseconds");
    	}
    }
    
    package com.mitchz.lucence;
    
    import java.io.File;
    import java.io.IOException;
    
    import org.apache.lucene.analysis.core.SimpleAnalyzer;
    import org.apache.lucene.document.Document;
    import org.apache.lucene.index.DirectoryReader;
    import org.apache.lucene.queryparser.classic.ParseException;
    import org.apache.lucene.queryparser.classic.QueryParser;
    import org.apache.lucene.search.IndexSearcher;
    import org.apache.lucene.search.Query;
    import org.apache.lucene.search.ScoreDoc;
    import org.apache.lucene.search.TopDocs;
    import org.apache.lucene.store.Directory;
    import org.apache.lucene.store.FSDirectory;
    import org.apache.lucene.util.Version;
    
    /**
     * @author mitchz
     * @version 1.0
     * @since 2014年4月30日
     * @category com.mitchz.lucence
     */
    public class Searcher
    {
    
    	public static void main(String args[]) throws IOException, ParseException
    	{
    		if (args.length != 2)
    		{
    			throw new IllegalArgumentException("Usage java " + Searcher.class.getName()
    					+ "<index dir> <query>");
    		}
    		String indexDir = args[0];
    		String q = args[1];
    		search(indexDir, q);
    	}
    
    	public static void search(String indexDir, String q) throws IOException,
    			ParseException
    	{
    		Directory dir = FSDirectory.open(new File(indexDir));
    		DirectoryReader dirReader = DirectoryReader.open(dir);
    		IndexSearcher is = new IndexSearcher(dirReader);
    		QueryParser parser = new QueryParser(Version.LUCENE_47, "contents",
    				new SimpleAnalyzer(Version.LUCENE_47));
    		Query query = parser.parse(q);
    		long start = System.currentTimeMillis();
    		TopDocs hits = is.search(query, 10);
    		long end = System.currentTimeMillis();
    		System.out.println("Found " + hits.totalHits + " document(s) (in "
    				+ (end - start) + " milliseconds) that matched query '" + q + "':");
    		for (ScoreDoc scoreDoc : hits.scoreDocs)
    		{
    			Document doc = is.doc(scoreDoc.doc);
    			System.out.println(doc.get("filename"));
    		}
    	}
    }
    

      

    maven的配置如下:

     

    <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
    	xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
    	<modelVersion>4.0.0</modelVersion>
    
    	<groupId>com.mitchz</groupId>
    	<artifactId>lucence-test</artifactId>
    	<version>0.0.1-SNAPSHOT</version>
    	<packaging>jar</packaging>
    
    	<name>lucence-test</name>
    	<url>http://maven.apache.org</url>
    
    	<properties>
    		<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
    	</properties>
    
    	<dependencies>
    		<dependency>
    			<groupId>junit</groupId>
    			<artifactId>junit</artifactId>
    			<version>3.8.1</version>
    			<scope>test</scope>
    		</dependency>
    		<dependency>
    			<groupId>org.apache.lucene</groupId>
    			<artifactId>lucene-core</artifactId>
    			<version>4.7.0</version>
    		</dependency>
    		<dependency>
    			<groupId>org.apache.lucene</groupId>
    			<artifactId>lucene-analyzers-common</artifactId>
    			<version>4.7.0</version>
    		</dependency>
    		<dependency>
    			<groupId>org.apache.lucene</groupId>
    			<artifactId>lucene-queryparser</artifactId>
    			<version>4.7.0</version>
    		</dependency>
    	</dependencies>
    </project>
    

     

  • 相关阅读:
    Java学习笔记——继承、接口、多态
    Java学习笔记,前两章总结
    网络攻防第十一周总结
    第十一周网络攻防作业
    第十周网络作业
    第九周网络攻防作业
    第八周网络攻防作业
    第七周网络攻防作业
    第六周网络攻防作业
    第五周网络攻防作业
  • 原文地址:https://www.cnblogs.com/new0801/p/6175974.html
Copyright © 2011-2022 走看看