zoukankan      html  css  js  c++  java
  • lucene做简单的文件索引

    package com.mylucene;
    import java.io.File;
    import java.io.FileReader;
    import java.io.IOException;
    import java.io.Reader;
    import java.nio.CharBuffer;
    import java.util.ArrayList;
    import java.util.List;
    
    import org.apache.lucene.LucenePackage;
    import org.apache.lucene.analysis.Analyzer;
    import org.apache.lucene.analysis.cn.smart.SmartChineseAnalyzer;
    import org.apache.lucene.analysis.standard.StandardAnalyzer;
    import org.apache.lucene.document.Document;
    import org.apache.lucene.document.Field;
    import org.apache.lucene.document.TextField;
    import org.apache.lucene.index.DirectoryReader;
    import org.apache.lucene.index.IndexWriter;
    import org.apache.lucene.index.IndexWriterConfig;
    import org.apache.lucene.queryparser.classic.MultiFieldQueryParser;
    import org.apache.lucene.search.IndexSearcher;
    import org.apache.lucene.search.Query;
    import org.apache.lucene.search.ScoreDoc;
    import org.apache.lucene.store.Directory;
    import org.apache.lucene.store.FSDirectory;
    import org.apache.lucene.util.Version;
    
    public class MyLuceneTest {
        
        /**
         * 依据内容,构建索引
         * @param analyzer
         * @param directory
         * @param items
         * @return
         */
        private boolean buildIndexer(Analyzer analyzer, Directory directory, List<Item> items) {
            IndexWriter iwriter = null;
            try {
                // 配置索引
                iwriter = new IndexWriter(directory, new IndexWriterConfig(
                        Version.LUCENE_47, analyzer));
                // 删除全部document
                iwriter.deleteAll();
                // 将文档信息存入索引
                Document doc[] = new Document[items.size()];
                for (int i = 0; i < items.size(); i++) {
                    doc[i] = new Document();           
                    Item item = items.get(i);
                    java.lang.reflect.Field[] fields = item.getClass().getDeclaredFields();
                    for (java.lang.reflect.Field field : fields) {
                        String fieldName = field.getName();
                       // System.out.println(fieldName);
                        String getMethodName = "get"+toFirstLetterUpperCase(fieldName);
                        Object obj = item.getClass().getMethod(getMethodName).invoke(item);
                        //System.out.println((String)obj);
                        doc[i].add(new Field(fieldName, (String)obj, TextField.TYPE_STORED));
                       // Field field1 = new Field("", new FileReader(new File("")));
                       // doc[1].add(field1);
                    }
                    
                    iwriter.addDocument(doc[i]);
                }
            } catch (Exception e) {
                e.printStackTrace();
                return false;
            } finally {
                try {
                    iwriter.close();
                } catch (IOException e) {
                }
            }
            return true;
        }
        
        /**
         * 依据keyword搜索索引
         * @param analyzer
         * @param directory
         * @param keyword
         * @return
         */
        public List<Item> searchIndexer(Analyzer analyzer, Directory directory, String keyword) {
            DirectoryReader ireader = null;
            List<Item> result = new ArrayList<Item>();
            try {
                // 设定搜索文件夹
                ireader = DirectoryReader.open(directory);
                IndexSearcher isearcher = new IndexSearcher(ireader);
    
                // 对多field进行搜索
                java.lang.reflect.Field[] fields = Item.class.getDeclaredFields();
                int length = fields.length;
                String[] multiFields = new String[length];
                for (int i = 0; i < length; i++) {
                    multiFields[i] = fields[i].getName();
                }
                MultiFieldQueryParser parser = new MultiFieldQueryParser(
                        Version.LUCENE_47, multiFields, analyzer);
    
                // 设定详细的搜索词
                Query query = parser.parse(keyword);
                ScoreDoc[] hits = isearcher.search(query, null, 10).scoreDocs;
    
                for (int i = 0; i < hits.length; i++) {
                    Document hitDoc = isearcher.doc(hits[i].doc);
                    Item item = new Item();
                    for (String field : multiFields) {
                        String setMethodName = "set"+toFirstLetterUpperCase(field);
                        item.getClass().getMethod(setMethodName, String.class).invoke(item, hitDoc.get(field));
                    }
                    result.add(item);
                }
            } catch (Exception e) {
                e.printStackTrace();
                return null;
            } finally {
                try {
                    ireader.close();
                    directory.close();
                } catch (IOException e) {
                }
            }
            return result;
        }
        
        /**
         * 首字母转大写
         * @param str
         * @return
         */
        public static String toFirstLetterUpperCase(String str) {  
            if(str == null || str.length() < 2){  
                return str;  
            }  
            return str.substring(0, 1).toUpperCase() + str.substring(1, str.length());  
         }  
        
        public static void main(String[] args) throws Exception {
        	System.out.println(LucenePackage.get());
            MyLuceneTest demo = new MyLuceneTest();
            Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_47);
           // Analyzer analyzer = new SmartChineseAnalyzer(Version.LUCENE_47);
            
            List<Item> items = new ArrayList<Item>();
            /*items.add(new Item("1", "中国", "This is the text to be greatly indexed."));
            items.add(new Item("2", "second", "This is great"));
            items.add(new Item("3", "third", "I love apple and pear. "));
            items.add(new Item("4", "four", "我是中国人"));
            items.add(new Item("5", "five", "中华人民共和国"));
            
            */File dataFile = new File("C:/mylucene");
            File[] dataFiles = dataFile.listFiles();
            for(int i = 0; i < dataFiles.length; i++){
            	Reader txtReader = new FileReader(dataFiles[i]);
            	char []buff = new char[10000];
            	txtReader.read(buff);
            	String str = String.valueOf(buff);
            	System.out.println(buff);
            	items.add(new Item(dataFiles[i].getCanonicalPath(),dataFiles[i].getName(),str));
            	//System.out.println(dataFiles[i].getCanonicalPath());
            	//System.out.println(dataFiles[i].getName());
            	//System.out.println(buff);
            	//System.out.println(txtReader.toString());
            }
            
            // 索引存到内存中的文件夹
            //Directory directory = new RAMDirectory();
            // 索引存储到硬盘
            File file = new File("c:/lucene");
            Directory directory = FSDirectory.open(file);
            demo.buildIndexer(analyzer, directory, items);
            List<Item> result = demo.searchIndexer(analyzer, directory, "中国");
            
            for (Item item : result) {
                System.out.println(item.toString());
            }
        }
    }

    package com.mylucene;
    public class Item {
       
        private String id;
        private String title;
        private String content;
       
        public Item() {
        }
       
        public Item(String id, String title, String content) {
            this.id = id;
            this.title = title;
            this.content = content;
        }
       
        public String getId() {
            return id;
        }
        public void setId(String id) {
            this.id = id;
        }
        public String getTitle() {
            return title;
        }
        public void setTitle(String title) {
            this.title = title;
        }
        public String getContent() {
            return content;
        }
        public void setContent(String content) {
            this.content = content;
        }
       
        public String toString() {
            StringBuilder sb = new StringBuilder();
            sb.append("[id=").append(id).append(",title=").append(title)
                .append(",content=").append(content).append("]");
            return sb.toString();
        }
    }

    这里是将文件的的三个属性进行了一下抽象,而且运用还有一个类去表示,在曾经版本号中是运用Reader进行读取文件,而且在文件进行加入索引的时候直接对Reader读取的对象进行加入。不须要将其全部进行读出都进行封装。

    这里就是文件很大的时候内存将会存不下,导致内存不足或者数组越界的可能。这里应该还能够像曾经版本号一样能够直接对文件建立索引的。我相信是我没有找到好的解决的方法。所以应该多研究一下4.8的api。

    

    版权声明:本文博客原创文章。博客,未经同意,不得转载。

  • 相关阅读:
    推荐系统之推荐系统的分类,即分析框架
    问题解决——OpenGL超级宝典 关于gltDrawTorus的错误解决
    SICP 习题 (1.10)解题总结
    [置顶] android 自定义圆角ImageView以及锯齿的处理
    Codeforces Round #199 (Div. 2)
    数组——约瑟夫问题
    素数距离问题_ny_24.java
    [置顶] Guava学习之Splitter
    植物-常见植物:地黄、熟地黄
    植物-常见植物:仙人掌
  • 原文地址:https://www.cnblogs.com/hrhguanli/p/4727078.html
Copyright © 2011-2022 走看看