zoukankan      html  css  js  c++  java
  • lucene 3.0.2 搜索

    1.lucene 词频

    转载:http://mxdxm.iteye.com/blog/989031

    lucene in action作为action系列,确实坚持了其实用性的特色。全书花了很大的篇幅来讲解查询的解析,结果的优化和lucene应用上。很适合要做全文检索的人学习使用。但是lucen的功能决不仅仅在做搜索引擎上。如果不是最近看到一篇介绍用lucene作词频,文档统计的文章的话,我可能到现在还在为寻找一种用于专业研究的工具而苦恼。其实lucene可以很轻松地实现信息检索课中提到的要求,例如:

    * 统计,实现以下功能 *

    (1) 统计term在整个collection中的文档频度(document frequency, DF);

    (2) 统计term在整个collection中出现的词次(term frequency in whole collection);

    (3) 统计term在某个文档中出现的频度(term frequency, TF); (4) 列出term在某文档中出现的位置(position); (5) 整个collection中文档的个数;

    另一个参考:http://www.360doc.com/content/11/0427/03/1947337_112596569.shtml

    package lia.meetlucene;
    
    import java.io.File;
    import java.io.IOException;
    import java.util.Date;
    
    
    import org.apache.lucene.index.IndexReader;
    
    import org.apache.lucene.index.TermEnum;
    import org.apache.lucene.index.TermPositions;
    
    import org.apache.lucene.search.IndexSearcher;
    import org.apache.lucene.store.Directory;
    import org.apache.lucene.store.FSDirectory;
    
    
    public class Searchnum {
    
        //static final Log log = LogFactory.getLog(Statistic.class);
    
        public static void printIndex(IndexReader reader) throws Exception {
    /*
            // 显示document数
             System.out.println(new Date() + "n");
             System.out.println(reader + "t该索引共含 " + reader.numDocs() + "篇文档n");
    
            for (int i = 0; i < reader.numDocs(); i++) {
                 System.out.println("文档" + i + ":" + reader.document(i) + "n");
            }
    */
            // 枚举term,获得<document, term freq, position* >信息
            TermEnum termEnum = reader.terms();
            while (termEnum.next()) {
                 System.out.println("n" + termEnum.term().field() + "域中出现的词语:"
                        + termEnum.term().text());
                 System.out.println(" 出现该词的文档数=" + termEnum.docFreq());
    
                TermPositions termPositions = reader.termPositions(termEnum.term());
                int i = 0;
                int j = 0;
                while (termPositions.next()) {
                     System.out.println("n" + (i++) + "->" + "    文章编号:"
                            + termPositions.doc() + ", 出现次数:"
                            + termPositions.freq() + "    出现位置:");
                    for (j = 0; j < termPositions.freq(); j++)
                         System.out.println("[" + termPositions.nextPosition() + "]");
                     System.out.println("n");
                }
    
            }
    
        }
    /*
        public static void main(String args[]) throws Exception {
            // String index = ReadConfig.getPara("indexdir");
    
            IndexReader reader = IndexReader.open(index);
            printIndex(reader);
    
        }*/
    
        public static void main(String[] args) throws Exception {
            if (args.length != 2) {
                throw new IllegalArgumentException("Usage: java "
                        + Searcher.class.getName() + " <index dir> <query>");
            }
    
            String indexDir = args[0]; // 1 索引路径
            //String q = args[1]; // 2 解析输入的查询字符串
    
            Directory dir = FSDirectory.open(new File(indexDir)); //3打开索引文件
            //IndexSearcher is = new IndexSearcher(dir);   //3   
            IndexSearcher search = new IndexSearcher(dir);
            IndexReader reader = search.getIndexReader();
            //search(indexDir);
            System.out.println("asdfsasdfasd");
            printIndex(reader);
            
        }
    }
    View Code

     结果:

    n
    ncontents域中出现的词语:精神
     出现该词的文档数=1
    n0->    文章编号:0, 出现次数:1    出现位置:
    [388]
    n
    ncontents域中出现的词语:繁荣
     出现该词的文档数=1
    n0->    文章编号:0, 出现次数:3    出现位置:
    [254]
    [353]
    [450]
    n
    ncontents域中出现的词语:给予
     出现该词的文档数=1
    n0->    文章编号:0, 出现次数:1    出现位置:
    [85]

    参考:http://hanyuanbo.iteye.com/blog/812847

    2.lucene 统计term(与建立索引时的分词器有关)出现的个数

    package lia.meetlucene;
    
    import java.io.File;
    import java.io.IOException;
    import org.apache.lucene.document.Document;
    import org.apache.lucene.index.CorruptIndexException;
    import org.apache.lucene.index.IndexReader;
    import org.apache.lucene.index.Term;
    import org.apache.lucene.index.TermDocs;
    import org.apache.lucene.search.IndexSearcher;
    import org.apache.lucene.store.Directory;
    import org.apache.lucene.store.FSDirectory;
    
    public class Searchnumber {
    
        public static void main(String[] args) throws CorruptIndexException,
                IOException {
            String indexDir = args[0]; // 1 索引路径
            String q = args[1]; // 2 解析输入的查询字符串
    
            search(indexDir, q);
        }
    
        public static void search(String indexDir, String keyword) {
            try {
                Directory dir = FSDirectory.open(new File(indexDir)); // 3打开索引文件
                IndexSearcher is = new IndexSearcher(dir, true);
                IndexReader reader = is.getIndexReader();
                int num = reader.numDocs();
                for (int i = 0; i < num; i++) {
                    Document doc = reader.document(i);
                    System.out.println(doc);
                }
    
                Term term = new Term("contents", keyword);
                TermDocs docs = reader.termDocs(term);
                while (docs.next()) {
                    // System.out.println("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    ");
                    System.out.print("doc num	" + docs.doc() + "	");
                    System.out.println("frequency:	" + docs.freq());
                }
                reader.close();
                is.close();
            } catch (CorruptIndexException e) {
                e.printStackTrace();
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
    }
    View Code

     结果:

    Document<stored,indexed<filename:commentbyme.txt> stored,indexed<fullpath:C:UsersAdministratorDesktopxdjdatacommentbyme.txt>>
    Document<stored,indexed<filename:gettrendweek.txt> stored,indexed<fullpath:C:UsersAdministratorDesktopxdjdatagettrendweek.txt>>
    Document<stored,indexed<filename:no.txt> stored,indexed<fullpath:C:UsersAdministratorDesktopxdjdata
    o.txt>>
    Document<stored,indexed<filename:showuser.txt> stored,indexed<fullpath:C:UsersAdministratorDesktopxdjdatashowuser.txt>>
    Document<stored,indexed<filename:suggestionusermayinst.txt> stored,indexed<fullpath:C:UsersAdministratorDesktopxdjdatasuggestionusermayinst.txt>>
    doc num    0    frequency:    15
    doc num    2    frequency:    2
    doc num    3    frequency:    1
    doc num    4    frequency:    30

    3.lucene统计多个文档中出现关键词的文档数

    package lia.meetlucene;
    
    import java.io.File;
    import java.io.IOException;
    import java.util.Date;
    
    import org.apache.lucene.analysis.standard.StandardAnalyzer;
    import org.apache.lucene.document.Document;
    import org.apache.lucene.index.CorruptIndexException;
    
    import org.apache.lucene.queryParser.ParseException;
    import org.apache.lucene.queryParser.QueryParser;
    
    import org.apache.lucene.search.IndexSearcher;
    import org.apache.lucene.search.Query;
    import org.apache.lucene.search.ScoreDoc;
    
    import org.apache.lucene.search.TopScoreDocCollector;
    import org.apache.lucene.store.Directory;
    import org.apache.lucene.store.FSDirectory;
    import org.apache.lucene.util.Version;
    
    public class Searchnumbers {
    
        public static void main(String[] args) throws CorruptIndexException,
                IOException {
            String indexDir = args[0]; // 1 索引路径
            String q = args[1]; // 2 解析输入的查询字符串
    
            search(indexDir, q);
        }
    
        public static void search(String indexDir, String keyword) {
            try {
                Directory dir = FSDirectory.open(new File(indexDir)); // 3打开索引文件
                IndexSearcher is = new IndexSearcher(dir, true);
                QueryParser parser = new QueryParser(Version.LUCENE_30, "contents",
                        new StandardAnalyzer(Version.LUCENE_30));// 对Document中的哪个Field进行QueryParser
                Query query = parser.parse(keyword);
                TopScoreDocCollector collector = TopScoreDocCollector.create(100,
                        false);
    
                long start = new Date().getTime();
                is.search(query, collector);// IndexSearcher对Query进行索引,并将结果保存在TopScoreDocCollector中
                ScoreDoc[] hits = collector.topDocs().scoreDocs;
    
                System.out.println(hits.length);
                for (int i = 0; i < hits.length; i++) {
                    Document doc = is.doc(hits[i].doc);
                    System.out.println(doc.getField("filename") + "	"
                            + hits[i].toString());// 得到doc中的filename的Field
                }
                is.close();
                long end = new Date().getTime();
    
                System.out.println("Found " + collector.getTotalHits()
                        + " document(s) (in " + (end - start)
                        + " milliseconds) that matched query '" + keyword + "'");
            } catch (CorruptIndexException e) {
                e.printStackTrace();
            } catch (IOException e) {
                e.printStackTrace();
            } catch (ParseException e) {
                e.printStackTrace();
            }
        }
    
    }
    View Code

    4.lucene搜索产生的索引term(分割词)中关键词出现个数。(同2)

    package lia.meetlucene;
    
    import java.io.File;
    import java.io.IOException;
    import org.apache.lucene.document.Document;
    import org.apache.lucene.index.CorruptIndexException;
    import org.apache.lucene.index.IndexReader;
    import org.apache.lucene.index.Term;
    import org.apache.lucene.index.TermDocs;
    import org.apache.lucene.search.IndexSearcher;
    import org.apache.lucene.store.Directory;
    import org.apache.lucene.store.FSDirectory;
    
    public class Searchnumber {
    
        public static void main(String[] args) throws CorruptIndexException,
                IOException {
            String indexDir = args[0]; // 1 索引路径
            String q = args[1]; // 2 解析输入的查询字符串
    
            search(indexDir, q);
        }
    
        public static void search(String indexDir, String keyword) {
            try {
                Directory dir = FSDirectory.open(new File(indexDir)); // 3打开索引文件
                IndexSearcher is = new IndexSearcher(dir, true);
                IndexReader reader = is.getIndexReader();
                int num = reader.numDocs();
                for (int i = 0; i < num; i++) {
                    Document doc = reader.document(i);
                    System.out.println(doc);
                }
    
                Term term = new Term("contents", keyword);
                TermDocs docs = reader.termDocs(term);
                while (docs.next()) {
                    System.out.println("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    ");
                    System.out.print("doc num	" + docs.doc() + "	");
                    System.out.println("frequency:	" + docs.freq());
                }
                reader.close();
                is.close();
            } catch (CorruptIndexException e) {
                e.printStackTrace();
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
    }
    View Code
    Document<stored,indexed<filename:texthz.txt> stored,indexed<fullpath:E:xdjweibodata	ext	exthz.txt>>
    ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    
    doc num    0    frequency:    27254

     5.lucene搜索文档中是否包含关键字,打印文档名

    package lia.meetlucene;
    
    /**
     * Copyright Manning Publications Co.
     *
     * Licensed under the Apache License, Version 2.0 (the "License");
     * you may not use this file except in compliance with the License.
     * You may obtain a copy of the License at
     *
     *     http://www.apache.org/licenses/LICENSE-2.0
     *
     * Unless required by applicable law or agreed to in writing, software
     * distributed under the License is distributed on an "AS IS" BASIS,
     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     * See the License for the specific lan      
     */
    
    import org.apache.lucene.document.Document;
    import org.apache.lucene.index.IndexReader;
    import org.apache.lucene.index.Term;
    import org.apache.lucene.index.TermEnum;
    import org.apache.lucene.search.IndexSearcher;
    import org.apache.lucene.search.Query;
    import org.apache.lucene.search.ScoreDoc;
    import org.apache.lucene.search.TopDocs;
    import org.apache.lucene.store.FSDirectory;
    import org.apache.lucene.store.Directory;
    import org.apache.lucene.queryParser.QueryParser;
    import org.apache.lucene.queryParser.ParseException;
    import org.apache.lucene.analysis.cn.smart.SmartChineseAnalyzer;
    import org.apache.lucene.analysis.standard.StandardAnalyzer;
    import org.apache.lucene.util.Version;
    
    import java.io.File;
    import java.io.IOException;
    
    // From chapter 1
    
    /**
     * This code was originally written for Erik's Lucene intro java.net article
     */
    public class Searcher {
    
        public static void main(String[] args) throws IllegalArgumentException,
                IOException, ParseException {
            if (args.length != 2) {
                throw new IllegalArgumentException("Usage: java "
                        + Searcher.class.getName() + " <index dir> <query>");
            }
    
            String indexDir = args[0]; // 1 索引路径
            String q = args[1]; // 2 解析输入的查询字符串
    
            search(indexDir, q);
        }
    
        public static void search(String indexDir, String q) throws IOException,
                ParseException {
            // ////////////////////////////////////////////////////////////////////////////////
            Directory dir = FSDirectory.open(new File(indexDir)); // 3打开索引文件
            IndexSearcher is = new IndexSearcher(dir); // 3
            IndexReader reader = is.getIndexReader();
            // TermEnum termEnum=reader.terms();
            // termEnum.
            /*
             * QueryParser parser = new QueryParser(Version.LUCENE_30, // 4 解析查询字符串
             * "contents", //4 new StandardAnalyzer( //4 Version.LUCENE_30)); //4
             */
            QueryParser parser = new QueryParser(Version.LUCENE_30, // 4 解析查询字符串
                    "contents", // 4
                     new SmartChineseAnalyzer(Version.LUCENE_30)); // 4
            Query query = parser.parse(q); // 4
            long start = System.currentTimeMillis();
            TopDocs hits = is.search(query, 10); // 5 搜索索引
            // Hits hits = is.search(query);
            long end = System.currentTimeMillis();
    
            System.err.println("Found " + hits.totalHits + // 6记录索引状态
                    " document(s) (in " + (end - start) + // 6
                    " milliseconds) that matched query '" + // 6+reader.docFreq(new
                                                            // Term("雨"))
                    q + "':");
    
            for (ScoreDoc scoreDoc : hits.scoreDocs) {
                Document doc = is.doc(scoreDoc.doc); // 7 返回匹配文字
    
                System.out.println(doc.get("fullpath") + "  " + scoreDoc.doc); // 8
                                                                                // 显示匹配文件名
            }
            // ////////////////////////////////////////////////////////////////////////////////
            is.close(); // 9
        }
    }
    
    /*
     * #1 Parse provided index directory #2 Parse provided query string #3 Open
     * index #4 Parse query #5 Search index #6 Write search stats
     * 
     * #7 Retrieve matching document #8 Display filename #9 Close IndexSearcher
     */
    View Code
    Found 1 document(s) (in 16 milliseconds) that matched query '雨钝':
    E:xdjweibodata	ext	exthz.txt  0

    6.查询词排序输出

    package lia.meetlucene;
    import java.io.File;
    import java.io.IOException;
    import org.apache.lucene.analysis.standard.StandardAnalyzer;
    import org.apache.lucene.document.Document;
    import org.apache.lucene.document.Field;
    import org.apache.lucene.document.Field.Index;
    import org.apache.lucene.document.Field.Store;
    import org.apache.lucene.index.CorruptIndexException;
    import org.apache.lucene.index.IndexWriter;
    import org.apache.lucene.index.IndexWriter.MaxFieldLength;
    import org.apache.lucene.queryParser.ParseException;
    import org.apache.lucene.queryParser.QueryParser;
    import org.apache.lucene.search.IndexSearcher;
    import org.apache.lucene.search.Query;
    import org.apache.lucene.search.ScoreDoc;
    import org.apache.lucene.search.Searcher;
    import org.apache.lucene.search.Sort;
    import org.apache.lucene.search.SortField;
    import org.apache.lucene.search.TopDocs;
    import org.apache.lucene.search.TopFieldDocs;
    import org.apache.lucene.store.FSDirectory;
    import org.apache.lucene.store.LockObtainFailedException;
    import org.apache.lucene.util.Version;
    public class Searchnumbers {
        /**
         * 建索引<br>
         * 一共4个Document,每个文档两个Field:text,size。text存放内容,size用于排序
         * 
         * @throws CorruptIndexException
         * @throws LockObtainFailedException
         * @throws IOException
         */
        private static void build() throws CorruptIndexException, LockObtainFailedException, IOException {
            IndexWriter writer = new IndexWriter(FSDirectory.open(new File("index")), new StandardAnalyzer(Version.LUCENE_30), true, MaxFieldLength.LIMITED);
            Document document = new Document();
            document.add(new Field("text", "google", Store.YES, Index.ANALYZED));
            document.add(new Field("size", "1", Store.YES, Index.NOT_ANALYZED_NO_NORMS));
            writer.addDocument(document);
            document = new Document();
            document.add(new Field("text", "google earth apache", Store.YES, Index.ANALYZED));
            document.add(new Field("size", "2", Store.YES, Index.NOT_ANALYZED_NO_NORMS));
            writer.addDocument(document);
            document = new Document();
            document.add(new Field("text", "baidu earth", Store.YES, Index.ANALYZED));
            document.add(new Field("size", "3", Store.YES, Index.NOT_ANALYZED_NO_NORMS));
            writer.addDocument(document);
            document = new Document();
            document.add(new Field("text", "baidu earth apache", Store.YES, Index.ANALYZED));
            document.add(new Field("size", "4", Store.YES, Index.NOT_ANALYZED_NO_NORMS));
            writer.addDocument(document);
            writer.optimize();
            writer.close();
        }
        /**
         * lucene3.0已经没有返回Hits的方法,使用返回TopDocs的方法进行搜索
         * 
         * @param keyword
         *            要搜索的关键词
         * @throws CorruptIndexException
         * @throws IOException
         * @throws ParseException
         */
        private static void searchWithTopDocs(String keyword) throws CorruptIndexException, IOException, ParseException {
            QueryParser parser = new QueryParser(Version.LUCENE_30, "text", new StandardAnalyzer(Version.LUCENE_30));
            IndexSearcher searcher = new IndexSearcher(FSDirectory.open(new File("index")));
            TopDocs topDocs = searcher.search(parser.parse(keyword), 10);// 取前10个搜索结果,如果没有这么多,就取实际大小
            ScoreDoc[] scoreDocs = topDocs.scoreDocs;// 获取ScoreDoc
            System.out.println("hits:" + topDocs.totalHits);
            for (ScoreDoc scoreDoc : scoreDocs) {
                int docNum = scoreDoc.doc;// 文档编号
                Document doc = searcher.doc(docNum);
                String text = doc.get("text");
                String size = doc.get("size");
                float score = scoreDoc.score;// 评分
                System.out.println(text + " " + size + " " + score);
            }
        }
        /**
         * 对命中文档进行排序的搜索,也不再返回Hits,而是返回TopFieldDocs
         * 
         * @param keyword
         *            要搜索的关键词
         * @throws CorruptIndexException
         * @throws IOException
         * @throws ParseException
         */
        private static void searchWithSort(String keyword) throws CorruptIndexException, IOException, ParseException {
            QueryParser parser = new QueryParser(Version.LUCENE_30, "text", new StandardAnalyzer(Version.LUCENE_30));
            Searcher searcher = new IndexSearcher(FSDirectory.open(new File("index")));
            Query query = parser.parse(keyword);
            SortField sortField = new SortField("size", SortField.INT, true);// 需要排序的字段
            TopFieldDocs topFieldDocs = searcher.search(query, null, 10, new Sort(sortField));// 第二个参数是过滤器,此例中不需要
            ScoreDoc[] socDocs = topFieldDocs.scoreDocs;
            System.out.println("hits:" + topFieldDocs.totalHits);
            for (ScoreDoc scoreDoc : socDocs) {
                int docNum = scoreDoc.doc;
                Document doc = searcher.doc(docNum);
                String text = doc.get("text");
                String size = doc.get("size");
                float score = scoreDoc.score;// 评分,这里的评分不可用,分值都是NaN
                System.out.println(text + " " + size + " " + score);
            }
        }
        public static void main(String[] args) throws CorruptIndexException, LockObtainFailedException, IOException, ParseException {
            build();
            String keyword = "google";
            searchWithTopDocs(keyword);
            System.out.println("---------");
            searchWithSort(keyword);
        }
    }
    View Code
    hits:2
    google 1 1.287682
    google earth apache 2 0.643841
    ---------
    hits:2
    google earth apache 2 NaN
    google 1 NaN

    7.lucene关键字的高亮显示

    package lia.meetlucene;
    import java.io.File;
    import java.io.IOException;
    import java.io.StringReader;
    import org.apache.lucene.analysis.Analyzer;
    import org.apache.lucene.analysis.TokenStream;
    import org.apache.lucene.analysis.cn.smart.SmartChineseAnalyzer;
    import org.apache.lucene.analysis.tokenattributes.TermAttribute;
    import org.apache.lucene.document.Document;
    import org.apache.lucene.document.Field;
    import org.apache.lucene.document.Field.Index;
    import org.apache.lucene.document.Field.Store;
    import org.apache.lucene.index.CorruptIndexException;
    import org.apache.lucene.index.IndexWriter;
    import org.apache.lucene.index.IndexWriter.MaxFieldLength;
    import org.apache.lucene.queryParser.ParseException;
    import org.apache.lucene.queryParser.QueryParser;
    import org.apache.lucene.search.IndexSearcher;
    import org.apache.lucene.search.Query;
    import org.apache.lucene.search.ScoreDoc;
    import org.apache.lucene.search.TopDocs;
    import org.apache.lucene.search.highlight.Highlighter;
    import org.apache.lucene.search.highlight.InvalidTokenOffsetsException;
    import org.apache.lucene.search.highlight.QueryScorer;
    import org.apache.lucene.search.highlight.SimpleFragmenter;
    import org.apache.lucene.search.highlight.SimpleHTMLFormatter;
    import org.apache.lucene.store.FSDirectory;
    import org.apache.lucene.store.LockObtainFailedException;
    import org.apache.lucene.util.Version;
    public class Searchnum {
        /**
         * lucene3.0开始已经抛弃了原来的分词方式,转而使用新的分词方式<br>
         * 本方法以SmartChineseAnalyzer为例,演示如何分词以及取得分词之后的term
         * 
         * @throws Exception
         */
        public static void analysis() throws Exception {
            Analyzer analyzer = new SmartChineseAnalyzer(Version.LUCENE_30);
            String string = "中国人民银行采取了一系列措施防止人民币升值,但是很遗憾,这些措施在今天看来其作用是微乎其微的。难道真的就没有什么别的措施防止人民币再次疯狂升值了吗?";
            StringReader reader = new StringReader(string);
            TokenStream ts = analyzer.tokenStream("", reader);
            TermAttribute termAttribute = ts.getAttribute(TermAttribute.class);
            while (ts.incrementToken()) {
                System.out.print(termAttribute.term() + "  ");
            }
            System.out.println();
        }
        /**
         * 建索引<br>
         * 在构造IndexWriter时必须使用Directory作为参数了
         * 
         * @throws CorruptIndexException
         * @throws LockObtainFailedException
         * @throws IOException
         */
        private static void build() throws CorruptIndexException, LockObtainFailedException, IOException {
            String path = "index";
            IndexWriter writer = new IndexWriter(FSDirectory.open(new File(path)), new SmartChineseAnalyzer(Version.LUCENE_30), true, MaxFieldLength.LIMITED);
            Document document = new Document();
            document.add(new Field("text", "中国人民银行采取了一系列措施防止人民币升值,但是很遗憾,这些措施在今天看来其作用是微乎其微的。难道真的就没有什么别的措施防止人民币再次疯狂升值了吗?", Store.YES, Index.ANALYZED));
            writer.addDocument(document);
            writer.optimize();
            writer.close();
        }
        /**
         * 搜索也没有返回Hits类型结果的方法了
         * 
         * @param keyword
         * @throws CorruptIndexException
         * @throws IOException
         * @throws ParseException
         * @throws InvalidTokenOffsetsException
         */
        private static void search(String keyword) throws CorruptIndexException, IOException, ParseException, InvalidTokenOffsetsException {
            Analyzer analyzer = new SmartChineseAnalyzer(Version.LUCENE_30);
            QueryParser parser = new QueryParser(Version.LUCENE_30, "text", analyzer);
            IndexSearcher searcher = new IndexSearcher(FSDirectory.open(new File("index")));
            Query query = parser.parse(keyword);
            System.out.println(query);
            TopDocs topDocs = searcher.search(query, 10);
            ScoreDoc[] scoreDocs = topDocs.scoreDocs;
            System.out.println("hits:" + topDocs.totalHits);
            for (ScoreDoc scoreDoc : scoreDocs) {
                Document doc = searcher.doc(scoreDoc.doc);
                String text = doc.get("text");
                System.out.println(highlight(text, query, analyzer));
            }
        }
        /**
         * 高亮关键词
         * 
         * @param content
         *            需要高亮的内容
         * @param query
         *            搜索时使用的Query对象
         * @param analyzer
         *            分词器
         * @return 高亮之后的文本
         * @throws IOException
         * @throws InvalidTokenOffsetsException
         */
        private static String highlight(String content, Query query, Analyzer analyzer) throws IOException, InvalidTokenOffsetsException {
            SimpleHTMLFormatter formatter = new SimpleHTMLFormatter("<b>", "</b>");
            Highlighter highlighter = new Highlighter(formatter, new QueryScorer(query));
            highlighter.setTextFragmenter(new SimpleFragmenter(25));
            String resultString = highlighter.getBestFragment(analyzer.tokenStream("", new StringReader(content)), content);
            return resultString + "...";
        }
        public static void main(String[] args) throws Exception {
            analysis();
            build();
            search("人民币");
        }
    }
    View Code
    中国  人民  银行  采取  了  一  系列  措施  防止  人民币  升值  但是  很  遗憾  这些  措施  在  今天  看来  其  作用  是  微乎其微  的  难道  真  的  就  没有  什么  别的  措施  防止  人民币  再次  疯狂  升值  了  吗  
    text:人民币
    hits:1
    中国人民银行采取了一系列措施防止<b>人民币</b>升值,但是...

     8.输出分词后,出现次数较多的前1000个terms

    package lia.meetlucene;
    
    import java.io.File;
    
    import java.util.ArrayList;
    import java.util.Collections;
    import java.util.Comparator;
    
    import java.util.List;
    
    import org.apache.lucene.index.IndexReader;
    import org.apache.lucene.index.TermEnum;
    import org.apache.lucene.index.TermPositions;
    import org.apache.lucene.search.IndexSearcher;
    import org.apache.lucene.store.Directory;
    import org.apache.lucene.store.FSDirectory;
    
    public class Searchnumbers {
    
        // static final Log log = LogFactory.getLog(Statistic.class);
    
        public static void printIndex(IndexReader reader) throws Exception {
            /*
             * // 显示document数 System.out.println(new Date() + "n");
             * System.out.println(reader + "t该索引共含 " + reader.numDocs() + "篇文档n");
             * 
             * for (int i = 0; i < reader.numDocs(); i++) { System.out.println("文档"
             * + i + ":" + reader.document(i) + "n"); }
             */
            // 枚举term,获得<document, term freq, position* >信息
            TermEnum termEnum = reader.terms();
    
            // List ve = new List();
            List<Person_Term> listA = new ArrayList<Person_Term>();
    
            while (termEnum.next()) {
                Person_Term pa = new Person_Term();
                /*
                 * System.out.println("n" + termEnum.term().field() + "域中出现的词语:" +
                 * termEnum.term().text());
                 */
                pa.setterm(termEnum.term().text());
                TermPositions termPositions = reader.termPositions(termEnum.term());
                int i = 0;
                int j = 0;
                // while (termPositions.next())
                {
                    /*
                     * System.out.println("n" + (i++) + "->" + "    文章编号:" +
                     * termPositions.doc() + ", 出现次数:" + termPositions.freq() +
                     * "    出现位置:"); for (j = 0; j < termPositions.freq(); j++)
                     * System.out.println("[" + termPositions.nextPosition() + "]");
                     * System.out.println("n");
                     */
                    termPositions.next();
                    pa.setfreq(termPositions.freq());
                    // System.out.println(termPositions.);
                }
    
                listA.add(pa);
            }
            Collections.sort(listA, new Comparator<Person_Term>() {
    
                public int compare(Person_Term arg0, Person_Term arg1) {
    
                    return arg1.getfreq().compareTo(arg0.getfreq());
    
                }
    
            });
            int i = 0;
            for (Person_Term p : listA) {
                i++;
                System.out.println(p.getterm() + "	" + p.getfreq());
                if (i > 1000)
                    break;
            }
    
        }
    
        /*
         * public static void main(String args[]) throws Exception { // String index
         * = ReadConfig.getPara("indexdir");
         * 
         * IndexReader reader = IndexReader.open(index); printIndex(reader);
         * 
         * }
         */
    
        public static void main(String[] args) throws Exception {
            if (args.length != 2) {
                throw new IllegalArgumentException("Usage: java "
                        + Searcher.class.getName() + " <index dir> <query>");
            }
    
            String indexDir = "C:/Users/Administrator/Desktop/xdj/suoyin";// args[0];
                                                                            // // 1
                                                                            // 索引路径
            // String indexDir = "E:/xiaodajun/new/lia2e/src/lia/meetlucene";
            // String q = args[1]; // 2 解析输入的查询字符串
    
            Directory dir = FSDirectory.open(new File(indexDir)); // 3打开索引文件
            // IndexSearcher is = new IndexSearcher(dir); //3
            IndexSearcher search = new IndexSearcher(dir);
            IndexReader reader = search.getIndexReader();
            // search(indexDir);
            // System.out.println("asdfsasdfasd");
            printIndex(reader);
    
        }
    }
    View Code

     Person_Term.java的代码:

    package lia.meetlucene;
    
    public class Person_Term implements Comparable<Person_Term> {
    
        private String term;
    
        private Integer freq;
    
        /**
         * 
         * @return the term
         */
    
        public String getterm() {
    
            return term;
    
        }
    
        /**
         * 
         * @param term
         * 
         *            the term to set
         */
    
        public void setterm(String term) {
    
            this.term = term;
    
        }
    
        /**
         * 
         * @return the freq
         */
    
        public Integer getfreq() {
    
            return freq;
    
        }
    
        /**
         * 
         * @param freq
         * 
         *            the freq to set
         */
    
        public void setfreq(Integer freq) {
    
            this.freq = freq;
    
        }
    
        @Override
        public int compareTo(Person_Term arg0) {
    
            return this.getfreq().compareTo(arg0.getfreq());
    
        }
    
    }
    
    /*
     * TermPositions termPositions = reader.termPositions(termEnum.term()); int i =
     * 0; int j = 0; while (termPositions.next()) { System.out.println("n" + (i++) +
     * "->" + "    文章编号:" + termPositions.doc() + ", 出现次数:" + termPositions.freq() +
     * "    出现位置:"); for (j = 0; j < termPositions.freq(); j++)
     * System.out.println("[" + termPositions.nextPosition() + "]");
     * System.out.println("n"); }
     */
    View Code

    输出(部分):

    文化    191698
    中国    777
    先达    33
    委员    3
    家中    33
    C:UsersAdministratorDesktopxdjweibohanziweibo.txt    1

     9.多条件搜索

    package lia.meetlucene;
    
    /**
     * Copyright Manning Publications Co.
     *
     * Licensed under the Apache License, Version 2.0 (the "License");
     * you may not use this file except in compliance with the License.
     * You may obtain a copy of the License at
     *
     *     http://www.apache.org/licenses/LICENSE-2.0
     *
     * Unless required by applicable law or agreed to in writing, software
     * distributed under the License is distributed on an "AS IS" BASIS,
     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     * See the License for the specific lan      
     */
    
    import org.apache.lucene.document.Document;
    import org.apache.lucene.document.NumberTools;
    import org.apache.lucene.index.IndexReader;
    import org.apache.lucene.index.Term;
    import org.apache.lucene.index.TermEnum;
    import org.apache.lucene.search.Filter;
    import org.apache.lucene.search.FilteredQuery;
    import org.apache.lucene.search.IndexSearcher;
    import org.apache.lucene.search.NumericRangeQuery;
    import org.apache.lucene.search.Query;
    import org.apache.lucene.search.RangeFilter;
    import org.apache.lucene.search.ScoreDoc;
    import org.apache.lucene.search.Sort;
    import org.apache.lucene.search.SortField;
    import org.apache.lucene.search.TopDocs;
    import org.apache.lucene.store.FSDirectory;
    import org.apache.lucene.store.Directory;
    import org.apache.lucene.queryParser.QueryParser;
    import org.apache.lucene.queryParser.ParseException;
    import org.apache.lucene.analysis.cjk.CJKAnalyzer;
    import org.apache.lucene.analysis.cn.smart.SmartChineseAnalyzer;
    import org.apache.lucene.analysis.standard.StandardAnalyzer;
    import org.apache.lucene.util.Version;
    
    import java.io.File;
    import java.io.FileFilter;
    import java.io.IOException;
    import java.text.SimpleDateFormat;
    import java.util.ArrayList;
    import java.util.List;
    
    // From chapter 1
    
    /**
     * This code was originally written for Erik's Lucene intro java.net article
     */
    public class Searcherw {
    
        public static void main(String[] args) throws IllegalArgumentException,
                IOException, ParseException {
    
    
            // String indexDir = args[0]; //1 索引路径
            //String indexDir = "C:/Users/Administrator/Desktop/xdj/suoyin";
            String indexDir = "E:/xdj/tengxunsuoying";
            String q = "雨天";// args[1]; //2 解析输入的查询字符串
    
            search(indexDir, q);
        }
    
        public static void search(String indexDir, String q) throws IOException,
                ParseException {
            // ////////////////////////////////////////////////////////////////////////////////
            Directory dir = FSDirectory.open(new File(indexDir)); // 3打开索引文件
            IndexSearcher is = new IndexSearcher(dir); // 3
            IndexReader reader = is.getIndexReader();
            // TermEnum termEnum=reader.terms();
            // termEnum.
            QueryParser parser = new QueryParser(Version.LUCENE_30, // 4 解析查询字符串
                    "context", // 4
                    //new StandardAnalyzer(Version.LUCENE_30)); // 4
                    //new CJKAnalyzer(Version.LUCENE_30));
                    new SmartChineseAnalyzer(Version.LUCENE_30));
            
            
    //        SimpleDateFormat sdf  =   new  SimpleDateFormat( " yyyy-MM-dd" ); 
    
      //       FileFilter filter = new FileFilter("time", datetime.parse(”2005-10-1′), datetime.parse(”2005-10-30′));
            Filter filter = new RangeFilter("time", "20141001", "20141031", true, true);     
            // NumericRangeQuery rangeQuery = NumericRangeQuery.("carPrice",st,ed,true, true);  
            
    
            Query query = parser.parse(q); // 4
            
            query = new FilteredQuery(query, filter); // 带过滤条件的搜索
            long start = System.currentTimeMillis();
            
            
    
            
            //is.search(query, filter, n, sort)
            
            //TopDocs hits = is.search(query, 10,new Sort(new SortField("time",SortField.STRING,true))); // 5 搜索索引
            TopDocs hits = is.search(query, 10); // 5 搜索索引
            long end = System.currentTimeMillis();
            // System.err
    
            System.err.println("Found " + hits.totalHits + // 6记录索引状态
                    " document(s) (in " + (end - start) + // 6
                    " milliseconds) that matched query '" + // 6+reader.docFreq(new
                                                            // Term("雨"))
                    q + "':");
    
            for (ScoreDoc scoreDoc : hits.scoreDocs) {
                Document doc = is.doc(scoreDoc.doc); // 7 返回匹配文字
    
                System.out.println(doc.get("time") + "  "+doc.get("context")); // 8
                                                                                // 显示匹配文件名
            }
            // ////////////////////////////////////////////////////////////////////////////////
            is.close(); // 9
        }
            
        
    }
    
    /*
     *
    public  List<Document> rangeSearch(){  
             List<Document> docList = new ArrayList<Document>();  
             Double start = 20.0;  
             Double end = 40.0;  
            NumericRangeQuery rangeQuery = NumericRangeQuery.newDoubleRange("carPrice",start,end,true, true);  
            try {  
             directory = FSDirectory.open(new File(LuceneConstant.INDEX_PATH));//打开索引文件  
             IndexReader reader = DirectoryReader.open(directory);//读取目录  
             IndexSearcher search = new IndexSearcher(reader);//初始化查询组件  
             TopDocs td = search.search(rangeQuery, 10000);//获取匹配上元素的一个docid    
             for (ScoreDoc doc : td.scoreDocs) {  
                 docList.add(search.doc(doc.doc));  
             }  
             reader.close();//关闭资源    
             directory.close();//关闭连接    
         } catch (IOException ex) {  
             Logger.getLogger(LuceneDao.class.getName()).log(Level.SEVERE, null, ex);  
      }  
     }  
    
     * 
     * 
     * 
     */
    View Code
  • 相关阅读:
    awk去重以某列重复的行
    awk 统计文件中按照某列统计某列的和(sum)
    使用jdk压缩war包
    histoty显示时间戳
    awk统计文件中某关键词出现次数
    Jbox帮助文档,默认的属性含义
    net之session漫谈及分布式session解决方案
    StackExchange.Redis 基本使用 (一) (转)
    Sql Server 表创建以及Ef浅谈
    数据验证(自定义特性)
  • 原文地址:https://www.cnblogs.com/XDJjy/p/4337663.html
Copyright © 2011-2022 走看看