zoukankan      html  css  js  c++  java
  • PyLucene索引DEMO

    # coding:utf-8
    '''
    对doc目录里的所有文件建立索引,索引域主要有name,path,contents
    '''
    import sys, os
    import lucene
    from lucene import SimpleFSDirectory,Document,File, Field,
         StandardAnalyzer, IndexWriter, Version
    from datetime import datetime
    
    
    lucene.initVM() ############1
    print 'lucene',lucene.VERSION
    start = datetime.now()
    indexDir = './index'
    docDir = './doc'
    try :
        analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) ###########2
        INDEXDIR = SimpleFSDirectory(File(indexDir))  ############3
        indexWriter = IndexWriter(INDEXDIR, analyzer, True,IndexWriter.MaxFieldLength.LIMITED)#####4
        
        for root, dirnames, filenames in os.walk(docDir):
            for filename in filenames:
                print filename
                if not filename.endswith('.txt'):
                    continue
                path = os.path.join(root,filename)
                path = os.path.abspath(os.path.normpath(path))
                with open(path,'r') as c:
                    contents = unicode(c.read(),'utf-8')
                    #print contents
                    
                doc = Document()###5
                nameField = Field('name', filename, Field.Store.YES, Field.Index.NOT_ANALYZED)
                doc.add(nameField)###6
                pathField = Field('path', path, Field.Store.YES, Field.Index.NOT_ANALYZED)
                doc.add(pathField)
                contentsField = Field('contents', contents, Field.Store.NO, Field.Index.ANALYZED)
                doc.add(contentsField)
    
                indexWriter.addDocument(doc, analyzer)####6
    
        indexWriter.optimize()#######7
        indexWriter.close()######8
        end = datetime.now()
        print '建立索引花费时间:', (end-start)
    except Exception,e:
        print e
    
    #下面的内容为解释
    '''
           Field.Store.YES:存储字段值(未分词前的字段值)
           Field.Store.NO:不存储,存储与索引没有关系
           Field.Store.COMPRESS:压缩存储,用于长文本或二进制,但性能受损
    
           Field.Index.ANALYZED:分词建索引
           Field.Index.ANALYZED_NO_NORMS:分词建索引,但是Field的值不像通常那样被保存,而是只取一个byte,这样节约存储空间
           Field.Index.NOT_ANALYZED:不分词且索引
           Field.Index.NOT_ANALYZED_NO_NORMS:不分词建索引,Field的值去一个byte保存
    '''
    '''
    流程:
    1  initVM()
    2  StandardAnalyzer
    3  SimpleFSDirectory
    4  IndexWriter
    5  for
          doc = Document()
          doc.add(Field())
          indexWriter.addDocument(doc, analyzer)
    6   indexWriter.optimize()
        indexWriter.close()
    '''
  • 相关阅读:
    继续OI
    [WARNING]考前必读?!
    近些日的总结吧
    续上文
    又是一年NOIP然鹅我考的是高数(虽然我没打并且内容与NOIP无关)(手动滑稽)
    轮船问题(DP基础)
    NOIP2016报零记
    字符数组
    HA-0302 退役
    各种模板(part 2)
  • 原文地址:https://www.cnblogs.com/TianMG/p/3191679.html
Copyright © 2011-2022 走看看