zoukankan      html  css  js  c++  java
  • PyLucene索引DEMO

    # coding:utf-8
    '''
    对doc目录里的所有文件建立索引,索引域主要有name,path,contents
    '''
    import sys, os
    import lucene
    from lucene import SimpleFSDirectory,Document,File, Field,
         StandardAnalyzer, IndexWriter, Version
    from datetime import datetime
    
    
    lucene.initVM() ############1
    print 'lucene',lucene.VERSION
    start = datetime.now()
    indexDir = './index'
    docDir = './doc'
    try :
        analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) ###########2
        INDEXDIR = SimpleFSDirectory(File(indexDir))  ############3
        indexWriter = IndexWriter(INDEXDIR, analyzer, True,IndexWriter.MaxFieldLength.LIMITED)#####4
        
        for root, dirnames, filenames in os.walk(docDir):
            for filename in filenames:
                print filename
                if not filename.endswith('.txt'):
                    continue
                path = os.path.join(root,filename)
                path = os.path.abspath(os.path.normpath(path))
                with open(path,'r') as c:
                    contents = unicode(c.read(),'utf-8')
                    #print contents
                    
                doc = Document()###5
                nameField = Field('name', filename, Field.Store.YES, Field.Index.NOT_ANALYZED)
                doc.add(nameField)###6
                pathField = Field('path', path, Field.Store.YES, Field.Index.NOT_ANALYZED)
                doc.add(pathField)
                contentsField = Field('contents', contents, Field.Store.NO, Field.Index.ANALYZED)
                doc.add(contentsField)
    
                indexWriter.addDocument(doc, analyzer)####6
    
        indexWriter.optimize()#######7
        indexWriter.close()######8
        end = datetime.now()
        print '建立索引花费时间:', (end-start)
    except Exception,e:
        print e
    
    #下面的内容为解释
    '''
           Field.Store.YES:存储字段值(未分词前的字段值)
           Field.Store.NO:不存储,存储与索引没有关系
           Field.Store.COMPRESS:压缩存储,用于长文本或二进制,但性能受损
    
           Field.Index.ANALYZED:分词建索引
           Field.Index.ANALYZED_NO_NORMS:分词建索引,但是Field的值不像通常那样被保存,而是只取一个byte,这样节约存储空间
           Field.Index.NOT_ANALYZED:不分词且索引
           Field.Index.NOT_ANALYZED_NO_NORMS:不分词建索引,Field的值去一个byte保存
    '''
    '''
    流程:
    1  initVM()
    2  StandardAnalyzer
    3  SimpleFSDirectory
    4  IndexWriter
    5  for
          doc = Document()
          doc.add(Field())
          indexWriter.addDocument(doc, analyzer)
    6   indexWriter.optimize()
        indexWriter.close()
    '''
  • 相关阅读:
    面试回忆录(一)
    2013国内IT行业薪资对照表【技术岗】
    腾讯2013笔试题—web前端笔试题 (老题练手)
    Nicholas C. Zakas(JS圣经:JavaScript高级程序设计作者)如何面试前端工程师
    Js中 关于top、clientTop、scrollTop、offsetTop的用法
    JavaScript中的面向对象的讨论(转)
    javascript中的原型理解总结
    关于Javascript语言中this关键字(变量)的用法
    window.clearInterval与window.setInterval的用法(
    JavaScript经典魔力代码
  • 原文地址:https://www.cnblogs.com/TianMG/p/3191679.html
Copyright © 2011-2022 走看看