# coding:utf-8 ''' 对doc目录里的所有文件建立索引,索引域主要有name,path,contents ''' import sys, os import lucene from lucene import SimpleFSDirectory,Document,File, Field, StandardAnalyzer, IndexWriter, Version from datetime import datetime lucene.initVM() ############1 print 'lucene',lucene.VERSION start = datetime.now() indexDir = './index' docDir = './doc' try : analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) ###########2 INDEXDIR = SimpleFSDirectory(File(indexDir)) ############3 indexWriter = IndexWriter(INDEXDIR, analyzer, True,IndexWriter.MaxFieldLength.LIMITED)#####4 for root, dirnames, filenames in os.walk(docDir): for filename in filenames: print filename if not filename.endswith('.txt'): continue path = os.path.join(root,filename) path = os.path.abspath(os.path.normpath(path)) with open(path,'r') as c: contents = unicode(c.read(),'utf-8') #print contents doc = Document()###5 nameField = Field('name', filename, Field.Store.YES, Field.Index.NOT_ANALYZED) doc.add(nameField)###6 pathField = Field('path', path, Field.Store.YES, Field.Index.NOT_ANALYZED) doc.add(pathField) contentsField = Field('contents', contents, Field.Store.NO, Field.Index.ANALYZED) doc.add(contentsField) indexWriter.addDocument(doc, analyzer)####6 indexWriter.optimize()#######7 indexWriter.close()######8 end = datetime.now() print '建立索引花费时间:', (end-start) except Exception,e: print e #下面的内容为解释 ''' Field.Store.YES:存储字段值(未分词前的字段值) Field.Store.NO:不存储,存储与索引没有关系 Field.Store.COMPRESS:压缩存储,用于长文本或二进制,但性能受损 Field.Index.ANALYZED:分词建索引 Field.Index.ANALYZED_NO_NORMS:分词建索引,但是Field的值不像通常那样被保存,而是只取一个byte,这样节约存储空间 Field.Index.NOT_ANALYZED:不分词且索引 Field.Index.NOT_ANALYZED_NO_NORMS:不分词建索引,Field的值去一个byte保存 ''' ''' 流程: 1 initVM() 2 StandardAnalyzer 3 SimpleFSDirectory 4 IndexWriter 5 for doc = Document() doc.add(Field()) indexWriter.addDocument(doc, analyzer) 6 indexWriter.optimize() indexWriter.close() '''