zoukankan html css js c++ java

PyLucene索引DEMO

# coding:utf-8
'''
对doc目录里的所有文件建立索引，索引域主要有name，path，contents
'''
import sys, os
import lucene
from lucene import SimpleFSDirectory,Document,File, Field,
     StandardAnalyzer, IndexWriter, Version
from datetime import datetime


lucene.initVM() ############1
print 'lucene',lucene.VERSION
start = datetime.now()
indexDir = './index'
docDir = './doc'
try :
    analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) ###########2
    INDEXDIR = SimpleFSDirectory(File(indexDir))  ############3
    indexWriter = IndexWriter(INDEXDIR, analyzer, True,IndexWriter.MaxFieldLength.LIMITED)#####4
    
    for root, dirnames, filenames in os.walk(docDir):
        for filename in filenames:
            print filename
            if not filename.endswith('.txt'):
                continue
            path = os.path.join(root,filename)
            path = os.path.abspath(os.path.normpath(path))
            with open(path,'r') as c:
                contents = unicode(c.read(),'utf-8')
                #print contents
                
            doc = Document()###5
            nameField = Field('name', filename, Field.Store.YES, Field.Index.NOT_ANALYZED)
            doc.add(nameField)###6
            pathField = Field('path', path, Field.Store.YES, Field.Index.NOT_ANALYZED)
            doc.add(pathField)
            contentsField = Field('contents', contents, Field.Store.NO, Field.Index.ANALYZED)
            doc.add(contentsField)

            indexWriter.addDocument(doc, analyzer)####6

    indexWriter.optimize()#######7
    indexWriter.close()######8
    end = datetime.now()
    print '建立索引花费时间：', (end-start)
except Exception,e:
    print e

#下面的内容为解释
'''
       Field.Store.YES:存储字段值（未分词前的字段值）
       Field.Store.NO:不存储,存储与索引没有关系
       Field.Store.COMPRESS:压缩存储,用于长文本或二进制，但性能受损

       Field.Index.ANALYZED:分词建索引
       Field.Index.ANALYZED_NO_NORMS:分词建索引，但是Field的值不像通常那样被保存，而是只取一个byte，这样节约存储空间
       Field.Index.NOT_ANALYZED:不分词且索引
       Field.Index.NOT_ANALYZED_NO_NORMS:不分词建索引，Field的值去一个byte保存
'''
'''
流程：
1  initVM()
2  StandardAnalyzer
3  SimpleFSDirectory
4  IndexWriter
5  for
      doc = Document()
      doc.add(Field())
      indexWriter.addDocument(doc, analyzer)
6   indexWriter.optimize()
    indexWriter.close()
'''

查看全文

相关阅读:
Filebeat Processors对日志数据的处理
 beats直接给es传输日志，自定义索引名
 Elasticsearch:修改fielddata
Elasticsearch：Elasticsearch中的refresh和flush操作指南
 Elasticsearch创建索引(index)及一个文档(document)
Elasticsearch:如何对PDF文件进行搜索
 C++ 类构造函数 & 析构函数~
学习CSS的好地方：CSS Inspiration -- CSS灵感
 css式样里的content
寄存器与cmp，mov，add，sub，IMUL指令

原文地址：https://www.cnblogs.com/TianMG/p/3191679.html