zoukankan      html  css  js  c++  java
  • 利用whoosh对mongoDB的中文文档建立全文检索

    1、建立索引

    #coding=utf-8
    from __future__ import unicode_literals
    __author__ = 'zh'
    import sys,os
    from whoosh.index import create_in,open_dir
    from whoosh.fields import *
    from jieba.analyse import ChineseAnalyzer
    import pymongo
    import json
    from pymongo.collection import Collection
    from pymongo import database
    
    class CreatIndex:
        def __init__(self):
            self.mongoClient = pymongo.MongoClient('192.168.229.128',27017)
            self.websdb = pymongo.database.Database(self.mongoClient,'webdb')
            self.pagesCollection = Collection(self.websdb,'pages')
        def BuiltIndex(self):
            analyzer = ChineseAnalyzer()
            # 索引模版
            schema = Schema(
                U_id=ID(stored=True),
                # md5=ID(stored=True),
                title=TEXT(stored=True,analyzer=analyzer),
                location=TEXT(stored=True),
                publish_time=DATETIME(stored=True,sortable=True),
                content=TEXT(stored=False,analyzer=analyzer)
            )
            from whoosh.filedb.filestore import FileStorage
            storage = FileStorage("../whoosh_index")
            if not os.path.exists("../whoosh_index"):
                os.mkdir("../whoosh_index")
                ix = storage.create_index(schema)
                print '建立索引文件!'
            else:
                ix=storage.open_index()
    
            # if not os.path.exists("whoosh_index"):
            #     os.mkdir("whoosh_index")
            #     ix = create_in("whoosh_index", schema) # for create new index
            # #ix = open_dir("tmp") # for read only
            writer = ix.writer()
            try:
                num=0
                while(True):
                    # break
                    try:
                        row=self.pagesCollection.find_one({'indexed':{'$exists':False}})
                        if row!=None:
                            publish_time=None
                            if row.has_key('publish_time'):
                                publish_time=row['publish_time']
                                if str(publish_time)=='' or str(publish_time)=='0':
                                    publish_time=None
                            location=''
                            if row.has_key('location'):
                                location=json.JSONEncoder().encode(row['location'])
    
                            writer.add_document(
                            U_id=''.join(str(row['_id'])),
                            # md5=row['md5'],
                            title=row['name'],
                            location=''.join(location),
                            publish_time=publish_time,
                            content=row['information']
                            )
                            self.pagesCollection.update_one({"_id":row["_id"]},{"$set":{"indexed":True}})
                            num+=1
                            print row["_id"],"已建立索引!"
                        else:
                            writer.commit()
                            print "全部处理完毕"
                            # time.sleep(3600)
                            # self.BuiltIndex()
                            break
                    except:
                        print row["_id"],"异常"
                        break
            except:
                writer.commit()
                print "异常"
            # print '已处理',num,'共计', self.pagesCollection.find({'indexed':{'$exists':True}}).count()
                print '已处理',num,'共计', self.pagesCollection.find().count()
    
    creatindext = CreatIndex()
    creatindext.BuiltIndex()
    View Code

    注:注意编码

    2、检索

    from __future__ import unicode_literals
    #coding=utf-8
    __author__ = 'zh'
    # from whoosh.qparser import QueryParser
    from whoosh import qparser,sorting
    # from jieba.analyse import ChineseAnalyzer
    from whoosh.index import open_dir
    from whoosh.query import *
    # import pymongo
    import datetime
    # from pymongo.collection import Collection
    # from pymongo import database
    
    class FullText:
        def __init__(self,index_home='whoosh_index'):
            self.index_home = index_home
            self.ix = open_dir(self.index_home)
            self.searcher = self.ix.searcher()
    
        # 全文检索,目前主要利用关键字
        def Query(self,parameter):
            # analyzer = ChineseAnalyzer()
            # ix = open_dir(self.index_home) # for read only
    
            # searcher = ix.searcher()
            # print ix.schema['content']
            # 按照字段查询,可联合查询,MultifieldParser
            list=parameter['keys']
            if len(list)==1:
                parser = qparser.QueryParser(list[0], schema=self.ix.schema)
            if len(list)>1:
                parser = qparser.MultifieldParser(list, schema=self.ix.schema)
            # else:
            #     return None
            # print ix.schema
            keywords = parameter['keywords']
            # print keywords
            q = parser.parse(keywords)
    
            # mf = sorting.MultiFacet()
            scores = sorting.ScoreFacet()
            date = sorting.FieldFacet("publish_time", reverse=True)
    
            # 是否分页返回OR全部返回,默认全部返回
            _limit=None
            if parameter.has_key('page') and parameter.has_key('pagesize'):
                page=parameter['page']
                pagesize=parameter['pagesize']
                if page > 0 and pagesize !=0:
                    _limit=page*pagesize
    
            # 是否按照location字段过滤,默认不过滤
            allow_q=None
            if parameter.has_key('includeFields') and parameter['includeFields'].__contains__(u'location'):
                allow_q = qparser.query.Term("location", u"coordinates")
    
            #  时间分组,暂时不用
            # start = datetime.datetime(2000, 1, 1)
            # end = datetime.datetime.now()
            # gap = datetime.timedelta(days=365)
            # bdayfacet = sorting.DateRangeFacet("publish_time", start, end, gap)
    
            results = self.searcher.search(q, limit=_limit,filter=allow_q,sortedby=[scores,date])
            # results = searcher.search(q, limit=_limit,filter=restrict_q,
            #                           groupedby=bdayfacet,sortedby=[scores,date])
            # print results.estimated_length()
            return results
    fulltext_query = fulltext.FullText()
    View Code

    注:支持多字段检索、分类、排序等

    whoosh参考

    提供陕西省POI数据(300万条,sqlserver备份文件
  • 相关阅读:
    Java 8 新特性-菜鸟教程 (8) -Java 8 日期时间 API
    Java 8 新特性-菜鸟教程 (7) -Java 8 Nashorn JavaScript
    Java 8 新特性-菜鸟教程 (6) -Java 8 Optional 类
    心理相关
    matlab和Visio安装
    论文资料搜集整理(研究现状)
    调式相关
    梅花落与折杨柳
    混合高斯模型——学习笔记
    NSCT,非下采样Contourlet变换——学习笔记
  • 原文地址:https://www.cnblogs.com/Micang/p/6346437.html
Copyright © 2011-2022 走看看