zoukankan      html  css  js  c++  java
  • elasticsearch python接口

    elastic'search提供了许多接口用于访问,在这里集成了很多常用的额python访问的方法,如有不正确的地方请指正。

    import json
    import os
    import sys
    
    from elasticsearch import Elasticsearch
    from elasticsearch.exceptions import ConnectionError, NotFoundError
    from elasticsearch.helpers import bulk
    
    
    class ErrorHunter(object):
        @staticmethod
        def capture_connection_error(func):
            def _decorator(*args, **kwargs):
                try:
                    return func(*args, **kwargs)
                except ConnectionError, e:
                    sys.stderr.write("%s
    " % e)
                    sys.exit(-1)
    
            return _decorator
    
        @staticmethod
        def capture_notfind_error(func):
            def _decorator(*args, **kwargs):
                try:
                    return func(*args, **kwargs)
                except NotFoundError, e:
                    sys.stderr.write("%s
    " % e)
                    sys.exit(-1)
    
            return _decorator
    
    
    class ElasticSearchImporter(object):
        batch_search_size = 1000
    
        def __init__(self, host, port=9200, username=None, password=None):
            if not username or not password:
                self.es = Elasticsearch(hosts=[host], port=port)
            else:
                self.es = Elasticsearch(hosts=[host], port=port, http_auth=(username, password))
    
        @ErrorHunter.capture_connection_error
        def index_is_exist(self, index_name):
            res = self.es.indices.exists(index=index_name)
            sys.stdout.write("index[%s] exist state is 
    " % index_name, res)
            return res
    
        @ErrorHunter.capture_connection_error
        @ErrorHunter.capture_notfind_error
        def search_size(self, index_name, doc_type):
            res = self.es.search(index=index_name, doc_type=doc_type, body={"query": {"match_all": {}}})
            sys.stdout.write("search response is 
    ", res)
            return res["hits"]["total"]
    
        @ErrorHunter.capture_connection_error
        def _create_index(self, index_name):
            res = self.es.indices.create(index=index_name, ignore=400)
            sys.stdout.write("response of create index[%s]:" % index_name, res)
    
        def insert_by_batch(self, index_name, doc_type, data_list):
            batch_size = len(data_list) / self.batch_search_size + 1
            for i in range(batch_size):
                begin_index = self.batch_search_size * i
                end_index = begin_index + self.batch_search_size
                res = self.insert_data_list(index_name=index_name, doc_type=doc_type, data_list=data_list[begin_index:end_index])
    
        @ErrorHunter.capture_connection_error
        def insert_data_list(self, index_name, doc_type, data_list):
            if not self.index_is_exist(index_name):
                self._create_index(index_name)
    
            actions = [
                {
                    "_op_type": "index",
                    "_index": index_name,
                    "_type": doc_type,
                    "_source": d
                }
                for d in data_list
            ]
            res = bulk(self.es, actions)
            sys.stdout.write("response of insert is : ", res)
    
        @ErrorHunter.capture_connection_error
        @ErrorHunter.capture_notfind_error
        def search_data(self, index_name, doc_type):
            if not self.index_is_exist(index_name):
                raise StopIteration
    
            total_size = self.search_size(index_name=index_name, doc_type=doc_type)
            # search data by page
            total_page = total_size / self.batch_search_size + 1
            for page_num in range(total_page):
                batch_result_data = self.es.search(index=index_name, doc_type=doc_type, from_=page_num,
                                                   size=self.batch_search_size)
                id_data_list = [(result[u"_id"], result[u"_source"])
                                for result in batch_result_data[u"hits"][u"hits"]]
                for id_data in id_data_list:
                    yield id_data
    
        @ErrorHunter.capture_connection_error
        @ErrorHunter.capture_notfind_error
        def search_by_body(self, index_name, doc_type, **kwargs):
            body = {
                "query": {
                    "match": kwargs
                },
                "size": 1
            }
            res = self.es.search(index_name, doc_type, body)
            sys.stdout.write("%s
    " % res)
    
        @ErrorHunter.capture_connection_error
        @ErrorHunter.capture_notfind_error
        def update_data_list(self, index_name, doc_type, id_data_dict):
            if not id_data_dict:
                return
            actions = [
                {
                    # "_op_type": "update",
                    "_index": index_name,
                    "_type": doc_type,
                    "_source": d,
                    "_id": id_index
                }
                for id_index, d in id_data_dict.iteritems()
            ]
    
            res = bulk(self.es, actions)
            sys.stdout.write("%s
    " % res)
    
        @ErrorHunter.capture_connection_error
        @ErrorHunter.capture_notfind_error
        def delete_index_doc(self, index_name):
            res = self.es.indices.delete(index=index_name)
            sys.stdout.write("%s
    " % res)
            return res
    
        def json_import(self, index_name, doc_type, json_file):
            if not os.path.exists(json_file):
                sys.stderr.write("This json file[%s] is not exist.
    " % json_file)
                sys.exit(-1)
            with open(json_file, "r") as f:
                data_list = json.load(f)
                self.insert_data_list(index_name=index_name, doc_type=doc_type, data_list=data_list)
                sys.stdout.write("Success import json file to ES.
    ")
    
    
    def test_search():
        builds_index_name = "builds_data"
        builds_doc_type = "data_stat_doc"
        es_import = ElasticSearchImporter(host="localhost")
        es_import.index_is_exist(builds_index_name)
        es_import.search_by_body(index_name=builds_index_name, doc_type=builds_doc_type,
                                 data_path="/var/www/html/ec_latest_builds/DENALI_NGXTRAFFIC/CN_AXF_16Q2/DENALI_NGXTRAFFIC-CN_AXF_16Q2-TnMapDataAccess-linux-centos6x64-2.9.491956-trunk-20170510093257-RC")
    
    
    def test_import():
        index_name = "sales"
        doc_type = "sales_doc"
        json_file_path = os.path.join(os.path.dirname(__file__), "sales.json")
    
        es_import = ElasticSearchImporter(host="localhost")
        es_import.json_import(index_name, doc_type, json_file_path)
    
    
    if __name__ == '__main__':
        # import_builds_data("pbf_statistic_autonavi.json")
        # test_search()
        test_import()

    Tips:

    在这里查询数据部分采用分页批量获取数据的方法,并且采用yield方式返回。如果需要终止yield,只需抛出StopIteration异常即可。

  • 相关阅读:
    [转载]Oracle中TO_DATE()函数用法
    validationEngine
    批处理执行sql语句 osql
    asp.net导出excel
    Oracle nls_sort和nlssort 排序功能介绍
    js中2个等号与3个等号的区别
    【36】第零章 起航
    那些年,我还在学习Ajax
    那些年,我还在学习java
    那些年,我还在学习jquery
  • 原文地址:https://www.cnblogs.com/dasheng-maritime/p/8521722.html
Copyright © 2011-2022 走看看