zoukankan      html  css  js  c++  java
  • Elasticsearch打造全文搜索引擎(二)

    一、Es的文档、索引的CURD操作

    1. elasticsearch概念

    • 集群:一个或多个节点组织在一起
    • 节点:一个节点是集群中的一个服务器,有一个名字来标识,默认是一个随机的漫画角色的名字
    • 分片:将索引划分为多份的能力,允许水平分割和扩展容量,多个分片相应请求,提高性能和吞吐量。
    • 副本:创建分片的一份或多份的能力,在一个节点失败其余节点可以顶上。
    elasticsearch mysql
    index(索引) 数据库
    type(类型)
    document(文档)
    fields

    2.常用属性和类型

    3.内置类型

    4. CURD操作

    • 索引的初始化操作
    • 指定分片和副本的数量
    • shards一旦设置不能修改
    # 索引初始化
    PUT lagou { "settings": { "index": { "number_of_shards": 5, # 分片 "number_of_replicas": 1 # 备份 } } } GET lagou/_settings GET _all/_settings GET .kibana,lagou/_settings GET _settings # 修改settings PUT lagou/_settings { "number_of_replicas": 2 } # 获取索引信息 GET _all GET lagou # 新建/保存文档 # 方式一 PUT lagou/job/1 { "title": "python爬虫分布式开发", "salary_min":15000, "city":"北京", "company":{ "name":"百度", "company_addr":"北京市软件园" }, "publish_date":"2019-06-15", "comments":15 } # 新建文档 # 方式二 POST lagou/job/ { "title": "python django 开发工程师", "salary_min":30000, "city":"上海", "company":{ "name":"美团科技", "company_addr":"北京市软件园A区" }, "publish_date":"2019-06-15", "comments":120 } GET lagou/job/1 GET lagou/job/1?_source=title GET lagou/job/1?_source=title,city GET lagou/job/1?_source # 修改文章 # 方式一 PUT lagou/job/1 { "title": "python爬虫分布式开发", "salary_min":18000, "city":"广州", "company":{ "name":"百度", "company_addr":"北京市软件园" }, "publish_date":"2019-06-15", "comments":15 } # 方式二:修改修改某一字段 POST lagou/job/1/_update { "doc": { "comments":20 } } # 删除 DELETE lagou/job/1 DELETE lagou/job DELETE lagou

    二、mget和bulk操作

    # 批量操作
    
    数据准备
    POST lagou/job1/1
    {
      "title": "python django 开发工程师",
      "salary_min":30000,
      "city":"上海",
      "company":{
        "name":"美团科技",
        "company_addr":"北京市软件园A区"
      },
      "publish_date":"2019-06-15",
      "comments":120
    }
    
    POST lagou/job1/2
    {
      "title": "python django 开发工程师",
      "salary_min":30000,
      "city":"上海",
      "company":{
        "name":"美团科技",
        "company_addr":"北京市软件园A区"
      },
      "publish_date":"2019-06-15",
      "comments":120
    }
    
    POST lagou/job2/1
    {
      "title": "python django 开发工程师",
      "salary_min":30000,
      "city":"上海",
      "company":{
        "name":"美团科技",
        "company_addr":"北京市软件园A区"
      },
      "publish_date":"2019-06-15",
      "comments":120
    }
    
    POST lagou/job2/2
    {
      "title": "python django 开发工程师",
      "salary_min":30000,
      "city":"上海",
      "company":{
        "name":"美团科技",
        "company_addr":"北京市软件园A区"
      },
      "publish_date":"2019-06-15",
      "comments":120
    }
    
    mget批量获取
    GET _mget
    {
      "docs":[
          {"_index":"lagou",
           "_type":"job1",
           "_id":1
          },
          {"_index":"lagou",
           "_type":"job2",
           "_id":2
          }
        ]
    }
    
    GET lagou/_mget
    {
      "docs":[
          {
           "_type":"job1",
           "_id":1
          },
          {
           "_type":"job2",
           "_id":2
          }
        ]
    }
    
    GET lagou/job1/_mget
    {
      "docs":[
          {
           "_id":1
          },
          {
           "_id":2
          }
        ]
    }
    
    GET lagou/job1/_mget
    {
      "ids":[1,2]
    }
    
    bulk增删改查
    
    POST _bulk
    {"index":{"_index":"lagou","_type":"job1","_id":"3"}}
    {"title": "python django 开发工程师","salary_min":30000,"city":"上海","company":{"name":"美团科技","company_addr":"北京市软件园A区"},"publish_date":"2019-06-15","comments":120}
    {"index":{"_index":"lagou","_type":"job2","_id":"3"}}
    {"title": "python django 开发工程师","salary_min":30000,"city":"上海","company":{"name":"美团科技","company_addr":"北京市软件园A区"},"publish_date":"2019-06-15","comments":120}
    
    POST _bulk
    {"create":{"_index":"lagou","_type":"job1","_id":"3"}}
    {"title": "python django 开发工程师","salary_min":30000,"city":"上海","company":{"name":"美团科技","company_addr":"北京市软件园A区"},"publish_date":"2019-06-15","comments":120}
    
    POST _bulk
    {"delete":{"_index":"lagou","_type":"job1","_id":"3"}}
    
    POST _bulk
    {"update":{"_index":"lagou","_type":"job1","_id":"3"}}
    {"doc":{"title": "python django 开发工程师","salary_min":30000,"city":"上海","company":{"name":"美团科技","company_addr":"北京市软件园A区"},"publish_date":"2019-06-15","comments":120}}
    

    三、mapping映射和查询

    1. mapping映射

    2.倒排索引

    3. 倒排索引待解决的问题

    4. 查询

    5. 操作

    # mapping操作
    
    PUT lagou1
    {
      "mappings":{
        "job":{
          "properties":{
            "title":{
              "type":"text"
            },
            "salary_min":{
              "type":"integer"
            },
            "city":{
              "type":"keyword"
            },
            "company":{
              "properties":{
                "name":{
                  "type":"text"
                },
                "company_addr":{
                  "type":"text"
                },
                "employee_count":{
                  "type":"integer"
                }
            }
          },
          "publish_date":{
            "type":"date",
            "format":"yyyy-MM-dd"
          },
          "comments":{
            "type":"integer"
          }
        }
      }
    }
    }
    
    PUT lagou1/job/1
    {
      "title": "python爬虫分布式开发",
      "salary_min":15000,
      "city":"北京",
      "company":{
        "name":"百度",
        "company_addr":"北京市软件园",
        "employee_count":50
      },
      "publish_date":"2019-06-15",
      "comments":15
    }
    
    # get index mapping
    
    GET lagou1/_mapping
    GET lagou1/_mapping/job
    GET _all/_mapping/job
    
    # 查询
    
    PUT lagou2
    {
      "mappings": {
        "job":{
          "properties":{
            "title":{
              "type": "text",
              "store":true,
              "analyzer": "ik_max_word"
            },
            "company_name": {
              "type": "keyword",
              "store":true
            },
            "desc":{
              "type":"text"
            }, 
            "add_time":{
              "type":"date",
              "format":"yyyy-MM-dd"
            },
            "comments":{
              "type": "integer"
            }
          }
        }
      }
    }
    
    
    POST lagou2/job
    {
      "title":"python django 开发工程师" ,
      "company_name":"美国科技有限公司",
      "desc":"对django的概念熟悉,熟悉python基础知识", 
      "comments":20,
      "add_time":"2017-04-01"  
    }
    
    POST lagou2/job
    {
      "title":"python scrapy redis 分布式爬虫基本" ,
      "company_name":"百度科技有限公司",
      "desc":"对scrapy的概念熟悉,熟悉redis的基本操作",
      "comments":5,
      "add_time":"2017-04-15"  
    } 
    
    POST lagou2/job
    {
      "title":"Elasticsearch打造搜索引擎" ,
      "company_name":"阿里巴巴科技有限公司",
      "desc":"熟悉数据结构算法,熟悉python的基本开发",
      "comments":15,
      "add_time":"2017-06-20"  
    } 
    
    POST lagou2/job
    {
      "title":"python打造推荐引擎系统" ,
      "company_name":"阿里巴巴科技有限公司",
      "desc":"熟悉推荐引擎的原理以及算法、掌握C语言",
      "comments":60,
      "add_time":"2016-10-20"  
    } 
    
    # 简单查询
    #查看分析器解析的结果
    GET _analyze
    {
      "analyzer": "ik_smart",
      "text":"Python网络开发师"
    }
    GET _analyze
    {
      "analyzer": "ik_max_word",
      "text":"Python网络开发师"
    }
    
    #match查询 (分词查询) python 和分布式
    #查询第0-2条的title和company_name字段(desc字段的stored属性不是true),并按comments排序
    GET lagou2/_search
    {
     "stored_fields":["title","company_name","desc"], 
      "query":{
        "match":{
          "title":"python分布式"  
        }
      },
      "from": 0,
      "size": 2,
      "sort": [
        {
          "comments": {
            "order": "desc"
          }
        }
      ]
    }
    
    #查询comments在大于等于10、小于等于20、权重2.0的数据
    GET lagou2/_search
    {
      "query":{  
        "range": {
          "comments": {
            "gte": 10,
            "lte": 20,
            "boost":2.0
          }
        }
      }
    }
    GET lagou2/_search
    {
      "query":{  
        "range": {
          "add_time": {
            "gte": "2017-04-01",
            "lte": "now"
          }
        }
      }
    }
    
    #term查询(不会做处理、直接查,类似于keyword属性)
    GET lagou2/_search
    {
      "query":{
        "term":{
          "title":"python"  
        }
      }
    }
    #terms 和用match查django分布工程  效果一样
    GET lagou2/_search
    {
      "query":{
        "terms":{
          "title":["django"  ,"分布"  ,"工程"  ]
        }
      }
    }
    
    #match_all
    GET lagou2/_search
    {
      "query":{
        "match_all":{}
      }
    }
     
    #match_phrase 
    #短语查询
    #满足所有词 既有python也有系统,俩个词最小间距6位
    GET lagou2/_search
    {
      "query":{
        "match_phrase": {
          "title": {
            "query": "python系统",
            "slop":6
          }
        }
      }
    }
    
    #multi_match 多字段匹配,title的权重高于desc的3倍
    GET lagou2/_search
    {
      "query":{
        "multi_match": { 
          "query": "python系统",
          "fields":["title^3","desc"]
        }
      }
    }
    
    # sort查询
    GET lagou2/_search
    {
      "query": {
        "match_all": {}
      },
      "sort": [
        {
          "comments": {
            "order": "asc"
          }
        }
      ]
    }
    
    # range范围查询
    GET lagou2/_search
    {
      "query": { 
          "range": {
          "comments": {
            "gte": 20,
            "lte": 60,
            "boost":2.0
          }
        }
      }
    }
    
    GET lagou2/_search
    {
      "query": { 
          "range": {
          "add_time": {
            "gte": "2017-06-07",
            "lte": "now"
          }
      }
    }
    }
    
    #wildcard 通配符查询
    GET lagou2/_search
    {
      "query":{  
        "wildcard": {
          "title": {
            "value": "pyth*n",
            "boost": 2
          }
        }
      }
    }
    
    # 组合查询
    #bool 查询
    #用 bool 包括 must should must_not filter来完成
    #格式如下
    #bool:{
    #  "filter":[], #不参与打分
    #  "must":[],  #相当于        (salary=20 and title=Python)
    #  "should":[], #相当于       (salary=20 or title=Python)
    #  "must_not":[], #相当于not
    #}
    
    #建立测试数据
    POST lagou/testjob/_bulk
    {"index":{"_id":1}}
    {"salary":10,"title":"Python"}
    {"index":{"_id":2}}
    {"salary":20,"title":"Scrapy"}
    {"index":{"_id":3}}
    {"salary":30,"title":"Django"}
    {"index":{"_id":4}}
    {"salary":30,"title":"Elasticsearch"}
    
    DELETE lagou/testjob
    
    #简单的过滤查询
    #最简单的fileter查询
    #select * from testjob where salary=20
    GET lagou/testjob/_search
    {
      "query":{
        "bool": { 
          "must": {
            "match":{
              "salary":20
            }
          }, 
          "filter":{ 
            "match":{
              "title":"Scrapy"
            }
          }
        }
      }
    }
    #select * from testjob
    #where (salary=20 or title=Python) and salary!=30 and salary!=10
    GET lagou/testjob/_search
    {
      "query":{
        "bool": { 
          "should":[
              {"term":{"salary":20}},
              {"term":{"title":"python"}}
            ],
          "must_not": [
            {"term": {"salary": "30"}},
            {"term": {"salary": "10"}}
          ] 
        }
      }
    }
    
    #where (salary=30 and title="django") or title="python"
    GET lagou/testjob/_search
    {
      "query":{
        "bool": { 
          "should":[
              {"term":{"title":"python"}},
              {"bool": { 
                "must":[
                    {"term":{"salary":30}},
                    {"term":{"title":"django"}}
                  ] 
              }}
            ] 
        }
      }
    }
    
    #测试数据
    POST lagou/testjob2/_bulk
    {"index":{"_id":1}}
    {"tags":["search"]}
    {"index":{"_id":2}}
    {"tags":["search","python"]}
    {"index":{"_id":3}}
    {"other_filed":["some data"]}
    {"index":{"_id":4}}
    {"tags":null}
    {"index":{"_id":5}}
    {"tags":["search",null]}
    
    #处理null空值的方法
    #select tags from testjob2 where tags is not null
    GET lagou/testjob2/_search
    {
      "query": {
        "bool": {
          "filter": {
            "exists": {
              "field": "tags"
            }
          }
        }
      }
    }
    #select tags from testjob2 where tags is null
    GET lagou/testjob2/_search
    {
      "query": {
        "bool": {
          "must_not": {
            "exists": {
              "field": "tags"
            }
          }
        }
      }
    } 

    gitee地址https://gitee.com/zhangyafeii/ArticleSpider_LcvSearch

  • 相关阅读:
    数据结构与算法系列——排序(6)_树形选择排序
    数据结构与算法系列——排序(7)_堆排序
    数据结构与算法系列——排序(5)_简单选择排序
    数据结构与算法系列——排序(4)_Shell希尔排序
    数据结构与算法系列——排序(3)_折半插入排序
    数据结构与算法系列——排序(2)_直接插入排序
    数据结构与算法系列——排序(1)_概述
    Java高级开发_性能优化的细节
    图形推理
    美团点评面试20190515
  • 原文地址:https://www.cnblogs.com/zhangyafei/p/11041766.html
Copyright © 2011-2022 走看看