zoukankan      html  css  js  c++  java
  • term 和 match 查询(二)

    1. 准备数据

    PUT h1/doc/1
    {
      "name": "rose",
      "gender": "female",
      "age": 18,
      "tags": ["白", "漂亮", "高"]
    }
    
    PUT h1/doc/2
    {
      "name": "lila",
      "gender": "female",
      "age": 18,
      "tags": ["黑", "漂亮", "高"]
    }
    
    PUT h1/doc/3
    {
      "name": "john",
      "gender": "male",
      "age": 18,
      "tags": ["黑", "帅", "高"]
    }
    

    运行结果:

    {
      "_index" : "h1",
      "_type" : "doc",
      "_id" : "1",
      "_version" : 1,
      "result" : "created",
      "_shards" : {
        "total" : 2,
        "successful" : 1,
        "failed" : 0
      },
      "_seq_no" : 0,
      "_primary_term" : 1
    }
    

    2. match 查询

    2.1 match 按条件查询

    # 查询性别是男性的结果
    GET h1/doc/_search
    {
      "query": {
        "match": {
          "gender": "male"
        }
      }
    }
    

    查询结果:

    {
      "took" : 59,
      "timed_out" : false,
      "_shards" : {
        "total" : 5,
        "successful" : 5,
        "skipped" : 0,
        "failed" : 0
      },
      "hits" : {
        "total" : 1,
        "max_score" : 0.2876821,
        "hits" : [
          {
            "_index" : "h1",		# 索引
            "_type" : "doc",		# 文档类型
            "_id" : "3",			# 文档唯一 id
            "_score" : 0.2876821,	# 打分机制打出来的分数
            "_source" : {			# 查询结果
              "name" : "john",
              "gender" : "male",
              "age" : 18,
              "tags" : [
                "黑",
                "帅",
                "高"
              ]
            }
          }
        ]
      }
    }
    

    2.2 match_all 查询全部

    # 查询 h1 中所有文档
    GET h1/doc/_search
    {
      "query": {
        "match_all": {}
      }
    }
    

    match_all的值为空,表示没有查询条件,那就是查询全部。就像select * from table_name 一样。

    查询结果:

    {
      "took" : 2,
      "timed_out" : false,
      "_shards" : {
        "total" : 5,
        "successful" : 5,
        "skipped" : 0,
        "failed" : 0
      },
      "hits" : {
        "total" : 3,
        "max_score" : 1.0,
        "hits" : [
          {
            "_index" : "h1",
            "_type" : "doc",
            "_id" : "2",
            "_score" : 1.0,
            "_source" : {
              "name" : "lila",
              "gender" : "female",
              "age" : 18,
              "tags" : [
                "黑",
                "漂亮",
                "高"
              ]
            }
          },
          {
            "_index" : "h1",
            "_type" : "doc",
            "_id" : "1",
            "_score" : 1.0,
            "_source" : {
              "name" : "rose",
              "gender" : "female",
              "age" : 18,
              "tags" : [
                "白",
                "漂亮",
                "高"
              ]
            }
          },
          {
            "_index" : "h1",
            "_type" : "doc",
            "_id" : "3",
            "_score" : 1.0,
            "_source" : {
              "name" : "john",
              "gender" : "male",
              "age" : 18,
              "tags" : [
                "黑",
                "帅",
                "高"
              ]
            }
          }
        ]
      }
    }
    

    2.3 match_phrase 短语查询

    match 查询时散列映射,包含了我们希望搜索的字段和字符串,即只要文档中有我们希望的那个关键字,但也会带来一些问题。

    es 会将文档中的内容进行拆分,对于英文来说可能没有太大的影响,但是中文短语就不太适用,一旦拆分就会失去原有的含义,比如以下:

    1、准备数据:

    PUT t1/doc/1
    {
      "title": "中国是世界上人口最多的国家"
    }
    
    PUT t1/doc/2
    {
      "title": "美国是世界上军事实力最强大的国家"
    }
    
    PUT t1/doc/3
    {
      "title": "北京是中国的首都"
    }
    

    2、先使用 match 查询含有中国的文档:

    GET t1/doc/_search
    {
      "query": {
        "match": {
          "title": "中国"
        }
      }
    }
    

    查询结果:

    {
      "took" : 5,
      "timed_out" : false,
      "_shards" : {
        "total" : 5,
        "successful" : 5,
        "skipped" : 0,
        "failed" : 0
      },
      "hits" : {
        "total" : 3,
        "max_score" : 0.68324494,
        "hits" : [
          {
            "_index" : "t1",
            "_type" : "doc",
            "_id" : "1",
            "_score" : 0.68324494,
            "_source" : {
              "title" : "中国是世界上人口最多的国家"
            }
          },
          {
            "_index" : "t1",
            "_type" : "doc",
            "_id" : "3",
            "_score" : 0.5753642,
            "_source" : {
              "title" : "北京是中国的首都"
            }
          },
          {
            "_index" : "t1",
            "_type" : "doc",
            "_id" : "2",
            "_score" : 0.39556286,
            "_source" : {
              "title" : "美国是世界上军事实力最强大的国家"
            }
          }
        ]
      }
    }
    

    发现三篇文档都被返回,与我们的预期有偏差;这是因为 title 中的内容被拆分成一个个单独的字,而 id=2 的文档包含了 字也符合,所以也被返回了。es 自带的中文分词处理不太好用,后面可以使用 ik 中文分词器来处理。

    3、match_phrase 查询短语

    不过可以使用 match_phrase 来匹配短语,将上面的 match 换成 match_phrase 试试:

    # 短语查询
    GET t1/doc/_search
    {
      "query": {
        "match_phrase": {
          "title": "中国"
        }
      }
    }
    

    查询结果:

    {
      "took" : 2,
      "timed_out" : false,
      "_shards" : {
        "total" : 5,
        "successful" : 5,
        "skipped" : 0,
        "failed" : 0
      },
      "hits" : {
        "total" : 2,
        "max_score" : 0.5753642,
        "hits" : [
          {
            "_index" : "t1",
            "_type" : "doc",
            "_id" : "1",
            "_score" : 0.5753642,
            "_source" : {
              "title" : "中国是世界上人口最多的国家"
            }
          },
          {
            "_index" : "t1",
            "_type" : "doc",
            "_id" : "3",
            "_score" : 0.5753642,
            "_source" : {
              "title" : "北京是中国的首都"
            }
          }
        ]
      }
    }
    

    4、slop 间隔查询

    当我们要查询的短语,中间有别的词时,可以使用 slop 来跳过。比如上述要查询 中国世界,这个短语中间被 隔开了,这时可以使用 slop 来跳过,相当于正则中的中国.*?世界

    # 短语查询,查询中国世界,加 slop 
    GET t1/doc/_search
    {
      "query": {
        "match_phrase": {
          "title": {
            "query": "中国世界",
            "slop": 1
          }
        }
      }
    }
    

    查询结果:

    {
      "took" : 4,
      "timed_out" : false,
      "_shards" : {
        "total" : 5,
        "successful" : 5,
        "skipped" : 0,
        "failed" : 0
      },
      "hits" : {
        "total" : 1,
        "max_score" : 0.7445889,
        "hits" : [
          {
            "_index" : "t1",
            "_type" : "doc",
            "_id" : "1",
            "_score" : 0.7445889,
            "_source" : {
              "title" : "中国是世界上人口最多的国家"
            }
          }
        ]
      }
    }
    

    2.4 match_phrase_prefix 最左前缀查询

    场景:当我们要查询的词只能想起前几个字符时

    # 最左前缀查询,查询名字为 rose 的文档
    GET h1/doc/_search
    {
      "query": {
        "match_phrase_prefix": {
          "name": "ro"
        }
      }
    }
    

    查询结果:

    {
      "took" : 1,
      "timed_out" : false,
      "_shards" : {
        "total" : 5,
        "successful" : 5,
        "skipped" : 0,
        "failed" : 0
      },
      "hits" : {
        "total" : 1,
        "max_score" : 0.2876821,
        "hits" : [
          {
            "_index" : "h1",
            "_type" : "doc",
            "_id" : "1",
            "_score" : 0.2876821,
            "_source" : {
              "name" : "rose",
              "gender" : "female",
              "age" : 18,
              "tags" : [
                "白",
                "漂亮",
                "高"
              ]
            }
          }
        ]
      }
    }
    

    限制结果集

    最左前缀查询很费性能,返回的是一个很大的集合,一般很少使用,使用的时候最好对结果集进行限制,max_expansions 参数可以设置最大的前缀扩展数量:

    # 最左前缀查询
    GET h1/doc/_search
    {
      "query": {
        "match_phrase_prefix": {
          "gender": {
            "query": "fe",
            "max_expansions": 1
          }
        }
      }
    }
    

    查询结果:

    {
      "took" : 2,
      "timed_out" : false,
      "_shards" : {
        "total" : 5,
        "successful" : 5,
        "skipped" : 0,
        "failed" : 0
      },
      "hits" : {
        "total" : 2,
        "max_score" : 0.2876821,
        "hits" : [
          {
            "_index" : "h1",
            "_type" : "doc",
            "_id" : "2",
            "_score" : 0.2876821,
            "_source" : {
              "name" : "lila",
              "gender" : "female",
              "age" : 18,
              "tags" : [
                "黑",
                "漂亮",
                "高"
              ]
            }
          },
          {
            "_index" : "h1",
            "_type" : "doc",
            "_id" : "1",
            "_score" : 0.2876821,
            "_source" : {
              "name" : "rose",
              "gender" : "female",
              "age" : 18,
              "tags" : [
                "白",
                "漂亮",
                "高"
              ]
            }
          }
        ]
      }
    }
    

    2.5 multi_match 多字段查询

    1、准备数据:

    # 多字段查询
    PUT t3/doc/1
    {
      "title": "maggie is beautiful girl",
      "desc": "beautiful girl you are beautiful so"
    }
    
    PUT t3/doc/2
    {
      "title": "beautiful beach",
      "desc": "I like basking on the beach,and you? beautiful girl"
    }
    

    2、查询包含 beautiful 字段的文档:

    GET t3/doc/_search
    {
      "query": {
        "multi_match": {
          "query": "beautiful",				# 要查询的词
          "fields": ["desc", "title"]		# 要查询的字段
        }
      }
    }
    

    还可以当做 match_phrasematch_phrase_prefix使用,只需要指定type类型即可:

    GET t3/doc/_search
    {
      "query": {
        "multi_match": {
          "query": "gi",
          "fields": ["title"],
          "type": "phrase_prefix"
        }
      }
    }
    
    GET t3/doc/_search
    {
      "query": {
        "multi_match": {
          "query": "girl",
          "fields": ["title"],
          "type": "phrase"
        }
      }
    }
    

    3. term 查询

    3.1 初始 es 的分析器

    term 查询用于精确查询,但是不适用于 text 类型的字段查询。

    在此之前我们先了解 es 的分析机制,默认的标准分析器会对文档进行:

    • 删除大多数的标点符号
    • 将文档拆分为单个词条,称为 token
    • token 转换为小写

    最后保存到倒排序索引上,而倒排序索引用来查询,如 Beautiful girl 经过分析后是这样的:

    POST _analyze
    {
      "analyzer": "standard",
      "text": "Beautiful girl"
    }
    
    
    # 结果,转换为小写了
    {
      "tokens" : [
        {
          "token" : "beautiful",
          "start_offset" : 0,
          "end_offset" : 9,
          "type" : "<ALPHANUM>",
          "position" : 0
        },
        {
          "token" : "girl",
          "start_offset" : 10,
          "end_offset" : 14,
          "type" : "<ALPHANUM>",
          "position" : 1
        }
      ]
    }
    

    3.2 term 查询

    1、准备数据:

    # 创建索引,自定义 mapping,后面会讲到
    PUT t4
    {
      "mappings": {
        "doc":{
          "properties":{
            "t1":{
              "type": "text"    # 定义字段类型为 text
            }
          }
        }
      }
    }
    
    PUT t4/doc/1
    {
      "t1": "Beautiful girl!"
    }
    
    PUT t4/doc/2
    {
      "t1": "sexy girl!"
    }
    

    2、match 查询:

    GET t4/doc/_search
    {
      "query": {
        "match": {
          "t1": "Beautiful girl!"
        }
      }
    }
    

    经过分析后,会得到 beautiful、girl 两个 token,然后再去 t4 索引上去查询,会返回两篇文档:

    {
      "took" : 1,
      "timed_out" : false,
      "_shards" : {
        "total" : 5,
        "successful" : 5,
        "skipped" : 0,
        "failed" : 0
      },
      "hits" : {
        "total" : 2,
        "max_score" : 0.5753642,
        "hits" : [
          {
            "_index" : "t4",
            "_type" : "doc",
            "_id" : "1",
            "_score" : 0.5753642,
            "_source" : {
              "title" : "Beautiful girl"
            }
          },
          {
            "_index" : "t4",
            "_type" : "doc",
            "_id" : "2",
            "_score" : 0.2876821,
            "_source" : {
              "title" : "sex girl"
            }
          }
        ]
      }
    }
    

    3、但是我们只想精确查询包含 Beautiful girl 的文档,这时就需要使用 term 来精确查询:

    GET t4/doc/_search
    {
      "query": {
        "term": {
          "title": "beautiful"
        }
      }
    }
    

    查询结果:

    {
      "took" : 0,
      "timed_out" : false,
      "_shards" : {
        "total" : 5,
        "successful" : 5,
        "skipped" : 0,
        "failed" : 0
      },
      "hits" : {
        "total" : 1,
        "max_score" : 0.2876821,
        "hits" : [
          {
            "_index" : "t4",
            "_type" : "doc",
            "_id" : "1",
            "_score" : 0.2876821,
            "_source" : {
              "title" : "Beautiful girl"
            }
          }
        ]
      }
    }
    

    注意:term 查询不适用于类型是 text 的字段,可以使用 match 查询;另外 Beautiful 经过分析后变为 beautiful,查询时使用 Beautiful 是查询不到的~

    3.3 查询多个

    精确查询多个字段:

    GET t4/doc/_search
    {
      "query": {
        "terms": {
          "title": ["beautiful", "sex"]
        }
      }
    }
    
  • 相关阅读:
    简单区间dp
    【题解】石子合并
    【2019.7.6】刷题记录
    【题解】大朋友的数字
    【基础】dp系列1
    【题解】垃圾陷阱
    【题解】导弹拦截
    hadoop各组件安装(非专业人士,不定期更新)
    python逼格提升
    python第三十二天-----算法
  • 原文地址:https://www.cnblogs.com/midworld/p/13782857.html
Copyright © 2011-2022 走看看