zoukankan      html  css  js  c++  java
  • (53)ElasticSearch之如何计算相关度分数

      ElasticSearch查询的相关度分数是3部分综合的分数,使用的是TF/IDF算法(Term Frequency&Invest Document Frequency)

      1、根据Term Frequency(词条出现频率)

      我们查询的文本中的词条在本document中出现了多少次,出现次数越多,相关度越高。 

      例如搜索内容:hello world

      在文档1::hello,I love china.中出现了hello,出现了一次

      在文档2:hello world,how are you!中出现了hello world,相当于出现了两次,所以文档2的相关度分数高于文档1。

      2、根据Inverse Document Frequency

      根据查询的文本中的词条在索引的全部文档中出现了多少次,出现的次数越多,相关度越低。

      例如搜索内容:hello world

      在文档1:hello,what are you doing?中hello出现了一次。

      在文档2:I like the world.中world出现了一次。

      按照第1项算,这两个文档的分数是一样的,但是还要比较hello在该索引的所有文档中出现多少次,world在该索引的所有文档中出现多少次,假如hello在索引的所有文档中出现了500次,world出现了100次。那么文档2的相关度分数要高于文档1。

      3、根据Field-length norm(字段长度规约)

      field越长,相关度约低。

      例如搜索内容:hello world,有下面两个文档。

      文档1:{"title":"hello,what's your name?","content":{"qwieurowieuolsdjflk"}}

      文档2:{"title":"hi,good morning","content":{"lkjkljkj....world"}}

      在文档1的title字段中搜索到hello,在文档2的content字段中搜索到world,content字段的长度比title字段长,所以文档2的相关度低

      4、演示查看分数是如何计算的

      准备数据:

    PUT /lib
    {
        "settings":{
            "number_of_shards":3,
            "number_of_replicas":0
          },
            "mappings":{
                "user":{
                    "properties":{
                        "name":{"type":"text"},
                        "address":{"type":"text"},
                        "age":{"type":"integer"},
                        "interests":{
                          "type":"text"
                        },
                        "birthday":{"type":"date"}
                    }
                }
            }
    }
    put /lib/user/1
    {
        "name":"zhaoliu",
        "address":"hei long jiang sheng tie ling shi",
        "age":50,
        "birthday":"1970-12-12",
        "interests":"xi huang hejiu,duanlian,lvyou"
    }
    
    put /lib/user/2
    {
        "name":"zhaoming",
        "address":"bei jing hai dian qu qing he zhen",
        "age":20,
        "birthday":"1998-10-12",
        "interests":"xi huan hejiu,duanlian,changge"
    }
    
    put /lib/user/3
    {
        "name":"lisi",
        "address":"bei jing hai dian qu qing he zhen",
        "age":23,
        "birthday":"1998-10-12",
        "interests":"xi huan hejiu,duanlian,changge"
    }
    
    put /lib/user/4
    {
        "name":"wangwu",
        "address":"bei jing hai dian qu qing he zhen",
        "age":26,
        "birthday":"1998-10-12",
        "interests":"xi huan biancheng,tingyinyue,lvyou"
    }
    
    put /lib/user/5
    {
        "name":"zhangsan",
        "address":"bei jing chao yang qu",
        "age":29,
        "birthday":"1988-10-12",
        "interests":"xi huan tingyinyue,changge,tiaowu"
    }

      在查询后面添加explain=true

    GET lib/user/_search?explain=true
    {
      "query": {
        "match": {
          "interests": "duanlian,changge"
        }
      }
    }

      查询结果,可以看到3部分的分数,加起来是总的分数

    {
      "took": 8,
      "timed_out": false,
      "_shards": {
        "total": 5,
        "successful": 5,
        "skipped": 0,
        "failed": 0
      },
      "hits": {
        "total": 4,
        "max_score": 1.3862944,
        "hits": [
          {
            "_shard": "[lib][2]",
            "_node": "AJ3x6yc8TfKj6_zx6VRm0g",
            "_index": "lib",
            "_type": "user",
            "_id": "2",
            "_score": 1.3862944,
            "_source": {
              "name": "zhaoming",
              "address": "bei jing hai dian qu qing he zhen",
              "age": 20,
              "birthday": "1998-10-12",
              "interests": "xi huan hejiu,duanlian,changge"
            },
            "_explanation": {
              "value": 1.3862944,
              "description": "sum of:",
              "details": [
                {
                  "value": 0.6931472,
                  "description": "weight(interests:duanlian in 0) [PerFieldSimilarity], result of:",
                  "details": [
                    {
                      "value": 0.6931472,
                      "description": "score(doc=0,freq=1.0 = termFreq=1.0
    ), product of:",
                      "details": [
                        {
                          "value": 0.6931472,
                          "description": "idf, computed as log(1 + (docCount - docFreq + 0.5) / (docFreq + 0.5)) from:",
                          "details": [
                            {
                              "value": 1,
                              "description": "docFreq",
                              "details": []
                            },
                            {
                              "value": 2,
                              "description": "docCount",
                              "details": []
                            }
                          ]
                        },
                        {
                          "value": 1,
                          "description": "tfNorm, computed as (freq * (k1 + 1)) / (freq + k1 * (1 - b + b * fieldLength / avgFieldLength)) from:",
                          "details": [
                            {
                              "value": 1,
                              "description": "termFreq=1.0",
                              "details": []
                            },
                            {
                              "value": 1.2,
                              "description": "parameter k1",
                              "details": []
                            },
                            {
                              "value": 0.75,
                              "description": "parameter b",
                              "details": []
                            },
                            {
                              "value": 5,
                              "description": "avgFieldLength",
                              "details": []
                            },
                            {
                              "value": 5,
                              "description": "fieldLength",
                              "details": []
                            }
                          ]
                        }
                      ]
                    }
                  ]
                },
                {
                  "value": 0.6931472,
                  "description": "weight(interests:changge in 0) [PerFieldSimilarity], result of:",
                  "details": [
                    {
                      "value": 0.6931472,
                      "description": "score(doc=0,freq=1.0 = termFreq=1.0
    ), product of:",
                      "details": [
                        {
                          "value": 0.6931472,
                          "description": "idf, computed as log(1 + (docCount - docFreq + 0.5) / (docFreq + 0.5)) from:",
                          "details": [
                            {
                              "value": 1,
                              "description": "docFreq",
                              "details": []
                            },
                            {
                              "value": 2,
                              "description": "docCount",
                              "details": []
                            }
                          ]
                        },
                        {
                          "value": 1,
                          "description": "tfNorm, computed as (freq * (k1 + 1)) / (freq + k1 * (1 - b + b * fieldLength / avgFieldLength)) from:",
                          "details": [
                            {
                              "value": 1,
                              "description": "termFreq=1.0",
                              "details": []
                            },
                            {
                              "value": 1.2,
                              "description": "parameter k1",
                              "details": []
                            },
                            {
                              "value": 0.75,
                              "description": "parameter b",
                              "details": []
                            },
                            {
                              "value": 5,
                              "description": "avgFieldLength",
                              "details": []
                            },
                            {
                              "value": 5,
                              "description": "fieldLength",
                              "details": []
                            }
                          ]
                        }
                      ]
                    }
                  ]
                }
              ]
            }
          },
          {
            "_shard": "[lib][4]",
            "_node": "AJ3x6yc8TfKj6_zx6VRm0g",
            "_index": "lib",
            "_type": "user",
            "_id": "3",
            "_score": 0.5753642,
            "_source": {
              "name": "lisi",
              "address": "bei jing hai dian qu qing he zhen",
              "age": 23,
              "birthday": "1998-10-12",
              "interests": "xi huan hejiu,duanlian,changge"
            },
            "_explanation": {
              "value": 0.5753642,
              "description": "sum of:",
              "details": [
                {
                  "value": 0.2876821,
                  "description": "weight(interests:duanlian in 0) [PerFieldSimilarity], result of:",
                  "details": [
                    {
                      "value": 0.2876821,
                      "description": "score(doc=0,freq=1.0 = termFreq=1.0
    ), product of:",
                      "details": [
                        {
                          "value": 0.2876821,
                          "description": "idf, computed as log(1 + (docCount - docFreq + 0.5) / (docFreq + 0.5)) from:",
                          "details": [
                            {
                              "value": 1,
                              "description": "docFreq",
                              "details": []
                            },
                            {
                              "value": 1,
                              "description": "docCount",
                              "details": []
                            }
                          ]
                        },
                        {
                          "value": 1,
                          "description": "tfNorm, computed as (freq * (k1 + 1)) / (freq + k1 * (1 - b + b * fieldLength / avgFieldLength)) from:",
                          "details": [
                            {
                              "value": 1,
                              "description": "termFreq=1.0",
                              "details": []
                            },
                            {
                              "value": 1.2,
                              "description": "parameter k1",
                              "details": []
                            },
                            {
                              "value": 0.75,
                              "description": "parameter b",
                              "details": []
                            },
                            {
                              "value": 5,
                              "description": "avgFieldLength",
                              "details": []
                            },
                            {
                              "value": 5,
                              "description": "fieldLength",
                              "details": []
                            }
                          ]
                        }
                      ]
                    }
                  ]
                },
                {
                  "value": 0.2876821,
                  "description": "weight(interests:changge in 0) [PerFieldSimilarity], result of:",
                  "details": [
                    {
                      "value": 0.2876821,
                      "description": "score(doc=0,freq=1.0 = termFreq=1.0
    ), product of:",
                      "details": [
                        {
                          "value": 0.2876821,
                          "description": "idf, computed as log(1 + (docCount - docFreq + 0.5) / (docFreq + 0.5)) from:",
                          "details": [
                            {
                              "value": 1,
                              "description": "docFreq",
                              "details": []
                            },
                            {
                              "value": 1,
                              "description": "docCount",
                              "details": []
                            }
                          ]
                        },
                        {
                          "value": 1,
                          "description": "tfNorm, computed as (freq * (k1 + 1)) / (freq + k1 * (1 - b + b * fieldLength / avgFieldLength)) from:",
                          "details": [
                            {
                              "value": 1,
                              "description": "termFreq=1.0",
                              "details": []
                            },
                            {
                              "value": 1.2,
                              "description": "parameter k1",
                              "details": []
                            },
                            {
                              "value": 0.75,
                              "description": "parameter b",
                              "details": []
                            },
                            {
                              "value": 5,
                              "description": "avgFieldLength",
                              "details": []
                            },
                            {
                              "value": 5,
                              "description": "fieldLength",
                              "details": []
                            }
                          ]
                        }
                      ]
                    }
                  ]
                }
              ]
            }
          },
          {
            "_shard": "[lib][1]",
            "_node": "AJ3x6yc8TfKj6_zx6VRm0g",
            "_index": "lib",
            "_type": "user",
            "_id": "5",
            "_score": 0.2876821,
            "_source": {
              "name": "zhangsan",
              "address": "bei jing chao yang qu",
              "age": 29,
              "birthday": "1988-10-12",
              "interests": "xi huan tingyinyue,changge,tiaowu"
            },
            "_explanation": {
              "value": 0.2876821,
              "description": "sum of:",
              "details": [
                {
                  "value": 0.2876821,
                  "description": "weight(interests:changge in 0) [PerFieldSimilarity], result of:",
                  "details": [
                    {
                      "value": 0.2876821,
                      "description": "score(doc=0,freq=1.0 = termFreq=1.0
    ), product of:",
                      "details": [
                        {
                          "value": 0.2876821,
                          "description": "idf, computed as log(1 + (docCount - docFreq + 0.5) / (docFreq + 0.5)) from:",
                          "details": [
                            {
                              "value": 1,
                              "description": "docFreq",
                              "details": []
                            },
                            {
                              "value": 1,
                              "description": "docCount",
                              "details": []
                            }
                          ]
                        },
                        {
                          "value": 1,
                          "description": "tfNorm, computed as (freq * (k1 + 1)) / (freq + k1 * (1 - b + b * fieldLength / avgFieldLength)) from:",
                          "details": [
                            {
                              "value": 1,
                              "description": "termFreq=1.0",
                              "details": []
                            },
                            {
                              "value": 1.2,
                              "description": "parameter k1",
                              "details": []
                            },
                            {
                              "value": 0.75,
                              "description": "parameter b",
                              "details": []
                            },
                            {
                              "value": 5,
                              "description": "avgFieldLength",
                              "details": []
                            },
                            {
                              "value": 5,
                              "description": "fieldLength",
                              "details": []
                            }
                          ]
                        }
                      ]
                    }
                  ]
                }
              ]
            }
          },
          {
            "_shard": "[lib][3]",
            "_node": "AJ3x6yc8TfKj6_zx6VRm0g",
            "_index": "lib",
            "_type": "user",
            "_id": "1",
            "_score": 0.2876821,
            "_source": {
              "name": "zhaoliu",
              "address": "hei long jiang sheng tie ling shi",
              "age": 50,
              "birthday": "1970-12-12",
              "interests": "xi huang hejiu,duanlian,lvyou"
            },
            "_explanation": {
              "value": 0.2876821,
              "description": "sum of:",
              "details": [
                {
                  "value": 0.2876821,
                  "description": "weight(interests:duanlian in 0) [PerFieldSimilarity], result of:",
                  "details": [
                    {
                      "value": 0.2876821,
                      "description": "score(doc=0,freq=1.0 = termFreq=1.0
    ), product of:",
                      "details": [
                        {
                          "value": 0.2876821,
                          "description": "idf, computed as log(1 + (docCount - docFreq + 0.5) / (docFreq + 0.5)) from:",
                          "details": [
                            {
                              "value": 1,
                              "description": "docFreq",
                              "details": []
                            },
                            {
                              "value": 1,
                              "description": "docCount",
                              "details": []
                            }
                          ]
                        },
                        {
                          "value": 1,
                          "description": "tfNorm, computed as (freq * (k1 + 1)) / (freq + k1 * (1 - b + b * fieldLength / avgFieldLength)) from:",
                          "details": [
                            {
                              "value": 1,
                              "description": "termFreq=1.0",
                              "details": []
                            },
                            {
                              "value": 1.2,
                              "description": "parameter k1",
                              "details": []
                            },
                            {
                              "value": 0.75,
                              "description": "parameter b",
                              "details": []
                            },
                            {
                              "value": 5,
                              "description": "avgFieldLength",
                              "details": []
                            },
                            {
                              "value": 5,
                              "description": "fieldLength",
                              "details": []
                            }
                          ]
                        }
                      ]
                    }
                  ]
                }
              ]
            }
          }
        ]
      }
    }

      5、查看一个文档能否匹配上某个查询

      使用上面的数据,id为2的可以匹配

    GET /lib/user/2/_explain
    {
      "query":{
        "match":{
          "interests":"duanlian,changge"
        }
      }
    }

      查询结果:

    {
      "_index": "lib",
      "_type": "user",
      "_id": "2",
      "matched": true,
      "explanation": {
        "value": 1.3862944,
        "description": "sum of:",
        "details": [
          {
            "value": 0.6931472,
            "description": "weight(interests:duanlian in 0) [PerFieldSimilarity], result of:",
            "details": [
              {
                "value": 0.6931472,
                "description": "score(doc=0,freq=1.0 = termFreq=1.0
    ), product of:",
                "details": [
                  {
                    "value": 0.6931472,
                    "description": "idf, computed as log(1 + (docCount - docFreq + 0.5) / (docFreq + 0.5)) from:",
                    "details": [
                      {
                        "value": 1,
                        "description": "docFreq",
                        "details": []
                      },
                      {
                        "value": 2,
                        "description": "docCount",
                        "details": []
                      }
                    ]
                  },
                  {
                    "value": 1,
                    "description": "tfNorm, computed as (freq * (k1 + 1)) / (freq + k1 * (1 - b + b * fieldLength / avgFieldLength)) from:",
                    "details": [
                      {
                        "value": 1,
                        "description": "termFreq=1.0",
                        "details": []
                      },
                      {
                        "value": 1.2,
                        "description": "parameter k1",
                        "details": []
                      },
                      {
                        "value": 0.75,
                        "description": "parameter b",
                        "details": []
                      },
                      {
                        "value": 5,
                        "description": "avgFieldLength",
                        "details": []
                      },
                      {
                        "value": 5,
                        "description": "fieldLength",
                        "details": []
                      }
                    ]
                  }
                ]
              }
            ]
          },
          {
            "value": 0.6931472,
            "description": "weight(interests:changge in 0) [PerFieldSimilarity], result of:",
            "details": [
              {
                "value": 0.6931472,
                "description": "score(doc=0,freq=1.0 = termFreq=1.0
    ), product of:",
                "details": [
                  {
                    "value": 0.6931472,
                    "description": "idf, computed as log(1 + (docCount - docFreq + 0.5) / (docFreq + 0.5)) from:",
                    "details": [
                      {
                        "value": 1,
                        "description": "docFreq",
                        "details": []
                      },
                      {
                        "value": 2,
                        "description": "docCount",
                        "details": []
                      }
                    ]
                  },
                  {
                    "value": 1,
                    "description": "tfNorm, computed as (freq * (k1 + 1)) / (freq + k1 * (1 - b + b * fieldLength / avgFieldLength)) from:",
                    "details": [
                      {
                        "value": 1,
                        "description": "termFreq=1.0",
                        "details": []
                      },
                      {
                        "value": 1.2,
                        "description": "parameter k1",
                        "details": []
                      },
                      {
                        "value": 0.75,
                        "description": "parameter b",
                        "details": []
                      },
                      {
                        "value": 5,
                        "description": "avgFieldLength",
                        "details": []
                      },
                      {
                        "value": 5,
                        "description": "fieldLength",
                        "details": []
                      }
                    ]
                  }
                ]
              }
            ]
          }
        ]
      }
    }

      使用上面的数据,id为10的不能匹配:

    GET /lib/user/10/_explain
    {
      "query":{
        "match":{
          "interests":"duanlian,changge"
        }
      }
    }

      查询结果:

    {
      "_index": "lib",
      "_type": "user",
      "_id": "10",
      "matched": false
    }
  • 相关阅读:
    字典树入门
    Cyclic Nacklace HDU 3746 KMP 循环节
    KMP字符串匹配 模板 洛谷 P3375
    Phone List POJ-3630 字典树 or 暴力
    stringstream istringstream ostringstream 三者的区别
    单词数 HDU 2072 字符串输入控制
    逆序单词 HIhoCoder 1366 字典树
    input框中修改placeholder的样式
    如何使用$.each()与$().each()以及他们的区别
    css解决input的阴影
  • 原文地址:https://www.cnblogs.com/javasl/p/12661972.html
Copyright © 2011-2022 走看看