zoukankan      html  css  js  c++  java
  • Elasticsearch07-分词器

    分词器作用

    1.切词
    2.提升召回率:能搜索到的结果的比率

    分析器

    • character filter:分词之前预处理(过滤无用字符、标签等,转换一些&=>and 《Elasticsearch》=> Elasticsearch

      # html标标签解析
      PUT index_my_char_filter_html
      {
        "settings": {
          "analysis": {
            "char_filter": {
              "my_char_filter": {
                "type": "html_strip",
                "escaped_tags": ["a"] # a标签不用解析,escaped_tags:需要保留的html标签
              }
            },
            "analyzer": {
              "my_analyzerr_html": {
                "tokenizer": "keyword",
                "char_filter": ["my_char_filter"]
              }
            }
          }
        }
      }
      
      验证:
      POST index_my_char_filter_html/_analyze
      {
        "analyzer": "my_analyzerr_html",
        "text": "<p>I&apos;m so <a>happy</a>!</p>"
      }
      
      
      
      #filter mapping
      PUT index_my_char_filter_mapping
      {
        "settings": {
          "analysis": {
            "analyzer": {
              "my_analyzer": {
                "tokenizer": "keyword",
                "char_filter": [
                  "analyzer_mapping"
                ]
              }
            },
            "char_filter": {
              "analyzer_mapping": {
                "type": "mapping",
                "mappings": [
                  "٠ => 0",
                  "١ => 1",
                  "٢ => 2",
                  "٣ => 3",
                  "٤ => 4",
                  "٥ => 5",
                  "٦ => 6",
                  "٧ => 7",
                  "٨ => 8",
                  "٩ => 9"
                ]
              }
            }
          }
        }
      }
      
      
      验证:
      POST index_my_char_filter_mapping/_analyze
      {
        "analyzer": "my_analyzer",
        "text": "My license plate is ٢٥٠١٥"
      }
      
      
      #正则
      PUT my_index
      {
        "settings": {
          "analysis": {
            "analyzer": {
              "my_analyzer": {
                "tokenizer": "standard",
                "char_filter": ["my_char_filter"]
              }
            },
            "char_filter": {
              "my_char_filter": {
                "type": "pattern_replace",
                "pattern": "(\d+)-(?=\d)",
                "replacement": "$1_"
              }
            }
          }
        }
      }
      
      验证:
      POST my_index/_analyze
      {
        "analyzer": "my_analyzer",
        "text": "My credit card is 123-456-789"
      }
    • tokenizer(分词器):分词

      GET _analyze
      {
        "tokenizer" : "standard", #默认分词器
        "filter" : ["lowercase"],
        "text" : "THE Quick FoX JUMPs"
      }
    • token filter:停用词、时态转换、大小写转换、同义词转换、语气词处理等。比如:has=>have  him=>he  apples=>apple  the/oh/a=>干掉

      GET /_analyze
      {
        "tokenizer": "standard",
        "filter": [
          {
            "type": "condition",
            "filter": [ "lowercase" ],
            "script": {
              "source": "token.getTerm().length() < 5"
            }
          }
        ],
        "text": "THE QUICK BROWN FOX"
      }
      
      #停用词 stopwords token filter
      PUT /index_stopword
      {
        "settings": {
          "analysis": {
            "analyzer": {
              "index_stopword_analyzer":{
                "type":"standard",
                "stopwords":"_english_"
              }
            }
          }
        }
      }
      
      GET index_stopword/_analyze
      {
        "analyzer": "index_stopword_analyzer",
        "text": "Teacher Ma is in the restroom"
      }



    分词器:analyzer

    • standard:默认分词器,中文支持的不理想,会逐字拆分。

      GET _analyze
      {
        "tokenizer" : "standard",
        "text" : "THE Quick FoX JUMPs"
      }
    • Pattern Tokenizer:以正则匹配分隔符,把文本拆分成若干词项。

    • Simple Pattern Tokenizer:以正则匹配词项,速度比Pattern Tokenizer快。

      GET /index_stopword/_analyze
      {
        "text": "江山如此多娇,小姐姐哪里可以撩",
        "analyzer": "simple"
      }
    • whitespace:以空白符分隔

    • 自定义分词器
      PUT /index_custom
      {
        "settings": {
          "analysis": {
            "char_filter": {
              "test_char_filter": {
                "type": "mapping",
                "mappings": [
                  "& => and",
                  "| => or"
                ]
              }
            },
            "filter": {
              "test_stopwords": {
                "type": "stop",
                "stopwords": ["is","in","at","the","a","for"]
              }
            },
            "tokenizer": {
              "punctuation": { 
                "type": "pattern",
                "pattern": "[ .,!?]"
              }
            },
            "analyzer": {
              "index_custom_analyzer": {
                "type": "custom",
                "char_filter": [
                  "html_strip",
                  "test_char_filter"
                ],
                "tokenizer": "standard",
                "filter": ["lowercase","test_stopwords"]
              }
            }
          }
        }
      }
      
      GET /index_custom/_analyze
      {
        "text": "Teacher ma & zhang also thinks [mother's friends] is good | nice!!!",
        "analyzer": "index_custom_analyzer"
      }

    创建mapping时候指定分词器

    #创建mapping时候指定分词器
    PUT /index_create_mapping_analuzerc/
    {
      "mappings": {
        "properties": {
          "content": {
            "type": "text",
            "analyzer": "standard"
          }
        }
      }
    }
    
    
    #创建mapping字段时候指定分词器
    PUT /index_create_mapping_analuzerb/_mapping
    {
      "properties": {
        "column": {
          "type": "text",
         "analyzer": "ik_max_word"
        }
      }
    }

    中文分词器

    PUT index_ik_max
    {
      "settings": {
        "analysis": {
          "analyzer": {
            "default": {
              "type": "ik_max_word"
            }
          }
        }
      }
    }
    PUT /index_ik_smart
    {
      "mappings": {
          "properties": {
            "text": {
              "type": "text",
              "analyzer": "ik_max_word",
              "search_analyzer": "ik_smart"
            }
        }
      }
    }
    
    GET /index_ik_max/_analyze
    {
      "text": "中华人民共和国国歌",
      "analyzer": "ik_max_word"
    }
    GET /index_ik_smart/_analyze
    {
      "text": "中华人民共和国国歌",
      "analyzer": "ik_smart"
    }

    安装Ik分词器

    1.https://github.com/medcl/elasticsearch-analysis-ik/releases
    2.Releases -> Assets -> 下载zip文件
    3.es的/plugin创建/ik目录
    4.将文件解压到ik目录下
    5.将mysql驱动包放到ik目录下
    6.重新启动

    热加载ik分词器

    1.下载https://github.com/medcl/elasticsearch-analysis-ik
    2.打包
    3.获取jar包: /target/releases/elasticsearch-analysis-ik-7.6.2.zip
    4.es的/plugin创建/ik目录
    5.将文件elasticsearch-analysis-ik-7.6.2.zip解压在ik目录下
    6.将mysql驱动包放到ik目录下
    7.重新启动
    
    备注: 修改代码目录:dic/Dictionary initial方法,这里面有对应的主词库、停用词修改代码。
    原理: 
    1.es启动的时候启动maven项目
    2.maven项目中写个定时器,定时去mysql中查询数据
    3.将查询的数据临时加载进内存,也就是es启动时候加载这些主词、停用词文件代码的地方

    ik分词器注意点

    1.es版本、kibana版本、ik分词版本你要一致
    2.三个最新版本不同步,一般不要下载最新版本
  • 相关阅读:
    innotop 安装和使用
    Waiting for table flush 的原因及处理方法
    input 在 chrome 下 , 自动填充后,默认样式清除
    小米WiFi放大器出现黄灯闪烁,无法使用处理方法
    Error writing file '/tmp/MLLGyECY' (Errcode: 28
    org.apache.activemq.transport.InactivityIOException: Channel was inactive for too (>30000) long: tcp://192.168.120.22:61616
    Apache-Tomcat-Ajp漏洞(CVE-2020-1938)漏洞复现 和处理
    Spring boot CommandLineRunner接口使用例子
    interface21
    interface21
  • 原文地址:https://www.cnblogs.com/bigdata-familyMeals/p/15248610.html
Copyright © 2011-2022 走看看