zoukankan      html  css  js  c++  java
  • es 分词器介绍

    按照单词切分,不做处理

    GET _analyze
    {
      "analyzer": "standard",
      "text": "2 running Quick brawn-foxes leap over lazy dogs in the summer evening."
    }
    
    
    
    {
      "tokens" : [
        {
          "token" : "2",
          "start_offset" : 0,分词
          "end_offset" : 1,
          "type" : "<NUM>",
          "position" : 0
        },
        {
          "token" : "running",
          "start_offset" : 2,
          "end_offset" : 9,
          "type" : "<ALPHANUM>",
          "position" : 1
        },
        {
          "token" : "quick",
          "start_offset" : 10,
          "end_offset" : 15,
          "type" : "<ALPHANUM>",
          "position" : 2
        },
        {
          "token" : "brawn",
          "start_offset" : 16,
          "end_offset" : 21,
          "type" : "<ALPHANUM>",
          "position" : 3
        },
        {
          "token" : "foxes",
          "start_offset" : 22,
          "end_offset" : 27,
          "type" : "<ALPHANUM>",
          "position" : 4
        },
        {
          "token" : "leap",
          "start_offset" : 28,
          "end_offset" : 32,
          "type" : "<ALPHANUM>",
          "position" : 5
        },
        {
          "token" : "over",
          "start_offset" : 33,
          "end_offset" : 37,
          "type" : "<ALPHANUM>",
          "position" : 6
        },
        {
          "token" : "lazy",
          "start_offset" : 38,
          "end_offset" : 42,
          "type" : "<ALPHANUM>",
          "position" : 7
        },
        {
          "token" : "dogs",
          "start_offset" : 43,
          "end_offset" : 47,
          "type" : "<ALPHANUM>",
          "position" : 8
        },
        {
          "token" : "in",
          "start_offset" : 48,
          "end_offset" : 50,
          "type" : "<ALPHANUM>",
          "position" : 9
        },
        {
          "token" : "the",
          "start_offset" : 51,
          "end_offset" : 54,
          "type" : "<ALPHANUM>",
          "position" : 10
        },
        {
          "token" : "summer",
          "start_offset" : 55,
          "end_offset" : 61,
          "type" : "<ALPHANUM>",
          "position" : 11
        },
        {
          "token" : "evening",
          "start_offset" : 62,
          "end_offset" : 69,
          "type" : "<ALPHANUM>",
          "position" : 12
        }
      ]
    }
    

      按照非字母的字符切分

    GET _analyze
    {
      "analyzer": "simple",
      "text": "2 running Quick brawn-foxes leap over lazy dogs in the summer evening."
    }
    
    
    {
      "tokens" : [
        {
          "token" : "running",
          "start_offset" : 2,
          "end_offset" : 9,
          "type" : "word",
          "position" : 0
        },
        {
          "token" : "quick",
          "start_offset" : 10,
          "end_offset" : 15,
          "type" : "word",
          "position" : 1
        },
        {
          "token" : "brawn",
          "start_offset" : 16,
          "end_offset" : 21,
          "type" : "word",
          "position" : 2
        },
        {
          "token" : "foxes",
          "start_offset" : 22,
          "end_offset" : 27,
          "type" : "word",
          "position" : 3
        },
        {
          "token" : "leap",
          "start_offset" : 28,
          "end_offset" : 32,
          "type" : "word",
          "position" : 4
        },
        {
          "token" : "over",
          "start_offset" : 33,
          "end_offset" : 37,
          "type" : "word",
          "position" : 5
        },
        {
          "token" : "lazy",
          "start_offset" : 38,
          "end_offset" : 42,
          "type" : "word",
          "position" : 6
        },
        {
          "token" : "dogs",
          "start_offset" : 43,
          "end_offset" : 47,
          "type" : "word",
          "position" : 7
        },
        {
          "token" : "in",
          "start_offset" : 48,
          "end_offset" : 50,
          "type" : "word",
          "position" : 8
        },
        {
          "token" : "the",
          "start_offset" : 51,
          "end_offset" : 54,
          "type" : "word",
          "position" : 9
        },
        {
          "token" : "summer",
          "start_offset" : 55,
          "end_offset" : 61,
          "type" : "word",
          "position" : 10
        },
        {
          "token" : "evening",
          "start_offset" : 62,
          "end_offset" : 69,
          "type" : "word",
          "position" : 11
        }
      ]
    }
    

      按照空格切分不做任何处理

    GET _analyze
    {
      "analyzer": "whitespace", 
      "text": "2 running Quick brawn-foxes leap over lazy dogs in the summer evening."
    }
    
    {
      "tokens" : [
        {
          "token" : "2",
          "start_offset" : 0,
          "end_offset" : 1,
          "type" : "word",
          "position" : 0
        },
        {
          "token" : "running",
          "start_offset" : 2,
          "end_offset" : 9,
          "type" : "word",
          "position" : 1
        },
        {
          "token" : "Quick",
          "start_offset" : 10,
          "end_offset" : 15,
          "type" : "word",
          "position" : 2
        },
        {
          "token" : "brawn-foxes",
          "start_offset" : 16,
          "end_offset" : 27,
          "type" : "word",
          "position" : 3
        },
        {
          "token" : "leap",
          "start_offset" : 28,
          "end_offset" : 32,
          "type" : "word",
          "position" : 4
        },
        {
          "token" : "over",
          "start_offset" : 33,
          "end_offset" : 37,
          "type" : "word",
          "position" : 5
        },
        {
          "token" : "lazy",
          "start_offset" : 38,
          "end_offset" : 42,
          "type" : "word",
          "position" : 6
        },
        {
          "token" : "dogs",
          "start_offset" : 43,
          "end_offset" : 47,
          "type" : "word",
          "position" : 7
        },
        {
          "token" : "in",
          "start_offset" : 48,
          "end_offset" : 50,
          "type" : "word",
          "position" : 8
        },
        {
          "token" : "the",
          "start_offset" : 51,
          "end_offset" : 54,
          "type" : "word",
          "position" : 9
        },
        {
          "token" : "summer",
          "start_offset" : 55,
          "end_offset" : 61,
          "type" : "word",
          "position" : 10
        },
        {
          "token" : "evening.",
          "start_offset" : 62,
          "end_offset" : 70,
          "type" : "word",
          "position" : 11
        }
      ]
    }
    

      按词切分去掉修饰词

    GET _analyze
    {
      "analyzer": "stop",
      "text": "2 running Quick brawn-foxes leap over lazy dogs in the summer evening."
    }
    
    
    
    
    
    {
      "tokens" : [
        {
          "token" : "running",
          "start_offset" : 2,
          "end_offset" : 9,
          "type" : "word",
          "position" : 0
        },
        {
          "token" : "quick",
          "start_offset" : 10,
          "end_offset" : 15,
          "type" : "word",
          "position" : 1
        },
        {
          "token" : "brawn",
          "start_offset" : 16,
          "end_offset" : 21,
          "type" : "word",
          "position" : 2
        },
        {
          "token" : "foxes",
          "start_offset" : 22,
          "end_offset" : 27,
          "type" : "word",
          "position" : 3
        },
        {
          "token" : "leap",
          "start_offset" : 28,
          "end_offset" : 32,
          "type" : "word",
          "position" : 4
        },
        {
          "token" : "over",
          "start_offset" : 33,
          "end_offset" : 37,
          "type" : "word",
          "position" : 5
        },
        {
          "token" : "lazy",
          "start_offset" : 38,
          "end_offset" : 42,
          "type" : "word",
          "position" : 6
        },
        {
          "token" : "dogs",
          "start_offset" : 43,
          "end_offset" : 47,
          "type" : "word",
          "position" : 7
        },
        {
          "token" : "summer",
          "start_offset" : 55,
          "end_offset" : 61,
          "type" : "word",
          "position" : 10
        },
        {
          "token" : "evening",
          "start_offset" : 62,
          "end_offset" : 69,
          "type" : "word",
          "position" : 11
        }
      ]
    }
    

      不进行切分直接输出

    GET _analyze
    {
      "analyzer": "keyword",
      "text": "2 running Quick brawn-foxes leap over lazy dogs in the summer evening."
    }
    
    
    
    {
      "tokens" : [
        {
          "token" : "2 running Quick brawn-foxes leap over lazy dogs in the summer evening.",
          "start_offset" : 0,
          "end_offset" : 70,
          "type" : "word",
          "position" : 0
        }
      ]
    }
    

      通过正则表达式方式进行切割,默认非字符的方式切割

    GET _analyze
    {
      "analyzer": "pattern",
      "text": "2 running Quick brawn-foxes leap over lazy dogs in the summer evening."
    }
    
    
    {
      "tokens" : [
        {
          "token" : "2",
          "start_offset" : 0,
          "end_offset" : 1,
          "type" : "word",
          "position" : 0
        },
        {
          "token" : "running",
          "start_offset" : 2,
          "end_offset" : 9,
          "type" : "word",
          "position" : 1
        },
        {
          "token" : "quick",
          "start_offset" : 10,
          "end_offset" : 15,
          "type" : "word",
          "position" : 2
        },
        {
          "token" : "brawn",
          "start_offset" : 16,
          "end_offset" : 21,
          "type" : "word",
          "position" : 3
        },
        {
          "token" : "foxes",
          "start_offset" : 22,
          "end_offset" : 27,
          "type" : "word",
          "position" : 4
        },
        {
          "token" : "leap",
          "start_offset" : 28,
          "end_offset" : 32,
          "type" : "word",
          "position" : 5
        },
        {
          "token" : "over",
          "start_offset" : 33,
          "end_offset" : 37,
          "type" : "word",
          "position" : 6
        },
        {
          "token" : "lazy",
          "start_offset" : 38,
          "end_offset" : 42,
          "type" : "word",
          "position" : 7
        },
        {
          "token" : "dogs",
          "start_offset" : 43,
          "end_offset" : 47,
          "type" : "word",
          "position" : 8
        },
        {
          "token" : "in",
          "start_offset" : 48,
          "end_offset" : 50,
          "type" : "word",
          "position" : 9
        },
        {
          "token" : "the",
          "start_offset" : 51,
          "end_offset" : 54,
          "type" : "word",
          "position" : 10
        },
        {
          "token" : "summer",
          "start_offset" : 55,
          "end_offset" : 61,
          "type" : "word",
          "position" : 11
        },
        {
          "token" : "evening",
          "start_offset" : 62,
          "end_offset" : 69,
          "type" : "word",
          "position" : 12
        }
      ]
    }
    

      英语分词器

    GET _analyze
    {
      "analyzer": "english",
      "text": "2 running Quick brawn-foxes leap over lazy dogs in the summer evening."
    }
    
    
    
    
    
    {
      "tokens" : [
        {
          "token" : "2",
          "start_offset" : 0,
          "end_offset" : 1,
          "type" : "<NUM>",
          "position" : 0
        },
        {
          "token" : "run",
          "start_offset" : 2,
          "end_offset" : 9,
          "type" : "<ALPHANUM>",
          "position" : 1
        },
        {
          "token" : "quick",
          "start_offset" : 10,
          "end_offset" : 15,
          "type" : "<ALPHANUM>",
          "position" : 2
        },
        {
          "token" : "brawn",
          "start_offset" : 16,
          "end_offset" : 21,
          "type" : "<ALPHANUM>",
          "position" : 3
        },
        {
          "token" : "fox",
          "start_offset" : 22,
          "end_offset" : 27,
          "type" : "<ALPHANUM>",
          "position" : 4
        },
        {
          "token" : "leap",
          "start_offset" : 28,
          "end_offset" : 32,
          "type" : "<ALPHANUM>",
          "position" : 5
        },
        {
          "token" : "over",
          "start_offset" : 33,
          "end_offset" : 37,
          "type" : "<ALPHANUM>",
          "position" : 6
        },
        {
          "token" : "lazi",
          "start_offset" : 38,
          "end_offset" : 42,
          "type" : "<ALPHANUM>",
          "position" : 7
        },
        {
          "token" : "dog",
          "start_offset" : 43,
          "end_offset" : 47,
          "type" : "<ALPHANUM>",
          "position" : 8
        },
        {
          "token" : "summer",
          "start_offset" : 55,
          "end_offset" : 61,
          "type" : "<ALPHANUM>",
          "position" : 11
        },
        {
          "token" : "even",
          "start_offset" : 62,
          "end_offset" : 69,
          "type" : "<ALPHANUM>",
          "position" : 12
        }
      ]
    }
    

      中文分词器,一个字符一个字符切分

    POST _analyze
    {
      "analyzer": "standard",
      "text": "他说的确实在理"
    }
    
    
    
    {
      "tokens" : [
        {
          "token" : "他",
          "start_offset" : 0,
          "end_offset" : 1,
          "type" : "<IDEOGRAPHIC>",
          "position" : 0
        },
        {
          "token" : "说",
          "start_offset" : 1,
          "end_offset" : 2,
          "type" : "<IDEOGRAPHIC>",
          "position" : 1
        },
        {
          "token" : "的",
          "start_offset" : 2,
          "end_offset" : 3,
          "type" : "<IDEOGRAPHIC>",
          "position" : 2
        },
        {
          "token" : "确",
          "start_offset" : 3,
          "end_offset" : 4,
          "type" : "<IDEOGRAPHIC>",
          "position" : 3
        },
        {
          "token" : "实",
          "start_offset" : 4,
          "end_offset" : 5,
          "type" : "<IDEOGRAPHIC>",
          "position" : 4
        },
        {
          "token" : "在",
          "start_offset" : 5,
          "end_offset" : 6,
          "type" : "<IDEOGRAPHIC>",
          "position" : 5
        },
        {
          "token" : "理",
          "start_offset" : 6,
          "end_offset" : 7,
          "type" : "<IDEOGRAPHIC>",
          "position" : 6
        }
      ]
    }
    

      

    草都可以从石头缝隙中长出来更可况你呢
  • 相关阅读:
    Module build failed: Error: Cannot find module 'node-sass'报错问题
    vue element upload
    vue-element-table 分页选中
    两种倒计时
    【LOJ #6076】「2017 山东一轮集训 Day6」三元组(莫比乌斯反演 / 三元环计数)
    【LOJ #6075】「2017 山东一轮集训 Day6」重建(DP)
    【2020省选模拟】题解
    【LOJ #6074】「2017 山东一轮集训 Day6」子序列(矩阵乘法)
    【LOJ #6073】「2017 山东一轮集训 Day5」距离(主席树 / 树链剖分)
    【LOJ #6072】 「2017 山东一轮集训 Day5」苹果树(容斥 / 搜索 / 矩阵树定理)
  • 原文地址:https://www.cnblogs.com/rdchenxi/p/11826407.html
Copyright © 2011-2022 走看看