zoukankan      html  css  js  c++  java
  • es 分词器介绍

    按照单词切分,不做处理

    GET _analyze
    {
      "analyzer": "standard",
      "text": "2 running Quick brawn-foxes leap over lazy dogs in the summer evening."
    }
    
    
    
    {
      "tokens" : [
        {
          "token" : "2",
          "start_offset" : 0,分词
          "end_offset" : 1,
          "type" : "<NUM>",
          "position" : 0
        },
        {
          "token" : "running",
          "start_offset" : 2,
          "end_offset" : 9,
          "type" : "<ALPHANUM>",
          "position" : 1
        },
        {
          "token" : "quick",
          "start_offset" : 10,
          "end_offset" : 15,
          "type" : "<ALPHANUM>",
          "position" : 2
        },
        {
          "token" : "brawn",
          "start_offset" : 16,
          "end_offset" : 21,
          "type" : "<ALPHANUM>",
          "position" : 3
        },
        {
          "token" : "foxes",
          "start_offset" : 22,
          "end_offset" : 27,
          "type" : "<ALPHANUM>",
          "position" : 4
        },
        {
          "token" : "leap",
          "start_offset" : 28,
          "end_offset" : 32,
          "type" : "<ALPHANUM>",
          "position" : 5
        },
        {
          "token" : "over",
          "start_offset" : 33,
          "end_offset" : 37,
          "type" : "<ALPHANUM>",
          "position" : 6
        },
        {
          "token" : "lazy",
          "start_offset" : 38,
          "end_offset" : 42,
          "type" : "<ALPHANUM>",
          "position" : 7
        },
        {
          "token" : "dogs",
          "start_offset" : 43,
          "end_offset" : 47,
          "type" : "<ALPHANUM>",
          "position" : 8
        },
        {
          "token" : "in",
          "start_offset" : 48,
          "end_offset" : 50,
          "type" : "<ALPHANUM>",
          "position" : 9
        },
        {
          "token" : "the",
          "start_offset" : 51,
          "end_offset" : 54,
          "type" : "<ALPHANUM>",
          "position" : 10
        },
        {
          "token" : "summer",
          "start_offset" : 55,
          "end_offset" : 61,
          "type" : "<ALPHANUM>",
          "position" : 11
        },
        {
          "token" : "evening",
          "start_offset" : 62,
          "end_offset" : 69,
          "type" : "<ALPHANUM>",
          "position" : 12
        }
      ]
    }
    

      按照非字母的字符切分

    GET _analyze
    {
      "analyzer": "simple",
      "text": "2 running Quick brawn-foxes leap over lazy dogs in the summer evening."
    }
    
    
    {
      "tokens" : [
        {
          "token" : "running",
          "start_offset" : 2,
          "end_offset" : 9,
          "type" : "word",
          "position" : 0
        },
        {
          "token" : "quick",
          "start_offset" : 10,
          "end_offset" : 15,
          "type" : "word",
          "position" : 1
        },
        {
          "token" : "brawn",
          "start_offset" : 16,
          "end_offset" : 21,
          "type" : "word",
          "position" : 2
        },
        {
          "token" : "foxes",
          "start_offset" : 22,
          "end_offset" : 27,
          "type" : "word",
          "position" : 3
        },
        {
          "token" : "leap",
          "start_offset" : 28,
          "end_offset" : 32,
          "type" : "word",
          "position" : 4
        },
        {
          "token" : "over",
          "start_offset" : 33,
          "end_offset" : 37,
          "type" : "word",
          "position" : 5
        },
        {
          "token" : "lazy",
          "start_offset" : 38,
          "end_offset" : 42,
          "type" : "word",
          "position" : 6
        },
        {
          "token" : "dogs",
          "start_offset" : 43,
          "end_offset" : 47,
          "type" : "word",
          "position" : 7
        },
        {
          "token" : "in",
          "start_offset" : 48,
          "end_offset" : 50,
          "type" : "word",
          "position" : 8
        },
        {
          "token" : "the",
          "start_offset" : 51,
          "end_offset" : 54,
          "type" : "word",
          "position" : 9
        },
        {
          "token" : "summer",
          "start_offset" : 55,
          "end_offset" : 61,
          "type" : "word",
          "position" : 10
        },
        {
          "token" : "evening",
          "start_offset" : 62,
          "end_offset" : 69,
          "type" : "word",
          "position" : 11
        }
      ]
    }
    

      按照空格切分不做任何处理

    GET _analyze
    {
      "analyzer": "whitespace", 
      "text": "2 running Quick brawn-foxes leap over lazy dogs in the summer evening."
    }
    
    {
      "tokens" : [
        {
          "token" : "2",
          "start_offset" : 0,
          "end_offset" : 1,
          "type" : "word",
          "position" : 0
        },
        {
          "token" : "running",
          "start_offset" : 2,
          "end_offset" : 9,
          "type" : "word",
          "position" : 1
        },
        {
          "token" : "Quick",
          "start_offset" : 10,
          "end_offset" : 15,
          "type" : "word",
          "position" : 2
        },
        {
          "token" : "brawn-foxes",
          "start_offset" : 16,
          "end_offset" : 27,
          "type" : "word",
          "position" : 3
        },
        {
          "token" : "leap",
          "start_offset" : 28,
          "end_offset" : 32,
          "type" : "word",
          "position" : 4
        },
        {
          "token" : "over",
          "start_offset" : 33,
          "end_offset" : 37,
          "type" : "word",
          "position" : 5
        },
        {
          "token" : "lazy",
          "start_offset" : 38,
          "end_offset" : 42,
          "type" : "word",
          "position" : 6
        },
        {
          "token" : "dogs",
          "start_offset" : 43,
          "end_offset" : 47,
          "type" : "word",
          "position" : 7
        },
        {
          "token" : "in",
          "start_offset" : 48,
          "end_offset" : 50,
          "type" : "word",
          "position" : 8
        },
        {
          "token" : "the",
          "start_offset" : 51,
          "end_offset" : 54,
          "type" : "word",
          "position" : 9
        },
        {
          "token" : "summer",
          "start_offset" : 55,
          "end_offset" : 61,
          "type" : "word",
          "position" : 10
        },
        {
          "token" : "evening.",
          "start_offset" : 62,
          "end_offset" : 70,
          "type" : "word",
          "position" : 11
        }
      ]
    }
    

      按词切分去掉修饰词

    GET _analyze
    {
      "analyzer": "stop",
      "text": "2 running Quick brawn-foxes leap over lazy dogs in the summer evening."
    }
    
    
    
    
    
    {
      "tokens" : [
        {
          "token" : "running",
          "start_offset" : 2,
          "end_offset" : 9,
          "type" : "word",
          "position" : 0
        },
        {
          "token" : "quick",
          "start_offset" : 10,
          "end_offset" : 15,
          "type" : "word",
          "position" : 1
        },
        {
          "token" : "brawn",
          "start_offset" : 16,
          "end_offset" : 21,
          "type" : "word",
          "position" : 2
        },
        {
          "token" : "foxes",
          "start_offset" : 22,
          "end_offset" : 27,
          "type" : "word",
          "position" : 3
        },
        {
          "token" : "leap",
          "start_offset" : 28,
          "end_offset" : 32,
          "type" : "word",
          "position" : 4
        },
        {
          "token" : "over",
          "start_offset" : 33,
          "end_offset" : 37,
          "type" : "word",
          "position" : 5
        },
        {
          "token" : "lazy",
          "start_offset" : 38,
          "end_offset" : 42,
          "type" : "word",
          "position" : 6
        },
        {
          "token" : "dogs",
          "start_offset" : 43,
          "end_offset" : 47,
          "type" : "word",
          "position" : 7
        },
        {
          "token" : "summer",
          "start_offset" : 55,
          "end_offset" : 61,
          "type" : "word",
          "position" : 10
        },
        {
          "token" : "evening",
          "start_offset" : 62,
          "end_offset" : 69,
          "type" : "word",
          "position" : 11
        }
      ]
    }
    

      不进行切分直接输出

    GET _analyze
    {
      "analyzer": "keyword",
      "text": "2 running Quick brawn-foxes leap over lazy dogs in the summer evening."
    }
    
    
    
    {
      "tokens" : [
        {
          "token" : "2 running Quick brawn-foxes leap over lazy dogs in the summer evening.",
          "start_offset" : 0,
          "end_offset" : 70,
          "type" : "word",
          "position" : 0
        }
      ]
    }
    

      通过正则表达式方式进行切割,默认非字符的方式切割

    GET _analyze
    {
      "analyzer": "pattern",
      "text": "2 running Quick brawn-foxes leap over lazy dogs in the summer evening."
    }
    
    
    {
      "tokens" : [
        {
          "token" : "2",
          "start_offset" : 0,
          "end_offset" : 1,
          "type" : "word",
          "position" : 0
        },
        {
          "token" : "running",
          "start_offset" : 2,
          "end_offset" : 9,
          "type" : "word",
          "position" : 1
        },
        {
          "token" : "quick",
          "start_offset" : 10,
          "end_offset" : 15,
          "type" : "word",
          "position" : 2
        },
        {
          "token" : "brawn",
          "start_offset" : 16,
          "end_offset" : 21,
          "type" : "word",
          "position" : 3
        },
        {
          "token" : "foxes",
          "start_offset" : 22,
          "end_offset" : 27,
          "type" : "word",
          "position" : 4
        },
        {
          "token" : "leap",
          "start_offset" : 28,
          "end_offset" : 32,
          "type" : "word",
          "position" : 5
        },
        {
          "token" : "over",
          "start_offset" : 33,
          "end_offset" : 37,
          "type" : "word",
          "position" : 6
        },
        {
          "token" : "lazy",
          "start_offset" : 38,
          "end_offset" : 42,
          "type" : "word",
          "position" : 7
        },
        {
          "token" : "dogs",
          "start_offset" : 43,
          "end_offset" : 47,
          "type" : "word",
          "position" : 8
        },
        {
          "token" : "in",
          "start_offset" : 48,
          "end_offset" : 50,
          "type" : "word",
          "position" : 9
        },
        {
          "token" : "the",
          "start_offset" : 51,
          "end_offset" : 54,
          "type" : "word",
          "position" : 10
        },
        {
          "token" : "summer",
          "start_offset" : 55,
          "end_offset" : 61,
          "type" : "word",
          "position" : 11
        },
        {
          "token" : "evening",
          "start_offset" : 62,
          "end_offset" : 69,
          "type" : "word",
          "position" : 12
        }
      ]
    }
    

      英语分词器

    GET _analyze
    {
      "analyzer": "english",
      "text": "2 running Quick brawn-foxes leap over lazy dogs in the summer evening."
    }
    
    
    
    
    
    {
      "tokens" : [
        {
          "token" : "2",
          "start_offset" : 0,
          "end_offset" : 1,
          "type" : "<NUM>",
          "position" : 0
        },
        {
          "token" : "run",
          "start_offset" : 2,
          "end_offset" : 9,
          "type" : "<ALPHANUM>",
          "position" : 1
        },
        {
          "token" : "quick",
          "start_offset" : 10,
          "end_offset" : 15,
          "type" : "<ALPHANUM>",
          "position" : 2
        },
        {
          "token" : "brawn",
          "start_offset" : 16,
          "end_offset" : 21,
          "type" : "<ALPHANUM>",
          "position" : 3
        },
        {
          "token" : "fox",
          "start_offset" : 22,
          "end_offset" : 27,
          "type" : "<ALPHANUM>",
          "position" : 4
        },
        {
          "token" : "leap",
          "start_offset" : 28,
          "end_offset" : 32,
          "type" : "<ALPHANUM>",
          "position" : 5
        },
        {
          "token" : "over",
          "start_offset" : 33,
          "end_offset" : 37,
          "type" : "<ALPHANUM>",
          "position" : 6
        },
        {
          "token" : "lazi",
          "start_offset" : 38,
          "end_offset" : 42,
          "type" : "<ALPHANUM>",
          "position" : 7
        },
        {
          "token" : "dog",
          "start_offset" : 43,
          "end_offset" : 47,
          "type" : "<ALPHANUM>",
          "position" : 8
        },
        {
          "token" : "summer",
          "start_offset" : 55,
          "end_offset" : 61,
          "type" : "<ALPHANUM>",
          "position" : 11
        },
        {
          "token" : "even",
          "start_offset" : 62,
          "end_offset" : 69,
          "type" : "<ALPHANUM>",
          "position" : 12
        }
      ]
    }
    

      中文分词器,一个字符一个字符切分

    POST _analyze
    {
      "analyzer": "standard",
      "text": "他说的确实在理"
    }
    
    
    
    {
      "tokens" : [
        {
          "token" : "他",
          "start_offset" : 0,
          "end_offset" : 1,
          "type" : "<IDEOGRAPHIC>",
          "position" : 0
        },
        {
          "token" : "说",
          "start_offset" : 1,
          "end_offset" : 2,
          "type" : "<IDEOGRAPHIC>",
          "position" : 1
        },
        {
          "token" : "的",
          "start_offset" : 2,
          "end_offset" : 3,
          "type" : "<IDEOGRAPHIC>",
          "position" : 2
        },
        {
          "token" : "确",
          "start_offset" : 3,
          "end_offset" : 4,
          "type" : "<IDEOGRAPHIC>",
          "position" : 3
        },
        {
          "token" : "实",
          "start_offset" : 4,
          "end_offset" : 5,
          "type" : "<IDEOGRAPHIC>",
          "position" : 4
        },
        {
          "token" : "在",
          "start_offset" : 5,
          "end_offset" : 6,
          "type" : "<IDEOGRAPHIC>",
          "position" : 5
        },
        {
          "token" : "理",
          "start_offset" : 6,
          "end_offset" : 7,
          "type" : "<IDEOGRAPHIC>",
          "position" : 6
        }
      ]
    }
    

      

    草都可以从石头缝隙中长出来更可况你呢
  • 相关阅读:
    如何快速转载CSDN及博客园中的博客
    Ubuntu18.04连不网 报"有线连接未托管"
    Ubuntu18.04的网络配置
    vim基本操作
    Git更新远程仓库代码到本地(转)
    POJ 3253 Fence Repair
    POJ 2503 Babelfish
    POJ 2002 Squares
    POJ 1840 Eqs
    POJ 3274 Gold Balanced Lineup
  • 原文地址:https://www.cnblogs.com/rdchenxi/p/11826407.html
Copyright © 2011-2022 走看看