zoukankan      html  css  js  c++  java
  • es 分词器介绍

    按照单词切分,不做处理

    GET _analyze
    {
      "analyzer": "standard",
      "text": "2 running Quick brawn-foxes leap over lazy dogs in the summer evening."
    }
    
    
    
    {
      "tokens" : [
        {
          "token" : "2",
          "start_offset" : 0,分词
          "end_offset" : 1,
          "type" : "<NUM>",
          "position" : 0
        },
        {
          "token" : "running",
          "start_offset" : 2,
          "end_offset" : 9,
          "type" : "<ALPHANUM>",
          "position" : 1
        },
        {
          "token" : "quick",
          "start_offset" : 10,
          "end_offset" : 15,
          "type" : "<ALPHANUM>",
          "position" : 2
        },
        {
          "token" : "brawn",
          "start_offset" : 16,
          "end_offset" : 21,
          "type" : "<ALPHANUM>",
          "position" : 3
        },
        {
          "token" : "foxes",
          "start_offset" : 22,
          "end_offset" : 27,
          "type" : "<ALPHANUM>",
          "position" : 4
        },
        {
          "token" : "leap",
          "start_offset" : 28,
          "end_offset" : 32,
          "type" : "<ALPHANUM>",
          "position" : 5
        },
        {
          "token" : "over",
          "start_offset" : 33,
          "end_offset" : 37,
          "type" : "<ALPHANUM>",
          "position" : 6
        },
        {
          "token" : "lazy",
          "start_offset" : 38,
          "end_offset" : 42,
          "type" : "<ALPHANUM>",
          "position" : 7
        },
        {
          "token" : "dogs",
          "start_offset" : 43,
          "end_offset" : 47,
          "type" : "<ALPHANUM>",
          "position" : 8
        },
        {
          "token" : "in",
          "start_offset" : 48,
          "end_offset" : 50,
          "type" : "<ALPHANUM>",
          "position" : 9
        },
        {
          "token" : "the",
          "start_offset" : 51,
          "end_offset" : 54,
          "type" : "<ALPHANUM>",
          "position" : 10
        },
        {
          "token" : "summer",
          "start_offset" : 55,
          "end_offset" : 61,
          "type" : "<ALPHANUM>",
          "position" : 11
        },
        {
          "token" : "evening",
          "start_offset" : 62,
          "end_offset" : 69,
          "type" : "<ALPHANUM>",
          "position" : 12
        }
      ]
    }
    

      按照非字母的字符切分

    GET _analyze
    {
      "analyzer": "simple",
      "text": "2 running Quick brawn-foxes leap over lazy dogs in the summer evening."
    }
    
    
    {
      "tokens" : [
        {
          "token" : "running",
          "start_offset" : 2,
          "end_offset" : 9,
          "type" : "word",
          "position" : 0
        },
        {
          "token" : "quick",
          "start_offset" : 10,
          "end_offset" : 15,
          "type" : "word",
          "position" : 1
        },
        {
          "token" : "brawn",
          "start_offset" : 16,
          "end_offset" : 21,
          "type" : "word",
          "position" : 2
        },
        {
          "token" : "foxes",
          "start_offset" : 22,
          "end_offset" : 27,
          "type" : "word",
          "position" : 3
        },
        {
          "token" : "leap",
          "start_offset" : 28,
          "end_offset" : 32,
          "type" : "word",
          "position" : 4
        },
        {
          "token" : "over",
          "start_offset" : 33,
          "end_offset" : 37,
          "type" : "word",
          "position" : 5
        },
        {
          "token" : "lazy",
          "start_offset" : 38,
          "end_offset" : 42,
          "type" : "word",
          "position" : 6
        },
        {
          "token" : "dogs",
          "start_offset" : 43,
          "end_offset" : 47,
          "type" : "word",
          "position" : 7
        },
        {
          "token" : "in",
          "start_offset" : 48,
          "end_offset" : 50,
          "type" : "word",
          "position" : 8
        },
        {
          "token" : "the",
          "start_offset" : 51,
          "end_offset" : 54,
          "type" : "word",
          "position" : 9
        },
        {
          "token" : "summer",
          "start_offset" : 55,
          "end_offset" : 61,
          "type" : "word",
          "position" : 10
        },
        {
          "token" : "evening",
          "start_offset" : 62,
          "end_offset" : 69,
          "type" : "word",
          "position" : 11
        }
      ]
    }
    

      按照空格切分不做任何处理

    GET _analyze
    {
      "analyzer": "whitespace", 
      "text": "2 running Quick brawn-foxes leap over lazy dogs in the summer evening."
    }
    
    {
      "tokens" : [
        {
          "token" : "2",
          "start_offset" : 0,
          "end_offset" : 1,
          "type" : "word",
          "position" : 0
        },
        {
          "token" : "running",
          "start_offset" : 2,
          "end_offset" : 9,
          "type" : "word",
          "position" : 1
        },
        {
          "token" : "Quick",
          "start_offset" : 10,
          "end_offset" : 15,
          "type" : "word",
          "position" : 2
        },
        {
          "token" : "brawn-foxes",
          "start_offset" : 16,
          "end_offset" : 27,
          "type" : "word",
          "position" : 3
        },
        {
          "token" : "leap",
          "start_offset" : 28,
          "end_offset" : 32,
          "type" : "word",
          "position" : 4
        },
        {
          "token" : "over",
          "start_offset" : 33,
          "end_offset" : 37,
          "type" : "word",
          "position" : 5
        },
        {
          "token" : "lazy",
          "start_offset" : 38,
          "end_offset" : 42,
          "type" : "word",
          "position" : 6
        },
        {
          "token" : "dogs",
          "start_offset" : 43,
          "end_offset" : 47,
          "type" : "word",
          "position" : 7
        },
        {
          "token" : "in",
          "start_offset" : 48,
          "end_offset" : 50,
          "type" : "word",
          "position" : 8
        },
        {
          "token" : "the",
          "start_offset" : 51,
          "end_offset" : 54,
          "type" : "word",
          "position" : 9
        },
        {
          "token" : "summer",
          "start_offset" : 55,
          "end_offset" : 61,
          "type" : "word",
          "position" : 10
        },
        {
          "token" : "evening.",
          "start_offset" : 62,
          "end_offset" : 70,
          "type" : "word",
          "position" : 11
        }
      ]
    }
    

      按词切分去掉修饰词

    GET _analyze
    {
      "analyzer": "stop",
      "text": "2 running Quick brawn-foxes leap over lazy dogs in the summer evening."
    }
    
    
    
    
    
    {
      "tokens" : [
        {
          "token" : "running",
          "start_offset" : 2,
          "end_offset" : 9,
          "type" : "word",
          "position" : 0
        },
        {
          "token" : "quick",
          "start_offset" : 10,
          "end_offset" : 15,
          "type" : "word",
          "position" : 1
        },
        {
          "token" : "brawn",
          "start_offset" : 16,
          "end_offset" : 21,
          "type" : "word",
          "position" : 2
        },
        {
          "token" : "foxes",
          "start_offset" : 22,
          "end_offset" : 27,
          "type" : "word",
          "position" : 3
        },
        {
          "token" : "leap",
          "start_offset" : 28,
          "end_offset" : 32,
          "type" : "word",
          "position" : 4
        },
        {
          "token" : "over",
          "start_offset" : 33,
          "end_offset" : 37,
          "type" : "word",
          "position" : 5
        },
        {
          "token" : "lazy",
          "start_offset" : 38,
          "end_offset" : 42,
          "type" : "word",
          "position" : 6
        },
        {
          "token" : "dogs",
          "start_offset" : 43,
          "end_offset" : 47,
          "type" : "word",
          "position" : 7
        },
        {
          "token" : "summer",
          "start_offset" : 55,
          "end_offset" : 61,
          "type" : "word",
          "position" : 10
        },
        {
          "token" : "evening",
          "start_offset" : 62,
          "end_offset" : 69,
          "type" : "word",
          "position" : 11
        }
      ]
    }
    

      不进行切分直接输出

    GET _analyze
    {
      "analyzer": "keyword",
      "text": "2 running Quick brawn-foxes leap over lazy dogs in the summer evening."
    }
    
    
    
    {
      "tokens" : [
        {
          "token" : "2 running Quick brawn-foxes leap over lazy dogs in the summer evening.",
          "start_offset" : 0,
          "end_offset" : 70,
          "type" : "word",
          "position" : 0
        }
      ]
    }
    

      通过正则表达式方式进行切割,默认非字符的方式切割

    GET _analyze
    {
      "analyzer": "pattern",
      "text": "2 running Quick brawn-foxes leap over lazy dogs in the summer evening."
    }
    
    
    {
      "tokens" : [
        {
          "token" : "2",
          "start_offset" : 0,
          "end_offset" : 1,
          "type" : "word",
          "position" : 0
        },
        {
          "token" : "running",
          "start_offset" : 2,
          "end_offset" : 9,
          "type" : "word",
          "position" : 1
        },
        {
          "token" : "quick",
          "start_offset" : 10,
          "end_offset" : 15,
          "type" : "word",
          "position" : 2
        },
        {
          "token" : "brawn",
          "start_offset" : 16,
          "end_offset" : 21,
          "type" : "word",
          "position" : 3
        },
        {
          "token" : "foxes",
          "start_offset" : 22,
          "end_offset" : 27,
          "type" : "word",
          "position" : 4
        },
        {
          "token" : "leap",
          "start_offset" : 28,
          "end_offset" : 32,
          "type" : "word",
          "position" : 5
        },
        {
          "token" : "over",
          "start_offset" : 33,
          "end_offset" : 37,
          "type" : "word",
          "position" : 6
        },
        {
          "token" : "lazy",
          "start_offset" : 38,
          "end_offset" : 42,
          "type" : "word",
          "position" : 7
        },
        {
          "token" : "dogs",
          "start_offset" : 43,
          "end_offset" : 47,
          "type" : "word",
          "position" : 8
        },
        {
          "token" : "in",
          "start_offset" : 48,
          "end_offset" : 50,
          "type" : "word",
          "position" : 9
        },
        {
          "token" : "the",
          "start_offset" : 51,
          "end_offset" : 54,
          "type" : "word",
          "position" : 10
        },
        {
          "token" : "summer",
          "start_offset" : 55,
          "end_offset" : 61,
          "type" : "word",
          "position" : 11
        },
        {
          "token" : "evening",
          "start_offset" : 62,
          "end_offset" : 69,
          "type" : "word",
          "position" : 12
        }
      ]
    }
    

      英语分词器

    GET _analyze
    {
      "analyzer": "english",
      "text": "2 running Quick brawn-foxes leap over lazy dogs in the summer evening."
    }
    
    
    
    
    
    {
      "tokens" : [
        {
          "token" : "2",
          "start_offset" : 0,
          "end_offset" : 1,
          "type" : "<NUM>",
          "position" : 0
        },
        {
          "token" : "run",
          "start_offset" : 2,
          "end_offset" : 9,
          "type" : "<ALPHANUM>",
          "position" : 1
        },
        {
          "token" : "quick",
          "start_offset" : 10,
          "end_offset" : 15,
          "type" : "<ALPHANUM>",
          "position" : 2
        },
        {
          "token" : "brawn",
          "start_offset" : 16,
          "end_offset" : 21,
          "type" : "<ALPHANUM>",
          "position" : 3
        },
        {
          "token" : "fox",
          "start_offset" : 22,
          "end_offset" : 27,
          "type" : "<ALPHANUM>",
          "position" : 4
        },
        {
          "token" : "leap",
          "start_offset" : 28,
          "end_offset" : 32,
          "type" : "<ALPHANUM>",
          "position" : 5
        },
        {
          "token" : "over",
          "start_offset" : 33,
          "end_offset" : 37,
          "type" : "<ALPHANUM>",
          "position" : 6
        },
        {
          "token" : "lazi",
          "start_offset" : 38,
          "end_offset" : 42,
          "type" : "<ALPHANUM>",
          "position" : 7
        },
        {
          "token" : "dog",
          "start_offset" : 43,
          "end_offset" : 47,
          "type" : "<ALPHANUM>",
          "position" : 8
        },
        {
          "token" : "summer",
          "start_offset" : 55,
          "end_offset" : 61,
          "type" : "<ALPHANUM>",
          "position" : 11
        },
        {
          "token" : "even",
          "start_offset" : 62,
          "end_offset" : 69,
          "type" : "<ALPHANUM>",
          "position" : 12
        }
      ]
    }
    

      中文分词器,一个字符一个字符切分

    POST _analyze
    {
      "analyzer": "standard",
      "text": "他说的确实在理"
    }
    
    
    
    {
      "tokens" : [
        {
          "token" : "他",
          "start_offset" : 0,
          "end_offset" : 1,
          "type" : "<IDEOGRAPHIC>",
          "position" : 0
        },
        {
          "token" : "说",
          "start_offset" : 1,
          "end_offset" : 2,
          "type" : "<IDEOGRAPHIC>",
          "position" : 1
        },
        {
          "token" : "的",
          "start_offset" : 2,
          "end_offset" : 3,
          "type" : "<IDEOGRAPHIC>",
          "position" : 2
        },
        {
          "token" : "确",
          "start_offset" : 3,
          "end_offset" : 4,
          "type" : "<IDEOGRAPHIC>",
          "position" : 3
        },
        {
          "token" : "实",
          "start_offset" : 4,
          "end_offset" : 5,
          "type" : "<IDEOGRAPHIC>",
          "position" : 4
        },
        {
          "token" : "在",
          "start_offset" : 5,
          "end_offset" : 6,
          "type" : "<IDEOGRAPHIC>",
          "position" : 5
        },
        {
          "token" : "理",
          "start_offset" : 6,
          "end_offset" : 7,
          "type" : "<IDEOGRAPHIC>",
          "position" : 6
        }
      ]
    }
    

      

    草都可以从石头缝隙中长出来更可况你呢
  • 相关阅读:
    Leecode no.22 括号生成
    修改mysql数据库的时区
    Django 路由层之反向解析
    学习 Django 的几个教程网址
    leetcode周赛 242
    AcWing第二次热身赛
    AcWing夏季每日一题--最长公共子序列
    AcWIng夏季每日一题--序列最大收益
    leetcode周赛 241
    第十二届蓝桥杯C++ B组
  • 原文地址:https://www.cnblogs.com/rdchenxi/p/11826407.html
Copyright © 2011-2022 走看看