zoukankan      html  css  js  c++  java
  • 谷粒商城ES自定义词库(十八)

    具体的IK分词可以查看博客:https://www.cnblogs.com/dalianpai/p/12694298.html

    122、全文检索-ElasticSearch-分词-分词&安装ik分词 - 124、全文检索-ElasticSearch-分词-自定义扩展词库

    下载地址;https://github.com/medcl/elasticsearch-analysis-ik

    [root@localhost plugins]# ll
    total 4400
    -rw-r--r-- 1 root root 4504487 Jun 15 13:13 elasticsearch-analysis-ik-7.4.2.zip
    [root@localhost plugins]# unzip
    UnZip 6.00 of 20 April 2009, by Info-ZIP.  Maintained by C. Spieler.  Send
    bug reports using http://www.info-zip.org/zip-bug.html; see README for details.
    
    Usage: unzip [-Z] [-opts[modifiers]] file[.zip] [list] [-x xlist] [-d exdir]
      Default action is to extract files in list, except those in xlist, to exdir;
      file[.zip] may be a wildcard.  -Z => ZipInfo mode ("unzip -Z" for usage).
    
      -p  extract files to pipe, no messages     -l  list files (short format)
      -f  freshen existing files, create none    -t  test compressed archive data
      -u  update files, create if necessary      -z  display archive comment only
      -v  list verbosely/show version info       -T  timestamp archive to latest
      -x  exclude files that follow (in xlist)   -d  extract files into exdir
    modifiers:
      -n  never overwrite existing files         -q  quiet mode (-qq => quieter)
      -o  overwrite files WITHOUT prompting      -a  auto-convert any text files
      -j  junk paths (do not make directories)   -aa treat ALL files as text
      -U  use escapes for all non-ASCII Unicode  -UU ignore any Unicode fields
      -C  match filenames case-insensitively     -L  make (some) names lowercase
      -X  restore UID/GID info                   -V  retain VMS version numbers
      -K  keep setuid/setgid/tacky permissions   -M  pipe through "more" pager
      -O CHARSET  specify a character encoding for DOS, Windows and OS/2 archives
      -I CHARSET  specify a character encoding for UNIX and other archives
    
    See "unzip -hh" or unzip.txt for more help.  Examples:
      unzip data1 -x joe   => extract all files except joe from zipfile data1.zip
      unzip -p foo | more  => send contents of foo.zip via pipe into program more
      unzip -fo foo ReadMe => quietly replace existing ReadMe if archive file newer
    [root@localhost plugins]# unzip elasticsearch-analysis-ik-7.4.2.zip
    Archive:  elasticsearch-analysis-ik-7.4.2.zip
      inflating: elasticsearch-analysis-ik-7.4.2.jar
      inflating: httpclient-4.5.2.jar
      inflating: httpcore-4.4.4.jar
      inflating: commons-logging-1.2.jar
      inflating: commons-codec-1.9.jar
      inflating: plugin-descriptor.properties
      inflating: plugin-security.policy
       creating: config/
      inflating: config/surname.dic
      inflating: config/quantifier.dic
      inflating: config/extra_stopword.dic
      inflating: config/suffix.dic
      inflating: config/extra_single_word_full.dic
      inflating: config/extra_single_word.dic
      inflating: config/preposition.dic
      inflating: config/IKAnalyzer.cfg.xml
      inflating: config/main.dic
      inflating: config/stopword.dic
      inflating: config/extra_main.dic
      inflating: config/extra_single_word_low_freq.dic
    [root@localhost plugins]# ll
    total 5828
    -rw-r--r-- 1 root root  263965 May  6  2018 commons-codec-1.9.jar
    -rw-r--r-- 1 root root   61829 May  6  2018 commons-logging-1.2.jar
    drwxr-xr-x 2 root root     299 Oct  7  2019 config
    -rw-r--r-- 1 root root   54643 Nov  4  2019 elasticsearch-analysis-ik-7.4.2.jar
    -rw-r--r-- 1 root root 4504487 Jun 15 13:13 elasticsearch-analysis-ik-7.4.2.zip
    -rw-r--r-- 1 root root  736658 May  6  2018 httpclient-4.5.2.jar
    -rw-r--r-- 1 root root  326724 May  6  2018 httpcore-4.4.4.jar
    -rw-r--r-- 1 root root    1805 Nov  4  2019 plugin-descriptor.properties
    -rw-r--r-- 1 root root     125 Nov  4  2019 plugin-security.policy
    [root@localhost plugins]# mkdir ik
    [root@localhost plugins]# ll
    total 1428
    -rw-r--r-- 1 root root 263965 May  6  2018 commons-codec-1.9.jar
    -rw-r--r-- 1 root root  61829 May  6  2018 commons-logging-1.2.jar
    drwxr-xr-x 2 root root    299 Oct  7  2019 config
    -rw-r--r-- 1 root root  54643 Nov  4  2019 elasticsearch-analysis-ik-7.4.2.jar
    -rw-r--r-- 1 root root 736658 May  6  2018 httpclient-4.5.2.jar
    -rw-r--r-- 1 root root 326724 May  6  2018 httpcore-4.4.4.jar
    drwxr-xr-x 2 root root      6 Jun 15 13:17 ik
    -rw-r--r-- 1 root root   1805 Nov  4  2019 plugin-descriptor.properties
    -rw-r--r-- 1 root root    125 Nov  4  2019 plugin-security.policy
    [root@localhost plugins]# mv * ik/
    mv: cannot move ‘ik’ to a subdirectory of itself, ‘ik/ik’

    进行重启容器,然后查询

    POST _analyze
    {
      "tokenizer": "standard",
      "text": "The 2 QUICK Brown-Foxes jumped over the lazy dog's bone."
    }
    
    
    POST _analyze
    {
      "tokenizer": "ik_smart",
      "text": "尚硅谷电商"
    }

    但是有很多的词识别不了,需要自定义词汇表

    先增大内存

    [root@cicd ~]# docker ps -a
    CONTAINER ID        IMAGE                 COMMAND                  CREATED             STATUS              PORTS                                            NAMES
    7ab7bf7aa2e5        kibana:7.4.2          "/usr/local/bin/dumb…"   7 days ago          Up 5 hours          0.0.0.0:5601->5601/tcp                           kibana
    174c44e86f31        elasticsearch:7.4.2   "/usr/local/bin/dock…"   7 days ago          Up 2 minutes        0.0.0.0:9200->9200/tcp, 0.0.0.0:9300->9300/tcp   elasticsearch
    [root@cicd ~]# docker stop 174c44e86f31
    174c44e86f31
    [root@cicd ~]# docker start 174c44e86f31
    174c44e86f31
    [root@cicd ~]# free -m
                  total        used        free      shared  buff/cache   available
    Mem:           7821        3944        2361           9        1515        3605
    Swap:          1639           0        1639
    [root@cicd ~]# docker stop 174c44e86f31
    174c44e86f31
    [root@cicd ~]# docker rm 174c44e86f31
    174c44e86f31
    [root@cicd ~]# docker run --name elasticsearch -p 9200:9200 -p 9300:9300 --privi                                                                              leged=true 
    > -e "discovery.type=single-node"  
    > -e ES_JAVA_OPTS="-Xms512m -Xms1024m"  
    > -v /mydata/elasticsearch/config/elasticsearch.yml:/usr/share/elasticsearch/con                                                                              fig/elasticsearch.yml   
    > -v /mydata/elasticsearch/data:/usr/share/elasticsearch/data   
    > -v /mydata/elasticsearch/plugins:/usr/share/elasticsearch/plugins   
    > -d elasticsearch:7.4.2
    aa707f92c246a4878adcb5e6f6e7c98ab55ecbe201fa026d3329178a41fc7791
    [root@cicd ~]# docker ps -a

    在安装nginx,并修改es的xml

    [root@cicd ~]# cd /mydata/
    [root@cicd mydata]# mkdir nginx
    [root@cicd mydata]# docker pull nginx:1.10
    1.10: Pulling from library/nginx
    6d827a3ef358: Pull complete
    1e3e18a64ea9: Pull complete
    556c62bb43ac: Pull complete
    Digest: sha256:6202beb06ea61f44179e02ca965e8e13b961d12640101fca213efbfd145d7575
    Status: Downloaded newer image for nginx:1.10
    docker.io/library/nginx:1.10
    [root@cicd mydata]# ll
    total 0
    drwxrwxrwx. 5 root root 47 Jun  8 11:35 elasticsearch
    drwxr-xr-x  2 root root  6 Jun 15 13:47 nginx
    [root@cicd mydata]# docker run -p 80:80 --name nginx -d nginx:1.10
    7217ab7d7ad153960b2d1acebffd3fc02527655e2b0888e8c5d1eb0cebb84a05
    [root@cicd mydata]# docker ps -a
    CONTAINER ID        IMAGE                 COMMAND                  CREATED                                                                                           STATUS              PORTS                                            NAME                                                                              S
    7217ab7d7ad1        nginx:1.10            "nginx -g 'daemon of…"   5 seconds ago                                                                                     Up 4 seconds        0.0.0.0:80->80/tcp, 443/tcp                      ngin                                                                              x
    aa707f92c246        elasticsearch:7.4.2   "/usr/local/bin/dock…"   3 minutes ago                                                                                     Up 3 minutes        0.0.0.0:9200->9200/tcp, 0.0.0.0:9300->9300/tcp   elas                                                                              ticsearch
    7ab7bf7aa2e5        kibana:7.4.2          "/usr/local/bin/dumb…"   7 days ago                                                                                        Up 5 hours          0.0.0.0:5601->5601/tcp                           kiba                                                                              na
    [root@cicd mydata]# docker container cp nginx:/etc/nginx .
    [root@cicd mydata]# cd nginx/
    [root@cicd nginx]# ll
    total 32
    drwxr-xr-x 2 root root   26 Mar 27  2017 conf.d
    -rw-r--r-- 1 root root 1007 Jan 31  2017 fastcgi_params
    -rw-r--r-- 1 root root 2837 Jan 31  2017 koi-utf
    -rw-r--r-- 1 root root 2223 Jan 31  2017 koi-win
    -rw-r--r-- 1 root root 3957 Jan 31  2017 mime.types
    lrwxrwxrwx 1 root root   22 Jan 31  2017 modules -> /usr/lib/nginx/modules
    -rw-r--r-- 1 root root  643 Jan 31  2017 nginx.conf
    -rw-r--r-- 1 root root  636 Jan 31  2017 scgi_params
    -rw-r--r-- 1 root root  664 Jan 31  2017 uwsgi_params
    -rw-r--r-- 1 root root 3610 Jan 31  2017 win-utf
    [root@cicd nginx]# docker ps -a
    CONTAINER ID        IMAGE                 COMMAND                  CREATED              STATUS              PORTS                                            NAMES
    7217ab7d7ad1        nginx:1.10            "nginx -g 'daemon of…"   About a minute ago   Up About a minute   0.0.0.0:80->80/tcp, 443/tcp                      nginx
    aa707f92c246        elasticsearch:7.4.2   "/usr/local/bin/dock…"   4 minutes ago        Up 4 minutes        0.0.0.0:9200->9200/tcp, 0.0.0.0:9300->9300/tcp   elasticsearch
    7ab7bf7aa2e5        kibana:7.4.2          "/usr/local/bin/dumb…"   7 days ago           Up 5 hours          0.0.0.0:5601->5601/tcp                           kibana
    [root@cicd nginx]# docker stop 7217ab7d7ad1
    7217ab7d7ad1
    [root@cicd nginx]# docker rm 7217ab7d7ad1
    7217ab7d7ad1
    [root@cicd nginx]# cd ..
    [root@cicd mydata]# ll
    total 0
    drwxrwxrwx. 5 root root  47 Jun  8 11:35 elasticsearch
    drwxr-xr-x  3 root root 177 Mar 27  2017 nginx
    [root@cicd mydata]# mv nginx conf
    [root@cicd mydata]# ll
    total 0
    drwxr-xr-x  3 root root 177 Mar 27  2017 conf
    drwxrwxrwx. 5 root root  47 Jun  8 11:35 elasticsearch
    [root@cicd mydata]# mkdir nginx
    [root@cicd mydata]# mv conf/ nginx/
    [root@cicd mydata]# ll
    total 0
    drwxrwxrwx. 5 root root 47 Jun  8 11:35 elasticsearch
    drwxr-xr-x  3 root root 18 Jun 15 13:52 nginx
    [root@cicd mydata]# cd nginx/
    [root@cicd nginx]# ll
    total 0
    drwxr-xr-x 3 root root 177 Mar 27  2017 conf
    [root@cicd nginx]#
    [root@cicd nginx]#
    [root@cicd nginx]#
    [root@cicd nginx]#
    [root@cicd nginx]#
    [root@cicd nginx]# docker run -p 80:80 --name nginx 
    > -v /mydata/nginx/html:/usr/share/nginx/html  
    > -v /mydata/nginx/logs:/var/log/nginx 
    > -v /mydata/nginx/conf/:/etc//nginx 
    > -d nginx:1.10
    7b3ae8abac8219ac43b99e058fed83d93f3e16db015744369477e99ec134cc16
    [root@cicd nginx]# docker ps -l
    CONTAINER ID        IMAGE               COMMAND                  CREATED                                                                                      STATUS              PORTS                         NAMES
    7b3ae8abac82        nginx:1.10          "nginx -g 'daemon of…"   20 seconds ago                                                                               Up 18 seconds       0.0.0.0:80->80/tcp, 443/tcp   nginx
    [root@cicd nginx]# cd html/
    [root@cicd html]# ll
    total 0
    [root@cicd html]# vim index.html
    [root@cicd html]# mkdir es
    [root@cicd html]# cd es
    [root@cicd es]# ll
    total 0
    [root@cicd es]# vim femci.txt
    [root@cicd es]# mv femci.txt fenci.txt
    [root@cicd es]# cd /mydata/elasticsearch/plugins/
    [root@cicd plugins]# cd ik/config/
    [root@cicd config]# ll
    total 8260
    -rw-r--r-- 1 root root 5225922 Oct  7  2019 extra_main.dic
    -rw-r--r-- 1 root root   63188 Oct  7  2019 extra_single_word.dic
    -rw-r--r-- 1 root root   63188 Oct  7  2019 extra_single_word_full.dic
    -rw-r--r-- 1 root root   10855 Oct  7  2019 extra_single_word_low_freq.dic
    -rw-r--r-- 1 root root     156 Oct  7  2019 extra_stopword.dic
    -rw-r--r-- 1 root root     625 Oct  7  2019 IKAnalyzer.cfg.xml
    -rw-r--r-- 1 root root 3058510 Oct  7  2019 main.dic
    -rw-r--r-- 1 root root     123 Oct  7  2019 preposition.dic
    -rw-r--r-- 1 root root    1824 Oct  7  2019 quantifier.dic
    -rw-r--r-- 1 root root     164 Oct  7  2019 stopword.dic
    -rw-r--r-- 1 root root     192 Oct  7  2019 suffix.dic
    -rw-r--r-- 1 root root     752 Oct  7  2019 surname.dic
    [root@cicd config]# vim IKAnalyzer.cfg.xml
    [root@cicd config]# docker ps -a
    CONTAINER ID        IMAGE                 COMMAND                  CREATED                                                                                      STATUS              PORTS                                            NAMES
    7b3ae8abac82        nginx:1.10            "nginx -g 'daemon of…"   4 minutes ago                                                                                Up 4 minutes        0.0.0.0:80->80/tcp, 443/tcp                      nginx
    aa707f92c246        elasticsearch:7.4.2   "/usr/local/bin/dock…"   12 minutes ago                                                                               Up 12 minutes       0.0.0.0:9200->9200/tcp, 0.0.0.0:9300->9300/tcp   elasticsearch
    7ab7bf7aa2e5        kibana:7.4.2          "/usr/local/bin/dumb…"   7 days ago                                                                                   Up 5 hours          0.0.0.0:5601->5601/tcp                           kibana
    [root@cicd config]# docker restart elasticsearch
    elasticsearch
    [root@cicd config]#  

    然后再进行分词

  • 相关阅读:
    电商交易背景知识合集第二季
    技术高手如何炼成
    #研发解决方案#基于Apriori算法的Nginx+Lua+ELK异常流量拦截方案
    电商交易背景知识合集第一季
    真刀真枪压测:基于TCPCopy的仿真压测方案
    安全基础教育第二季第1集:屡战屡败的找回密码
    #研发解决方案#从宏观到微观——天机与鹰眼联手
    挖坑和踩雷
    我们过去几年做对了哪些事
    小伙伴们手滑集
  • 原文地址:https://www.cnblogs.com/dalianpai/p/13138855.html
Copyright © 2011-2022 走看看