zoukankan      html  css  js  c++  java
  • elasticsearch

    lucene : 倒排索引
    如下: 我 (1:1) {0}  表示第一行出现一次,索引位置为0
    

      

     

    elasticsearch 部署  elasticsearch-2.2.1.zip
    
    192.168.112.101	node1
    192.168.112.102	node2
    192.168.112.103	node3
    
    三台机器,每台机器上都部署。
    
    es不能以root用户启动(因为es可以远程执行脚本,对于主机不安全)
    
    ## 所以三台主机都创建用户
    [root@node2 ~]# useradd sxt
    [root@node2 ~]# echo sxt | passwd --stdin sxt
    [root@node2 ~]# mkdir -p /opt/sxt/es
    [root@node2 ~]# cd /opt/sxt
    
    [root@node1 sxt]# cd /opt/sxt/es/
    [root@node1 es]# ll
    total 28740
    -rw-r--r--. 1 root root 29428075 Sep 10 21:18 elasticsearch-2.2.1.zip
    [root@node1 sxt]# chown sxt:sxt es
    [root@node1 sxt]# su sxt
    [sxt@node1 sxt]$ cd es
    [sxt@node1 es]$ ll
    total 28740
    -rw-r--r--. 1 root root 29428075 Sep 10 21:18 elasticsearch-2.2.1.zip
    [sxt@node1 es]$ unzip elasticsearch-2.2.1.zip 
    [sxt@node1 es]$ cd elasticsearch-2.2.1/config/elasticsearch.yml  ## 修改
    cluster.name: bjsxt-es
    node.name: node1
    network.host: 192.168.112.101
    
    discovery.zen.ping.multicast.enabled: false   ## 放在末尾
    discovery.zen.ping.unicast.hosts: ["192.168.112.101","192.168.112.102", "192.168.112.103"]
    discovery.zen.ping_timeout: 120s
    client.transport.ping_timeout: 60s
    
    [sxt@node1 es]$ scp -r elasticsearch-2.2.1 sxt@node2:`pwd`  ## 分发到node2和node3
    [sxt@node1 bin]$ cd /opt/sxt/es/elasticsearch-2.2.1/bin
    [sxt@node1 bin]$ ./elasticsearch   ## node2,node3都启动此命令    
    

    配置json内容的格式化ui
    
    02_第二阶段  hadoop体系之离线计算12_EL SEARCH 搜索引擎1资料1资料附件plugins 将文件夹下的head上传到
    [root@node1 plugins]# pwd
    /opt/sxt/es/elasticsearch-2.2.1/plugins
    [root@node1 plugins]# ll
    total 4
    drwxr-xr-x. 6 sxt sxt 4096 Sep 10 21:41 head  ## 注意权限head 为sxt
    
    [root@node1 plugins]# chown -R sxt:sxt head
    

      

     

    ## 如果不小心以root用户启动,报错,如下。此时需要删除logs文件夹。否则再次以sxt启动也可能失败。
    [root@node1 plugins]# cd /opt/sxt/es/elasticsearch-2.2.1/bin
    [root@node1 bin]# ./elasticsearch
    Exception in thread "main" java.lang.RuntimeException: don't run elasticsearch as root.
    	at org.elasticsearch.bootstrap.Bootstrap.initializeNatives(Bootstrap.java:93)
    	at org.elasticsearch.bootstrap.Bootstrap.setup(Bootstrap.java:144)
    	at org.elasticsearch.bootstrap.Bootstrap.init(Bootstrap.java:285)
    	at org.elasticsearch.bootstrap.Elasticsearch.main(Elasticsearch.java:35)
    
    [root@node1 elasticsearch-2.2.1]# rm -rf logs
    ## 重新启动   ### ctrl+c 结束程序
    [root@node1 elasticsearch-2.2.1]# su sxt
    [sxt@node1 elasticsearch-2.2.1]$ cd /opt/sxt/es/elasticsearch-2.2.1/bin
    [sxt@node1 bin]$ ./elasticsearch
    
    ## 访问页面内容如下;
    http://node2:9200/_plugin/head/
    

    横向扩展sharding切片,纵向扩展搭建ha.
    一般lucense的分片不可修改,在规划时候需要考虑好,一经确认不可修改。(可以给分片做备份)
    

      

     

     

    通过curl 操作es
    [root@node1 plugins]# curl -XPUT http://192.168.112.101:9200/bjsxt/
    
    如下:创建了lucene分片。粗体代表主分片,普通矩形框表示备分片

    称为创建索引库 (相当于数据库)

      

     

    node3挂掉后,出现短暂的警告,过一会儿又重新调整为如下第二图(达到健康状态了,自动备份了)。
    再次重启node3.过一会如图第三。 * 代表是主。

      

     

    curl -XPOST http://192.168.112.101:9200/bjsxt/employee -d '
    {
     "first_name" : "bin",
     "age" : 33,
     "about" : "I love to go rock climbing",
     "interests": [ "sports", "music" ]
    }'
    创建type和document.
    
    [root@node1 plugins]# curl -XPUT http://192.168.112.101:9200/bjsxt/
    {"acknowledged":true}[root@node1 plugins]# curl -XPOST http://192.168.112.101:9200/bjsxt/employee -d '
    > {
    >  "first_name" : "bin",
    >  "age" : 33,
    >  "about" : "I love to go rock climbing",
    >  "interests": [ "sports", "music" ]
    > }'
    {"_index":"bjsxt","_type":"employee","_id":"AW0brHsbOCeeN2j3g-hG","_version":1,"_shards":{"total":2,"successful":2,"failed":0},"created":true}[root@node1 plugins]# 
    

      

    curl -XPOST http://192.168.112.101:9200/bjsxt/employee -d '
    {
     "first_name" : "gob bin",
     "age" : 43,
     "about" : "I love to go rock climbing",
     "interests": [ "sports", "music" ]
    }'
    
    
    curl -XPOST http://192.168.112.101:9200/bjsxt/employee -d '
    {
     "first_name" : "pablo2",
     "age" : 33,
     "about" : "I love to go rock climbing",
     "interests": [ "sports", "music" ],
     "sex": "man"
    }'
    
    #XPUT 必须给出id 
    curl -XPUT http://192.168.112.101:9200/bjsxt/employee/1 -d '  
    {
     "first_name" : "god bin",
     "last_name" : "pang",
     "age" : 42,
     "about" : "I love to go rock climbing",
     "interests": [ "sports", "music" ]
    }'
    
    ## 修改age 44
    curl -XPUT http://192.168.112.101:9200/bjsxt/employee/1 -d '
    {
     "first_name" : "god bin",
     "last_name" : "pang",
     "age" : 44,
     "about" : "I love to go rock climbing",
     "interests": [ "sports", "music" ]
    }'
    
    curl -XPOST http://192.168.112.101:9200/bjsxt/employee/1 -d '
    {
     "first_name" : "pablo2",
     "age" : 33,
     "about" : "I love to go rock climbing",
     "interests": [ "sports", "music" ],
     "sex": "man"
    }'
    
    ## XPUT,XPOST 都可以做创建和修改。 XPUT 必须给出id,如果id不存在就创建,存在则修改。
    XPOST 不用必须给定id
    
    [root@node1 plugins]# curl -XGET http://192.168.112.101:9200/bjsxt/employee/1?pretty
    {
      "_index" : "bjsxt",
      "_type" : "employee",
      "_id" : "1",
      "_version" : 4,
      "found" : true,
      "_source" : {
        "first_name" : "pablo2",
        "age" : 33,
        "about" : "I love to go rock climbing",
        "interests" : [ "sports", "music" ],
        "sex" : "man"
      }
    }
    

      

    [root@node1 plugins]# curl -XGET http://192.168.112.101:9200/bjsxt/employee/_search?q=first_name="bin"
    {"took":31,"timed_out":false,"_shards":{"total":5,"successful":5,"failed":0},"hits":{"total":2,"max_score":0.079459734,"hits":[{"_index":"bjsxt","_type":"employee","_id":"AW0brHsbOCeeN2j3g-hG","_score":0.079459734,"_source":
    {
     "first_name" : "bin",
     "age" : 33,
     "about" : "I love to go rock climbing",
     "interests": [ "sports", "music" ]
    }},{"_index":"bjsxt","_type":"employee","_id":"AW0brvCeOCeeN2j3g-hH","_score":0.01125201,"_source":
    {
     "first_name" : "gob bin",
     "age" : 43,
     "about" : "I love to go rock climbing",
     "interests": [ "sports", "music" ]
    }}]}}
    

      

    [root@node1 plugins]# curl -XGET http://192.168.112.101:9200/bjsxt/employee/_search?pretty -d '
    > {
    >  "query":
    >   {"match":
    >    {"first_name":"bin"}
    >   }
    > }'
    {
      "took" : 13,
      "timed_out" : false,
      "_shards" : {
        "total" : 5,
        "successful" : 5,
        "failed" : 0
      },
      "hits" : {
        "total" : 2,
        "max_score" : 1.0,
        "hits" : [ {
          "_index" : "bjsxt",
          "_type" : "employee",
          "_id" : "AW0brHsbOCeeN2j3g-hG",
          "_score" : 1.0,
          "_source" : {
            "first_name" : "bin",
            "age" : 33,
            "about" : "I love to go rock climbing",
            "interests" : [ "sports", "music" ]
          }
        }, {
          "_index" : "bjsxt",
          "_type" : "employee",
          "_id" : "AW0brvCeOCeeN2j3g-hH",
          "_score" : 0.19178301,
          "_source" : {
            "first_name" : "gob bin",
            "age" : 43,
            "about" : "I love to go rock climbing",
            "interests" : [ "sports", "music" ]
          }
        } ]
      }
    }
    

      

    [root@node1 plugins]# curl -XGET http://192.168.112.101:9200/bjsxt/employee/_search?pretty -d '
    > {
    >  "query":
    >   {"multi_match":
    >    {
    >     "query":"bin",
    >     "fields":["last_name","first_name"],
    >     "operator":"and"
    >    }
    >   }
    > }'
    {
      "took" : 13,
      "timed_out" : false,
      "_shards" : {
        "total" : 5,
        "successful" : 5,
        "failed" : 0
      },
      "hits" : {
        "total" : 2,
        "max_score" : 0.5906161,
        "hits" : [ {
          "_index" : "bjsxt",
          "_type" : "employee",
          "_id" : "AW0brHsbOCeeN2j3g-hG",
          "_score" : 0.5906161,
          "_source" : {
            "first_name" : "bin",
            "age" : 33,
            "about" : "I love to go rock climbing",
            "interests" : [ "sports", "music" ]
          }
        }, {
          "_index" : "bjsxt",
          "_type" : "employee",
          "_id" : "AW0brvCeOCeeN2j3g-hH",
          "_score" : 0.058849156,
          "_source" : {
            "first_name" : "gob bin",
            "age" : 43,
            "about" : "I love to go rock climbing",
            "interests" : [ "sports", "music" ]
          }
        } ]
      }
    }
    

      

    [root@node1 plugins]# curl -XGET http://192.168.112.101:9200/bjsxt/employee/_search?pretty -d '
    > {
    >  "query":
    >   {"bool" :
    >    {
    >     "must" : 
    >      {"match":
    >       {"first_name":"bin"}
    >      },
    >     "must" : 
    >      {"match":
    >       {"age":33}
    >      }
    >    }
    >   }
    > }'
    {
      "took" : 10,
      "timed_out" : false,
      "_shards" : {
        "total" : 5,
        "successful" : 5,
        "failed" : 0
      },
      "hits" : {
        "total" : 1,
        "max_score" : 1.163388,
        "hits" : [ {
          "_index" : "bjsxt",
          "_type" : "employee",
          "_id" : "AW0brHsbOCeeN2j3g-hG",
          "_score" : 1.163388,
          "_source" : {
            "first_name" : "bin",
            "age" : 33,
            "about" : "I love to go rock climbing",
            "interests" : [ "sports", "music" ]
          }
        } ]
      }
    }
    

      

    [root@node1 plugins]# curl -XGET http://192.168.112.101:9200/bjsxt/employee/_search?pretty -d '
    > {
    >  "query":
    >   {"bool" :
    >    {
    >     "must" : 
    >      {"match":
    >       {"first_name":"bin"}
    >      },
    >     "must_not" : 
    >      {"match":
    >       {"age":33}
    >      }
    >    }
    >   }
    > }'
    {
      "took" : 8,
      "timed_out" : false,
      "_shards" : {
        "total" : 5,
        "successful" : 5,
        "failed" : 0
      },
      "hits" : {
        "total" : 1,
        "max_score" : 0.19178301,
        "hits" : [ {
          "_index" : "bjsxt",
          "_type" : "employee",
          "_id" : "AW0brvCeOCeeN2j3g-hH",
          "_score" : 0.19178301,
          "_source" : {
            "first_name" : "gob bin",
            "age" : 43,
            "about" : "I love to go rock climbing",
            "interests" : [ "sports", "music" ]
          }
        } ]
      }
    }
    

      

    [root@node1 plugins]# curl -XGET http://192.168.112.101:9200/bjsxt/employee/_search?pretty -d '
    > {
    >  "query":
    >   {"bool" :
    >    {
    >     "must_not" : 
    >      {"match":
    >       {"first_name":"bin"}
    >      },
    >     "must_not" : 
    >      {"match":
    >       {"age":33}
    >      }
    >    }
    >   }
    > }'
    {
      "took" : 10,
      "timed_out" : false,
      "_shards" : {
        "total" : 5,
        "successful" : 5,
        "failed" : 0
      },
      "hits" : {
        "total" : 0,
        "max_score" : null,
        "hits" : [ ]
      }
    }
    

      

    以集合的方式思考
    

     

    [root@node1 plugins]# curl -XGET http://192.168.112.101:9200/bjsxt/employee/_search -d '
    > {
    >  "query":
    >   {"bool" :
    >    {
    >    "must" :
    >     {"term" : 
    >      { "first_name" : "bin" }
    >     }
    >    ,
    >    "must_not" : 
    >     {"range":
    >      {"age" : { "from" : 20, "to" : 33 }
    >     }
    >    }
    >    }
    >   }
    > }'
    {"took":17,"timed_out":false,"_shards":{"total":5,"successful":5,"failed":0},"hits":{"total":1,"max_score":0.19178301,"hits":[{"_index":"bjsxt","_type":"employee","_id":"AW0brvCeOCeeN2j3g-hH","_score":0.19178301,"_source":
    {
     "first_name" : "gob bin",
     "age" : 43,
     "about" : "I love to go rock climbing",
     "interests": [ "sports", "music" ]
    

      

     

    curl -XPUT 'http://192.168.112.101:9200/test2/' -d'{"settings":{"number_of_replicas":2}}'
    

      

    curl -XPUT 'http://192.168.112.101:9200/test3/' -d'{"settings":{"number_of_shards":3,"number_of_replicas":3}}'
    

      

    file
    segment(段,多个document组成)
    document(一条记录,一个对象实例)
    field(对象的属性)
    term(项,分词之后的词条)
    
    
    
    # yes
    curl -XPUT http://192.168.133.6:9200/bjsxt/
    # yes 
    curl -XDELETE http://192.168.133.6:9200/test2/
    curl -XDELETE http://192.168.133.6:9200/test3/
    
    #document:yes 
    curl -XPOST http://192.168.133.6:9200/bjsxt/employee -d '
    {
     "first_name" : "bin",
     "age" : 33,
     "about" : "I love to go rock climbing",
     "interests": [ "sports", "music" ]
    }'
    
    curl -XPOST http://192.168.133.6:9200/bjsxt/employee -d '
    {
     "first_name" : "gob bin",
     "age" : 43,
     "about" : "I love to go rock climbing",
     "interests": [ "sports", "music" ]
    }'
    
    curl -XPOST http://192.168.133.6:9200/bjsxt/employee/2 -d '
    {
     "first_name" : "bin",
     "age" : 45,
     "about" : "I love to go rock climbing",
     "interests": [ "sports", "music" ]
    }'
    
    
    #add field yes
    
    curl -XPOST http://192.168.133.6:9200/bjsxt/employee -d '
    {
     "first_name" : "pablo2",
     "age" : 33,
     "about" : "I love to go rock climbing",
     "interests": [ "sports", "music" ],
     "sex": "man"
    }'
    
    curl -XPOST http://192.168.133.6:9200/bjsxt/employee/1 -d '
    {
     "first_name" : "pablo2",
     "age" : 35,
     "about" : "I love to go rock climbing",
     "interests": [ "sports", "music" ],
     "sex": "man"
    }'
    
    
    ----------------------------------------
    
    
    #put:yes
    
    
    curl -XPUT http://192.168.133.6:9200/bjsxt/employee/1 -d '
    {
     "first_name" : "god bin",
     "last_name" : "pang",
     "age" : 42,
     "about" : "I love to go rock climbing",
     "interests": [ "sports", "music" ]
    }'
    
    curl -XPUT http://192.168.133.6:9200/bjsxt/employee -d '
    {
     "first_name" : "god bin",
     "last_name" : "bin",
     "age" : 45,
     "about" : "I love to go rock climbing",
     "interests": [ "sports", "music" ]
    }'
    
    
    curl -XPUT http://192.168.133.6:9200/bjsxt/employee/2 -d '
    {
     "first_name" : "god bin",
     "last_name" : "bin",
     "age" : 45,
     "about" : "I love to go rock climbing",
     "interests": [ "sports", "music" ]
    }'
    
    curl -XPUT http://192.168.133.6:9200/bjsxt/employee/1 -d '
    {
     "first_name" : "god bin",
     "last_name" : "pang",
     "age" : 40,
     "about" : "I love to go rock climbing",
     "interests": [ "sports", "music" ]
    }'
    
    
    
    #根据document的id来获取数据:(without pretty)
    curl -XGET http://192.168.133.6:9200/bjsxt/employee/1?pretty
    
    #根据field来查询数据:
    curl -XGET http://192.168.133.6:9200/bjsxt/employee/_search?q=first_name="bin"
    
    #根据field来查询数据:match
    curl -XGET http://192.168.133.6:9200/bjsxt/employee/_search?pretty -d '
    {
     "query":
      {"match":
       {"first_name":"bin"}
      }
    }'
    
    
    
    #对多个field发起查询:multi_match
    curl -XGET http://192.168.133.6:9200/bjsxt/employee/_search?pretty -d '
    {
     "query":
      {"multi_match":
       {
        "query":"bin",
        "fields":["last_name","first_name"],
        "operator":"and"
       }
      }
    }'
    
    
    #多个term对多个field发起查询:bool(boolean) 
    # 组合查询,must,must_not,should 
    #  must + must : 交集
    #  must +must_not :差集
    #  should+should  : 并集
    
    curl -XGET http://192.168.133.6:9200/bjsxt/employee/_search?pretty -d '
    {
     "query":
      {"bool" :
       {
        "must" : 
         {"match":
          {"first_name":"bin"}
         },
        "must" : 
         {"match":
          {"age":33}
         }
       }
      }
    }'
    
    curl -XGET http://192.168.133.6:9200/bjsxt/employee/_search?pretty -d '
    {
     "query":
      {"bool" :
       {
        "must" : 
         {"match":
          {"first_name":"bin"}
         },
        "must_not" : 
         {"match":
          {"age":33}
         }
       }
      }
    }'
    
    
    
    
    
    curl -XGET http://192.168.133.6:9200/bjsxt/employee/_search?pretty -d '
    {
     "query":
      {"bool" :
       {
        "must_not" : 
         {"match":
          {"first_name":"bin"}
         },
        "must_not" : 
         {"match":
          {"age":33}
         }
       }
      }
    }'
    
    ##查询first_name=bin的,或者年龄在20岁到33岁之间的
    
    curl -XGET http://192.168.133.6:9200/bjsxt/employee/_search -d '
    {
     "query":
      {"bool" :
       {
       "must" :
        {"term" : 
         { "first_name" : "bin" }
        }
       ,
       "must_not" : 
        {"range":
         {"age" : { "from" : 20, "to" : 33 }
        }
       }
       }
      }
    }'
    
    
    #修改配置
    curl -XPUT 'http://192.168.133.6:9200/test2/' -d'{"settings":{"number_of_replicas":2}}'
    
    curl -XPUT 'http://192.168.133.6:9200/test3/' -d'{"settings":{"number_of_shards":3,"number_of_replicas":3}}'
    
    curl -XPUT 'http://192.168.133.6:9200/test4/' -d'{"settings":{"number_of_shards":6,"number_of_replicas":4}}'
    
    
    curl -XPOST http://192.168.9.11:9200/bjsxt/person/_mapping -d'
    {
        "person": {
            "properties": {
                "content": {
                    "type": "string",
                    "store": "no",
                    "term_vector": "with_positions_offsets",
                    "analyzer": "ik_max_word",
                    "search_analyzer": "ik_max_word",
                    "include_in_all": "true",
                    "boost": 8
                }
            }
        }
    }'
    

      

    官网
    https://www.elastic.co/guide/index.html
    https://www.elastic.co/guide/en/elasticsearch/client/index.html
    https://www.elastic.co/guide/en/elasticsearch/client/java-api/index.html
    https://www.elastic.co/guide/en/elasticsearch/client/java-api/2.2/transport-client.html
    

     

    爬取数据,作为document的原始文件。在linux上
    yum install wget  
    ## 如下命令爬取 http://news.cctv.com;并且按照原有网站的url目录存储到data下
    wget -o /tmp/wget.log -P /root/data  --no-parent --no-verbose -m -D news.cctv.com   -N --convert-links --random-wait -A html,HTML,shtml,SHTML http://news.cctv.com
    配置分词器
    https://github.com/medcl/elasticsearch-analysis-ik 
    版本必须与es相对应
    
    elasticsearch-2.2.1.zip 
    elasticsearch-analysis-ik-1.8.0.zip  ## 
    [sxt@node1 ik]$ pwd
    /opt/sxt/es/elasticsearch-2.2.1/plugins/ik  ## 修改如下配置文件
    [sxt@node1 ik]$ cat plugin-descriptor.properties | grep version=
    elasticsearch.version=2.2.1 ## 版本号也修改对应。
    
    ## 启动es.
    
    ## 运行java程序  createIndex
    
    package com.sxt.es;
    
    import java.io.File;
    import java.net.InetAddress;
    import java.util.HashMap;
    import java.util.Map;
    
    import org.elasticsearch.action.admin.indices.exists.indices.IndicesExistsResponse;
    import org.elasticsearch.action.admin.indices.mapping.put.PutMappingRequest;
    import org.elasticsearch.action.search.SearchResponse;
    import org.elasticsearch.client.Client;
    import org.elasticsearch.client.Requests;
    import org.elasticsearch.client.transport.TransportClient;
    import org.elasticsearch.common.settings.Settings;
    import org.elasticsearch.common.text.Text;
    import org.elasticsearch.common.transport.InetSocketTransportAddress;
    import org.elasticsearch.common.xcontent.XContentBuilder;
    import org.elasticsearch.common.xcontent.XContentFactory;
    import org.elasticsearch.index.query.BoolQueryBuilder;
    import org.elasticsearch.index.query.MatchQueryBuilder;
    import org.elasticsearch.index.query.MultiMatchQueryBuilder;
    import org.elasticsearch.index.query.MultiMatchQueryParser;
    import org.elasticsearch.index.query.RangeQueryBuilder;
    import org.elasticsearch.search.SearchHit;
    import org.elasticsearch.search.SearchHits;
    import org.junit.Test;
    import org.springframework.stereotype.Service;
    
    import com.sxt.util.HtmlTool;
    
    @Service
    public class IndexService {
    
    	//存放html文件的目录
    //	public static String DATA_DIR="C:\data\";
    	public static String DATA_DIR="d:\data\";
    	
    	public static Client client;
    
    	static {
    		Settings settings = Settings.settingsBuilder()
    				.put("cluster.name", "bjsxt-es").build();
    		try {
    			client = TransportClient
    					.builder()
    					.settings(settings)
    					.build()
    					.addTransportAddress(
    							new InetSocketTransportAddress(InetAddress
    									.getByName("node1"), 9300))
    					.addTransportAddress(
    							new InetSocketTransportAddress(InetAddress
    									.getByName("node2"), 9300))
    					.addTransportAddress(
    							new InetSocketTransportAddress(InetAddress
    									.getByName("node3"), 9300));
    		} catch (Exception e) {
    			e.printStackTrace();
    		}
    	}
    
    	/**
    	 * admin():管理索引库的。client.admin().indices()
    	 * 
    	 * 索引数据的管理:client.prepare
    	 * 
    	 */
    	@Test
    	public void createIndex() throws Exception {
    		IndicesExistsResponse resp = client.admin().indices().prepareExists("bjsxt").execute().actionGet();
    		if(resp.isExists()){
    			client.admin().indices().prepareDelete("bjsxt").execute().actionGet();
    		}
    		client.admin().indices().prepareCreate("bjsxt").execute().actionGet();
    
    		new XContentFactory();
    
    		XContentBuilder builder = XContentFactory.jsonBuilder().startObject()
    				.startObject("htmlbean").startObject("properties")
    				.startObject("title").field("type", "string")
    				.field("store", "yes").field("analyzer", "ik_max_word")
    				.field("search_analyzer", "ik_max_word").endObject()
    				.startObject("content").field("type", "string")
    				.field("store", "yes").field("analyzer", "ik_max_word")
    				.field("search_analyzer", "ik_max_word").endObject()
    //				.startObject("url").field("type", "string")
    //				.field("store", "yes").field("analyzer", "ik_max_word")
    //				.field("search_analyzer", "ik_max_word").endObject()
    				.endObject().endObject().endObject();
    		PutMappingRequest mapping = Requests.putMappingRequest("bjsxt").type("htmlbean").source(builder);
    		client.admin().indices().putMapping(mapping).actionGet();
    
    	}
    	
    	/**
    	 * 把源数据html文件添加到索引库中(构建索引文件)
    	 */
    	@Test
    	public void addHtmlToES(){
    		readHtml(new File(DATA_DIR));
    	}
    	
    	/**
    	 * 遍历数据文件目录d:/data ,递归方法
    	 * @param file
    	 */
    	public void readHtml(File file){
    		if(file.isDirectory()){
    			File[]  fs =file.listFiles();
    			for (int i = 0; i < fs.length; i++) {
    				File f = fs[i];
    				readHtml(f);
    			}
    		}else{
    			HtmlBean bean;
    			try {
    				bean = HtmlTool.parserHtml(file.getPath());
    				if(bean!=null){
    					Map<String, String> dataMap =new HashMap<String, String>();
    					dataMap.put("title", bean.getTitle());
    					dataMap.put("content", bean.getContent());
    					dataMap.put("url", bean.getUrl());
    					//写索引
    					client.prepareIndex("bjsxt", "htmlbean").setSource(dataMap).execute().actionGet();
    				}
    			} catch (Throwable e) {
    				e.printStackTrace();
    			}
    			
    		}
    	}
    	
    	/**
    	 * 搜索
    	 * @param kw
    	 * @param num
    	 * @return
    	 */
    	public PageBean<HtmlBean> search(String kw,int num,int count){
    		PageBean<HtmlBean> wr =new PageBean<HtmlBean>();
    		wr.setIndex(num);
    //		//构建查询条件
    //		MatchQueryBuilder q1 =new MatchQueryBuilder("title", kw);
    //		MatchQueryBuilder q2 =new MatchQueryBuilder("content", kw);
    //		
    //		//构建一个多条件查询对象
    //		BoolQueryBuilder q =new BoolQueryBuilder(); //组合查询条件对象
    //		q.should(q1);
    //		q.should(q2);
    		
    //		RangeQueryBuilder q1 =new RangeQueryBuilder("age");
    //		q1.from(18);
    //		q1.to(40);
    		
    		MultiMatchQueryBuilder q =new MultiMatchQueryBuilder(kw, new String[]{"title","content"});
    		SearchResponse resp=null;
    		if(wr.getIndex()==1){
    			resp = client.prepareSearch("bjsxt")
    					.setTypes("htmlbean")
    					.setQuery(q)
    					.addHighlightedField("title")
    					.addHighlightedField("content")
    					.setHighlighterPreTags("<font color="red">")
    					.setHighlighterPostTags("</font>")
    					.setHighlighterFragmentSize(40)//设置显示结果中一个碎片段的长度
    					.setHighlighterNumOfFragments(5)//设置显示结果中每个结果最多显示碎片段,每个碎片段之间用...隔开
    					.setFrom(0)
    					.setSize(10)
    					.execute().actionGet();
    			
    		}else{
    			wr.setTotalCount(count);
    			resp = client.prepareSearch("bjsxt")
    					.setTypes("htmlbean")
    					.setQuery(q)
    					.addHighlightedField("title")
    					.addHighlightedField("content")
    					.setHighlighterPreTags("<font color="red">")
    					.setHighlighterPostTags("</font>")
    					.setHighlighterFragmentSize(40)
    					.setHighlighterNumOfFragments(5)
    					.setFrom(wr.getStartRow())
    					.setSize(10)
    					.execute().actionGet();
    		}
    		SearchHits hits= resp.getHits();
    		wr.setTotalCount((int)hits.getTotalHits());
    		
    		for(SearchHit hit : hits.getHits()){
    			HtmlBean bean =new HtmlBean();
    			if(hit.getHighlightFields().get("title")==null){//title中没有包含关键字
    				bean.setTitle(hit.getSource().get("title").toString());//获取原来的title(没有高亮的title)
    			}else{
    				bean.setTitle(hit.getHighlightFields().get("title").getFragments()[0].toString());
    			}
    			if(hit.getHighlightFields().get("content")==null){//title中没有包含关键字
    				bean.setContent(hit.getSource().get("content").toString());//获取原来的title(没有高亮的title)
    			}else{
    				StringBuilder sb =new StringBuilder();
    				for(Text text: hit.getHighlightFields().get("content").getFragments()){
    					sb.append(text.toString()+"...");
    				}
    				bean.setContent(sb.toString());
    			}
    			
    			bean.setUrl("http://"+hit.getSource().get("url").toString());
    			wr.setBean(bean);
    			
    		}
    		
    		
    		return wr;
    	}
    	
    	
    //	@Test
    //	public void del(){
    ////		client.admin().indices().prepareDelete("bjsxt").execute().actionGet();
    //		client.admin().indices().prepareDelete("bjsxt2").execute().actionGet();
    //	}
    }
    
    ## 将linux wget 爬取到的数据存放到D:\下。
    ## 运行addHtmlToES()方法,数据文档添加到es中
    
    ## 如下时对项目:ES_SEARCH的演示效果。
    

      

     

    window 查看端口和pid,杀死pid
    C:WINDOWSsystem32>netstat -ano | findstr 8080
      TCP    0.0.0.0:8080           0.0.0.0:0              LISTENING       9448
      TCP    [::]:8080              [::]:0                 LISTENING       9448
    
    C:WINDOWSsystem32>taskkill /PID 9448 /F
    

      

  • 相关阅读:
    httpclient用法
    JS逻辑运算符&&与||的妙用
    jackson详解
    MVC +EF+linq 多表联查
    Log4net 集成到MVC+EF框架
    Asp.net中的页面跳转及post数据
    字符串的分割操作
    线程的信号机制
    事件的标准模式
    Java网络编程
  • 原文地址:https://www.cnblogs.com/xhzd/p/11503392.html
Copyright © 2011-2022 走看看