zoukankan      html  css  js  c++  java
  • 【Elasticsearch学习之三】Elasticsearch 搜索引擎案例

    环境
      虚拟机:VMware 10
      Linux版本:CentOS-6.5-x86_64
      客户端:Xshell4
      FTP:Xftp4
      jdk8
      elasticsearch-2.2.0

    第一步:获取数据
    主流搜索引擎,会使用爬虫,来获取网站的html数据,常用的工具有nutch,Python(主流),wget(c语言)
    这里使用wget模拟
    #安装wget
    yum install wget
    #使用wget从文件wget.log爬取数据
    #参数:
    #-o 指定爬取内容输出日志名
    #-P 爬取生成文件目录父目录
    #-m 拷贝
    #-D 列举爬取域名清单
    #-N 不重新检索文件, 除非更新文件时间晚于本地时间
    #--convert-links 根据路径转换成目录
    #--random-wait 随机等待 间隔性爬取 以防止被禁止访问
    #-A 指定文档类型
    #最后指定爬取网站地址

    [cluster@PCS101 /] wget -o /tmp/wget.log -P /root/data --no-parent --no-verbose -m -D news.cctv.com -N --convert-links --random-wait -A html,HTML,shtml,SHTML http://news.cctv.com


    第二步:ES集群安装分词器IK
    注意:必选选择与ES对应的IK版本 https://github.com/medcl/elasticsearch-analysis-ik


    #将ik目录直接ftp上传到目录下/opt/cluster/es/elasticsearch-2.2.1/plugins
    #更改目录所有者
    [cluster@PCS101 plugins]$ chown -R cluster:cluster ik
    修改配置 plugin-descriptor.properties
    elasticsearch.version=2.2.1

    分发至102、103:
    [cluster@PCS101 plugins]$ scp -r ik/ cluster@PCS102:`pwd`
    [cluster@PCS101 plugins]$ scp -r ik/ cluster@PCS103:`pwd`

    第三步:数据抽取:从网页中抽取数据
    HtmlTool.java

    package com.sxt.util;
    
    import java.io.File;
    
    import com.sxt.es.HtmlBean;
    import com.sxt.es.IndexService;
    
    import net.htmlparser.jericho.CharacterReference;
    import net.htmlparser.jericho.Element;
    import net.htmlparser.jericho.HTMLElementName;
    import net.htmlparser.jericho.Source;
    
    public class HtmlTool {
        
        /**
         * 数据过滤清洗 将网页转换成javabean
         * @param path html 文件路径
         */
        public static HtmlBean parserHtml(String path)throws Throwable{
            HtmlBean bean  =new HtmlBean();
            Source source=new Source(new File(path));
            source.fullSequentialParse();
            Element titleElement=source.getFirstElement(HTMLElementName.TITLE);
            if(titleElement==null){
                return null;
            }else{
                String title=CharacterReference.decodeCollapseWhiteSpace(titleElement.getContent());
                bean.setTitle(title);
            }
            String content =source.getTextExtractor().setIncludeAttributes(true).toString();
            
            String url =path.substring(IndexService.DATA_DIR.length());
            bean.setContent(content);
            bean.setUrl(url);
            return bean;
      }
        
        /**
         * @param args
         */
        public static void main(String[] args) {
            try {
                System.out.println(parserHtml("e:\data\news.cctv.com\2017\05\01\ARTI0k5MFLx2cvzQZffwQcUp170501.shtml").getContent());
            } catch (Throwable e) {
                e.printStackTrace();
            }
        }
    }


    HtmlBean.java

    package com.sxt.es;
    
    public class HtmlBean {
    
        private int id;
        private String title;
        private String content;
        private String url;
        public int getId() {
            return id;
        }
        public void setId(int id) {
            this.id = id;
        }
        public String getTitle() {
            return title;
        }
        public void setTitle(String title) {
            this.title = title;
        }
        public String getContent() {
            return content;
        }
        public void setContent(String content) {
            this.content = content;
        }
        public String getUrl() {
            return url;
        }
        public void setUrl(String url) {
            this.url = url;
        }
        
    }

    第四步:把抽取出来的数据同ES建立索引
    #创建索引库
    IndexService.java::createIndex
    #数据同ES建立索引
    IndexService.java::addHtmlToES

    package com.sxt.es;
    
    import java.io.File;
    import java.net.InetAddress;
    import java.util.HashMap;
    import java.util.Map;
    
    import org.elasticsearch.action.admin.indices.exists.indices.IndicesExistsResponse;
    import org.elasticsearch.action.admin.indices.mapping.put.PutMappingRequest;
    import org.elasticsearch.action.search.SearchResponse;
    import org.elasticsearch.client.Client;
    import org.elasticsearch.client.Requests;
    import org.elasticsearch.client.transport.TransportClient;
    import org.elasticsearch.common.settings.Settings;
    import org.elasticsearch.common.text.Text;
    import org.elasticsearch.common.transport.InetSocketTransportAddress;
    import org.elasticsearch.common.xcontent.XContentBuilder;
    import org.elasticsearch.common.xcontent.XContentFactory;
    import org.elasticsearch.index.query.BoolQueryBuilder;
    import org.elasticsearch.index.query.MatchQueryBuilder;
    import org.elasticsearch.index.query.MultiMatchQueryBuilder;
    import org.elasticsearch.index.query.MultiMatchQueryParser;
    import org.elasticsearch.index.query.RangeQueryBuilder;
    import org.elasticsearch.search.SearchHit;
    import org.elasticsearch.search.SearchHits;
    import org.junit.Test;
    import org.springframework.stereotype.Service;
    
    import com.sxt.util.HtmlTool;
    
    @Service
    public class IndexService {
    
        //存放html文件的目录
        public static String DATA_DIR="e:\data\";
        //定义客户端
        public static Client client;
    
        static {
            //设置连接集群名
            Settings settings = Settings.settingsBuilder().put("cluster.name", "wjy-es").build();
            try {
                //创建连接集群客户端
                client = TransportClient
                        .builder()
                        .settings(settings)
                        .build()
                        .addTransportAddress(
                                new InetSocketTransportAddress(InetAddress
                                        .getByName("134.32.123.101"), 9300))
                        .addTransportAddress(
                                new InetSocketTransportAddress(InetAddress
                                        .getByName("134.32.123.102"), 9300))
                        .addTransportAddress(
                                new InetSocketTransportAddress(InetAddress
                                        .getByName("134.32.123.103"), 9300));
            } catch (Exception e) {
                e.printStackTrace();
            }
        }
    
        /**
         * 创建索引库
         * admin():管理索引库的。client.admin().indices()
         * 索引数据的管理:client.prepare
         */
        @Test
        public void createIndex() throws Exception {
            IndicesExistsResponse resp = client.admin().indices().prepareExists("testes").execute().actionGet();
            //存在删除旧的
            if(resp.isExists()){
                client.admin().indices().prepareDelete("testes").execute().actionGet();
            }
            //创建
            client.admin().indices().prepareCreate("testes").execute().actionGet();
    
            new XContentFactory();
    
            XContentBuilder builder = XContentFactory.jsonBuilder()
                    .startObject()
                    .startObject("htmlbean")
                    .startObject("properties")//type
                    .startObject("title")
                    .field("type", "string")//字符串类型
                    .field("store", "yes")//是否保存
                    .field("analyzer", "ik_max_word")//指定分词器
                    .field("search_analyzer", "ik_max_word")//指定搜索依赖的分词器
                    .endObject()
                    .startObject("content")
                    .field("type", "string")
                    .field("store", "yes")
                    .field("analyzer", "ik_max_word")
                    .field("search_analyzer", "ik_max_word")
                    .endObject()
                    .startObject("url")
                    .field("type", "string")
                    .field("store", "yes")
                    .field("analyzer", "ik_max_word")
                    .field("search_analyzer", "ik_max_word")
                    .endObject()
                    .endObject()
                    .endObject()
                    .endObject();
            //创建type
            PutMappingRequest mapping = Requests.putMappingRequest("testes").type("htmlbean").source(builder);
            client.admin().indices().putMapping(mapping).actionGet();
    
        }
        
        /**
         * 把源数据html文件添加到索引库中(构建索引文件)
         */
        @Test
        public void addHtmlToES(){
            readHtml(new File(DATA_DIR));
        }
        
        /**
         * 遍历数据文件目录d:/data ,递归方法
         * @param file
         */
        public void readHtml(File file){
            if(file.isDirectory()){
                File[]  fs =file.listFiles();
                for (int i = 0; i < fs.length; i++) {
                    File f = fs[i];
                    readHtml(f);
                }
            }else{
                HtmlBean bean;
                try {
                    bean = HtmlTool.parserHtml(file.getPath());
                    if(bean!=null){
                        Map<String, String> dataMap =new HashMap<String, String>();
                        dataMap.put("title", bean.getTitle());
                        dataMap.put("content", bean.getContent());
                        dataMap.put("url", bean.getUrl());
                        //写索引
                        client.prepareIndex("testes", "htmlbean").setSource(dataMap).execute().actionGet();
                    }
                } catch (Throwable e) {
                    e.printStackTrace();
                }
                
            }
        }
        
        /**
         * 搜索
         * @param kw
         * @param num
         * @return
         */
        public PageBean<HtmlBean> search(String kw,int num,int count){
            PageBean<HtmlBean> wr =new PageBean<HtmlBean>();
            wr.setIndex(num);
    //        //构建查询条件
    //        MatchQueryBuilder q1 =new MatchQueryBuilder("title", kw);
    //        MatchQueryBuilder q2 =new MatchQueryBuilder("content", kw);
    //        
    //        //构建一个多条件查询对象
    //        BoolQueryBuilder q =new BoolQueryBuilder(); //组合查询条件对象
    //        q.should(q1);
    //        q.should(q2);
            
    //        RangeQueryBuilder q1 =new RangeQueryBuilder("age");
    //        q1.from(18);
    //        q1.to(40);
            
            MultiMatchQueryBuilder q =new MultiMatchQueryBuilder(kw, new String[]{"title","content"});
            SearchResponse resp=null;
            if(wr.getIndex()==1){//第一页
                resp = client.prepareSearch("testes")
                        .setTypes("htmlbean")
                        .setQuery(q)
                        .addHighlightedField("title")
                        .addHighlightedField("content")
                        .setHighlighterPreTags("<font color="red">")
                        .setHighlighterPostTags("</font>")
                        .setHighlighterFragmentSize(40)//设置显示结果中一个碎片段的长度
                        .setHighlighterNumOfFragments(5)//设置显示结果中每个结果最多显示碎片段,每个碎片段之间用...隔开
                        .setFrom(0)//从第几个开始
                        .setSize(10)//第一页数量
                        .execute()
                        .actionGet();
                
            }else{
                wr.setTotalCount(count);
                resp = client.prepareSearch("testes")
                        .setTypes("htmlbean")
                        .setQuery(q)
                        .addHighlightedField("title")//高亮显示
                        .addHighlightedField("content")
                        .setHighlighterPreTags("<font color="red">")//红色显示
                        .setHighlighterPostTags("</font>")
                        .setHighlighterFragmentSize(40)
                        .setHighlighterNumOfFragments(5)
                        .setFrom(wr.getStartRow())
                        .setSize(10)
                        .execute()
                        .actionGet();
            }
            SearchHits hits= resp.getHits();
            wr.setTotalCount((int)hits.getTotalHits());
            
            for(SearchHit hit : hits.getHits())
            {
                HtmlBean bean =new HtmlBean();
                if(hit.getHighlightFields().get("title")==null)
                {//title中没有包含关键字
                    bean.setTitle(hit.getSource().get("title").toString());//获取原来的title(没有高亮的title)
                }
                else
                {
                    bean.setTitle(hit.getHighlightFields().get("title").getFragments()[0].toString());
                }
                
                if(hit.getHighlightFields().get("content")==null)
                {//content中没有包含关键字
                    bean.setContent(hit.getSource().get("content").toString());//获取原来的content(没有高亮的content)
                }
                else
                {
                    StringBuilder sb =new StringBuilder();
                    for(Text text: hit.getHighlightFields().get("content").getFragments())
                    {
                        sb.append(text.toString()+"...");
                    }
                    bean.setContent(sb.toString());
                }
                
                bean.setUrl("http://"+hit.getSource().get("url").toString());
                wr.setBean(bean);
            }
            
            
            return wr;
        }
        
        
    //    @Test
    //    public void del(){
    ////        client.admin().indices().prepareDelete("testes").execute().actionGet();
    //        client.admin().indices().prepareDelete("testes").execute().actionGet();
    //    }
    }


    第五步:搜索数据

    IndexService.java::search  见上面代码
    PageBean.java

    package com.sxt.es;
    
    import java.util.ArrayList;
    import java.util.List;
    
    
    public class PageBean<T> {
        private int size = 10;//每页显示记录
        private int index = 1;// 当前页号    
        private int totalCount = 0;// 记录总数
        
        private int totalPageCount = 1;// 总页
        
        
        private int[] numbers;//展示页数集合
        protected List<T> list;//要显示到页面的数据集
    
        /**
         * 得到
         * @return
         */
        public int getStartRow() {
    
            return (index - 1) * size;
        }
    
        /**
         * 得到结束记录
         * @return
         */
        public int getEndRow() {
            
            return index * size;
        }
    
        /**
         * @return Returns the size.
         */
        public int getSize() {        
            return size;
        }
    
        /**
         * @param size
         * The size to set.
         */
        public void setSize(int size) {
            if (size > 0) {
                this.size = size;
            }
        }
        /**
         * @return Returns the currentPageNo.
         */
        public int getIndex() {
            if (totalPageCount == 0) {
                
                return 0;
            }
            
            return index;
        }
    
        /**
         * @param currentPageNo
         * The currentPageNo to set.
         */
        public void setIndex(int index) {
            if (index > 0) {
                this.index = index;
            }
        }
    
        /**
         * @return Returns the totalCount.
         */
        public int getTotalCount() {
            return totalCount;
        }
    
        /**
         * @param totalCount
         *  The totalCount to set.
         */
        public void setTotalCount(int totalCount) {
            if (totalCount >= 0) {
                this.totalCount = totalCount;
                setTotalPageCountByRs();//根据总记录数计算总页
            }
        }
    
        
        public int getTotalPageCount() {
            return this.totalPageCount;
        }
    
        /**
         * 根据总记录数计算总页
         */
        private void setTotalPageCountByRs() {
            if (this.size > 0 && this.totalCount > 0 && this.totalCount % this.size == 0) {
                this.totalPageCount = this.totalCount / this.size;
            } else if (this.size > 0 && this.totalCount > 0 && this.totalCount % this.size > 0) {
                this.totalPageCount = (this.totalCount / this.size) + 1;
            } else {
                this.totalPageCount = 0;
            }
            setNumbers(totalPageCount);//获取展示页数集合
        }
    
        public int[] getNumbers() {
            return numbers;
        }
        
        /**
         * 设置显示页数集合
         * @param totalPageCount
         */
        public void setNumbers(int totalPageCount) {
            if(totalPageCount>0){
                //!.当前数组的长度
                int[] numbers = new int[totalPageCount>10?10:totalPageCount];//页面要显示的页数集合
                int k =0;
                //
                //1.数组长度<10   1 2 3 4 ....   7
                //2.数组长度>=10
                //     当前页<=6  1 2 3 4    10
                //     当前页>=总页数-5           ......12 13 14 15  
                //     其他                                5  6  7 8   当前页(10)  10  11 12  13
                for(int i = 0;i < totalPageCount;i++){
                    //保证当前页为集合的中�?
                    if((i>=index- (numbers.length/2+1) || i >= totalPageCount-numbers.length) && k<numbers.length){
                        numbers[k] = i+1;
                        k++;
                    }else if(k>=numbers.length){
                        break;
                    }                
                }
                
                this.numbers = numbers;
            }
            
        }
        
        public void setNumbers(int[] numbers) {
            this.numbers = numbers;
        }
    
        public List<T> getList() {
            return list;
        }
    
        public void setList(List<T> list) {
            this.list = list;
        }
    
        public void setBean(T bean){
            if(this.list==null){
                list =new ArrayList<T>();
            }
            list.add(bean);
        }
    
    /*
        public static int getTotalPageCount(int iTotalRecordCount, int iPageSize) {
            if (iPageSize == 0) {
                return 0;
            } else {
                return (iTotalRecordCount % iPageSize) == 0 ? (iTotalRecordCount / iPageSize) : (iTotalRecordCount / iPageSize) + 1;
            }
        }*/
    }

    启动ES_SEARCH web,访问 http://localhost:8080/ES_SEARCH

    输入关键词 搜索:

    项目代码下载连接:https://download.csdn.net/download/cac2020/11015439

  • 相关阅读:
    Saltstack module gem 详解
    Saltstack module freezer 详解
    Saltstack module firewalld 详解
    Saltstack module file 详解
    Saltstack module event 详解
    Saltstack module etcd 详解
    Saltstack module environ 详解
    Saltstack module drbd 详解
    Saltstack module dnsutil 详解
    获取主页_剥离百度
  • 原文地址:https://www.cnblogs.com/cac2020/p/10518548.html
Copyright © 2011-2022 走看看