zoukankan      html  css  js  c++  java
  • 【Elasticsearch学习之三】Elasticsearch 搜索引擎案例

    环境
      虚拟机:VMware 10
      Linux版本:CentOS-6.5-x86_64
      客户端:Xshell4
      FTP:Xftp4
      jdk8
      elasticsearch-2.2.0

    第一步:获取数据
    主流搜索引擎,会使用爬虫,来获取网站的html数据,常用的工具有nutch,Python(主流),wget(c语言)
    这里使用wget模拟
    #安装wget
    yum install wget
    #使用wget从文件wget.log爬取数据
    #参数:
    #-o 指定爬取内容输出日志名
    #-P 爬取生成文件目录父目录
    #-m 拷贝
    #-D 列举爬取域名清单
    #-N 不重新检索文件, 除非更新文件时间晚于本地时间
    #--convert-links 根据路径转换成目录
    #--random-wait 随机等待 间隔性爬取 以防止被禁止访问
    #-A 指定文档类型
    #最后指定爬取网站地址

    [cluster@PCS101 /] wget -o /tmp/wget.log -P /root/data --no-parent --no-verbose -m -D news.cctv.com -N --convert-links --random-wait -A html,HTML,shtml,SHTML http://news.cctv.com


    第二步:ES集群安装分词器IK
    注意:必选选择与ES对应的IK版本 https://github.com/medcl/elasticsearch-analysis-ik


    #将ik目录直接ftp上传到目录下/opt/cluster/es/elasticsearch-2.2.1/plugins
    #更改目录所有者
    [cluster@PCS101 plugins]$ chown -R cluster:cluster ik
    修改配置 plugin-descriptor.properties
    elasticsearch.version=2.2.1

    分发至102、103:
    [cluster@PCS101 plugins]$ scp -r ik/ cluster@PCS102:`pwd`
    [cluster@PCS101 plugins]$ scp -r ik/ cluster@PCS103:`pwd`

    第三步:数据抽取:从网页中抽取数据
    HtmlTool.java

    package com.sxt.util;
    
    import java.io.File;
    
    import com.sxt.es.HtmlBean;
    import com.sxt.es.IndexService;
    
    import net.htmlparser.jericho.CharacterReference;
    import net.htmlparser.jericho.Element;
    import net.htmlparser.jericho.HTMLElementName;
    import net.htmlparser.jericho.Source;
    
    public class HtmlTool {
        
        /**
         * 数据过滤清洗 将网页转换成javabean
         * @param path html 文件路径
         */
        public static HtmlBean parserHtml(String path)throws Throwable{
            HtmlBean bean  =new HtmlBean();
            Source source=new Source(new File(path));
            source.fullSequentialParse();
            Element titleElement=source.getFirstElement(HTMLElementName.TITLE);
            if(titleElement==null){
                return null;
            }else{
                String title=CharacterReference.decodeCollapseWhiteSpace(titleElement.getContent());
                bean.setTitle(title);
            }
            String content =source.getTextExtractor().setIncludeAttributes(true).toString();
            
            String url =path.substring(IndexService.DATA_DIR.length());
            bean.setContent(content);
            bean.setUrl(url);
            return bean;
      }
        
        /**
         * @param args
         */
        public static void main(String[] args) {
            try {
                System.out.println(parserHtml("e:\data\news.cctv.com\2017\05\01\ARTI0k5MFLx2cvzQZffwQcUp170501.shtml").getContent());
            } catch (Throwable e) {
                e.printStackTrace();
            }
        }
    }


    HtmlBean.java

    package com.sxt.es;
    
    public class HtmlBean {
    
        private int id;
        private String title;
        private String content;
        private String url;
        public int getId() {
            return id;
        }
        public void setId(int id) {
            this.id = id;
        }
        public String getTitle() {
            return title;
        }
        public void setTitle(String title) {
            this.title = title;
        }
        public String getContent() {
            return content;
        }
        public void setContent(String content) {
            this.content = content;
        }
        public String getUrl() {
            return url;
        }
        public void setUrl(String url) {
            this.url = url;
        }
        
    }

    第四步:把抽取出来的数据同ES建立索引
    #创建索引库
    IndexService.java::createIndex
    #数据同ES建立索引
    IndexService.java::addHtmlToES

    package com.sxt.es;
    
    import java.io.File;
    import java.net.InetAddress;
    import java.util.HashMap;
    import java.util.Map;
    
    import org.elasticsearch.action.admin.indices.exists.indices.IndicesExistsResponse;
    import org.elasticsearch.action.admin.indices.mapping.put.PutMappingRequest;
    import org.elasticsearch.action.search.SearchResponse;
    import org.elasticsearch.client.Client;
    import org.elasticsearch.client.Requests;
    import org.elasticsearch.client.transport.TransportClient;
    import org.elasticsearch.common.settings.Settings;
    import org.elasticsearch.common.text.Text;
    import org.elasticsearch.common.transport.InetSocketTransportAddress;
    import org.elasticsearch.common.xcontent.XContentBuilder;
    import org.elasticsearch.common.xcontent.XContentFactory;
    import org.elasticsearch.index.query.BoolQueryBuilder;
    import org.elasticsearch.index.query.MatchQueryBuilder;
    import org.elasticsearch.index.query.MultiMatchQueryBuilder;
    import org.elasticsearch.index.query.MultiMatchQueryParser;
    import org.elasticsearch.index.query.RangeQueryBuilder;
    import org.elasticsearch.search.SearchHit;
    import org.elasticsearch.search.SearchHits;
    import org.junit.Test;
    import org.springframework.stereotype.Service;
    
    import com.sxt.util.HtmlTool;
    
    @Service
    public class IndexService {
    
        //存放html文件的目录
        public static String DATA_DIR="e:\data\";
        //定义客户端
        public static Client client;
    
        static {
            //设置连接集群名
            Settings settings = Settings.settingsBuilder().put("cluster.name", "wjy-es").build();
            try {
                //创建连接集群客户端
                client = TransportClient
                        .builder()
                        .settings(settings)
                        .build()
                        .addTransportAddress(
                                new InetSocketTransportAddress(InetAddress
                                        .getByName("134.32.123.101"), 9300))
                        .addTransportAddress(
                                new InetSocketTransportAddress(InetAddress
                                        .getByName("134.32.123.102"), 9300))
                        .addTransportAddress(
                                new InetSocketTransportAddress(InetAddress
                                        .getByName("134.32.123.103"), 9300));
            } catch (Exception e) {
                e.printStackTrace();
            }
        }
    
        /**
         * 创建索引库
         * admin():管理索引库的。client.admin().indices()
         * 索引数据的管理:client.prepare
         */
        @Test
        public void createIndex() throws Exception {
            IndicesExistsResponse resp = client.admin().indices().prepareExists("testes").execute().actionGet();
            //存在删除旧的
            if(resp.isExists()){
                client.admin().indices().prepareDelete("testes").execute().actionGet();
            }
            //创建
            client.admin().indices().prepareCreate("testes").execute().actionGet();
    
            new XContentFactory();
    
            XContentBuilder builder = XContentFactory.jsonBuilder()
                    .startObject()
                    .startObject("htmlbean")
                    .startObject("properties")//type
                    .startObject("title")
                    .field("type", "string")//字符串类型
                    .field("store", "yes")//是否保存
                    .field("analyzer", "ik_max_word")//指定分词器
                    .field("search_analyzer", "ik_max_word")//指定搜索依赖的分词器
                    .endObject()
                    .startObject("content")
                    .field("type", "string")
                    .field("store", "yes")
                    .field("analyzer", "ik_max_word")
                    .field("search_analyzer", "ik_max_word")
                    .endObject()
                    .startObject("url")
                    .field("type", "string")
                    .field("store", "yes")
                    .field("analyzer", "ik_max_word")
                    .field("search_analyzer", "ik_max_word")
                    .endObject()
                    .endObject()
                    .endObject()
                    .endObject();
            //创建type
            PutMappingRequest mapping = Requests.putMappingRequest("testes").type("htmlbean").source(builder);
            client.admin().indices().putMapping(mapping).actionGet();
    
        }
        
        /**
         * 把源数据html文件添加到索引库中(构建索引文件)
         */
        @Test
        public void addHtmlToES(){
            readHtml(new File(DATA_DIR));
        }
        
        /**
         * 遍历数据文件目录d:/data ,递归方法
         * @param file
         */
        public void readHtml(File file){
            if(file.isDirectory()){
                File[]  fs =file.listFiles();
                for (int i = 0; i < fs.length; i++) {
                    File f = fs[i];
                    readHtml(f);
                }
            }else{
                HtmlBean bean;
                try {
                    bean = HtmlTool.parserHtml(file.getPath());
                    if(bean!=null){
                        Map<String, String> dataMap =new HashMap<String, String>();
                        dataMap.put("title", bean.getTitle());
                        dataMap.put("content", bean.getContent());
                        dataMap.put("url", bean.getUrl());
                        //写索引
                        client.prepareIndex("testes", "htmlbean").setSource(dataMap).execute().actionGet();
                    }
                } catch (Throwable e) {
                    e.printStackTrace();
                }
                
            }
        }
        
        /**
         * 搜索
         * @param kw
         * @param num
         * @return
         */
        public PageBean<HtmlBean> search(String kw,int num,int count){
            PageBean<HtmlBean> wr =new PageBean<HtmlBean>();
            wr.setIndex(num);
    //        //构建查询条件
    //        MatchQueryBuilder q1 =new MatchQueryBuilder("title", kw);
    //        MatchQueryBuilder q2 =new MatchQueryBuilder("content", kw);
    //        
    //        //构建一个多条件查询对象
    //        BoolQueryBuilder q =new BoolQueryBuilder(); //组合查询条件对象
    //        q.should(q1);
    //        q.should(q2);
            
    //        RangeQueryBuilder q1 =new RangeQueryBuilder("age");
    //        q1.from(18);
    //        q1.to(40);
            
            MultiMatchQueryBuilder q =new MultiMatchQueryBuilder(kw, new String[]{"title","content"});
            SearchResponse resp=null;
            if(wr.getIndex()==1){//第一页
                resp = client.prepareSearch("testes")
                        .setTypes("htmlbean")
                        .setQuery(q)
                        .addHighlightedField("title")
                        .addHighlightedField("content")
                        .setHighlighterPreTags("<font color="red">")
                        .setHighlighterPostTags("</font>")
                        .setHighlighterFragmentSize(40)//设置显示结果中一个碎片段的长度
                        .setHighlighterNumOfFragments(5)//设置显示结果中每个结果最多显示碎片段,每个碎片段之间用...隔开
                        .setFrom(0)//从第几个开始
                        .setSize(10)//第一页数量
                        .execute()
                        .actionGet();
                
            }else{
                wr.setTotalCount(count);
                resp = client.prepareSearch("testes")
                        .setTypes("htmlbean")
                        .setQuery(q)
                        .addHighlightedField("title")//高亮显示
                        .addHighlightedField("content")
                        .setHighlighterPreTags("<font color="red">")//红色显示
                        .setHighlighterPostTags("</font>")
                        .setHighlighterFragmentSize(40)
                        .setHighlighterNumOfFragments(5)
                        .setFrom(wr.getStartRow())
                        .setSize(10)
                        .execute()
                        .actionGet();
            }
            SearchHits hits= resp.getHits();
            wr.setTotalCount((int)hits.getTotalHits());
            
            for(SearchHit hit : hits.getHits())
            {
                HtmlBean bean =new HtmlBean();
                if(hit.getHighlightFields().get("title")==null)
                {//title中没有包含关键字
                    bean.setTitle(hit.getSource().get("title").toString());//获取原来的title(没有高亮的title)
                }
                else
                {
                    bean.setTitle(hit.getHighlightFields().get("title").getFragments()[0].toString());
                }
                
                if(hit.getHighlightFields().get("content")==null)
                {//content中没有包含关键字
                    bean.setContent(hit.getSource().get("content").toString());//获取原来的content(没有高亮的content)
                }
                else
                {
                    StringBuilder sb =new StringBuilder();
                    for(Text text: hit.getHighlightFields().get("content").getFragments())
                    {
                        sb.append(text.toString()+"...");
                    }
                    bean.setContent(sb.toString());
                }
                
                bean.setUrl("http://"+hit.getSource().get("url").toString());
                wr.setBean(bean);
            }
            
            
            return wr;
        }
        
        
    //    @Test
    //    public void del(){
    ////        client.admin().indices().prepareDelete("testes").execute().actionGet();
    //        client.admin().indices().prepareDelete("testes").execute().actionGet();
    //    }
    }


    第五步:搜索数据

    IndexService.java::search  见上面代码
    PageBean.java

    package com.sxt.es;
    
    import java.util.ArrayList;
    import java.util.List;
    
    
    public class PageBean<T> {
        private int size = 10;//每页显示记录
        private int index = 1;// 当前页号    
        private int totalCount = 0;// 记录总数
        
        private int totalPageCount = 1;// 总页
        
        
        private int[] numbers;//展示页数集合
        protected List<T> list;//要显示到页面的数据集
    
        /**
         * 得到
         * @return
         */
        public int getStartRow() {
    
            return (index - 1) * size;
        }
    
        /**
         * 得到结束记录
         * @return
         */
        public int getEndRow() {
            
            return index * size;
        }
    
        /**
         * @return Returns the size.
         */
        public int getSize() {        
            return size;
        }
    
        /**
         * @param size
         * The size to set.
         */
        public void setSize(int size) {
            if (size > 0) {
                this.size = size;
            }
        }
        /**
         * @return Returns the currentPageNo.
         */
        public int getIndex() {
            if (totalPageCount == 0) {
                
                return 0;
            }
            
            return index;
        }
    
        /**
         * @param currentPageNo
         * The currentPageNo to set.
         */
        public void setIndex(int index) {
            if (index > 0) {
                this.index = index;
            }
        }
    
        /**
         * @return Returns the totalCount.
         */
        public int getTotalCount() {
            return totalCount;
        }
    
        /**
         * @param totalCount
         *  The totalCount to set.
         */
        public void setTotalCount(int totalCount) {
            if (totalCount >= 0) {
                this.totalCount = totalCount;
                setTotalPageCountByRs();//根据总记录数计算总页
            }
        }
    
        
        public int getTotalPageCount() {
            return this.totalPageCount;
        }
    
        /**
         * 根据总记录数计算总页
         */
        private void setTotalPageCountByRs() {
            if (this.size > 0 && this.totalCount > 0 && this.totalCount % this.size == 0) {
                this.totalPageCount = this.totalCount / this.size;
            } else if (this.size > 0 && this.totalCount > 0 && this.totalCount % this.size > 0) {
                this.totalPageCount = (this.totalCount / this.size) + 1;
            } else {
                this.totalPageCount = 0;
            }
            setNumbers(totalPageCount);//获取展示页数集合
        }
    
        public int[] getNumbers() {
            return numbers;
        }
        
        /**
         * 设置显示页数集合
         * @param totalPageCount
         */
        public void setNumbers(int totalPageCount) {
            if(totalPageCount>0){
                //!.当前数组的长度
                int[] numbers = new int[totalPageCount>10?10:totalPageCount];//页面要显示的页数集合
                int k =0;
                //
                //1.数组长度<10   1 2 3 4 ....   7
                //2.数组长度>=10
                //     当前页<=6  1 2 3 4    10
                //     当前页>=总页数-5           ......12 13 14 15  
                //     其他                                5  6  7 8   当前页(10)  10  11 12  13
                for(int i = 0;i < totalPageCount;i++){
                    //保证当前页为集合的中�?
                    if((i>=index- (numbers.length/2+1) || i >= totalPageCount-numbers.length) && k<numbers.length){
                        numbers[k] = i+1;
                        k++;
                    }else if(k>=numbers.length){
                        break;
                    }                
                }
                
                this.numbers = numbers;
            }
            
        }
        
        public void setNumbers(int[] numbers) {
            this.numbers = numbers;
        }
    
        public List<T> getList() {
            return list;
        }
    
        public void setList(List<T> list) {
            this.list = list;
        }
    
        public void setBean(T bean){
            if(this.list==null){
                list =new ArrayList<T>();
            }
            list.add(bean);
        }
    
    /*
        public static int getTotalPageCount(int iTotalRecordCount, int iPageSize) {
            if (iPageSize == 0) {
                return 0;
            } else {
                return (iTotalRecordCount % iPageSize) == 0 ? (iTotalRecordCount / iPageSize) : (iTotalRecordCount / iPageSize) + 1;
            }
        }*/
    }

    启动ES_SEARCH web,访问 http://localhost:8080/ES_SEARCH

    输入关键词 搜索:

    项目代码下载连接:https://download.csdn.net/download/cac2020/11015439

  • 相关阅读:
    JavaScript实现Map、Reduce和Filter
    Javascript常用对象的属性和方法
    自己封装的Ajax
    JavaScript函数调用规则
    JavaScript模板引擎使用
    java 简单贪吃蛇
    java邮件发送(含附件)
    java 读写Oracle Blob字段
    java 连接飞信API
    非常棒的Java REST服务器栈
  • 原文地址:https://www.cnblogs.com/cac2020/p/10518548.html
Copyright © 2011-2022 走看看