zoukankan      html  css  js  c++  java
  • 仿京东搜索

    仿京东搜索

    项目介绍:基于springboot的前后端分离项目,利用爬虫将京东首页的数据爬取下来,然后将数据放到ElasticSearch中,通过后端配置查询规则实现仿京东搜索。
    功能:实现分页高亮查询
    主要负责:
    1、Jsoup爬取数据 。2、实现搜索数据(条件、精确、分页、高亮搜索)
    项目地址:https://gitee.com/jamer/jingdong-search

    依赖

    <?xml version="1.0" encoding="UTF-8"?>
    <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
             xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 https://maven.apache.org/xsd/maven-4.0.0.xsd">
        <modelVersion>4.0.0</modelVersion>
        <parent>
            <groupId>org.springframework.boot</groupId>
            <artifactId>spring-boot-starter-parent</artifactId>
            <version>2.2.5.RELEASE</version>
            <relativePath/> <!-- lookup parent from repository -->
        </parent>
        <groupId>com.renzhe</groupId>
        <artifactId>jd-search</artifactId>
        <version>0.0.1-SNAPSHOT</version>
        <name>jd-search</name>
        <description>Demo project for Spring Boot</description>
    
        <properties>
            <java.version>1.8</java.version>
            <elasticsearch.version>7.6.1</elasticsearch.version>
        </properties>
    
    
        <dependencies>
            <!--解析电影音乐 tika-->
            <!--解析网页jsoup-->
            <dependency>
                <groupId>org.jsoup</groupId>
                <artifactId>jsoup</artifactId>
                <version>1.11.3</version>
            </dependency>
            <dependency>
                <groupId>com.alibaba</groupId>
                <artifactId>fastjson</artifactId>
                <version>1.2.50</version>
            </dependency>
            <dependency>
                <groupId>org.springframework.boot</groupId>
                <artifactId>spring-boot-starter-data-elasticsearch</artifactId>
            </dependency>
            <dependency>
                <groupId>org.springframework.boot</groupId>
                <artifactId>spring-boot-starter-thymeleaf</artifactId>
            </dependency>
            <dependency>
                <groupId>org.springframework.boot</groupId>
                <artifactId>spring-boot-starter-web</artifactId>
            </dependency>
    
            <dependency>
                <groupId>org.springframework.boot</groupId>
                <artifactId>spring-boot-devtools</artifactId>
                <scope>runtime</scope>
                <optional>true</optional>
            </dependency>
            <dependency>
                <groupId>org.springframework.boot</groupId>
                <artifactId>spring-boot-configuration-processor</artifactId>
                <optional>true</optional>
            </dependency>
            <dependency>
                <groupId>org.projectlombok</groupId>
                <artifactId>lombok</artifactId>
                <optional>true</optional>
            </dependency>
            <dependency>
                <groupId>org.springframework.boot</groupId>
                <artifactId>spring-boot-starter-test</artifactId>
                <scope>test</scope>
            </dependency>
        </dependencies>
    
        <build>
            <plugins>
                <plugin>
                    <groupId>org.springframework.boot</groupId>
                    <artifactId>spring-boot-maven-plugin</artifactId>
                    <configuration>
                        <excludes>
                            <exclude>
                                <groupId>org.projectlombok</groupId>
                                <artifactId>lombok</artifactId>
                            </exclude>
                        </excludes>
                    </configuration>
                </plugin>
            </plugins>
        </build>
    
    </project>
    

    工具类utils 用来解析网页 

    package com.renzhe.utils;
    
    import com.renzhe.pojo.Content;
    import org.jsoup.Jsoup;
    import org.jsoup.nodes.Document;
    import org.jsoup.nodes.Element;
    import org.jsoup.select.Elements;
    import org.springframework.boot.autoconfigure.condition.ConditionalOnJava;
    import org.springframework.stereotype.Component;
    
    import java.io.IOException;
    import java.net.URL;
    import java.util.ArrayList;
    import java.util.List;
    
    //爬取网页
    @Component
    public class HtmlParseUtil {
    
    
        public List<Content> parseJD(String keyword)throws IOException {
            //获取请求  https://search.jd.com/Search?keyword=Java
            //前提要联网 ajax不能获取到
            String url = "https://search.jd.com/Search?keyword=" + keyword;
            //解析网页,(Jsoup返回的就是浏览器Document对象)
            Document document = Jsoup.parse(new URL(url), 30000);
            //所有你在js中使用的方法这里都能用
            Element element = document.getElementById("J_goodsList");
    
            //获取所有的li元素
            Elements elements = element.getElementsByTag("li");
            //获取元素中的内容,这里的el 就是每一个li标签
            ArrayList<Content> goodsList = new ArrayList();
            for (Element el : elements) {
                //图片爬取失败的原因 在图片资源比较多的网站 大多都是将其用懒加载实现,等页面加载完后再渲染页面 从而实现提高加载速度
                //source-data-lazy-img
                String img = el.getElementsByTag("img").eq(0).attr("data-lazy-img");
                String price = el.getElementsByClass("p-price").eq(0).text();
                String title = el.getElementsByClass("p-name").eq(0).text();
    
                Content content = new Content();
                content.setImg(img);
                content.setPrice(price);
                content.setTitle(title);
    
               goodsList.add(content);
            }
            return goodsList;
        }
    }
    

    pojo类  

    package com.renzhe.pojo;
    
    import lombok.AllArgsConstructor;
    import lombok.Data;
    import lombok.NoArgsConstructor;
    import lombok.ToString;
    
    @Data
    @AllArgsConstructor
    @NoArgsConstructor
    @ToString
    public class Content {
        private String title;
        private String img;
        private String price;
    
    } 

    service

    package com.renzhe.service;
    
    import com.alibaba.fastjson.JSON;
    import com.renzhe.pojo.Content;
    import com.renzhe.utils.HtmlParseUtil;
    import org.elasticsearch.action.bulk.BulkRequest;
    import org.elasticsearch.action.bulk.BulkResponse;
    import org.elasticsearch.action.index.IndexRequest;
    import org.elasticsearch.action.search.SearchRequest;
    import org.elasticsearch.action.search.SearchResponse;
    import org.elasticsearch.client.RequestOptions;
    import org.elasticsearch.client.RestHighLevelClient;
    import org.elasticsearch.common.text.Text;
    import org.elasticsearch.common.unit.TimeValue;
    import org.elasticsearch.common.xcontent.XContentType;
    import org.elasticsearch.index.query.QueryBuilders;
    import org.elasticsearch.index.query.TermQueryBuilder;
    import org.elasticsearch.search.SearchHit;
    import org.elasticsearch.search.builder.SearchSourceBuilder;
    import org.elasticsearch.search.fetch.subphase.highlight.HighlightBuilder;
    import org.elasticsearch.search.fetch.subphase.highlight.HighlightField;
    import org.springframework.beans.factory.annotation.Autowired;
    import org.springframework.stereotype.Service;
    
    import javax.swing.text.Highlighter;
    import java.io.IOException;
    import java.util.ArrayList;
    import java.util.List;
    import java.util.Map;
    import java.util.concurrent.TimeUnit;
    
    //业务编写
    @Service
    public class ContentService {
    
    
        @Autowired
        private RestHighLevelClient restHighLevelClient;
        //解析数据 放入es索引库中
        public Boolean parseContent(String keywords) throws IOException {
            List<Content> contents = new HtmlParseUtil().parseJD(keywords);
            //把查询的数据放入到es
            BulkRequest bulkRequest = new BulkRequest();
            bulkRequest.timeout("2m");
            for (int i = 0; i <contents.size() ; i++) {
                bulkRequest.add(new IndexRequest("jd_goods")
                .source(JSON.toJSONString(contents.get(i)), XContentType.JSON));
            }
            BulkResponse bulk = restHighLevelClient.bulk(bulkRequest, RequestOptions.DEFAULT);
            return !bulk.hasFailures();
        }
    
        //2、获取这些数据实现搜索功能
        public List<Map<String,Object>> searchPage(String keyword,int pageNo,int pageSize) throws IOException{
            if(pageNo<=1){
                pageNo = 1;
            }
            //条件搜索
            SearchRequest searchRequest = new SearchRequest("jd_goods");
            SearchSourceBuilder sourceBuilder = new SearchSourceBuilder();
            //分页
            sourceBuilder.from(pageNo);
            sourceBuilder.size(pageSize);
            //精准匹配
            TermQueryBuilder termQueryBuilder = QueryBuilders.termQuery("title", keyword);
            sourceBuilder.query(termQueryBuilder);
            sourceBuilder.timeout(new TimeValue(60, TimeUnit.SECONDS));
            //执行搜索
            searchRequest.source(sourceBuilder);
            SearchResponse searchResponse = restHighLevelClient.search(searchRequest, RequestOptions.DEFAULT);
    
    
            //解析结果
            ArrayList<Map<String,Object>> list = new ArrayList<>();
            for (SearchHit documentFields : searchResponse.getHits().getHits()) {
                list.add(documentFields.getSourceAsMap());
            }
            return list;
    
        }
        //3、获取这些数据实现搜索高亮功能
        public List<Map<String,Object>> searchPageHighlightBuilder(String keyword,int pageNo,int pageSize) throws IOException{
            if(pageNo<=1){
                pageNo = 1;
            }
            //条件搜索
            SearchRequest searchRequest = new SearchRequest("jd_goods");
            SearchSourceBuilder sourceBuilder = new SearchSourceBuilder();
            //分页
            sourceBuilder.from(pageNo);
            sourceBuilder.size(pageSize);
            //精准匹配
            TermQueryBuilder termQueryBuilder = QueryBuilders.termQuery("title", keyword);
            sourceBuilder.query(termQueryBuilder);
            sourceBuilder.timeout(new TimeValue(60, TimeUnit.SECONDS));
    
            //高亮
            HighlightBuilder highlightBuilder = new HighlightBuilder();
            highlightBuilder.field("title");
            highlightBuilder.requireFieldMatch(false);
            highlightBuilder.preTags("<span style='color:red'>");
            highlightBuilder.postTags("</span>");
            sourceBuilder.highlighter(highlightBuilder);
    
    
            //执行搜索
            searchRequest.source(sourceBuilder);
            SearchResponse searchResponse = restHighLevelClient.search(searchRequest, RequestOptions.DEFAULT);
    
    
            //解析结果
            ArrayList<Map<String,Object>> list = new ArrayList<>();
            for (SearchHit documentFields : searchResponse.getHits().getHits()) {
                //解析高亮的字段
    
                Map<String, HighlightField> highlightFields =
                        documentFields.getHighlightFields();
                HighlightField title = highlightFields.get("title");
                Map<String,Object> sourceAsMap = documentFields.getSourceAsMap();//原来的结果
                if(title!=null){
                    Text[] fragments = title.fragments();
                    String newTitle = " ";
                    for (Text text : fragments) {
                        newTitle += text;
                    }
                    sourceAsMap.put("title",newTitle);//使用高亮的资源替换掉原来的内容
                }
    
    
                list.add(sourceAsMap);
            }
            return list;
    
        }
    
    }
    

    config

    package com.renzhe.config;
    
    import org.apache.http.HttpHost;
    import org.elasticsearch.client.RestClient;
    import org.elasticsearch.client.RestHighLevelClient;
    import org.springframework.context.annotation.Bean;
    import org.springframework.context.annotation.Configuration;
    
    @Configuration
    public class ElasticSearchClientConfig {
    
        @Bean
        public RestHighLevelClient restHighLevelClient(){
            RestHighLevelClient client = new RestHighLevelClient(
                    RestClient.builder(
                            new HttpHost("127.0.0.1", 9200, "http")));
                    return client;
        }
    }
    

    controller

    package com.renzhe.controller;
    
    import com.renzhe.service.ContentService;
    import org.springframework.beans.factory.annotation.Autowired;
    import org.springframework.stereotype.Controller;
    import org.springframework.web.bind.annotation.GetMapping;
    import org.springframework.web.bind.annotation.PathVariable;
    import org.springframework.web.bind.annotation.RestController;
    
    import java.io.IOException;
    import java.util.List;
    import java.util.Map;
    
    //前端的请求编写
    @RestController
    public class ContentController {
    
        @Autowired
        private ContentService contentService;
        @GetMapping("/parse/{keywords}")
        public Boolean parse(@PathVariable("keywords") String keywords) throws IOException {
            Boolean flag = contentService.parseContent(keywords);
            System.out.println(flag);
            return flag;
    
        }
    
    
         @GetMapping("/search/{keyword}/{pageNo}/{pageSize}")
        public List<Map<String,Object>> search(@PathVariable("keyword") String keyword,@PathVariable("pageNo") int pageNo,@PathVariable("pageSize") int pageSize) throws Exception{
           return contentService.searchPageHighlightBuilder(keyword,pageNo,pageSize);
        }
    }
    

    application.properties

    server.port=9090
    #关闭thymleaf的缓存
    spring.thymeleaf.cache=false
    

      

      

      

      

  • 相关阅读:
    RP2833 FPGA对应串口标识
    rp2833 网卡以及串口与接插件位置关系
    环境检测 短信收发的测试
    #A号板测试汇总
    #8号板测试汇总
    #2号板测试汇总
    #6号板问题
    【POJ3045】Cow Acrobats(贪心)
    【HDU1219】AC Me(水题)
    BUPT2017 wintertraining(15) #2 题解
  • 原文地址:https://www.cnblogs.com/jamers-rz/p/14438682.html
Copyright © 2011-2022 走看看