zoukankan      html  css  js  c++  java
  • CVPR论文爬取并进行词云展示

    效果图:

     

     源码如下:

    首先是Python对cvpr论文的爬取部分:爬取的网址为 http://openaccess.thecvf.com

    import  pymysql
    import re
    import requests
    
    # 连接数据库函数
    def insertCvpr(value):
    
        try:
            db = pymysql.connect("localhost", "root", "root", "jiaoli")
            print("数据库连接成功!")
            cur = db.cursor()
            sql = 'INSERT INTO cvpr(title,ab,hotword,pdf) VALUE (%s,%s,%s,%s)'
            cur.execute(sql, value)
            db.commit()
            print("增加数据成功!")
        except pymysql.Error as e:
            print("增加数据失败:  " + str(e))
            db.rollback()
    
        db.close()
    
    
        #开头
        url = "http://openaccess.thecvf.com/ICCV2019.py"
        headers = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.92 Safari/537.36"}
        res = requests.get(url,headers=headers)
        res.encoding = "utf-8"
        # 先爬取每个论文的网址
        web = re.findall("""<dt class="ptitle"><br><a href="(.*?)">.*?</a></dt>""", res.text, re.S)
        for each in web:
            try:
                each = "http://openaccess.thecvf.com/" + each
                print(each)
                res = requests.get(each, headers=headers, timeout=(3, 7))
                res.encoding = "utf-8"
                # 在各各论文网站中爬取详细信息
                title = re.findall("""<div id="papertitle">(.*?)</div>""", res.text, re.S)
                ab = re.findall("""<div id="abstract" >(.*?)</div>""", res.text, re.S)
                pdf = re.findall("""[<a href="../../(.*?)">pdf</a>]""", res.text, re.S)
                if (len(title) > 0):
                    title = title[0].replace("
    ", "")
                    ab = ab[0].replace("
    ", "")
                    pdf = "http://openaccess.thecvf.com/" + pdf[0]
                    print(title)
                    value = (title, ab, "", pdf)
                    insertCvpr(value)
            except:
                print("闪过")
    爬虫

    然后对爬取的文章进行题目和摘要的关键词分析,分析出词频最高的有效词汇: 

    import java.io.IOException;
    import java.sql.Connection;
    import java.sql.DriverManager;
    import java.sql.PreparedStatement;
    import java.sql.ResultSet;
    import java.sql.SQLException;
    import java.sql.Statement;
    import java.util.ArrayList;
    
    import javax.servlet.ServletException;
    import javax.servlet.annotation.WebServlet;
    import javax.servlet.http.HttpServlet;
    import javax.servlet.http.HttpServletRequest;
    import javax.servlet.http.HttpServletResponse;
    import javax.servlet.http.HttpSession;
    
        //2.将删除改成类名
        /**
         * Servlet implementation class index
         */
        @WebServlet("/input")
        public class input extends HttpServlet{
            private static final long serialVersionUID = 1L;
            
            /**
             * @see HttpServlet#HttpServlet()
             */
            public input() {
                super();
                // TODO Auto-generated constructor stub
            }
    
            /**
             * @see HttpServlet#doGet(HttpServletRequest request, HttpServletResponse response)
             */
    protected void doGet(HttpServletRequest request, HttpServletResponse response) throws ServletException, IOException {
        
        request.setCharacterEncoding("UTF-8");
        response.setContentType("text/html;charset=UTF-8");
        //声明缓冲区
        HttpSession session = request.getSession();
        String url = "jdbc:mysql://localhost:3306/cvpr?&useSSL=false&serverTimezone=UTC&useUnicode=yes&characterEncoding=utf8";        
        
        Connection conn = null;
        PreparedStatement ps = null;
    
        try {
            Class.forName("com.mysql.cj.jdbc.Driver");
            conn = DriverManager.getConnection(url, "root", "root");
            
        } catch (ClassNotFoundException e) {
            response.getWriter().print("加载驱动失败");
        } catch (SQLException e) {
            response.getWriter().print("连接数据库失败");
        }
        
        
        StringBuffer buffer2 = new StringBuffer();
        try {
            Statement stmt = conn.createStatement();
            //1.改sql语句        
                ResultSet rs = stmt.executeQuery("select * from cvpr");
            while (rs.next()) {            
                String title=new String(rs.getString("title"));        
                String ab=new String(rs.getString("ab"));        
                buffer2.append(title);
                buffer2.append(ab);        
            }
            
    }catch (SQLException e) {
            response.getWriter().print("查找失败");
        }    
        String file = buffer2.toString();
        String[] a=file.split("[^a-zA-Z]+");   
        int n=a.length;
        int kind=0,zs=0;  
        Object[][] b=new Object[n][2];
        for(;zs<n;zs++){
            int k=0;
            for(int i=0;i<kind;i++){
                if(((String) b[i][0]).equalsIgnoreCase(a[zs])){
                    b[i][1]=(int)b[i][1]+1;
                    k=1;
                    break;
                }
            }
            if(k==0){
                b[kind][0]=a[zs];
                b[kind][1]=1;
                kind++;
            } 
        }
        int max=0;
        int p=0,q=0;
        String m;
        String[] c=new String[1000];
        int[] d=new int[1000];
        for(int i=0;i<1000;i++)
        {
            c[i]="";
            d[i]=0;
        }
        for(int i=0;i<kind;i++){
            for(int j=0;j<kind;j++){
                if((int)b[j][1]>(int)b[max][1])
                {if(((String) b[j][0]).length()<4)
                    {j=j++;
                    continue;}
                if(((String) b[j][0]).equals("With"))
                    {j++;
                    continue;}
                if(((String) b[j][0]).equals("with"))
                    {j++;
                    continue;}
                if(((String) b[j][0]).equals("that"))
                    {j++;
                    continue;}
                if(((String) b[j][0]).equals("this"))
                    {j++;
                    continue;}
                if(((String) b[j][0]).equals("from"))
                    {j++;
                    continue;}
                if(((String) b[j][0]).equals("which"))
                {j++;
                continue;}
                 else max=j;
                }
                }
            System.out.println(b[max][0]+"出现次数为:"+b[max][1]);
            if(i<30)
            {c[i]=(String) b[max][0];
             d[i]= (int) b[max][1];
             b[max][1]=0;    
            }
            else
            b[max][1]=0;    
    }
    
        session.setAttribute("c",c); 
        session.setAttribute("d",d); 
        request.getRequestDispatcher( "reciyun.jsp").forward(request,response);
    }
    }
    View Code

    然后是展示界面:

    展示界面需要导入echarts.min.js    echarts-wordcloud.min.js两个包

    <!DOCTYPE html>
    <html lang="en">
    <head>
        <meta charset="UTF-8">
        <title>Document</title>
    </head>
    <body>
    <%String num[] = (String[])session.getAttribute("c");%>
    <%int num2[] = (int[])session.getAttribute("d");%>
    
    <form action="tiaozhuan" method="post">
    <div id = "main" style=" 1200px;height: 800px;"></div>
    </form>
    
    <script type="text/javascript" src = "js/echarts.min.js"></script>
    <script type="text/javascript" src = "js/echarts-wordcloud.min.js"></script>
    <script type="text/javascript">
             
             var worldCloudcharts=echarts.init(document.getElementById('main'));
             var worldCloudoption = {
                     title: {
                         text: 'CVPR热词',
                         x: 'center',
                         textStyle: {
                             fontSize: 23,
                             color:'#FFFFFF'
                         }
     
                     },
                     tooltip: {
                         show: true
                     },
                     series: [{
                         name: 'CVPR热词',
                         type: 'wordCloud',
                         sizeRange: [20, 130],
                         rotationRange: [-45, 90],
                         textPadding: 0,
                         autoSize: {
                             enable: true,
                             minSize: 10
                         },
                         textStyle: {
                             normal: {
                                 color: function() {
                                     return 'rgb(' + [
                                         Math.round(Math.random() * 160),
                                         Math.round(Math.random() * 160),
                                         Math.round(Math.random() * 160)
                                     ].join(',') + ')';
                                 }
                             },
                             emphasis: {
                                 shadowBlur: 10,
                                 shadowColor: '#333'
                             }
                         },
                         data: [{
                             name: "Jayfee",
                             value: 666
                         }, {
                             name: "Nancy",
                             value: 520
                         }]
                     }]
                 };
     
                 var JosnList = [];
     
                 JosnList.push(
                         <%for(int i=0;i<29;i++)
                         {
                         %>
                         {name: "<%=num[i]%>",value: <%=num2[i]%>,url:'tiaozhuan?title=<%=num[i]%>'},
                         <%
                         }
                         %>
                         {name: "<%=num[99]%>",value: <%=num2[29]%>,url:'tiaozhuan?title=<%=num[29]%>'}
                         );
    
             worldCloudoption.series[0].data = JosnList;
             worldCloudcharts.setOption(worldCloudoption);
             worldCloudcharts.on("click",function(e){
                 console.log(e);
                 window.open(e.data.url);
             });
         </script>
    </body>
    </html>
    jsp

    最后对关键词在数据库进行模糊查询把相关文章信息返回jsp中展示:

    import java.io.IOException;
    import java.sql.Connection;
    import java.sql.DriverManager;
    import java.sql.PreparedStatement;
    import java.sql.ResultSet;
    import java.sql.SQLException;
    import java.sql.Statement;
    import java.util.ArrayList;
    
    import javax.servlet.ServletException;
    import javax.servlet.annotation.WebServlet;
    import javax.servlet.http.HttpServlet;
    import javax.servlet.http.HttpServletRequest;
    import javax.servlet.http.HttpServletResponse;
    import javax.servlet.http.HttpSession;
    
    import com.cvpr.lun.user;
    
    //2.将删除改成类名
    /**
     * Servlet implementation class index
     */
    @WebServlet("/tiaozhuan")
    public class tiaozhuan extends HttpServlet{
        private static final long serialVersionUID = 1L;
        
        /**
         * @see HttpServlet#HttpServlet()
         */
        public tiaozhuan() {
            super();
            // TODO Auto-generated constructor stub
        }
    
        /**
         * @see HttpServlet#doGet(HttpServletRequest request, HttpServletResponse response)
         */
        protected void doGet(HttpServletRequest request, HttpServletResponse response) throws ServletException, IOException {
            
            request.setCharacterEncoding("UTF-8");
            response.setContentType("text/html;charset=UTF-8");
            //声明list
            ArrayList<user> list = new ArrayList();
            //声明缓冲区
            HttpSession session = request.getSession();
            
            String url = "jdbc:mysql://localhost:3306/cvpr?&useSSL=false&serverTimezone=UTC&useUnicode=yes&characterEncoding=utf8";        
            
            Connection conn = null;
            PreparedStatement ps = null;
    
            try {
                Class.forName("com.mysql.cj.jdbc.Driver");
                conn = DriverManager.getConnection(url, "root", "root");
                
            } catch (ClassNotFoundException e) {
                response.getWriter().print("加载驱动失败");
            } catch (SQLException e) {
                response.getWriter().print("连接数据库失败");
            }
            
            String name=request.getParameter("title"); 
            System.out.printf(name);
            
            try {
                Statement stmt = conn.createStatement();
                //1.改sql语句        
                 ResultSet rs = stmt.executeQuery("select * from cvpr where title like '%"+name+"%' ");
                while (rs.next()) {            
                    String title=new String(rs.getString("title"));    
                    String ab=new String(rs.getString("ab"));    
                    String pdf=new String(rs.getString("pdf"));    
                    user use2=new user(title,ab,pdf);
                    list.add(use2);
                    System.out.printf(title);
                }
    
                
        }catch (SQLException e) {
                response.getWriter().print("查找失败");
            }    
            request.setAttribute("list",list);       
            request.getRequestDispatcher( "tiaozhuan.jsp").forward(request,response);
            //***************************************
        }
        protected void doPost(HttpServletRequest request, HttpServletResponse response) throws ServletException, IOException {
            // TODO Auto-generated method stub
            doGet(request, response);
        }
        
        
    }
    servlet
    <!DOCTYPE html>
    <html lang="en">
    <head>
        <meta charset="UTF-8">
        <title>Document</title>
    </head>
    <body>
        <c:forEach items="${list}" var="use2" >    
            书名:${use2.title}<br/>
            摘要:${use2.ab}<br/>
            连接:<a href =${use2.pdf} >查看原文</a><br/>
        </c:forEach>
    </body>
    </html>
    jsp
  • 相关阅读:
    [转载]instanceof和typeof区别
    【转载】DNN架构
    Delphi实现高性能的Socket通讯服务器(完成端口模型IOCP)
    record, packed record和变体记录
    Delphi操作Word的几个知识点
    WinSock学习笔记6:IOCP完成端口模型
    MyEclipse 常用设置和操作方法
    PAIP.一些流氓软件的流氓营销方法.txt
    qq安全使用指南.txt
    海量数据高性能分页
  • 原文地址:https://www.cnblogs.com/dwx8845/p/12714894.html
Copyright © 2011-2022 走看看