zoukankan      html  css  js  c++  java
  • CVF顶会热词统计

    image-20210621010128271

    image-20210621010148561

    论文链接爬取

    首先是获取网页的html代码,然后通过正则表达式获取论文的下载链接

     public static String getHTMLText(String requesturl){
            StringBuffer buffer = null;
            BufferedReader bufferedReader = null;
            InputStreamReader inputStreamReader = null;
            InputStream inputStream = null;
            HttpsURLConnection httpsURLConnection = null;
            try {
                URL url = new URL(requesturl);
                httpsURLConnection = (HttpsURLConnection) url.openConnection();
                httpsURLConnection.setDoInput(true);
                httpsURLConnection.setRequestMethod("GET");
                inputStream = httpsURLConnection.getInputStream();
                inputStreamReader = new InputStreamReader(inputStream, "utf-8");
                bufferedReader = new BufferedReader(inputStreamReader);
                buffer = new StringBuffer();
                String str = null;
                while ((str = bufferedReader.readLine()) != null) {
                    buffer.append(str);
                }
            } catch (IOException e) {
                // TODO Auto-generated catch block
                e.printStackTrace();
            }
    
            return buffer.toString();
        }
    //返回的,一个是论文名,一个是论文链接
    public static Map<String,List<String>> getPapers(){
            Map<String,List<String>> urlAndName=new HashMap();
            String htmlStr=GetDataUtil.getHTMLText("https://openaccess.thecvf.com/WACV2021");
            List<String> urls= new ArrayList<>();
            List<String> nameS= new ArrayList<>();
            String url = "";
            Pattern p_url;
            Matcher m_url;
            String regEx_url = "href="([\w\s./:]+?)"";
            p_url = Pattern.compile
                    (regEx_url, Pattern.CASE_INSENSITIVE);
            m_url = p_url.matcher(htmlStr);
            while (m_url.find()) {
                url = m_url.group();
                if (url.contains(".pdf")){
                    url=url.substring(url.indexOf(""")+1);
                    url=url.substring(0,url.indexOf("""));
                    urls.add("https://openaccess.thecvf.com/"+url);
                    if (url.contains("papers")){
                        url=url.substring(url.indexOf("papers")+7);
                    }else if (url.contains("supplemental")){
                        url=url.substring(url.indexOf("supplemental")+13);
                    }
                    nameS.add(url);
                }
            }
            urlAndName.put("name",nameS);
            urlAndName.put("url",urls);
            return urlAndName;
        }
    

    论文爬取

    这里主要进行的是获取pdf论文

    转成txt文本为下一步做准备

    获取pdf

    //这个函数其实和上面的getHTMLText包括了,但我懒得改了
    public static InputStream getInputStream(String requesturl){
            InputStream inputStream = null;
            HttpsURLConnection httpsURLConnection = null;
            try {
                URL url = new URL(requesturl);
                httpsURLConnection = (HttpsURLConnection) url.openConnection();
                httpsURLConnection.setConnectTimeout(10000);
                httpsURLConnection.setReadTimeout(10000);
                httpsURLConnection.setDoInput(true);
                httpsURLConnection.setRequestMethod("GET");
                inputStream = httpsURLConnection.getInputStream();
            } catch (IOException e) {
                // TODO Auto-generated catch block
                e.printStackTrace();
            }
            return inputStream;
        }
    
    public static void saveAsPdf(InputStream inputStream,String path){
            File file = new File(path);
            if (!file.exists()) {
                FileOutputStream out = null;
                try {
                    out = new FileOutputStream(path);
                } catch (FileNotFoundException e) {
                    e.printStackTrace();
                }
                int data;
                try {
                    while((data = inputStream.read()) != -1) {
                        out.write(data);
                    }
                    inputStream.close();
                    out.close();
                } catch (IOException e) {
                    e.printStackTrace();
                }
            } else {
                System.out.println("文件已存在");
            }
        }
    

    转为txt

      /**
         * 传入一个.pdf文件
         * @param file
         * @throws Exception
         */
        public static void readPdf(String file) throws Exception {
            // 是否排序
            boolean sort = false;
            // pdf文件名
            String pdfFile = file;
            // 输入文本文件名称
            String textFile = null;
            // 编码方式
            String encoding = "UTF-8";
            // 开始提取页数
            int startPage = 1;
            // 结束提取页数
            int endPage = Integer.MAX_VALUE;
            // 文件输入流,生成文本文件
            Writer output = null;
            // 内存中存储的PDF Document
            PDDocument document = null;
            try {
                try {
                    // 首先当作一个URL来装载文件,如果得到异常再从本地文件系统//去装载文件
                    URL url = new URL(pdfFile);
                    //注意参数已不是以前版本中的URL.而是File。
                    document = PDDocument.load(new File(pdfFile));
                    // 获取PDF的文件名
                    String fileName = url.getFile();
                    // 以原来PDF的名称来命名新产生的txt文件
                    if (fileName.length() > 4) {
                        File outputFile = new File(fileName.substring(0, fileName.length() - 4)+ ".txt");
                        textFile ="F:\CVF\PDF\"+outputFile.getName();
                    }
                } catch (MalformedURLException e) {
                    // 如果作为URL装载得到异常则从文件系统装载
                    //注意参数已不是以前版本中的URL.而是File。
                    document = PDDocument.load(new File(pdfFile));
                    if (pdfFile.length() > 4) {
                        textFile = pdfFile.substring(0, pdfFile.length() - 4)+ ".txt";
                    }
                }
                // 文件输入流,写入文件倒textFile
                output = new OutputStreamWriter(new FileOutputStream(textFile),encoding);
                // PDFTextStripper来提取文本
                PDFTextStripper stripper = null;
                stripper = new PDFTextStripper();
                // 设置是否排序
                stripper.setSortByPosition(sort);
                // 设置起始页
                stripper.setStartPage(startPage);
                // 设置结束页
                stripper.setEndPage(endPage);
                // 调用PDFTextStripper的writeText提取并输出文本
                stripper.writeText(document, output);
    
                System.out.println(textFile + " 输出成功!");
    
                if (output != null) {
                    // 关闭输出流
                    output.close();
                }
                if (document != null) {
                    // 关闭PDF Document
                    document.close();
                }
            } finally {
                if (output != null) {
                    // 关闭输出流
                    output.close();
                }
                if (document != null) {
                    // 关闭PDF Document
                    document.close();
                }
            }
        }
    

    论文内关键词提取

    这里用到了GitHub上一个开源项目

    项目链接:RAKE-MASTER

    Java运行powershell类

    public class RunPowerShell {
        public void execCommand(String[] arstringCommand) {
            for (int i = 0; i < arstringCommand.length; i++) {
                System.out.print(arstringCommand[i] + " ");
            }
            try {
                Runtime.getRuntime().exec(arstringCommand);
    
            } catch (Exception e) {
                System.out.println(e.getMessage());
            }
        }
        public void execCommand(String arstringCommand) {
            try {
               Process process= Runtime.getRuntime().exec(arstringCommand);
               process.waitFor();
    
            } catch (Exception e) {
                System.out.println(e.getMessage());
            }
        }
    
        public void cmd(){
            //打开记算器
            String[] arstringCommand = new String[] {
                    "cmd ",
                    "/k",
                    "start", // cmd Shell命令
                    "calc"
            };
            execCommand(arstringCommand);
            //打开记事本
            String cmd = "cmd /k start notepad";
            execCommand(cmd);
        }
    
        public static void main(String[] args){
            new RunPowerShell().cmd();
        }
    
    }
    
    

    运行rake.py

    new RunPowerShell().execCommand(" python rake.py "+InputPath+" "+分词文件路径+" -o "+输出路径);
    

    数据处理

    循环运行完rake.py后会生成众多txt文件,这一步做的便是读取这些文件然后处理写入数据库

    pojo类

    paper

    package com.keyword.alice.pojo;
    
    import lombok.AllArgsConstructor;
    import lombok.Data;
    import lombok.NoArgsConstructor;
    
    @Data
    @AllArgsConstructor
    @NoArgsConstructor
    
    public class Paper {
        private String papertitle;
        private String authors;
        private String Abstract;
        private String pdf;
        private String year;
        private String type;
    }
    
    

    上个整体一点的代码得了

    public void JustRun(Paper paper) throws Exception {
            String name=paper.getPapertitle();
            String url=paper.getPdf();
            String year=paper.getYear();
            String type=paper.getType();
            List<KeyWords> resList;
            List<String>wordList=new ArrayList<>();
            List<String>urlList=new ArrayList<>();
        //爬取的论文中有空格或者斜杠会影响输出
            try {
                name=name.replace("/","_");
                name=name.replace(" ","_");
                url=url.replace("/../../","/");
                System.out.println(url);
            }catch (Exception e){
    
            }
            GetDataUtil.saveAsPdf(GetDataUtil.getInputStream(url),path+name+".pdf");
            PdfToTxt.readPdf(path+name+".pdf");
            name=name+".txt";
            new RunPowerShell().execCommand(" python rake.py F:\CVF\PDF\"+name+" stopwords.txt -o F:\CVF\TXT\KeyWords_"+name);
    
            try {
                FileReader fr = new FileReader("F:\CVF\TXT\KeyWords_"+name);
                BufferedReader bf = new BufferedReader(fr);
                String str;
                // 按行读取字符串
                while ((str = bf.readLine()) != null) {
                    wordList.add(str);
                    urlList.add(url);
                }
                bf.close();
                fr.close();
            } catch (IOException e) {
                e.printStackTrace();
            }
    
            resList= ToKeyWordsObj.ToObjKeyWords(wordList,urlList,year,type);
            resList.removeIf(keyWords -> keyWords.getValue() < 8);
            for (KeyWords keyWords : resList) {
                keyMapper.addkeyWord(keyWords);
            }
        }
    

    动态sql语句

    其实到这里就已经基本做完了项目了,剩下的就是CUDR了,唯一有难度的便是多条件查询

    paper是数据表名

    <script> select * from papers where 1=1 
                <if test='传入的参数!=null'>and 数据库字段名 like concat('%',#{传入的参数},'%')</if>
    

    这里其实是用的mybatis的注解

    图表联动

    这一段是图表联动,可以直接使用,但是注意变量名称还有元素的绑定

        table.on('row(keyW)', function(obj){
            var n=0;
            mycharts.dispatchAction({
                type: 'downplay',
                name: temp
            })
            temp=obj.data.name;
            array.push(temp);
            n=n+1;
            mycharts.dispatchAction({
                type: 'highlight',
                name: obj.data.name
            })
            if (array[n]===array[n-1]){
                mycharts.dispatchAction({
                    type: 'downplay',
                    name: temp
                })
            }
            setTimeout(function(){mycharts.dispatchAction({
                type: 'downplay',
                name: temp
            })},1000);
        });
    
    
    
        function eConsole(param) {
            var i = param.dataIndex;// 获取当前点击索引,
            //clickFunc(param.dataIndex);//执行点击效果
            $("tbody tr").each(function(){
                //alert($(this).index()); 获取数据表格每行的索引
                var a = $(this).index();
                if(a == i){
                    $("tbody tr").eq(a).css("background-color","yellow");
                }
            });
    
        };
        //鼠标移出
        function zConsole(param) {
            var i = param.dataIndex;// 获取当前点击索引,
    
    
            $("tbody tr").each(function(){
                //alert($(this).index()); 获取数据表格每行的索引
                var a = $(this).index();
                if(a == i){
                    $("tbody tr").eq(a).css("background-color","");
                }
            });
    
        };
    
        mycharts.on("mouseover", eConsole);//鼠标移入
        mycharts.on("mouseout", zConsole);//鼠标移出
        mycharts.hideLoading();
        mycharts.on('click',function(params){
            table.reload('test',{
                url: 'getUrlList'
                ,where: {
                    word:params.name
                }
                ,cols: [
                            [
                                {field:'type', 80, title: '会议',}
                                ,{field:'year', 80, title: '年份'}
                                ,{field:'url', title: '论文链接', sort: true}
                            ]
                        ]
            });
        });
    

    完整页面代码

    <!DOCTYPE html>
    <html>
    <head>
        <meta charset="utf-8">
        <title>检索</title>
        <meta name="renderer" content="webkit">
        <meta http-equiv="X-UA-Compatible" content="IE=edge,chrome=1">
        <meta name="viewport" content="width=device-width, initial-scale=1, maximum-scale=1">
        <link rel="stylesheet" type="text/css" href="https://www.layuicdn.com/layui/css/layui.css" />
        <script src="https://cdn.bootcdn.net/ajax/libs/jquery/3.6.0/jquery.js"></script>
        <script src="../echarts.js"></script>
        <script src="../echarts-wordcloud.js"></script>
        <!-- 注意:如果你直接复制所有代码到本地,上述css路径需要改成你本地的 -->
        <style>
            .layui-input-block {
                margin-left: 80px;
                min-height: 36px;
            }
        </style>
    </head>
    <body>
    
    <div class="layui-row">
        <div class="layui-col-md9">
            <div class="grid-demo grid-demo-bg1"> <div id="main" style=" 1200px;height: 800px; "></div></div>
        </div>
        <div class="layui-col-md3">
            <div class="grid-demo"><table class="layui-hide" id="keyW" lay-filter="keyW"></table></div>
        </div>
    </div>
    <table class="layui-hide" id="test" lay-filter="test"></table>
    
    <script type="text/html" id="toolbarDemo">
            <div class="layui-form-item">
                <label class="layui-form-label">关键词</label>
                <div class="layui-input-block">
                    <input type="text"  id="keyWord" name="keyWord" autocomplete="off" placeholder="请输入关键词,最多五个关键词,通过,和.分割(‘,’为或 '.'为和)" class="layui-input">
                </div>
            </div>
            <div class="layui-form-item">
                <label class="layui-form-label">检索</label>
                <div class="layui-input-inline">
                    <select name="year" id="year">
                        <option value="">请选择年份</option>
                        <option value="">所有</option>
                        <option value="2021" selected="">2021</option>
                        <option value="2020">2020</option>
                        <option value="2019">2019</option>
                        <option value="2018">2018</option>
                        <option value="2017">2017</option>
                        <option value="2016">2016</option>
                        <option value="2015">2015</option>
                        <option value="2014">2014</option>
                        <option value="2013">2013</option>
                    </select>
                </div>
                <div class="layui-input-inline">
                    <select name="type" id="type">
                        <option value="">请选会议类型</option>
                        <option value="">所有</option>
                        <option value="WACV">WACV</option>
                        <option value="CVPR">CVPR</option>
                        <option value="ICCV">ICCV</option>
                    </select>
                </div>
                <div class="layui-input-inline">
                    <div class="layui-input-inline">
                        <input type="text" name="author" id="author" placeholder="请输入作者" autocomplete="off" class="layui-input">
                    </div>
                </div>
                <div class="layui-input-inline">
                    <button class="layui-btn layui-btn-sm" onclick="change()" >搜索</button>
                    <button class="layui-btn layui-btn-sm" lay-event="getCheckData">获取选中行数据</button>
    <!--                layui-btn-sm-->
                </div>
            </div>
    </script>
    
    <script type="text/html" id="barDemo">
        <a class="layui-btn layui-btn-xs" lay-event="edit">查看本行</a>
        <a class="layui-btn layui-btn-danger layui-btn-xs" lay-event="del">删除</a>
    </script>
    
    
    <script src="https://www.layuicdn.com/layui/layui.js"></script>
    <!-- 注意:如果你直接复制所有代码到本地,上述 JS 路径需要改成你本地的 -->
    
    <script>
        var mycharts = echarts.init(document.getElementById("main"));
        //用来存储数据
        var jsonlist = [];
        var option;
        image1= "";
        var maskResource = new Image()
        $.ajax({
            url:"getList",
            dataType:"json",
            type:"GET",
            async:"false",
            success:function(data){
                jsonlist=data;
                option ={
                    //设置标题,居中显示
                    title:{
                        text: 'CVF',
                        left:'center',
                    },
                    //数据可以点击
                    tooltip:{
                        show:true
                    },
    
                    series:[
                        {
                            maskImage:maskResource,
                            //词的类型
                            type: 'wordCloud',
                            //设置字符大小范围
                            sizeRange:[5,80],
    
                            //不要忘记调用数据
                            data:jsonlist,
    
                            rotationRange:[-45,90],
                            textStyle: {
                                normal:{
                                    //生成随机的字体颜色
                                    color:function () {
                                        return 'rgb(' + [
                                            Math.round(Math.random() * 10),
                                            Math.round(Math.random() * 10),
                                            Math.round(Math.random() * 10)
                                        ].join(',')+')';
    
                                    }
                                },
    
                            },
                            emphasis: {  // 单词高亮时显示的效果
                                shadowBlur: 100000,
                                shadowColor: '#333',
                                backgroundColor:'#D92932',
                                show:true,
                                label: {
                                    show: true,
                                    fontSize: '40',
                                    fontWeight: 'bold'
                                },
                                textStyle: {
                                    fontSize:50
                                }
    
                            },
                            hoverAnimation:true,
                        }
                    ],
                };
                $.ajax({
                    url:"../image.json",
                    dataType:"json",
                    type:"GET",
                    async:"false",
                    success:function(data){
                        image1=data;
                        maskResource.src=image1;
                        //加载图像,将数据放在图像中
                        maskResource.onload = function(){
                            mycharts.setOption(option)
                        };
                    },
                })
            },
        })
    
        var table = layui.table;
        layui.use('table', function(){
            table = layui.table;
    
            table.render({
                elem: '#test'
                ,url:'getPaper'
                ,toolbar: '#toolbarDemo' //开启头部工具栏,并为其绑定左侧模板
                ,defaultToolbar: ['filter', 'exports', 'print', { //自定义头部工具栏右侧图标。如无需自定义,去除该参数即可
                    title: '提示'
                    ,layEvent: 'LAYTABLE_TIPS'
                    ,icon: 'layui-icon-tips'
                }]
                ,title: '用户数据表'
                ,cols: [
                    [
                        {type: 'checkbox', fixed: 'left'}
                    ,{field:'papertitle', title:'论文题目', 500, edit: 'text'}
                    ,{field:'authors', title:'作者', 500, edit: 'text', sort: true}
                    ,{field:'abstract', title:'摘要', edit: 'text', 100}
                    ,{field:'pdf', title:'论文链接', edit: 'text'}
                    ,{field:'year', title:'年份', edit: 'text', 80, sort: true}
                    ,{field:'type', title:'会议类型', edit: 'text', 120}
                    ,{fixed: 'right', title:'操作', toolbar: '#barDemo', 150}
                ]
                ]
                ,page: true
            });
    
            //头工具栏事件
            table.on('toolbar(test)', function(obj){
                var checkStatus = table.checkStatus(obj.config.id);
                switch(obj.event){
                    case 'getCheckData':
                        var data = checkStatus.data;
                        layer.alert(JSON.stringify(data));
                        break;
                    case 'getCheckLength':
                        var data = checkStatus.data;
                        layer.msg('选中了:'+ data.length + ' 个');
                        break;
                    case 'isAll':
                        layer.msg(checkStatus.isAll ? '全选': '未全选');
                        break;
    
                    //自定义头工具栏右侧图标 - 提示
                    case 'LAYTABLE_TIPS':
                        layer.alert('这是工具栏右侧自定义的一个图标按钮');
                        break;
                };
            });
    
            //监听行工具事件
            table.on('tool(test)', function(obj){
                var data = obj.data;
                //console.log(obj)
                if(obj.event === 'del'){
                    layer.confirm('真的删除行么', function(index){
                        obj.del();
                        layer.close(index);
                    });
                } else if(obj.event === 'edit'){
                    window.open(data.pdf,'_blank');
                }
            });
        });
        layui.use('table', function() {
            var table3 = layui.table;
    
            table3.render({
                elem: '#keyW'
                , url: 'getList20'
                , title: '数据表'
                , cols: [
                    [
                        , {field: 'name', title: '词', sort: true}
                        , {field: 'value', title: '词频',  80, sort: true}
                    ]
                ]
                , page: true
            });
        })
        var temp;
        var array=new Array();
        table.on('row(keyW)', function(obj){
            var n=0;
            mycharts.dispatchAction({
                type: 'downplay',
                name: temp
            })
            temp=obj.data.name;
            array.push(temp);
            n=n+1;
            mycharts.dispatchAction({
                type: 'highlight',
                name: obj.data.name
            })
            if (array[n]===array[n-1]){
                mycharts.dispatchAction({
                    type: 'downplay',
                    name: temp
                })
            }
            setTimeout(function(){mycharts.dispatchAction({
                type: 'downplay',
                name: temp
            })},1000);
        });
    
    
    
        function eConsole(param) {
            var i = param.dataIndex;// 获取当前点击索引,
            //clickFunc(param.dataIndex);//执行点击效果
            $("tbody tr").each(function(){
                //alert($(this).index()); 获取数据表格每行的索引
                var a = $(this).index();
                if(a == i){
                    $("tbody tr").eq(a).css("background-color","yellow");
                }
            });
    
        };
        //鼠标移出
        function zConsole(param) {
            var i = param.dataIndex;// 获取当前点击索引,
    
    
            $("tbody tr").each(function(){
                //alert($(this).index()); 获取数据表格每行的索引
                var a = $(this).index();
                if(a == i){
                    $("tbody tr").eq(a).css("background-color","");
                }
            });
    
        };
    
        mycharts.on("mouseover", eConsole);//鼠标移入
        mycharts.on("mouseout", zConsole);//鼠标移出
        mycharts.hideLoading();
        mycharts.on('click',function(params){
            table.reload('test',{
                url: 'getUrlList'
                ,where: {
                    word:params.name
                }
                ,cols: [
                            [
                                {field:'type', 80, title: '会议',}
                                ,{field:'year', 80, title: '年份'}
                                ,{field:'url', title: '论文链接', sort: true}
                            ]
                        ]
            });
        });
        function change(){
            table.reload('test',{
                url: 'getPaper'
                ,where: {
                    keyWord:document.getElementById('keyWord').value
                    ,year:document.getElementById('year').value
                    ,type:document.getElementById('type').value
                    ,author:document.getElementById('author').value
                }
                ,cols: [
                    [
                        {type: 'checkbox', fixed: 'left'}
                        ,{field:'papertitle', title:'论文题目', 500, edit: 'text'}
                        ,{field:'authors', title:'作者', 500, edit: 'text', sort: true}
                        ,{field:'abstract', title:'摘要', edit: 'text', 100}
                        ,{field:'pdf', title:'论文链接', edit: 'text'}
                        ,{field:'year', title:'年份', edit: 'text', 80, sort: true}
                        ,{field:'type', title:'会议类型', edit: 'text', 120}
                        ,{fixed: 'right', title:'操作', toolbar: '#barDemo', 150}
                    ]
                ]
            });
        }
    </script>
    
    </body>
    </html>
    
  • 相关阅读:
    使用图形化技术完成电子相册程序的开发
    本周新学的 GUI绘图技术
    不如今日来说 有关StringBufferr类和字符串格式化的内容~
    PHP的身份证号码工具类
    在网页中嵌入带标识的百度地图
    PHP正则表达式使用详解
    Spicy Chicken GDI in C#
    [C++] Running time and Integer to String
    the difference between const int *, int * const, int const *
    [C# WPF]MoeEroViewer Developing Log
  • 原文地址:https://www.cnblogs.com/L-L-ALICE/p/14910230.html
Copyright © 2011-2022 走看看