论文链接爬取
首先是获取网页的html代码,然后通过正则表达式获取论文的下载链接
public static String getHTMLText(String requesturl){
StringBuffer buffer = null;
BufferedReader bufferedReader = null;
InputStreamReader inputStreamReader = null;
InputStream inputStream = null;
HttpsURLConnection httpsURLConnection = null;
try {
URL url = new URL(requesturl);
httpsURLConnection = (HttpsURLConnection) url.openConnection();
httpsURLConnection.setDoInput(true);
httpsURLConnection.setRequestMethod("GET");
inputStream = httpsURLConnection.getInputStream();
inputStreamReader = new InputStreamReader(inputStream, "utf-8");
bufferedReader = new BufferedReader(inputStreamReader);
buffer = new StringBuffer();
String str = null;
while ((str = bufferedReader.readLine()) != null) {
buffer.append(str);
}
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
return buffer.toString();
}
//返回的,一个是论文名,一个是论文链接
public static Map<String,List<String>> getPapers(){
Map<String,List<String>> urlAndName=new HashMap();
String htmlStr=GetDataUtil.getHTMLText("https://openaccess.thecvf.com/WACV2021");
List<String> urls= new ArrayList<>();
List<String> nameS= new ArrayList<>();
String url = "";
Pattern p_url;
Matcher m_url;
String regEx_url = "href="([\w\s./:]+?)"";
p_url = Pattern.compile
(regEx_url, Pattern.CASE_INSENSITIVE);
m_url = p_url.matcher(htmlStr);
while (m_url.find()) {
url = m_url.group();
if (url.contains(".pdf")){
url=url.substring(url.indexOf(""")+1);
url=url.substring(0,url.indexOf("""));
urls.add("https://openaccess.thecvf.com/"+url);
if (url.contains("papers")){
url=url.substring(url.indexOf("papers")+7);
}else if (url.contains("supplemental")){
url=url.substring(url.indexOf("supplemental")+13);
}
nameS.add(url);
}
}
urlAndName.put("name",nameS);
urlAndName.put("url",urls);
return urlAndName;
}
论文爬取
这里主要进行的是获取pdf论文
转成txt文本为下一步做准备
获取pdf
//这个函数其实和上面的getHTMLText包括了,但我懒得改了
public static InputStream getInputStream(String requesturl){
InputStream inputStream = null;
HttpsURLConnection httpsURLConnection = null;
try {
URL url = new URL(requesturl);
httpsURLConnection = (HttpsURLConnection) url.openConnection();
httpsURLConnection.setConnectTimeout(10000);
httpsURLConnection.setReadTimeout(10000);
httpsURLConnection.setDoInput(true);
httpsURLConnection.setRequestMethod("GET");
inputStream = httpsURLConnection.getInputStream();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
return inputStream;
}
public static void saveAsPdf(InputStream inputStream,String path){
File file = new File(path);
if (!file.exists()) {
FileOutputStream out = null;
try {
out = new FileOutputStream(path);
} catch (FileNotFoundException e) {
e.printStackTrace();
}
int data;
try {
while((data = inputStream.read()) != -1) {
out.write(data);
}
inputStream.close();
out.close();
} catch (IOException e) {
e.printStackTrace();
}
} else {
System.out.println("文件已存在");
}
}
转为txt
/**
* 传入一个.pdf文件
* @param file
* @throws Exception
*/
public static void readPdf(String file) throws Exception {
// 是否排序
boolean sort = false;
// pdf文件名
String pdfFile = file;
// 输入文本文件名称
String textFile = null;
// 编码方式
String encoding = "UTF-8";
// 开始提取页数
int startPage = 1;
// 结束提取页数
int endPage = Integer.MAX_VALUE;
// 文件输入流,生成文本文件
Writer output = null;
// 内存中存储的PDF Document
PDDocument document = null;
try {
try {
// 首先当作一个URL来装载文件,如果得到异常再从本地文件系统//去装载文件
URL url = new URL(pdfFile);
//注意参数已不是以前版本中的URL.而是File。
document = PDDocument.load(new File(pdfFile));
// 获取PDF的文件名
String fileName = url.getFile();
// 以原来PDF的名称来命名新产生的txt文件
if (fileName.length() > 4) {
File outputFile = new File(fileName.substring(0, fileName.length() - 4)+ ".txt");
textFile ="F:\CVF\PDF\"+outputFile.getName();
}
} catch (MalformedURLException e) {
// 如果作为URL装载得到异常则从文件系统装载
//注意参数已不是以前版本中的URL.而是File。
document = PDDocument.load(new File(pdfFile));
if (pdfFile.length() > 4) {
textFile = pdfFile.substring(0, pdfFile.length() - 4)+ ".txt";
}
}
// 文件输入流,写入文件倒textFile
output = new OutputStreamWriter(new FileOutputStream(textFile),encoding);
// PDFTextStripper来提取文本
PDFTextStripper stripper = null;
stripper = new PDFTextStripper();
// 设置是否排序
stripper.setSortByPosition(sort);
// 设置起始页
stripper.setStartPage(startPage);
// 设置结束页
stripper.setEndPage(endPage);
// 调用PDFTextStripper的writeText提取并输出文本
stripper.writeText(document, output);
System.out.println(textFile + " 输出成功!");
if (output != null) {
// 关闭输出流
output.close();
}
if (document != null) {
// 关闭PDF Document
document.close();
}
} finally {
if (output != null) {
// 关闭输出流
output.close();
}
if (document != null) {
// 关闭PDF Document
document.close();
}
}
}
论文内关键词提取
这里用到了GitHub上一个开源项目
项目链接:RAKE-MASTER
Java运行powershell类
public class RunPowerShell {
public void execCommand(String[] arstringCommand) {
for (int i = 0; i < arstringCommand.length; i++) {
System.out.print(arstringCommand[i] + " ");
}
try {
Runtime.getRuntime().exec(arstringCommand);
} catch (Exception e) {
System.out.println(e.getMessage());
}
}
public void execCommand(String arstringCommand) {
try {
Process process= Runtime.getRuntime().exec(arstringCommand);
process.waitFor();
} catch (Exception e) {
System.out.println(e.getMessage());
}
}
public void cmd(){
//打开记算器
String[] arstringCommand = new String[] {
"cmd ",
"/k",
"start", // cmd Shell命令
"calc"
};
execCommand(arstringCommand);
//打开记事本
String cmd = "cmd /k start notepad";
execCommand(cmd);
}
public static void main(String[] args){
new RunPowerShell().cmd();
}
}
运行rake.py
new RunPowerShell().execCommand(" python rake.py "+InputPath+" "+分词文件路径+" -o "+输出路径);
数据处理
循环运行完rake.py后会生成众多txt文件,这一步做的便是读取这些文件然后处理写入数据库
pojo类
paper
package com.keyword.alice.pojo;
import lombok.AllArgsConstructor;
import lombok.Data;
import lombok.NoArgsConstructor;
@Data
@AllArgsConstructor
@NoArgsConstructor
public class Paper {
private String papertitle;
private String authors;
private String Abstract;
private String pdf;
private String year;
private String type;
}
上个整体一点的代码得了
public void JustRun(Paper paper) throws Exception {
String name=paper.getPapertitle();
String url=paper.getPdf();
String year=paper.getYear();
String type=paper.getType();
List<KeyWords> resList;
List<String>wordList=new ArrayList<>();
List<String>urlList=new ArrayList<>();
//爬取的论文中有空格或者斜杠会影响输出
try {
name=name.replace("/","_");
name=name.replace(" ","_");
url=url.replace("/../../","/");
System.out.println(url);
}catch (Exception e){
}
GetDataUtil.saveAsPdf(GetDataUtil.getInputStream(url),path+name+".pdf");
PdfToTxt.readPdf(path+name+".pdf");
name=name+".txt";
new RunPowerShell().execCommand(" python rake.py F:\CVF\PDF\"+name+" stopwords.txt -o F:\CVF\TXT\KeyWords_"+name);
try {
FileReader fr = new FileReader("F:\CVF\TXT\KeyWords_"+name);
BufferedReader bf = new BufferedReader(fr);
String str;
// 按行读取字符串
while ((str = bf.readLine()) != null) {
wordList.add(str);
urlList.add(url);
}
bf.close();
fr.close();
} catch (IOException e) {
e.printStackTrace();
}
resList= ToKeyWordsObj.ToObjKeyWords(wordList,urlList,year,type);
resList.removeIf(keyWords -> keyWords.getValue() < 8);
for (KeyWords keyWords : resList) {
keyMapper.addkeyWord(keyWords);
}
}
动态sql语句
其实到这里就已经基本做完了项目了,剩下的就是CUDR了,唯一有难度的便是多条件查询
paper是数据表名
<script> select * from papers where 1=1
<if test='传入的参数!=null'>and 数据库字段名 like concat('%',#{传入的参数},'%')</if>
这里其实是用的mybatis的注解
图表联动
这一段是图表联动,可以直接使用,但是注意变量名称还有元素的绑定
table.on('row(keyW)', function(obj){
var n=0;
mycharts.dispatchAction({
type: 'downplay',
name: temp
})
temp=obj.data.name;
array.push(temp);
n=n+1;
mycharts.dispatchAction({
type: 'highlight',
name: obj.data.name
})
if (array[n]===array[n-1]){
mycharts.dispatchAction({
type: 'downplay',
name: temp
})
}
setTimeout(function(){mycharts.dispatchAction({
type: 'downplay',
name: temp
})},1000);
});
function eConsole(param) {
var i = param.dataIndex;// 获取当前点击索引,
//clickFunc(param.dataIndex);//执行点击效果
$("tbody tr").each(function(){
//alert($(this).index()); 获取数据表格每行的索引
var a = $(this).index();
if(a == i){
$("tbody tr").eq(a).css("background-color","yellow");
}
});
};
//鼠标移出
function zConsole(param) {
var i = param.dataIndex;// 获取当前点击索引,
$("tbody tr").each(function(){
//alert($(this).index()); 获取数据表格每行的索引
var a = $(this).index();
if(a == i){
$("tbody tr").eq(a).css("background-color","");
}
});
};
mycharts.on("mouseover", eConsole);//鼠标移入
mycharts.on("mouseout", zConsole);//鼠标移出
mycharts.hideLoading();
mycharts.on('click',function(params){
table.reload('test',{
url: 'getUrlList'
,where: {
word:params.name
}
,cols: [
[
{field:'type', 80, title: '会议',}
,{field:'year', 80, title: '年份'}
,{field:'url', title: '论文链接', sort: true}
]
]
});
});
完整页面代码
<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8">
<title>检索</title>
<meta name="renderer" content="webkit">
<meta http-equiv="X-UA-Compatible" content="IE=edge,chrome=1">
<meta name="viewport" content="width=device-width, initial-scale=1, maximum-scale=1">
<link rel="stylesheet" type="text/css" href="https://www.layuicdn.com/layui/css/layui.css" />
<script src="https://cdn.bootcdn.net/ajax/libs/jquery/3.6.0/jquery.js"></script>
<script src="../echarts.js"></script>
<script src="../echarts-wordcloud.js"></script>
<!-- 注意:如果你直接复制所有代码到本地,上述css路径需要改成你本地的 -->
<style>
.layui-input-block {
margin-left: 80px;
min-height: 36px;
}
</style>
</head>
<body>
<div class="layui-row">
<div class="layui-col-md9">
<div class="grid-demo grid-demo-bg1"> <div id="main" style=" 1200px;height: 800px; "></div></div>
</div>
<div class="layui-col-md3">
<div class="grid-demo"><table class="layui-hide" id="keyW" lay-filter="keyW"></table></div>
</div>
</div>
<table class="layui-hide" id="test" lay-filter="test"></table>
<script type="text/html" id="toolbarDemo">
<div class="layui-form-item">
<label class="layui-form-label">关键词</label>
<div class="layui-input-block">
<input type="text" id="keyWord" name="keyWord" autocomplete="off" placeholder="请输入关键词,最多五个关键词,通过,和.分割(‘,’为或 '.'为和)" class="layui-input">
</div>
</div>
<div class="layui-form-item">
<label class="layui-form-label">检索</label>
<div class="layui-input-inline">
<select name="year" id="year">
<option value="">请选择年份</option>
<option value="">所有</option>
<option value="2021" selected="">2021</option>
<option value="2020">2020</option>
<option value="2019">2019</option>
<option value="2018">2018</option>
<option value="2017">2017</option>
<option value="2016">2016</option>
<option value="2015">2015</option>
<option value="2014">2014</option>
<option value="2013">2013</option>
</select>
</div>
<div class="layui-input-inline">
<select name="type" id="type">
<option value="">请选会议类型</option>
<option value="">所有</option>
<option value="WACV">WACV</option>
<option value="CVPR">CVPR</option>
<option value="ICCV">ICCV</option>
</select>
</div>
<div class="layui-input-inline">
<div class="layui-input-inline">
<input type="text" name="author" id="author" placeholder="请输入作者" autocomplete="off" class="layui-input">
</div>
</div>
<div class="layui-input-inline">
<button class="layui-btn layui-btn-sm" onclick="change()" >搜索</button>
<button class="layui-btn layui-btn-sm" lay-event="getCheckData">获取选中行数据</button>
<!-- layui-btn-sm-->
</div>
</div>
</script>
<script type="text/html" id="barDemo">
<a class="layui-btn layui-btn-xs" lay-event="edit">查看本行</a>
<a class="layui-btn layui-btn-danger layui-btn-xs" lay-event="del">删除</a>
</script>
<script src="https://www.layuicdn.com/layui/layui.js"></script>
<!-- 注意:如果你直接复制所有代码到本地,上述 JS 路径需要改成你本地的 -->
<script>
var mycharts = echarts.init(document.getElementById("main"));
//用来存储数据
var jsonlist = [];
var option;
image1= "";
var maskResource = new Image()
$.ajax({
url:"getList",
dataType:"json",
type:"GET",
async:"false",
success:function(data){
jsonlist=data;
option ={
//设置标题,居中显示
title:{
text: 'CVF',
left:'center',
},
//数据可以点击
tooltip:{
show:true
},
series:[
{
maskImage:maskResource,
//词的类型
type: 'wordCloud',
//设置字符大小范围
sizeRange:[5,80],
//不要忘记调用数据
data:jsonlist,
rotationRange:[-45,90],
textStyle: {
normal:{
//生成随机的字体颜色
color:function () {
return 'rgb(' + [
Math.round(Math.random() * 10),
Math.round(Math.random() * 10),
Math.round(Math.random() * 10)
].join(',')+')';
}
},
},
emphasis: { // 单词高亮时显示的效果
shadowBlur: 100000,
shadowColor: '#333',
backgroundColor:'#D92932',
show:true,
label: {
show: true,
fontSize: '40',
fontWeight: 'bold'
},
textStyle: {
fontSize:50
}
},
hoverAnimation:true,
}
],
};
$.ajax({
url:"../image.json",
dataType:"json",
type:"GET",
async:"false",
success:function(data){
image1=data;
maskResource.src=image1;
//加载图像,将数据放在图像中
maskResource.onload = function(){
mycharts.setOption(option)
};
},
})
},
})
var table = layui.table;
layui.use('table', function(){
table = layui.table;
table.render({
elem: '#test'
,url:'getPaper'
,toolbar: '#toolbarDemo' //开启头部工具栏,并为其绑定左侧模板
,defaultToolbar: ['filter', 'exports', 'print', { //自定义头部工具栏右侧图标。如无需自定义,去除该参数即可
title: '提示'
,layEvent: 'LAYTABLE_TIPS'
,icon: 'layui-icon-tips'
}]
,title: '用户数据表'
,cols: [
[
{type: 'checkbox', fixed: 'left'}
,{field:'papertitle', title:'论文题目', 500, edit: 'text'}
,{field:'authors', title:'作者', 500, edit: 'text', sort: true}
,{field:'abstract', title:'摘要', edit: 'text', 100}
,{field:'pdf', title:'论文链接', edit: 'text'}
,{field:'year', title:'年份', edit: 'text', 80, sort: true}
,{field:'type', title:'会议类型', edit: 'text', 120}
,{fixed: 'right', title:'操作', toolbar: '#barDemo', 150}
]
]
,page: true
});
//头工具栏事件
table.on('toolbar(test)', function(obj){
var checkStatus = table.checkStatus(obj.config.id);
switch(obj.event){
case 'getCheckData':
var data = checkStatus.data;
layer.alert(JSON.stringify(data));
break;
case 'getCheckLength':
var data = checkStatus.data;
layer.msg('选中了:'+ data.length + ' 个');
break;
case 'isAll':
layer.msg(checkStatus.isAll ? '全选': '未全选');
break;
//自定义头工具栏右侧图标 - 提示
case 'LAYTABLE_TIPS':
layer.alert('这是工具栏右侧自定义的一个图标按钮');
break;
};
});
//监听行工具事件
table.on('tool(test)', function(obj){
var data = obj.data;
//console.log(obj)
if(obj.event === 'del'){
layer.confirm('真的删除行么', function(index){
obj.del();
layer.close(index);
});
} else if(obj.event === 'edit'){
window.open(data.pdf,'_blank');
}
});
});
layui.use('table', function() {
var table3 = layui.table;
table3.render({
elem: '#keyW'
, url: 'getList20'
, title: '数据表'
, cols: [
[
, {field: 'name', title: '词', sort: true}
, {field: 'value', title: '词频', 80, sort: true}
]
]
, page: true
});
})
var temp;
var array=new Array();
table.on('row(keyW)', function(obj){
var n=0;
mycharts.dispatchAction({
type: 'downplay',
name: temp
})
temp=obj.data.name;
array.push(temp);
n=n+1;
mycharts.dispatchAction({
type: 'highlight',
name: obj.data.name
})
if (array[n]===array[n-1]){
mycharts.dispatchAction({
type: 'downplay',
name: temp
})
}
setTimeout(function(){mycharts.dispatchAction({
type: 'downplay',
name: temp
})},1000);
});
function eConsole(param) {
var i = param.dataIndex;// 获取当前点击索引,
//clickFunc(param.dataIndex);//执行点击效果
$("tbody tr").each(function(){
//alert($(this).index()); 获取数据表格每行的索引
var a = $(this).index();
if(a == i){
$("tbody tr").eq(a).css("background-color","yellow");
}
});
};
//鼠标移出
function zConsole(param) {
var i = param.dataIndex;// 获取当前点击索引,
$("tbody tr").each(function(){
//alert($(this).index()); 获取数据表格每行的索引
var a = $(this).index();
if(a == i){
$("tbody tr").eq(a).css("background-color","");
}
});
};
mycharts.on("mouseover", eConsole);//鼠标移入
mycharts.on("mouseout", zConsole);//鼠标移出
mycharts.hideLoading();
mycharts.on('click',function(params){
table.reload('test',{
url: 'getUrlList'
,where: {
word:params.name
}
,cols: [
[
{field:'type', 80, title: '会议',}
,{field:'year', 80, title: '年份'}
,{field:'url', title: '论文链接', sort: true}
]
]
});
});
function change(){
table.reload('test',{
url: 'getPaper'
,where: {
keyWord:document.getElementById('keyWord').value
,year:document.getElementById('year').value
,type:document.getElementById('type').value
,author:document.getElementById('author').value
}
,cols: [
[
{type: 'checkbox', fixed: 'left'}
,{field:'papertitle', title:'论文题目', 500, edit: 'text'}
,{field:'authors', title:'作者', 500, edit: 'text', sort: true}
,{field:'abstract', title:'摘要', edit: 'text', 100}
,{field:'pdf', title:'论文链接', edit: 'text'}
,{field:'year', title:'年份', edit: 'text', 80, sort: true}
,{field:'type', title:'会议类型', edit: 'text', 120}
,{fixed: 'right', title:'操作', toolbar: '#barDemo', 150}
]
]
});
}
</script>
</body>
</html>