数据挖掘---R语言爬虫(基于hardly的rvest包)

zoukankan html css js c++ java

数据挖掘---R语言爬虫(基于hardly的rvest包)

library(rvest)
library(stringr)

getdata<-function(page,urlwithoutpage){
#读取数据，规定编码
web<-read_html(paste0(urlwithoutpage,page),encoding="GBK")
#获取书名这些符号（：（(—）后面的统统丢掉
titie_all <- web %>% html_nodes("div ul p.name") %>% html_text()
title <- sapply(strsplit(titie_all,split = "[ ：（(—―]"),"[",2)
#获取价格
price <-web %>% html_nodes("div ul span.search_now_price") %>% html_text()
#获取出版信息
search_book_author <- web %>% html_nodes("p.search_book_author") %>% html_text()
#为避免与CSV的逗号混淆，字段里的逗号统一替换成中文逗号
search_book_author <-gsub(pattern = ",", replacement = "，", search_book_author)
author <- sapply(strsplit(search_book_author,"/"),"[",1)
#利用正则表达式提取字符串
publication_date = str_extract(search_book_author,"\d{4}-\d{2}-\d{2}")
publishing_house = str_extract(search_book_author,"\w*出版社\w*")
#整合成数据框返回
data.frame(title,price,author,publication_date,publishing_house)
}

#当当网上检索R语言得到的URL
dangdang <- "http://search.dangdang.com/?key=r%D3%EF%D1%D4&act=input&ddt-rpm=undefined&page_index="
#爬取其中的三页信息
final<-data.frame()
for (i in 1:2){
final<-rbind(final,getdata(i,dangdang))
}
write.table(final, 'dangdang.csv', sep = ",",row.names = FALSE)

python

#一个简单例子---中国图书网
# 爬虫
# 正则表达式爬虫
import requests
import re

url = 'http://www.bookschina.com/book_find2/?stp=python'
content = requests.get(url).text
title = re.findall('jpg" title="(.*?)"',content)
author = re.findall('作者：.*?;sbook=(.*?)">.*?出版社：',content)
pubhouses = re.findall('出版社：.*?">(.*?)</a> 出版时间：',content)
pubtime = re.findall('</a> 出版时间：(.*?) ISBN：',content)
ISBN = re.findall(' ISBN：(.*?) 原价：',content)
raw_price = re.findall('ISBN：.*? 原价：￥(.*?) 现价：',content)
now_price = re.findall('现价：￥(.*?)    您节省：',content)
jiesheng = re.findall('您节省：￥(.*?)',content)
discount = re.findall('您节省：.*?    （(.*?)折） ',content)

file = open('books.csv','w')
for i in list(range(0,len(title))):
file.write(','.join((title[i],author[i],pubhouses[i],pubtime[i],ISBN[i],raw_price[i],now_price[i],jiesheng[i],discount[i]))+' ')
#print(title[i],author[i],pubhouses[i],pubtime[i],ISBN[i],raw_price[i],now_price[i],jiesheng[i],discount[i])
file.close()

查看全文

相关阅读:
什么叫套接字
 浅谈labviEW定时器
 C#线程篇---Task（任务）和线程池不得不说的秘密
 async与await详解
 异步编程与多线程的联系与区别
 什么是Task
MVC模式的介绍(C#)
Git指令
 Redis安装部署、Jedis的使用
 Oracle——序列、索引、同义词

原文地址：https://www.cnblogs.com/heisaijuzhen/p/6656439.html