由于版权原因,具体网站不再明述。
爬取思路一:接口
ppt接口为:https://wenku.baidu.com/browse/getbcsurl?doc_id=(文章id) &pn=0&rn=99999&type=ppt
经过测验发现只能下载vip免费文档,部分vip专属文档是不能下载的
如vip专属文档:文章https://wenku.baidu.com/view/cdc02bcc842458fb770bf78a6529647d272834c6.html
对应接口为
https://wenku.baidu.com/browse/getbcsurl?doc_id=cdc02bcc842458fb770bf78a6529647d272834c6&pn=0&rn=99999&type=ppt
里面得到的图片Url打开啥都没有滴,但是vip免费文档是可以用的,
如:https://wenku.baidu.com/view/9712a85429160b4e767f5acfa1c7aa00b52a9d37.html
接口为:https://wenku.baidu.com/browse/getbcsurl?doc_id=9712a85429160b4e767f5acfa1c7aa00b52a9d37&pn=0&rn=99999&type=ppt
将所有数据复制打开json.cn格式化发现链接都是正常的,
但是为了爬虫的稳定性 只能果断放弃这种方法了哈啊哈~~
爬取思路二:使用自家的请求头和selenium进行获取文档
说一下爬取思路及遇到的问题:
-
爬取付费文档(大部分含文字)实际使用百度的请求头对自家进行爬取,可以爬取成功,请求头可以在robots.txt里找到 参考链接
-
xpath抓取的值有空格换行符等问题:使用normalize-space()函数,如contents = html.xpath('//div[normalize-space(@class="bd doc-reader")]/text()')
-
python-pptx 实践 :添加图片 参考链接如下:https://www.cnblogs.com/shanger/p/13098799.html
-
python-pdf 和 python-docx 实践参考
-
selenium click无效问题 之前写过解决办法,链接为https://blog.csdn.net/a12355556/article/details/108346202
-
python 中 for x or y in z:貌似都成立,应该是格式不对,正确写法是 for x in z or y in z:,但代码我就是用的错误的写法哈哈哈,因为后面有if判断用else巧妙的避过去了~~
注意:
- pptx和pdf我是用图片转换的,没有图片的文档可转换不了哦
- 运行时大家记得改一下chromedriver的路径哦
- vip专属文档也成功哦~~
爬取结果:
代码
import requests,docx,os
from lxml import etree
from PIL import Image
from urllib import request
from selenium import webdriver
driver_path = r'D:/chromedriver/chromedriver.exe'
header = {'User-agent': 'Baiduspider'}
def download_wenku(url,typ):
r = requests.get(url , headers = header)
html = etree.HTML(r.text)
title = html.xpath('//title/text()')[0][:-5]
contents = html.xpath('//div[normalize-space(@class="bd doc-reader")]/text()')
for c in contents:
if len(c)>50:
content = c
if ',' or ',' in typ:
if ',' in typ:
typs = typ.split(',')
for typ in typs:
save_type(typ,title,content,url)
else:
typs = typ.split(',')
for typ in typs:
save_type(typ,title,content,url)
if typ=='all':
a = [save_type(typ,title,content,url) for typ in ['docx','txt','pdf','pptx']]
def save_type(typ,title,content,url):
if typ=='docx':
print("*"*30+"docx正在下载中"+"*"*30)
docu = docx.Document()
docu.add_paragraph(content)
docu.save(title+'.docx')
print("*"*30+"docx类型下载完成"+"*"*30)
if typ=='txt':
print("*"*30+"txt正在下载中"+"*"*30)
f = open(title+'.txt','w',encoding='utf-8').write(content)
print("*"*30+"txt类型下载完成"+"*"*30)
if typ=='pdf':
print("*"*30+"pdf正在下载中"+"*"*30)
save_pdf(title,url)
print("*"*30+"pdf类型下载完成"+"*"*30)
if typ=='pptx':
print("*"*30+"pptx正在下载中"+"*"*30)
save_pptx(title,url)
print("*"*30+"pptx类型下载完成"+"*"*30)
def save_pptx(title,url):
verify = download_png(url)
if verify==True:
return
# 加载库
from pptx import Presentation
# 设置路径
# work_path = old_path+'img'
# os.chdir(work_path)
# 实例化 ppt 文档对象
prs = Presentation()
# 插入幻灯片
img = os.getcwd()+'img'
# 添加图片
for i in range(1,len(os.listdir(img))+1):
blank_slide = prs.slide_layouts[6]
slide_1 = prs.slides.add_slide(blank_slide)
img_name = img+'\'+ str(i)+'.png'
slide_1.shapes.add_picture(image_file=img_name,left=1,top=1,)
# 保存 ppt
prs.save(title+'.pptx')
def download_png(url):
try:
options = webdriver.ChromeOptions()
options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument('--disable-extensions')
options.add_argument('--disable-gpu')
options.add_argument('blink-settings=imagesEnabled=false')
driver = webdriver.Chrome(executable_path=driver_path,options=options)
driver.get("https://wenku.baidu.com")
driver.get(url)
element = driver.find_element_by_xpath('//span[@class="read-all"]')
driver.execute_script("arguments[0].click();", element)
js = "var q=document.documentElement.scrollTop=20000"
driver.execute_script(js)
source = driver.page_source
html = etree.HTML(source)
#获取图片url
divs = html.xpath('//div[@class="mod flow-ppt-mod"]/div/div')
urls = []
for div in divs:
url = div.xpath('div/img/@src')
if len(url)==0:
url = div.xpath('div/img/@data-src')
if len(url)!=0:
urls.append(url[0])
#创建图片文件夹并下载图片
if not os.path.exists('img'):
os.mkdir('img')
i=1
for url in urls:
request.urlretrieve(url,'img/'+str(i)+'.png')
i = i+1
except Exception as e:
print("该文档无图片,不适合转换成pptx和pdf")
driver.quit()
return True
driver.quit()
def save_pdf(title,url):
verify = download_png(url)
if verify==True:
return
folderPath = os.getcwd()+'img'
filename = title
files = os.listdir(folderPath)
jpgFiles = []
sources = []
for file in files:
if 'png' in file:
jpgFiles.append(file)
tep = []
for i in jpgFiles:
ex = i.split('.')
tep.append(int(ex[0]))
tep.sort()
jpgFiles=[folderPath +'/'+ str(i) + '.png' for i in tep]
output = Image.open(jpgFiles[0])
jpgFiles.pop(0)
for file in jpgFiles:
img = Image.open(file)
img = img.convert("P")
sources.append(img)
output.save(f"./{filename}.pdf","PDF",save_all=True,append_images=sources)
def main():
url = input("请输入要下载的文章链接:")
print("*"*10+"文档都是图片建议存为pptx,pdf,均为文字建议存为docx,txt"+"*"*10)
typ = input("请输入要保存的类型(可供选择的类型为doc,txt,pdf,pptx,下载多种格式请用逗号隔开,全部下载可使用all):")
download_wenku(url,typ)
if __name__ == '__main__':
main()