简介
因为写英文应用文与写作需要参考新闻信息,但是,我脑子里除了报纸没有其他更好的信息整合平台。遂打算下载renming日报
参考链接
https://www.liaoxuefeng.com/wiki/1016959663602400/1017628290184064
https://blog.csdn.net/qq_38161040/article/details/88366427
https://blog.csdn.net/baidu_28479651/article/details/76158051?utm_source=blogxgwz7
code 第一版
70%手动 30%自动 需要频繁的创建文件夹和更改下载次数
# coding = UTF-8
# 爬取自己编写的html链接中的PDF文档,网址:file:///E:/ZjuTH/Documents/pythonCode/pythontest.html
import urllib.request
import re
import os
# open the url and read
def getHtml(url):
page = urllib.request.urlopen(url)
html = page.read()
page.close()
return html
# compile the regular expressions and find
# all stuff we need
def getUrl(html):
reg = r'([A-Z]d+)' #匹配了G176200001
url_re = re.compile(reg)
url_lst = url_re.findall(html.decode('UTF-8')) #返回匹配的数组
return(url_lst)
def getFile(url):
file_name = url.split('/')[-1]
u = urllib.request.urlopen(url)
f = open(file_name, 'wb')
block_sz = 8192
while True:
buffer = u.read(block_sz)
if not buffer:
break
f.write(buffer)
f.close()
print ("Sucessful to download" + " " + file_name)
if __name__ == '__main__':
tmp = "http://paper.people.com.cn/rmrb/page/2020-03/26/01/rmrb20200326";
for i in range(20):
#print(i)
# http://paper.people.com.cn/rmrb/page/2020-03/26/02/rmrb2020032602.pdf
# http://paper.people.com.cn/rmrb/page/2020-03/26/03/rmrb2020032603.pdf
if(i+1 <10):
getFile("http://paper.people.com.cn/rmrb/page/2020-03/07/0"+str(i+1)+"/rmrb202003070"+str(i+1)+".pdf")
else:
getFile("http://paper.people.com.cn/rmrb/page/2020-03/07/"+str(i+1)+"/rmrb20200307"+str(i+1)+".pdf")
code 第二版 自动创建文件夹版本
下载速度较慢需要等待
# coding = UTF-8
# 爬取自己编写的html链接中的PDF文档,网址:file:///E:/ZjuTH/Documents/pythonCode/pythontest.html
import urllib.request
import re
import os
import shutil
# open the url and read
def getHtml(url):
page = urllib.request.urlopen(url)
html = page.read()
page.close()
return html
# compile the regular expressions and find
# all stuff we need
def getUrl(html):
reg = r'([A-Z]d+)' #匹配了G176200001
url_re = re.compile(reg)
url_lst = url_re.findall(html.decode('UTF-8')) #返回匹配的数组
return(url_lst)
def getFile(url):
file_name = url.split('/')[-1]
u = urllib.request.urlopen(url)
f = open(file_name, 'wb')
block_sz = 8192
while True:
buffer = u.read(block_sz)
if not buffer:
break
f.write(buffer)
f.close()
print ("Sucessful to download" + " " + file_name)
return file_name
if __name__ == '__main__':
for i in range(29):
folderName=""
data = str(i+1);
if(i+1 < 10):
data = "0"+data;
folderName = "02"+data;
os.mkdir(folderName)
for j in range(20):
fineName = ""
try:
if(j+1 <10):
fileName = "http://paper.people.com.cn/rmrb/page/2020-02/"+data+"/0"+str(j+1)+"/rmrb202002"+data+"0"+str(j+1)+".pdf";
tmp = getFile(fileName)
else:
fileName = "http://paper.people.com.cn/rmrb/page/2020-02/"+data+"/"+str(j+1)+"/rmrb202002"+data+str(j+1)+".pdf";
tmp = getFile(fileName)
shutil.move(tmp,folderName)
except OSError:
pass
continue
code 多进程下载
超级爽
# coding = UTF-8
# 爬取自己编写的html链接中的PDF文档,网址:file:///E:/ZjuTH/Documents/pythonCode/pythontest.html
import urllib.request
import re
import os
import shutil
from multiprocessing import Pool
import time
# open the url and read
def getHtml(url):
page = urllib.request.urlopen(url)
html = page.read()
page.close()
return html
# compile the regular expressions and find
# all stuff we need
def getUrl(html):
reg = r'([A-Z]d+)' #匹配了G176200001
url_re = re.compile(reg)
url_lst = url_re.findall(html.decode('UTF-8')) #返回匹配的数组
return(url_lst)
def getFile(url):
file_name = url.split('/')[-1]
u = urllib.request.urlopen(url)
f = open(file_name, 'wb')
block_sz = 8192
while True:
buffer = u.read(block_sz)
if not buffer:
break
f.write(buffer)
f.close()
print ("Sucessful to download" + " " + file_name)
return file_name
def download(i):
folderName=""
data = str(i+1);
if(i+1 < 10):
data = "0"+data;
folderName = "01"+data;
os.mkdir(folderName)
for j in range(20):
fineName = ""
try:
if(j+1 <10):
fileName = "http://paper.people.com.cn/rmrb/page/2020-01/"+data+"/0"+str(j+1)+"/rmrb202001"+data+"0"+str(j+1)+".pdf";
tmp = getFile(fileName)
else:
fileName = "http://paper.people.com.cn/rmrb/page/2020-01/"+data+"/"+str(j+1)+"/rmrb202001"+data+str(j+1)+".pdf";
tmp = getFile(fileName)
shutil.move(tmp,folderName)
except OSError:
pass
continue
if __name__ == '__main__':
p = Pool(31)
for i in range(31):
p.apply_async(download, args = (i,))
p.close()
p.join()
print('All subprocesses done.')