zoukankan html css js c++ java

python爬虫下载小视频和小说(基础)

下载视频：

 1 from bs4 import BeautifulSoup
 2 import requests
 3 import re
 4 import urllib
 5 
 6 
 7 def callbackfunc(blocknum, blocksize, totalsize):
 8     '''回调函数
 9     @blocknum: 已经下载的数据块
10     @blocksize: 数据块的大小
11     @totalsize: 远程文件的大小
12     '''
13     percent = 100.0 * blocknum * blocksize / totalsize
14     if percent > 100:
15         percent = 100
16     print ("%.2f%%"% percent)
17 
18 
19 
20 ur = 'http://www.budejie.com/video/'
21 
22 def get_htmls(url):
23     headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36'}
24     html = requests.get(url,headers=headers)
25     #print(type(html.text))
26     soup =BeautifulSoup(html.text,'html.parser')
27     result1 = soup.find(attrs={'class':'j-video-c','data-title':True})
28     result2 = soup.find(attrs={'class': 'j-video', 'data-mp4': True})
29     nam = result1.get('data-title')
30     url = result2.get('data-mp4')
31     local = 'e:\'+str(nam)+'.mp4'
32     urllib.request.urlretrieve(url, local, callbackfunc)
33 
34 if __name__ == '__main__':
35     get_htmls(ur)

下载小说：

 1 from bs4 import BeautifulSoup
 2 import requests
 3 import re
 4 from openpyxl import load_workbook
 5 from openpyxl.utils import get_column_letter
 6 
 7 #这一部分是存链接的
 8 '''
 9 # 设置文件 mingc
10 addr = "1.xlsx"
11 # 打开文件
12 wb = load_workbook(addr)
13 # 创建一张新表
14 ws = wb.create_sheet()
15 # 第一行输入
16 #ws.append(['TIME', 'TITLE', 'A-Z'])
17 ws['A1'] = '章节'
18 ws['B1'] = '链接
19 '''
20 links = []
21 ur = 'https://www.qb5200.tw/xiaoshuo/2/2155/'
22 def get_one_page(url,ok):
23     headers = {
24 
25    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36 Edge/18.17763'}
26 
27     html = requests.get(url,headers=headers)
28     html.encoding = 'gbk'
29     #print(html.content)
30     if ok == True:
31         get_parsing(html)
32     else :
33         return html
34 
35 def get_parsing(html):
36     soup = BeautifulSoup(html.content,'html.parser')
37     dd = soup.findAll(['dt',['dd']])
38     result = False
39 
40     #counts =2
41     co = 0
42     for one in dd:
43         #print(type(one))
44         if one.string=='《龙王传说》正文卷':
45             #ws.title = one.string
46             result = True
47         if result == True and one.name == 'dd':
48             link = one.a.get('href')
49             links.append(link)   #注意这里是传入元组、列表、字典
50 '''
51             st = one.a.string
52             data = [++co,ur+link]
53             ws.append(data)
54 '''
55 
56 def get_htmls():
57     i=1000
58     results = links[1000:]
59     for link in results:
60         i+=1
61         url = ur+link
62         path='龙3.txt'
63         html = get_one_page(url,False)
64         soup = BeautifulSoup(html.content, 'html.parser')
65         name = soup.find(attrs={'class':'content','id':False})
66         names = name.h1.string
67         div = soup.find('div',attrs={'class':'showtxt'})
68         with open(path,'a',encoding='utf8') as f:
69             f.write(names + '
')
70             for string in div.stripped_strings:
71                 f.write(string+'
')
72 
73         if i%10 == 0:
74             print(i)
75             if i==1300:
76                 break
77 
78 
79 
80 if __name__ == '__main__':
81     get_one_page('https://www.qb5200.tw/xiaoshuo/2/2155/',True)
82     #wb.save(addr)
83     get_htmls()

查看全文

相关阅读:
about java
nginx+tomcat
sed
百度推送
 线程及更新ui线程的简单理解
 测试异常检测的Bugly使用
 轮播图带加点，带时间自动轮播加手动轮播
 自定义listView与scrollView使用
 tabLayout加viewPager的实现
 网络获取图片列表

原文地址：https://www.cnblogs.com/kongbursi-2292702937/p/11953169.html