zoukankan      html  css  js  c++  java
  • python爬虫下载小视频和小说(基础)

    下载视频:

     1 from bs4 import BeautifulSoup
     2 import requests
     3 import re
     4 import urllib
     5 
     6 
     7 def callbackfunc(blocknum, blocksize, totalsize):
     8     '''回调函数
     9     @blocknum: 已经下载的数据块
    10     @blocksize: 数据块的大小
    11     @totalsize: 远程文件的大小
    12     '''
    13     percent = 100.0 * blocknum * blocksize / totalsize
    14     if percent > 100:
    15         percent = 100
    16     print ("%.2f%%"% percent)
    17 
    18 
    19 
    20 ur = 'http://www.budejie.com/video/'
    21 
    22 def get_htmls(url):
    23     headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36'}
    24     html = requests.get(url,headers=headers)
    25     #print(type(html.text))
    26     soup =BeautifulSoup(html.text,'html.parser')
    27     result1 = soup.find(attrs={'class':'j-video-c','data-title':True})
    28     result2 = soup.find(attrs={'class': 'j-video', 'data-mp4': True})
    29     nam = result1.get('data-title')
    30     url = result2.get('data-mp4')
    31     local = 'e:\'+str(nam)+'.mp4'
    32     urllib.request.urlretrieve(url, local, callbackfunc)
    33 
    34 if __name__ == '__main__':
    35     get_htmls(ur)

    下载小说:

     1 from bs4 import BeautifulSoup
     2 import requests
     3 import re
     4 from openpyxl import load_workbook
     5 from openpyxl.utils import get_column_letter
     6 
     7 #这一部分是存链接的
     8 '''
     9 # 设置文件 mingc
    10 addr = "1.xlsx"
    11 # 打开文件
    12 wb = load_workbook(addr)
    13 # 创建一张新表
    14 ws = wb.create_sheet()
    15 # 第一行输入
    16 #ws.append(['TIME', 'TITLE', 'A-Z'])
    17 ws['A1'] = '章节'
    18 ws['B1'] = '链接
    19 '''
    20 links = []
    21 ur = 'https://www.qb5200.tw/xiaoshuo/2/2155/'
    22 def get_one_page(url,ok):
    23     headers = {
    24 
    25    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36 Edge/18.17763'}
    26 
    27     html = requests.get(url,headers=headers)
    28     html.encoding = 'gbk'
    29     #print(html.content)
    30     if ok == True:
    31         get_parsing(html)
    32     else :
    33         return html
    34 
    35 def get_parsing(html):
    36     soup = BeautifulSoup(html.content,'html.parser')
    37     dd = soup.findAll(['dt',['dd']])
    38     result = False
    39 
    40     #counts =2
    41     co = 0
    42     for one in dd:
    43         #print(type(one))
    44         if one.string=='《龙王传说》正文卷':
    45             #ws.title = one.string
    46             result = True
    47         if result == True and one.name == 'dd':
    48             link = one.a.get('href')
    49             links.append(link)   #注意这里是传入元组、列表、字典
    50 '''
    51             st = one.a.string
    52             data = [++co,ur+link]
    53             ws.append(data)
    54 '''
    55 
    56 def get_htmls():
    57     i=1000
    58     results = links[1000:]
    59     for link in results:
    60         i+=1
    61         url = ur+link
    62         path='龙3.txt'
    63         html = get_one_page(url,False)
    64         soup = BeautifulSoup(html.content, 'html.parser')
    65         name = soup.find(attrs={'class':'content','id':False})
    66         names = name.h1.string
    67         div = soup.find('div',attrs={'class':'showtxt'})
    68         with open(path,'a',encoding='utf8') as f:
    69             f.write(names + '
    ')
    70             for string in div.stripped_strings:
    71                 f.write(string+'
    ')
    72 
    73         if i%10 == 0:
    74             print(i)
    75             if i==1300:
    76                 break
    77 
    78 
    79 
    80 if __name__ == '__main__':
    81     get_one_page('https://www.qb5200.tw/xiaoshuo/2/2155/',True)
    82     #wb.save(addr)
    83     get_htmls()
  • 相关阅读:
    about java
    nginx+tomcat
    sed
    百度推送
    线程及更新ui线程的简单理解
    测试异常检测的Bugly使用
    轮播图带加点,带时间自动轮播加手动轮播
    自定义listView与scrollView使用
    tabLayout加viewPager的实现
    网络获取图片列表
  • 原文地址:https://www.cnblogs.com/kongbursi-2292702937/p/11953169.html
Copyright © 2011-2022 走看看