zoukankan      html  css  js  c++  java
  • 第七篇:爬虫实战— 4、爬取校花网视频示例(点开往下拉)

    1、爬取校花网示例1:

     1 import requests #pip3 install requests
     2 import re
     3 import hashlib
     4 import time
     5 
     6 movie_path=r'C:mp4'
     7 
     8 def get_page(url):
     9     try:
    10         response=requests.get(url)
    11         if response.status_code == 200:
    12             return response.text
    13     except Exception:
    14         pass
    15 
    16 def parse_index(index_page):
    17     urls=re.findall('class="items".*?href="(.*?)"',index_page,re.S)
    18     for url in urls:
    19         if not url.startswith('http'):
    20             url='http://www.xiaohuar.com'+url
    21         yield url
    22 
    23 def parse_detail(detail_page):
    24     l=re.findall('id="media".*?src="(.*?)"',detail_page,re.S)
    25     if l:
    26         movie_url=l[0]
    27         if movie_url.endswith('mp4'):
    28             yield movie_url
    29 
    30 def get_movie(url):
    31     try:
    32         response=requests.get(url)
    33         if response.status_code == 200:
    34             m=hashlib.md5()
    35             m.update(str(time.time()).encode('utf-8'))
    36             m.update(url.encode('utf-8'))
    37             filepath='%s\%s.mp4' %(movie_path,m.hexdigest())
    38             with open(filepath,'wb') as f:
    39                 f.write(response.content)
    40                 print('%s 下载成功' %url)
    41     except Exception:
    42         pass
    43 
    44 def main():
    45     base_url='http://www.xiaohuar.com/list-3-{page_num}.html'
    46     for i in range(5):
    47         url=base_url.format(page_num=i)
    48         index_page=get_page(url)
    49         detail_urls=parse_index(index_page)
    50         for detail_url in detail_urls:
    51             detail_page=get_page(detail_url)
    52             movie_urls=parse_detail(detail_page)
    53             for movie_url in movie_urls:
    54                 get_movie(movie_url)
    55 
    56 if __name__ == '__main__':
    57     main()
    View Code

    2、爬取校花网视频示例二(加了并发的)

     1 import requests #pip3 install requests
     2 import re
     3 import hashlib
     4 import time
     5 from concurrent.futures import ThreadPoolExecutor
     6 
     7 pool=ThreadPoolExecutor(50)
     8 movie_path=r'C:mp4'
     9 
    10 def get_page(url):
    11     try:
    12         response=requests.get(url)
    13         if response.status_code == 200:
    14             return response.text
    15     except Exception:
    16         pass
    17 
    18 def parse_index(index_page):
    19     index_page=index_page.result()
    20     urls=re.findall('class="items".*?href="(.*?)"',index_page,re.S)
    21     for detail_url in urls:
    22         if not detail_url.startswith('http'):
    23             detail_url='http://www.xiaohuar.com'+detail_url
    24         pool.submit(get_page,detail_url).add_done_callback(parse_detail)
    25 
    26 def parse_detail(detail_page):
    27     detail_page=detail_page.result()
    28     l=re.findall('id="media".*?src="(.*?)"',detail_page,re.S)
    29     if l:
    30         movie_url=l[0]
    31         if movie_url.endswith('mp4'):
    32             pool.submit(get_movie,movie_url)
    33 
    34 def get_movie(url):
    35     try:
    36         response=requests.get(url)
    37         if response.status_code == 200:
    38             m=hashlib.md5()
    39             m.update(str(time.time()).encode('utf-8'))
    40             m.update(url.encode('utf-8'))
    41             filepath='%s\%s.mp4' %(movie_path,m.hexdigest())
    42             with open(filepath,'wb') as f:
    43                 f.write(response.content)
    44                 print('%s 下载成功' %url)
    45     except Exception:
    46         pass
    47 
    48 def main():
    49     base_url='http://www.xiaohuar.com/list-3-{page_num}.html'
    50     for i in range(5):
    51         url=base_url.format(page_num=i)
    52         pool.submit(get_page,url).add_done_callback(parse_index)
    53 
    54 if __name__ == '__main__':
    55     main()
    View Code
  • 相关阅读:
    SSM中 web.xml配置文件
    实现网站的登陆,注册,查看商品详细信息,加入购物车,注销登陆等简单功能。
    操作步骤
    mysql 查询 练习题及答案
    水仙花数!
    Spark SQL(4)-Unresolved Plan到Analyzed Plan
    Spark SQL(3) Parser到Unresolved LogicPlan
    Spark SQL(2)-InternalRow和TreeNode
    Spark SQL(1)-简述
    logstash output到kafka记录与总结( No entry found for connection 2)
  • 原文地址:https://www.cnblogs.com/mqhpy/p/11370851.html
Copyright © 2011-2022 走看看