1、爬取校花网示例1:
1 import requests #pip3 install requests 2 import re 3 import hashlib 4 import time 5 6 movie_path=r'C:mp4' 7 8 def get_page(url): 9 try: 10 response=requests.get(url) 11 if response.status_code == 200: 12 return response.text 13 except Exception: 14 pass 15 16 def parse_index(index_page): 17 urls=re.findall('class="items".*?href="(.*?)"',index_page,re.S) 18 for url in urls: 19 if not url.startswith('http'): 20 url='http://www.xiaohuar.com'+url 21 yield url 22 23 def parse_detail(detail_page): 24 l=re.findall('id="media".*?src="(.*?)"',detail_page,re.S) 25 if l: 26 movie_url=l[0] 27 if movie_url.endswith('mp4'): 28 yield movie_url 29 30 def get_movie(url): 31 try: 32 response=requests.get(url) 33 if response.status_code == 200: 34 m=hashlib.md5() 35 m.update(str(time.time()).encode('utf-8')) 36 m.update(url.encode('utf-8')) 37 filepath='%s\%s.mp4' %(movie_path,m.hexdigest()) 38 with open(filepath,'wb') as f: 39 f.write(response.content) 40 print('%s 下载成功' %url) 41 except Exception: 42 pass 43 44 def main(): 45 base_url='http://www.xiaohuar.com/list-3-{page_num}.html' 46 for i in range(5): 47 url=base_url.format(page_num=i) 48 index_page=get_page(url) 49 detail_urls=parse_index(index_page) 50 for detail_url in detail_urls: 51 detail_page=get_page(detail_url) 52 movie_urls=parse_detail(detail_page) 53 for movie_url in movie_urls: 54 get_movie(movie_url) 55 56 if __name__ == '__main__': 57 main()
2、爬取校花网视频示例二(加了并发的)
1 import requests #pip3 install requests 2 import re 3 import hashlib 4 import time 5 from concurrent.futures import ThreadPoolExecutor 6 7 pool=ThreadPoolExecutor(50) 8 movie_path=r'C:mp4' 9 10 def get_page(url): 11 try: 12 response=requests.get(url) 13 if response.status_code == 200: 14 return response.text 15 except Exception: 16 pass 17 18 def parse_index(index_page): 19 index_page=index_page.result() 20 urls=re.findall('class="items".*?href="(.*?)"',index_page,re.S) 21 for detail_url in urls: 22 if not detail_url.startswith('http'): 23 detail_url='http://www.xiaohuar.com'+detail_url 24 pool.submit(get_page,detail_url).add_done_callback(parse_detail) 25 26 def parse_detail(detail_page): 27 detail_page=detail_page.result() 28 l=re.findall('id="media".*?src="(.*?)"',detail_page,re.S) 29 if l: 30 movie_url=l[0] 31 if movie_url.endswith('mp4'): 32 pool.submit(get_movie,movie_url) 33 34 def get_movie(url): 35 try: 36 response=requests.get(url) 37 if response.status_code == 200: 38 m=hashlib.md5() 39 m.update(str(time.time()).encode('utf-8')) 40 m.update(url.encode('utf-8')) 41 filepath='%s\%s.mp4' %(movie_path,m.hexdigest()) 42 with open(filepath,'wb') as f: 43 f.write(response.content) 44 print('%s 下载成功' %url) 45 except Exception: 46 pass 47 48 def main(): 49 base_url='http://www.xiaohuar.com/list-3-{page_num}.html' 50 for i in range(5): 51 url=base_url.format(page_num=i) 52 pool.submit(get_page,url).add_done_callback(parse_index) 53 54 if __name__ == '__main__': 55 main()