一 利用生成器来完成爬去校花网视频
import requests import re import os import hashlib import time DOWLOAD_PATH=r'D:DOWNLOAD' def get_page(url): try: response=requests.get(url,) if response.status_code == 200: return response.text except Exception: pass def parse_index(index_contents): # print(type(index_contents)) detail_urls=re.findall('class="items".*?href="(.*?)"',index_contents,re.S) for detail_url in detail_urls: if not detail_url.startswith('http'): detail_url='http://www.xiaohuar.com'+detail_url yield detail_url def parse_detail(detail_contents): movie_urls=re.findall('id="media".*?src="(.*?)"',detail_contents,re.S) if movie_urls: movie_url=movie_urls[0] if movie_url.endswith('mp4'): yield movie_url def download(movie_url): print(movie_url) try: response=requests.get(movie_url, ) if response.status_code == 200: data=response.content m=hashlib.md5() m.update(str(time.time()).encode('utf-8')) m.update(movie_url.encode('utf-8')) filepath=os.path.join(DOWLOAD_PATH,'%s.mp4' %m.hexdigest()) with open(filepath,'wb') as f: f.write(data) f.flush() print('下载成功',movie_url) except Exception: pass def main(): raw_url='http://www.xiaohuar.com/list-3-{page_num}.html' for i in range(5): #请求索引页,解析拿到详情页链接 index_url=raw_url.format(page_num=i) index_contents=get_page(index_url) detail_urls=parse_index(index_contents) #请求详情页,解析拿到视频的链接地址 for detail_url in detail_urls: detail_contents=get_page(detail_url) movie_urls=parse_detail(detail_contents) #下载视频 for movie_url in movie_urls: download(movie_url) if __name__ == '__main__': t1=time.time() main() print(time.time()-t1)
二 利用对线程优化上述代码
import requests #pip install requests import re import os import hashlib import time from concurrent.futures import ThreadPoolExecutor pool=ThreadPoolExecutor(50) DOWLOAD_PATH=r'D:DOWNLOAD' def get_page(url): try: response=requests.get(url,) if response.status_code == 200: return response.text except Exception: pass def parse_index(index_contents): index_contents=index_contents.result() detail_urls=re.findall('class="items".*?href="(.*?)"',index_contents,re.S) for detail_url in detail_urls: if not detail_url.startswith('http'): detail_url='http://www.xiaohuar.com'+detail_url pool.submit(get_page,detail_url).add_done_callback(parse_detail) def parse_detail(detail_contents): detail_contents=detail_contents.result() movie_urls=re.findall('id="media".*?src="(.*?)"',detail_contents,re.S) if movie_urls: movie_url=movie_urls[0] if movie_url.endswith('mp4'): pool.submit(download,movie_url) def download(movie_url): # print(movie_url) try: response=requests.get(movie_url, ) if response.status_code == 200: data=response.content m=hashlib.md5() m.update(str(time.time()).encode('utf-8')) m.update(movie_url.encode('utf-8')) filepath=os.path.join(DOWLOAD_PATH,'%s.mp4' %m.hexdigest()) with open(filepath,'wb') as f: f.write(data) f.flush() print('下载成功',movie_url) except Exception: pass def main(): raw_url='http://www.xiaohuar.com/list-3-{page_num}.html' for i in range(5): #请求索引页,解析拿到详情页链接 index_url=raw_url.format(page_num=i) pool.submit(get_page,index_url).add_done_callback(parse_index) if __name__ == '__main__': t1=time.time() main() print(time.time()-t1)
牛逼的代码
三 自己根据egon讲的grep命令,类似的道理,写的爬去校花网图片的代码
import requests,re,os def init(f): def inner(*args,**kwargs): g=f(*args,**kwargs) next(g) return g return inner def get(url): r=requests.get(url) def inner(): r.encoding='gbk' return r.text return inner xiaohua=get('http://www.xiaohuar.com/2014.html') xiaohua_contend=xiaohua() def search(target): g=re.finditer('<a href=.*? target=.*?><img width=.*? alt="(?P<name>.*?)" src="(?P<src>.*?)" /></a>',xiaohua_contend,re.S) for i in g: target.send((i.group('name'),i.group('src'))) @init def handle(target): while True: name,src=yield if src.startswith('http'): pass else: src='http://www.xiaohuar.com'+src target.send((name,src)) @init def download(): while True: name,src=yield r=requests.get(src) with open(r'D:校花网'+'\'+name+'.jpg','wb')as f: f.write(r.content) search(handle((download())))
总结:
egon授课。
生成器与协程有紧密的联系。
生成器可以通过yield接收参数,通过send传值。
生成器与多线程也有关系吗?没有吧。
普通函数爬取视频也是可以用到多线程的。
优化的余地:可以加上进度条,利用类实现。大概就是这个样式,copy的。
def download_file(url, path): with closing(requests.get(url, stream=True)) as r: chunk_size = 1024*10 content_size = int(r.headers['content-length']) print '下载开始' with open(path, "wb") as f: p = ProgressData(size = content_size, unit='Kb', block=chunk_size) for chunk in r.iter_content(chunk_size=chunk_size): f.write(chunk) p.output()
class ProgressData(object): def __init__(self, block,size, unit, file_name='', ): self.file_name = file_name self.block = block/1000.0 self.size = size/1000.0 self.unit = unit self.count = 0 self.start = time.time() def output(self): self.end = time.time() self.count += 1 speed = self.block/(self.end-self.start) if (self.end-self.start)>0 else 0 self.start = time.time() loaded = self.count*self.block progress = round(loaded/self.size, 4) if loaded >= self.size: print u'%s下载完成 '%self.file_name else: print u'{0}下载进度{1:.2f}{2}/{3:.2f}{4} 下载速度{5:.2%} {6:.2f}{7}/s'. format(self.file_name, loaded, self.unit, self.size, self.unit, progress, speed, self.unit) print '%50s'%('/'*int((1-progress)*50))