import re import requests response = requests.get("http://www.xiaohuar.com/v/") url_s = re.findall('<div class="items">.*?href="(.*?)"',response.text,re.S) for url in url_s: res = requests.get(url) result = re.findall('<video id="media".*?src="(.*?)"',res.text,re.S) # print(result) def get_page(url): try: response = requests.get(url) if response.status_code==200: return response.text except Exception: pass def parse_data(text): url_s = re.findall('<div class="items">.*?href="(.*?)"',text,re.S) # list = [] for url in url_s: if url: yield url def parse_detail(text): try: movie_url_list = re.findall('<video id="media".*?src="(.*?)"',text,re.S) if movie_url_list: movie_url = movie_url_list[0] if movie_url.endswith(".mp4"): return movie_url except Exception(TypeError): pass import uuid def download_movie(movie_url): try: response=requests.get(movie_url) # print(response.text) with open (r"D:spider1movies\%s.mp4"%uuid.uuid4(),"wb")as f: f.write(response.content) except Exception: pass if __name__ == '__main__': base_url = "http://www.xiaohuar.com/list-3-{}.html" for line in range(1): url=base_url.format(line) #1 发送请求 index_text=get_page(url) #2解析数据 urls = parse_data(index_text) for url in urls: #访问详情页获取详情页文本 detail_text = get_page(url) movie_url = parse_detail(detail_text) #保存视屏 download_movie(movie_url) from concurrent.futures import ThreadPoolExecutor pool = ThreadPoolExecutor(50) response = requests.get("http://www.xiaohuar.com/v/") # print(response.text) url_s = re.findall('<div class="items">.*?href="(.*?)"',response.text,re.S) for url in url_s: # print(url) res = requests.get(url) result = re.findall('<video id="media".*?src="(.*?)"',res.text,re.S) # print(result) def get_page(url): print(url) try: response = requests.get(url) if response.status_code==200: return response.text except Exception: pass def parse(res): text = res.result() if text: # print(text) url_s = re.findall('<div class="items">.*?href="(.*?)"',text,re.S) # list = [] for url in url_s: if url: if url.startswith("/"): url = "http://www.xiaohuar.com"+url pool.submit(get_page,url).add_done_callback(parse_detail) def parse_detail(res): text = res.result() if text: try: movie_url_list = re.findall('<video id="media".*?src="(.*?)"',text,re.S) if movie_url_list: movie_url = movie_url_list[0] if movie_url.endswith(".mp4"): pool.submit(download_movie,movie_url) except Exception(TypeError): pass import uuid def download_movie(movie_url): if movie_url: try: response=requests.get(movie_url) # print(response.text) with open (r"D:spider1movies\%s.mp4"%uuid.uuid4(),"wb")as f: f.write(response.content) except Exception: pass if __name__ == '__main__': base_url = "http://www.xiaohuar.com/list-3-{}.html" for line in range(2): url=base_url.format(line) #1 发送请求 pool.submit(get_page,url).add_done_callback(parse)