线程池演示
from concurrent.futures import ThreadPoolExecutor
import time
# pool只能创建100个线程
pool = ThreadPoolExecutor(100)
def task(line):
print(line)
time.sleep(10)
if __name__ == '__main__':
for line in range(1000):
pool.submit(task, line)
通过并发(同步)爬虫某个网站的小视频
import requests
import re
# import os
# import uuid
#
#
# # 1.发送请求,获取响应数据
# def get_page(url):
# response = requests.get(url)
# if response.status_code == 200:
# return response
#
#
# # 2.解析并提取主页id号
# def parse_page(response):
# '''
# https://www.pearvideo.com/video_1630253
# https://www.pearvideo.com/video_1630042
# '''
# # 将所有电影的详情页id号,匹配获取,并放到列表中
# id_list = re.findall('href="video_(.*?)"', response.text, re.S)
# # print(len(id_list))
# id_list = list(set(id_list))
# # print(len(id_list))
# return id_list
#
#
# def parse_detail(response):
# '''
# srcUrl="https://video.pearvideo.com/mp4/adshort/20191206/cont-1630253-14671892_adpkg-ad_hd.mp4"
# srcUrl="(.*?)"
# '''
# mp4_url = re.findall('srcUrl="(.*?)"', response.text, re.S)
# # print(mp4_url, 111111)
# if mp4_url:
# return mp4_url[0]
#
#
# # 3.保存数据
# def save_movie(movie_url):
# response = get_page(movie_url)
#
# movie_dir = r'D:项目路径python13期day30梨视频'
# movie_path = os.path.join(
# movie_dir, str(uuid.uuid4()) + '.mp4'
# )
# # print(movie_path)
# with open(movie_path, 'wb') as f:
# for line in response.iter_content():
# f.write(line)
#
#
# if __name__ == '__main__':
# response = get_page('https://www.pearvideo.com/')
#
# # 解析提取所有电影详情页id号
# id_list = parse_page(response)
# # print(id_list)
#
# # 循环拼接详情页链接
# for id_num in id_list:
# url = f'https://www.pearvideo.com/video_{id_num}'
# # print(url)
#
# # 往详情页发送请求,
# detail_response = get_page(url)
# # print(detail_response.text)
#
# # # 解析电影详情页,并提取视频的存放的地址
# mp4_url = parse_detail(detail_response)
# print(mp4_url)
#
# # # 发送请求获取视频真实数据
# # movie_response = get_page(mp4_url)
#
# # response.content
# save_movie(mp4_url)
# 异步爬取梨视频
import requests
import re
import os
import uuid
from concurrent.futures import ThreadPoolExecutor
pool = ThreadPoolExecutor(100)
# 1.发送请求,获取响应数据
def get_page(url):
print(f'发送get请求: {url}')
response = requests.get(url)
if response.status_code == 200:
return response
# 2.解析并提取主页id号
def parse_page(response):
'''
https://www.pearvideo.com/video_1630253
https://www.pearvideo.com/video_1630042
'''
# 将所有电影的详情页id号,匹配获取,并放到列表中
id_list = re.findall('href="video_(.*?)"', response.text, re.S)
# print(len(id_list))
id_list = list(set(id_list))
# print(len(id_list))
return id_list
# 解析详情页,获取视频链接
def parse_detail(res):
'''
srcUrl="https://video.pearvideo.com/mp4/adshort/20191206/cont-1630253-14671892_adpkg-ad_hd.mp4"
srcUrl="(.*?)"
'''
res2 = res.result()
print(res2)
movie_url = re.findall('srcUrl="(.*?)"', res2.text, re.S)
print(movie_url)
if movie_url:
movie_url = movie_url[0]
pool.submit(save_movie, movie_url)
# 3.保存数据
def save_movie(movie_url):
# time.sleep(1)
# 获取响应数据的过程是IO操作
response = requests.get(movie_url)
movie_dir = r'D:项目路径python13期day30梨视频'
movie_path = os.path.join(
movie_dir, str(uuid.uuid4()) + '.mp4'
)
# print(movie_path)
with open(movie_path, 'wb') as f:
for line in response.iter_content():
f.write(line)
if __name__ == '__main__':
response = get_page('https://www.pearvideo.com/')
id_list = parse_page(response)
for id_num in id_list:
# 每一个视频详情页
url = f'https://www.pearvideo.com/video_{id_num}'
# 异步提交并爬取详情页任务
# add_done_callback(parse_detail): 将get_page任务结束后的结果,扔给parse_detail函数
# parse_detail函数接收的是一个对象,对象中的result()就是get_page函数的返回值。
pool.submit(get_page, url).add_done_callback(parse_detail)
import datetime
print(datetime.datetime.now())
# 21:54 ---> 18:45