本次目标地址为梨视频:https://www.pearvideo.com/category_59
在实现数据抓取的时候需要注意以下几点:
视频资源数据大多数网站都是加密或者隐藏在js文件或者某个文件下面的,很少有在页面能够直接解析到源地址的。
在已知视频路径的情况下,避免使用有界面浏览器或者无界面浏览器,非常浪费资源,可以直接用requests发送请求
在使用多进程:multiprocessing要注意,必须在还有main脚本的文件去调用,否则直接报错。
本次小案例的主要实现代码:
from lxml import etree
import requests,re
from multiprocessing.pool import Pool #进程池
from tool import getVideo,saveVideo
def star():
pool = Pool(5) # 实例化若干个线程池对象
url='https://www.pearvideo.com/category_59'
user_agent={"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.109 Safari/537.36"}
mov_html=requests.get(url,headers=user_agent).text
tree=etree.HTML(mov_html)
mov_url_list=tree.xpath('//ul[@id="listvideoListUl"]/li/div/a/@href')
video_list=[]
for mov_url in mov_url_list:
movie_url='https://www.pearvideo.com/'+mov_url #拼接的视频缩略图地址
detail_page=requests.get(url=movie_url,headers=user_agent).text #视频详情页面
#视频原始路径被隐藏至js代码块中 只能通过正则re去匹配结果
video_url=re.findall('srcUrl="(.*?)",vdoUrl',detail_page,re.S)[0] #原始视频地址
video_list.append(video_url) #原始视频地址追加至列表
#print(video_list)
movie_all = pool.map(getVideo, video_list)
pool.map(saveVideo, movie_all)
if __name__ == '__main__':
star()
多进程处理数据的代码:
import requests,uuid
user_agent={"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.109 Safari/537.36"}
def getVideo(url): #下载视频流数据
return requests.get(url=url,headers=user_agent).content
def saveVideo(data): #保存视频数据
filename=str(uuid.uuid4())+".mp4"
with open('./movies/'+filename,'wb')as f:
f.write(data)
---------------------------------------------------------------------------------------
短视频爬取:https://www.gaoxiaovod.com/
from lxml import etree
import requests,re
from multiprocessing.pool import Pool
from manger import Manage
m=Manage() #实例化一个进程管理类对象
def link():
url='https://www.gaoxiaovod.com/' #视频首页
user_agent = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.109 Safari/537.36"
}
page_text=requests.get(url,headers=user_agent).text
tree=etree.HTML(page_text) #格式化首页元素
div=tree.xpath('//div[@class="piclist2"]')[1] #获取后一个div元素中的结果
div2_url=div.xpath('./ul/li/a/@href') #取出全部的URL地址
video_list=[]
for img in div2_url:
img_url=url+img
mov_html=requests.get(url=img_url,headers=user_agent).text
video_url = re.findall("['(.*?)', 'video/mp4', '",mov_html,re.S)[0] #正则匹配真实的资源地址
video_list.append(video_url)
video_data_list=pool.map(m.getVideo,video_list) #多进程处理请求数据
pool.map(m.saveVideo,video_data_list) #多进程持久化存储数据
if __name__ == '__main__':
pool = Pool(5)
link()
多进程业务逻辑代码:
import requests
import uuid
class Manage():
user_agent = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.109 Safari/537.36"
}
def getVideo(self,url):
return requests.get(url,headers=self.user_agent).content
def saveVideo(self,data):
filename=str(uuid.uuid4())+'.mp4'
print('正在准备保存文件'+filename)
with open('./movies/'+filename,'wb')as f:
f.write(data)