对加密数据的爬取
import requests
from lxml import etree
import base64
import os
from urllib import request
url = 'http://jandan.net/ooxx/page-46'
headers = {
'User-Agent':'ozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36'
}
page_text = requests.get(url=url, headers=headers).text
#print(page_text)
tree = etree.HTML(page_text)
code_list = tree.xpath('//div[@class="text"]/p/span[@class="img-hash"]/text()')
#print(code_list)
if not os.path.exists('dandan'):
os.mkdir('dandan')
for code in code_list:
#print(code)
src = base64.b64decode(code).decode()
img_url = 'https:' + src
filepath = 'dandan/' + src.split('/')[-1]
#print(src)
request.urlretrieve(url=img_url, filename=filepath)
print(filepath + '下载成功')
二.验证码处理及模拟登陆
1.相关的门户网站在进行登录的时候,如果用户连续登录的次数超过3次或者5次的时候,就会在登录页中动态生成验证码。通过验证码达到分流和反爬的效果。
2.- 1.对携带验证码的页面数据进行抓取
- 2.可以将页面数据中验证码进行解析,验证码图片下载到本地
- 3.可以将验证码图片提交给三方平台进行识别,返回验证码图片上的数据值
- 云打码平台:
- 1.在官网中进行注册(普通用户和开发者用户)
- 2.登录开发者用户:
- 1.实例代码的下载(开发文档-》调用实例及最新的DLL-》PythonHTTP实例下载)
- 2.创建一个软件:我的软件-》添加新的软件
-3.使用示例代码中的源码文件中的代码进行修改,让其识别验证码图片中的数据值
3.账号:1355144989@qq.com 密码1355144989
模拟登陆爬取人人网
import requests
from lxml import etree
from urllib import request
url = 'http://www.renren.com/SysHome.do'
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36'
}
page_text = requests.get(url=url, headers=headers).text
tree = etree.HTML(page_text)
src = tree.xpath('//*[@id="verifyPic_login"]/@src')[0]
print(src)
img = request.urlretrieve(url=src, filename='./renren.jpg')
# 登录
code = get_code(2004,'./renren.jpg')
session = requests.Session()
login_url = 'http://www.renren.com/ajaxLogin/login?1=1&uniqueTimestamp=201903160368'
data = {
"email":"15516092050",
"icode":code,
"origURL":"http://www.renren.com/home",
"domain":"renren.com",
"key_id":'1',
"captcha_type":"web_login",
"password":"5e088a2ee22d34dd081aac25578e67bd3a2d851cdfbcf1f0c9ab7056bd1bad62",
"rkey":"3f4696f6fa1b89e9061868300bf11484",
"f":"http%3A%2F%2Fwww.renren.com%2F969395731",
}
login_page = session.post(url=login_url, headers=headers,data=data)
print(login_page.headers)
detail_url = 'http://www.renren.com/969395731'
detail_content = session.get(url=url,headers=headers).text
with open('./renren.html', 'w', encoding='utf-8') as f:
f.write(detail_content)
print('下载成功')
三.并发爬取视频
from multiprocessing.dummy import Pool # 开启多线程
from lxml import etree
import requests
import re
pool = Pool(5) # 创建一个线程池
url = 'https://www.pearvideo.com/category_3'
headers = {
'User-Agent':'ozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36'
}
page_text = requests.get(url=url, headers = headers).text
tree = etree.HTML(page_text)
video_url_list = tree.xpath('//*[@id="listvideoListUl"]/li/div/a/@href')
print(video_url_list)
after_url_list = []
film_name_list = []
for video_url in video_url_list:
detail_url = 'https://www.pearvideo.com/' + video_url
video_detail_page = requests.get(url = detail_url, headers=headers).text
after_url = re.findall('ldUrl="",srcUrl="(.*?)"', video_detail_page)[0]
trees = etree.HTML(video_detail_page)
film_name = trees.xpath('//div[@class="video-main"]/div/img[@class="img"]/@alt')[0]
film_name_list.append(film_name)
after_url_list.append(after_url)
print(after_url_list)
get_video_data = lambda after_url:requests.get(url=after_url, headers=headers).content
video_data_list = pool.map(get_video_data,after_url_list )
def get_video_name(video_data):
name = str(video_data_list.index(video_data))
with open(f'./{name}.mp4', 'wb') as f:
f.write(video_data)
print(name + '下载成功')
pool.map(get_video_name,video_data_list )