一、模拟登录的意义
当我们需要爬取的数据是登录之后的个人信息, 就需要使用模拟登录
二、使用打码平台处理验证码
云打码
打码兔
三、注册:
普通用户注册
充值题分(1块钱)
开发者用户注册
添加软件下载调用示例
填写相关信息
用户名
密码
软件名称
软件密钥
四、调用实例
import http.client, mimetypes, urllib, json, time, requests from get_img_code import YDMHttp def get_code(types, filename): # 用户名(普通用户的用户名) username = 'lxh66685' # 密码 password = 'nihao123!' # 软件ID,开发者分成必要参数。登录开发者后台【我的软件】获得! appid = 7971 # 软件密钥,开发者分成必要参数。登录开发者后台【我的软件】获得! appkey = 'b6fef487706d29041c20e6f9da220669' # 图片文件 filename = filename # 验证码类型,# 例:1004表示4位字母数字,不同类型收费不同。请准确填写,否则影响识别率。在此查询所有类型 http://www.yundama.com/price.html codetype = types # 超时时间,秒 timeout = 30 # 检查 if (username == 'username'): print('请设置好相关参数再测试') else: # 初始化 yundama = YDMHttp(username, password, appid, appkey) # 登陆云打码 uid = yundama.login(); print('uid: %s' % uid) # 查询余额 balance = yundama.balance(); print('balance: %s' % balance) # 开始识别,图片路径,验证码类型ID,超时时间(秒),识别结果 cid, result = yundama.decode(filename, codetype, timeout); print('cid: %s, result: %s' % (cid, result)) return result
五、模拟登录古诗文网
# 模拟登录古诗文网www.gushiwen.com import requests from lxml import etree from urllib import request # 将requests请求替换成session请求,可以自动处理cookie requests = requests.Session() url = "https://www.gushiwen.com/main/login.html" headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36" } # 1. 向页面首页发送请求,下载验证码图片到本地 page_text = requests.get(url=url, headers=headers).text tree = etree.HTML(page_text) img_url = "https://www.gushiwen.com/" + tree.xpath('//div[@class="lg_content"]/ul/li[3]/img/@src')[0] # request.urlretrieve(img_url, './code.jpg') page_content = requests.get(url=img_url, headers=headers).content with open('./code.jpg', 'wb') as f: f.write(page_content) # 2. 识别验证码图片 code = get_code(1004, './code.jpg') print(code) data= { "user": "lxh661314", "pass": "nihao123!", "imgvc": code } # 3. 实现模拟登录 response = requests.post(url=url, headers=headers, data=data) login_page_text = requests.get(url="https://www.gushiwen.com/u.html", headers=headers).text with open('./login.html', 'w', encoding="utf-8") as f: f.write(login_page_text)
六、登录古诗文
# 登录古诗文网 import requests from lxml import etree requests = requests.Session() home_url = "https://so.gushiwen.org/user/login.aspx" login_url = "https://so.gushiwen.org/user/login.aspx" headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36" } page_text = requests.get(url=home_url, headers=headers).text tree = etree.HTML(page_text) view_state = tree.xpath('//input[@id="__VIEWSTATE"]/@value')[0] view_state_generator = tree.xpath('//input[@name="__VIEWSTATEGENERATOR"]/@value')[0] img_code_url = "https://so.gushiwen.org" + tree.xpath('//img[@id="imgCode"]/@src')[0] print(view_state) page_content = requests.get(url=img_code_url, headers=headers).content with open('./code.png', 'wb') as f: f.write(page_content) code = get_code(1004, './code.png') data = { "__VIEWSTATE": view_state, "__VIEWSTATEGENERATOR": view_state_generator, "from": "", "email": "lxh661314@163.com", "pwd": "nihao123!", "code": code, "denglu": "登录", } # 模拟登录 page_text = requests.post(url=login_url, headers=headers, data=data).text with open('./gushiwen.html', 'w', encoding='utf-8') as f: f.write(page_text) # 1. 模拟登录的请求参数有些是动态的, 需要从登录页面源码中动态解析获取 # 2. 下载图片最好是使用requests模块来下载,它是会带上UA信息 # 3. 使用Session()自动携带cookie请求