zoukankan      html  css  js  c++  java
  • 模拟登录爬取数据、线程池的使用

    云打码平台

    • 注册:普通用户和开发者用户
    • 登录:
    • 登录开发者用户
    • 创建一个软件:我的软件 -> 创建软件
    • 下载示例代码:开发者中心 -> 下载最新的DDL -> pythonHttp示例代码下载

    1、模拟人人网登录

    代码示例

      1 import http.client, mimetypes, urllib, json, time, requests
      2 
      3 ######################################################################
      4 
      5 class YDMHttp:
      6 
      7     apiurl = 'http://api.yundama.com/api.php'
      8     username = ''
      9     password = ''
     10     appid = ''
     11     appkey = ''
     12 
     13     def __init__(self, username, password, appid, appkey):
     14         self.username = username  
     15         self.password = password
     16         self.appid = str(appid)
     17         self.appkey = appkey
     18 
     19     def request(self, fields, files=[]):
     20         response = self.post_url(self.apiurl, fields, files)
     21         response = json.loads(response)
     22         return response
     23     
     24     def balance(self):
     25         data = {'method': 'balance', 'username': self.username, 'password': self.password, 'appid': self.appid, 'appkey': self.appkey}
     26         response = self.request(data)
     27         if (response):
     28             if (response['ret'] and response['ret'] < 0):
     29                 return response['ret']
     30             else:
     31                 return response['balance']
     32         else:
     33             return -9001
     34     
     35     def login(self):
     36         data = {'method': 'login', 'username': self.username, 'password': self.password, 'appid': self.appid, 'appkey': self.appkey}
     37         response = self.request(data)
     38         if (response):
     39             if (response['ret'] and response['ret'] < 0):
     40                 return response['ret']
     41             else:
     42                 return response['uid']
     43         else:
     44             return -9001
     45 
     46     def upload(self, filename, codetype, timeout):
     47         data = {'method': 'upload', 'username': self.username, 'password': self.password, 'appid': self.appid, 'appkey': self.appkey, 'codetype': str(codetype), 'timeout': str(timeout)}
     48         file = {'file': filename}
     49         response = self.request(data, file)
     50         if (response):
     51             if (response['ret'] and response['ret'] < 0):
     52                 return response['ret']
     53             else:
     54                 return response['cid']
     55         else:
     56             return -9001
     57 
     58     def result(self, cid):
     59         data = {'method': 'result', 'username': self.username, 'password': self.password, 'appid': self.appid, 'appkey': self.appkey, 'cid': str(cid)}
     60         response = self.request(data)
     61         return response and response['text'] or ''
     62 
     63     def decode(self, filename, codetype, timeout):
     64         cid = self.upload(filename, codetype, timeout)
     65         if (cid > 0):
     66             for i in range(0, timeout):
     67                 result = self.result(cid)
     68                 if (result != ''):
     69                     return cid, result
     70                 else:
     71                     time.sleep(1)
     72             return -3003, ''
     73         else:
     74             return cid, ''
     75 
     76     def report(self, cid):
     77         data = {'method': 'report', 'username': self.username, 'password': self.password, 'appid': self.appid, 'appkey': self.appkey, 'cid': str(cid), 'flag': '0'}
     78         response = self.request(data)
     79         if (response):
     80             return response['ret']
     81         else:
     82             return -9001
     83 
     84     def post_url(self, url, fields, files=[]):
     85         for key in files:
     86             files[key] = open(files[key], 'rb');
     87         res = requests.post(url, files=files, data=fields)
     88         return res.text
     89 
     90 ######################################################################
     91 
     92 # 用户名(普通用户)
     93 username    = 'bobo328410948'
     94 
     95 # 密码
     96 password    = 'bobo328410948'                            
     97 
     98 # 软件ID,开发者分成必要参数。登录开发者后台【我的软件】获得!
     99 appid       = 6003                                    
    100 
    101 # 软件密钥,开发者分成必要参数。登录开发者后台【我的软件】获得!
    102 appkey      = '1f4b564483ae5c907a1d34f8e2f2776c'    
    103 
    104 # 图片文件
    105 filename    = 'getimage.jpg'                        
    106 
    107 # 验证码类型,# 例:1004表示4位字母数字,不同类型收费不同。请准确填写,否则影响识别率。在此查询所有类型 http://www.yundama.com/price.html
    108 codetype    = 1004
    109 
    110 # 超时时间,秒
    111 timeout     = 10                                    
    112 
    113 # 检查
    114 if (username == 'username'):
    115     print('请设置好相关参数再测试')
    116 else:
    117     # 初始化
    118     yundama = YDMHttp(username, password, appid, appkey)
    119 
    120     # 登陆云打码
    121     uid = yundama.login();
    122     print('uid: %s' % uid)
    123 
    124     # 查询余额
    125     balance = yundama.balance();
    126     print('balance: %s' % balance)
    127 
    128     # 开始识别,图片路径,验证码类型ID,超时时间(秒),识别结果
    129     cid, result = yundama.decode(filename, codetype, timeout);
    130     print('cid: %s, result: %s' % (cid, result))
    131 
    132 ######################################################################
    View Code

    解析验证码

     1 def getCodeDate(userName,pwd,codePath,codeType):
     2     # 用户名(普通用户)
     3     username    = userName
     4 
     5     # 密码
     6     password    = pwd                            
     7 
     8     # 软件ID,开发者分成必要参数。登录开发者后台【我的软件】获得!
     9     appid       = 6003                                    
    10 
    11     # 软件密钥,开发者分成必要参数。登录开发者后台【我的软件】获得!
    12     appkey      = '1f4b564483ae5c907a1d34f8e2f2776c'    
    13 
    14     # 图片文件
    15     filename    = codePath                       
    16 
    17     # 验证码类型,# 例:1004表示4位字母数字,不同类型收费不同。请准确填写,否则影响识别率。在此查询所有类型 http://www.yundama.com/price.html
    18     codetype    = codeType
    19 
    20     # 超时时间,秒
    21     timeout     = 2                                   
    22     result = None
    23     # 检查
    24     if (username == 'username'):
    25         print('请设置好相关参数再测试')
    26     else:
    27         # 初始化
    28         yundama = YDMHttp(username, password, appid, appkey)
    29 
    30         # 登陆云打码
    31         uid = yundama.login();
    32         #print('uid: %s' % uid)
    33 
    34         # 查询余额
    35         balance = yundama.balance();
    36         #print('balance: %s' % balance)
    37 
    38         # 开始识别,图片路径,验证码类型ID,超时时间(秒),识别结果
    39         cid, result = yundama.decode(filename, codetype, timeout);
    40         #print('cid: %s, result: %s' % (cid, result))
    41     return result
    View Code

    利用抓包工具获取请求的url和参数,这里发送的是post请求

    模拟人人网登录

     1 import requests
     2 import urllib
     3 from lxml import etree
     4 #获取session对象
     5 session = requests.Session()
     6 #将验证码图片进行下载
     7 headers = {
     8     'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Safari/537.36'
     9 }
    10 url = 'http://www.renren.com/'
    11 page_text = requests.get(url=url,headers=headers).text
    12 
    13 tree = etree.HTML(page_text)
    14 code_img_url = tree.xpath('//*[@id="verifyPic_login"]/@src')[0]
    15 urllib.request.urlretrieve(url=code_img_url,filename='code.jpg')
    16 
    17 #识别验证码图片中的数据值
    18 code_data = getCodeDate('bobo328410948','bobo328410948','./code.jpg',2004)
    19 
    20 #模拟登录
    21 login_url = 'http://www.renren.com/ajaxLogin/login?1=1&uniqueTimestamp=201914927558'
    22 data = {
    23     "email":"www.zhangbowudi@qq.com",
    24     "icode":code_data,
    25     "origURL":"http://www.renren.com/home",
    26     "domain":"renren.com",
    27     "key_id":"1",
    28     "captcha_type":"web_login",
    29     "password":"4f0350f09aeffeef86307747218b214b0960bdf35e30811c0d611fe39db96ec1",
    30     "rkey":"9e75e8dc3457b14c55a74627fa64fb43",
    31     "f":"http%3A%2F%2Fwww.renren.com%2F289676607",
    32 }
    33 #该次请求产生的cookie会被自动存储到session对象中
    34 session.post(url=login_url,data=data,headers=headers)
    35 
    36 url = ""  # 这里是登录之后才能访问的页面的url
    37 page_text = session.get(url=url,headers=headers).text
    38 
    39 with open('renren.html','w',encoding='utf-8') as fp:
    40     fp.write(page_text)
    View Code

    二 利用线程池爬取数据

    import requests
    import re
    from lxml import etree
    from multiprocessing.dummy import Pool
    import random
     1 #实例化一个线程池对象
     2 pool = Pool(5)
     3 url = 'https://www.pearvideo.com/category_1'
     4 headers = {
     5     'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Safari/537.36'
     6 }
     7 page_text = requests.get(url=url,headers=headers).text
     8 tree = etree.HTML(page_text)
     9 li_list = tree.xpath('//div[@id="listvideoList"]/ul/li')
    10 
    11 video_url_list = []
    12 for li in li_list:
    13     detail_url = 'https://www.pearvideo.com/'+li.xpath('./div/a/@href')[0]
    14     detail_page = requests.get(url=detail_url,headers=headers).text
    15     video_url = re.findall('srcUrl="(.*?)",vdoUrl',detail_page,re.S)[0]
    16     video_url_list.append(video_url)
    17     
    18 video_data_list = pool.map(getVideoData,video_url_list)
    19 
    20 pool.map(saveVideo,video_data_list)

    由于我们要获取的视屏连接不是在标签里,而是在js代码中,因此只能通过正则表达式来获取

    通过回调函数来下载和保存列表里的视频

    def getVideoData(url):
        return requests.get(url=url,headers=headers).content
    
    
    def saveVideo(data):
        fileName = str(random.randint(0,5000))+'.mp4'
        with open(fileName,'wb') as fp:
            fp.write(data)
  • 相关阅读:
    合并果子
    在线最小值问题
    沙盒机制(sandBox)
    简单地址簿?
    浅拷贝、深拷贝
    NSFileManager、NSFileHandle
    NSDate、NSCalendar、NSDateFormatter
    归档
    类目、延展、协议
    动态类型
  • 原文地址:https://www.cnblogs.com/liaopeng123/p/10452827.html
Copyright © 2011-2022 走看看