zoukankan      html  css  js  c++  java
  • python爬虫:使用账号、密码和验证码登录知乎网页

    先上代码,后分析出现的问题:

     1 #coding:utf-8
     2 import re
     3 from bs4 import BeautifulSoup
     4 import gzip
     5 import urllib.request
     6 import urllib.parse
     7 import http.cookiejar
     8 import ssl
     9 import time
    10 
    11 def get_opener(heads):
    12     cj=http.cookiejar.CookieJar()
    13     pro=urllib.request.HTTPCookieProcessor(cj)
    14     opener=urllib.request.build_opener(pro)
    15     header=[]
    16     for key,value in heads.items():
    17         header.append((key,value))
    18     opener.addheaders=header
    19     return opener
    20 
    21 def ungzip(data):
    22     try:
    23         print("正在解压....")
    24         data=gzip.decompress(data)
    25         print("解压完成")
    26     except:
    27         print("无需解压")
    28     return data    
    29 
    30 if __name__=="__main__":
    31     ssl._create_default_https_context = ssl._create_unverified_context 
    32     heads={
    33             "Accept":"text/html, application/xhtml+xml, */*",
    34             "Accept-Language":"zh-CN",
    35             "User-Agent":"Mozilla/5.0 (Windows NT 6.3; WOW64; rv:43.0) Gecko/20100101 Firefox/43.0",
    36             "Accept-Encoding": "gzip, deflate",
    37             "Host": "www.zhihu.com",
    38             "DNT": "1",
    39             "Connection": "Keep-Alive"
    40             }
    41     opener=get_opener(heads)
    42     url="https://www.zhihu.com/"
    43     op=opener.open(url)
    44     data1=op.read()
    45     data1=ungzip(data1).decode('utf-8')
    46     #print(data1.decode('utf-8'))
    47     #print(op.read().decode('utf-8'))
    48 ##    xsrf=re.findall(r'name="_xsrf" value=".*"',data1)
    49 ##    print(xsrf[0])
    50 ##    print(type(xsrf[0]))
    51 ##    value=xsrf[0].split(" ")
    52 ##    print(value)
    53 ##    _xsrf=re.findall(r'".*"',value[1])[0]
    54 ##    print(_xsrf)
    55     soup=BeautifulSoup(data1,"html.parser")
    56     _xsrf=soup.find("input",{'type':'hidden'}).get("value")
    57     password="hzc19911005"
    58     #captcha_type="cn"
    59     phone_num="13267243809"
    60     captcha_url="https://www.zhihu.com/captcha.gif?r=%d&type=login"% (time.time() * 1000)
    61     captchadata=opener.open(captcha_url).read()
    62     with open("1.gif",'wb') as file:
    63         file.write(captchadata)
    64     yanzhengma=input("captcha:")
    65     postdata={
    66         "_xsrf":_xsrf,
    67         "password":password,
    68         #"captcha_type":captcha_type,#不能带有这个字段
    69         "phone_num":phone_num,
    70         "captcha":yanzhengma
    71         }
    72     postdata=urllib.parse.urlencode(postdata).encode()
    73     login_url="https://www.zhihu.com/login/phone_num"
    74     op2=opener.open(login_url,postdata)
    75     login_data=op2.read()
    76     data=ungzip(login_data).decode("utf-8")
    77     print(data)
    78     result=dict(eval(data))
    79     if result["r"]==0:
    80         print("登录成功")
    81     

    1、出现“SSLError: [SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed (_ssl.c:581)”:

    Python 2.7.9 之后版本引入了一个新特性

    当你urllib.urlopen一个 https 的时候会验证一次 SSL 证书 

    当目标使用的是自签名的证书时就会爆出一个

    urllib.URLError: <urlopen error [SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed (_ssl.c:581)> 的错误消息,

    处理方法:

    import ssl 
    ssl._create_default_https_context = ssl._create_unverified_context  

    2、出现验证码错误,返回: 验证码过期:{ "r": 1, "errcode": 1991829, "data": {"captcha":"验证码回话无效 :(","name":"ERR_VERIFY_CAPTCHA_SESSION_INVALID"}, "msg": "验证码回话无效 :(" }:

    1. 发给服务器的post数据没有带验证码:"captcha",解决办法:postdata={
              "_xsrf":_xsrf,
              "password":password,
              #"captcha_type":captcha_type,#不能带有这个字段
              "phone_num":phone_num,
              "captcha":yanzhengma
              }
    2. 验证码过期,解决办法:先从url="https://www.zhihu.com/captcha.gif?r=%d&type=login"% (time.time() * 1000)下载图片保存在本地,然后人工识别,手动输入验证码
    1 captcha_url="https://www.zhihu.com/captcha.gif?r=%d&type=login"% (time.time() * 1000)
    2 captchadata=opener.open(captcha_url).read()
    3 with open("1.gif",'wb') as file:
    4       file.write(captchadata)
    5 yanzhengma=input("captcha:")

     

  • 相关阅读:
    MyBatis总结六:resultMap详解(包含多表查询)
    MyBatis总结五:#{}和${}的用法和区别
    MyBatis总结四:配置文件xml详解
    MyBatis使用动态代理报 invalid bound statement (not found) 错
    MyBatis总结三:使用动态代理实现dao接口
    MyBatis总结二:增删改查
    session详解&和cookie的区别
    cookie详解
    C#属性器Get和Set
    ORM实例介绍
  • 原文地址:https://www.cnblogs.com/yizhenfeng168/p/6972876.html
Copyright © 2011-2022 走看看