zoukankan      html  css  js  c++  java
  • python_day05(去爬登录的豆瓣)

    # 爬豆瓣需要用cookie
    # 需要注意隐藏的参数,即input 里面的默认的一些参数
    # 需要自己注册一个账户密码
    import urllib.request
    import http.cookiejar
    from lxml import etree
    import spiderimage
    head= {
    'Connection': 'Keep-Alive',
    'Accept': 'text/html, application/xhtml+xml, */*',
    'Accept-Language': 'en-US,en;q=0.8,zh-Hans-CN;q=0.5,zh-Hans;q=0.3',
    'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko'
    }
    url='https://accounts.douban.com/login'
    # 使用cookie
    def makeMyOpener(head):
    cj=http.cookiejar.CookieJar()
    opener=urllib.request.build_opener(urllib.request.HTTPCookieProcessor(cj))
    header=[]
    for key,value in head.items():
    elem=(key,value)
    header.append(elem)
    opener.addheaders = header
    return opener
    oper=makeMyOpener(head)
    uop=oper.open(url,timeout=1000)
    data=uop.read()
    html=data.decode()
    # lxml提取
    # 提取验证码
    selector=etree.HTML(html)
    links=selector.xpath('//img[@id="captcha_image"]/@src')
    for link in links:
    print(link)
    spiderimage.get_image(link,head,'captcs')
    # find hidden value
    captcha_ids=selector.xpath('//input[@name="captcha-id"]/@value');
    for link in captcha_ids:
    captcha_id=link
    captcha=input("请输入验证码:")
    print(captcha)
    datas = {'source':'index_nav','redir':'https://www.douban.com/','form_email':'此处填写自己的用户名','form_password':'此处填写自己的密码','captcha-solution':captcha,'captcha-id':captcha_id}
    data_encoded = urllib.parse.urlencode(datas).encode(encoding='utf-8')
    response=oper.open(url,data_encoded)
    content = response.read()
    html=content.decode();
    spath = 'douban.html'
    f=open(spath,"w",encoding='utf-8')
    f.write(html)
    f.close()
  • 相关阅读:
    如何利用 iTunes 把 m4a/wav 文件转成 MP3 格式
    The best way to learn a programming language
    琼瑶哀悼丈夫去世
    与“芯片”相关的专业有哪些?
    君子使物,不为物使
    SRID (空间引用识别号, 坐标系)【转】
    编码
    test
    剪贴板神器:Ditto
    写Markdown博客时遇到的一些问题
  • 原文地址:https://www.cnblogs.com/qieyu/p/7828417.html
Copyright © 2011-2022 走看看