zoukankan      html  css  js  c++  java
  • 爬虫实现模拟登陆豆瓣

    一:获取页面 然后返回验证码,自己填写验证码来模拟登陆(相当于手动模拟登陆)

    # -*- coding: utf-8 -*-
    import requests
    from HTMLParser import HTMLParser
    
    
    class DoubanClient(object):
        def __init__(self):
            object.__init__(self)
            headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.73 Safari/537.36',
                       'origin': 'http://www.douban.com'}
            self.session = requests.session()
            self.session.headers.update(headers)
    
    
        def login(self, username, password,
                  source='index_nav',
                  redir='http://www.douban.com/',
                  login='登录'):
    
            url = 'https://www.douban.com/accounts/login'
            r = self.session.get(url)
            (captcha_id, captcha_url) = _get_captcha(r.content)
            if captcha_id:
                captcha_solution = raw_input('please input solution for captcha [%s]:' % captcha_url)
            url = 'https://www.douban.com/accounts/login'
            data = {'form_email': username,
                    'form_password': password,
                    'source': source,
                    'redir': redir,
                    'login': login}
            headers = {'referer': 'http://www.douban.com/accounts/login?source=main',
                       'host': 'accounts.douban.com'}
            if captcha_id:
                data['captcha-id'] = captcha_id
                data['captcha-solution'] = captcha_solution
            self.session.post(url, data=data, headers=headers)
            print(self.session.cookies.items())
    
        def edit_signature(self, username, signature):
            url = 'https://www.douban.com/people/%s/' % username
            r = self.session.get(url)
            data = {'ck': _get_ck(r.content), 'signature': signature}
            url = 'https://www.douban.com/j/people/%s/edit_signature' % username
            headers = {'referer': url,
                       'host': 'www.douban.com',
                       'x-requested-with': 'XMLHttpRequest'}
            r = self.session.post(url, data=data, headers=headers)
            print(r.content)
    
    
    def _attr(attrs, attrname):
        for attr in attrs:
            if attr[0] == attrname:
                return attr[1]
        return None
    
    
    def _get_captcha(content):
    
        class CaptchaParser(HTMLParser):
            def __init__(self):
                HTMLParser.__init__(self)
                self.captcha_id = None
                self.captcha_url = None
    
            def handle_starttag(self, tag, attrs):
                if tag == 'img' and _attr(attrs, 'id') == 'captcha_image' and _attr(attrs, 'class') == 'captcha_image':
                    self.captcha_url = _attr(attrs, 'src')
    
                if tag == 'input' and _attr(attrs, 'type') == 'hidden' and _attr(attrs, 'name') == 'captcha-id':
                    self.captcha_id = _attr(attrs, 'value')
    
        p = CaptchaParser()
        p.feed(content)
        return p.captcha_id, p.captcha_url
    
    
    def _get_ck(content):
    
        class CKParser(HTMLParser):
            def __init__(self):
                HTMLParser.__init__(self)
                self.ck = None
    
            def handle_starttag(self, tag, attrs):
                if tag == 'input' and _attr(attrs, 'type') == 'hidden' and _attr(attrs, 'name') == 'ck':
                    self.ck = _attr(attrs, 'value')
    
        p = CKParser()
        p.feed(content)
        return p.ck
    
    
    if __name__ == '__main__':
        c = DoubanClient()
        c.login('username@douban.com', 'password@douban.com')
        c.edit_signature('username', 'python 爬虫基础')

    二:需要先登陆一次,获得你的登陆cookie,然后粘贴过来(cookie会过期,只能维持一小段时间)

    #coding=gbk
    
    import urllib2
    
    HEADERS = {"cookie": '写你的cookie'}#里面写你在www.douban.com的cookie
    url = 'http://www.douban.com/'
    req = urllib2.Request(url, headers=HEADERS)
    text = urllib2.urlopen(req).read()
    
    print text  ##证明已经登陆成功
  • 相关阅读:
    技术专题:ROS通过TTL值来防止二层路由的最简单办法
    唉,一大早起床遇到脑残的,实在无语!QQ:124316912
    简单描述FTTH方案中EPON、GPON设置的优势、原理及城中村的解决方案
    9.9成新WAYOS、HZZ、ROS软件路由WAN扩展交换机大量到货只需450
    辅助工具:免输入命令,WAYOS通过交换机一键扩展WAN口工具
    配置文档:3COM 4200 3C17300A配置文件,可与WAYOS、ROS、海蜘蛛多WAN对接
    网站页面跳转代码大全,网站网页跳转代码
    popupWin 属性及用法介绍 ASP.NET控件,仿QQ,msn右下角弹出窗口
    IIS打开ASP文件出现Server Application Error提示的解决方法,本人亲历,成功
    教你学会提高无线网下载速度的方法
  • 原文地址:https://www.cnblogs.com/tangbinghaochi/p/6140582.html
Copyright © 2011-2022 走看看