zoukankan      html  css  js  c++  java
  • 知乎爬虫(待完善)

    #!/usr/bin/env python3
    # -*- coding: utf-8 -*-
    '''
    Required
    - requests (必须)
    - pillow (可选)
    Info
    - author : "xchaoinfo"
    - email  : "xchaoinfo@qq.com"
    - date   : "2016.2.4"
    Update
    - name   : "wangmengcn"
    - email  : "eclipse_sv@163.com"
    - date   : "2016.4.21"
    '''
    import requests
    try:
        import cookielib
    except:
        import http.cookiejar as cookielib
    import re
    import time
    import os.path
    try:
        from PIL import Image
    except:
        pass
    from bs4 import BeautifulSoup as sb
    import lxml
    
    
    # 构造 Request headers
    agent = 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Mobile Safari/537.36'
    headers = {
        "Host": "www.zhihu.com",
        "Referer": "https://www.zhihu.com/",
        'User-Agent': agent
    }
    
    # 使用登录cookie信息
    session = requests.session()
    session.cookies = cookielib.LWPCookieJar(filename='cookies')
    try:
        session.cookies.load(ignore_discard=True)
        print('已加载cookie...')
    except:
        print("Cookie 未能加载")
    
    
    def get_xsrf():
        '''_xsrf 是一个动态变化的参数'''
        index_url = 'https://www.zhihu.com'
        # 获取登录时需要用到的_xsrf
        index_page = session.get(index_url, headers=headers)
        html = index_page.text
        soup = sb(html,'lxml')
        try:
            _xsrf = soup.find('input')
            print(_xsrf)
        except:
            print('找不到_xsrf...')
        print(_xsrf)
        return _xsrf
    
    
    # 获取验证码
    def get_captcha():
        t = str(int(time.time() * 1000))
        captcha_url = 'https://www.zhihu.com/captcha.gif?r=' + t + "&type=login"
        r = session.get(captcha_url, headers=headers)
        with open('captcha.jpg', 'wb') as f:
            f.write(r.content)
            f.close()
        # 用pillow 的 Image 显示验证码
        # 如果没有安装 pillow 到源代码所在的目录去找到验证码然后手动输入
        try:
            im = Image.open('captcha.jpg')
            im.show()
            im.close()
        except:
            print(u'请到 %s 目录找到captcha.jpg 手动输入' % os.path.abspath('captcha.jpg'))
        captcha = input("please input the captcha
    >")
        return captcha
    
    
    def isLogin():
        # 通过查看用户个人信息来判断是否已经登录
        url = "https://www.zhihu.com/settings/profile"
        login_code = session.get(url, headers=headers, allow_redirects=False).status_code
        if login_code == 200:
            return True
        else:
            return False
    
    def get_data():
        url = "https://www.zhihu.com/people/yang-an-yang-57/asks"
        r = session.get(url,headers=headers)
        soup = sb(r.text,"lxml")
    
        ques = soup.find_all('a',class_="question_link")
        for q in ques:
            print(q.text)
    
    
    def login(secret, account):
        _xsrf = get_xsrf()
        headers["X-Xsrftoken"] = _xsrf
        headers["X-Requested-With"] = "XMLHttpRequest"
        # 通过输入的用户名判断是否是手机号
        if re.match(r"^1d{10}$", account):
            print("手机号登录 
    ")
            post_url = 'https://www.zhihu.com/login/phone_num'
            postdata = {
                '_xsrf': _xsrf,
                'password': secret,
                'phone_num': account
            }
        else:
            if "@" in account:
                print("邮箱登录 
    ")
            else:
                print("你的账号输入有问题,请重新登录")
                return 0
            post_url = 'https://www.zhihu.com/login/email'
            postdata = {
                '_xsrf': _xsrf,
                'password': secret,
                'email': account
            }
        # 不需要验证码直接登录成功
        login_page = session.post(post_url, data=postdata, headers=headers)
        login_code = login_page.json()
        if login_code['r'] == 1:
            # 不输入验证码登录失败
            # 使用需要输入验证码的方式登录
            postdata["captcha"] = get_captcha()
            login_page = session.post(post_url, data=postdata, headers=headers)
            login_code = login_page.json()
            print(login_code['msg'])
        # 保存 cookies 到文件,
        # 下次可以使用 cookie 直接登录,不需要输入账号和密码
        session.cookies.save()
    
    try:
        input = raw_input
    except:
        pass
    
    
    if __name__ == '__main__':
        if isLogin():
            print('您已经登录')
        else:
            account = input('请输入你的用户名
    >  ')
            secret = input("请输入你的密码
    >  ")
            login(secret, account)
        get_data()
  • 相关阅读:
    Splunk Fundamentals 2 – Lab Exercises
    Splunk Fundamentals 1 Lab Exercises
    python交换机自动化巡检
    nginx web服务器
    linux tips
    web服务器统计情况
    HTTP请求报文和响应报文
    python 之路 day 14 HTML CSS
    python 之路 13 ORM SQLAlchemy
    python 之路12 RabbitMQ Python 操作mysql
  • 原文地址:https://www.cnblogs.com/peter1994/p/7444510.html
Copyright © 2011-2022 走看看