zoukankan      html  css  js  c++  java
  • 一个可以获取知乎timeline的爬虫

    # -*- coding: utf-8 -*-
    import requests
    import lxml
    import os,time
    from bs4 import BeautifulSoup as sb
    try:
        import cookielib
    
    except:
        import http.cookiejar as cookielib
    import json
    
    headers = {
            "Host": "www.zhihu.com",
            "Accept-Language":"zh-CN,zh;q=0.8",
            "accept":"application/json, text/plain, */*",
            "Referer": "https://www.zhihu.com/",
            "Connection":"keep-alive",
            'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Mobile Safari/537.36',
            "authorization" : "Bearer Mi4xUXJGd0FBQUFBQUFBa0VKNTBfbnVDeGNBQUFCaEFsVk5OQmZMV1FCVnQ3aEhfeUVsUElGN1Zrd3RSSWpMdHI0ZG5B|1503889972|a235d0e24d646c5df6b1f667abc005381c273870"
        }
    
    def get_session():
        session = requests.session()
        session.cookies = cookielib.LWPCookieJar(filename="cookies")
        try:
            session.cookies.load()
            print("cookie 加载成功!")
        except:
            print("cookie 无法加载...")
        return session
    
    session = get_session()
    
    data = {"action":"True",
            "limit":"10",
            "session_token":"c9c3581148b6d633275ba5d4412d3bd8",
            "action":"down",
            "after_id":"0",
            "desktop":"true"
            }
    
    def get_data():
        res = session.get("https://www.zhihu.com/api/v3/feed/topstory", data=data, headers=headers)
        json = res.json()
        global count
        for i in json['data']:
            try:
                print(i['target']['question']['title'])
            except:
                print('没有问题了'+str(i))
            try:
                print(i['target']['content'])
            except:
                print('找不到答案了'+str(i))
            count += 1
            print()
    count = 0
    for n in range(5):
        data["after_id"] = n*10
        get_data()
        time.sleep(3)
    
    
    print(count)
  • 相关阅读:
    ural(Timus) 1019 Line Painting
    ACMICPC Live Archive 2031 Dance Dance Revolution
    poj 3321 Apple Tree
    其他OJ 树型DP 选课
    poj 3548 Restoring the digits
    ACMICPC Live Archive 3031 Cable TV Network
    递归循环获取指定节点下面的所有子节点
    手动触发asp.net页面验证控件事件
    子级Repeater获取父级Repeater绑定项的值
    没有列名的数据绑定
  • 原文地址:https://www.cnblogs.com/peter1994/p/7449751.html
Copyright © 2011-2022 走看看