zoukankan      html  css  js  c++  java
  • 雪球数据的定时爬取

    import requests
    from lxml import etree
    import json
    import pymongo
    

    # 连接mongodb 数据库 存mongodb中 client
    = pymongo.MongoClient('127.0.0.1', port=27017) db = client.xueqiu collection = db.xueqiu
    url
    = 'https://xueqiu.com/' headers = { "User-Agent": "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36" } session = requests.Session() session.get(url=url,headers=headers) def get_page_list(): url = 'https://xueqiu.com/v4/statuses/public_timeline_by_category.json?since_id=-1&max_id=-1&count=10&category=-1' params = { "since_id": "-1", "max_id": "-1", "count": "10", "category": "-1" } response = session.get(url=url,headers=headers,params=params) page_text = response.json() content = page_text["list"] info_dict = {} for x in content: per_info = x["data"] #json 格式 per_info = json.loads(per_info) # print(per_info) id = per_info["id"] title = per_info["title"] description = per_info["description"] target = per_info["target"] detail_url = "https://xueqiu.com"+target info_dict['id']=id info_dict['title'] = title info_dict['detail_url']=detail_url parse_detail(detail_url) # break def parse_detail(url): response = session.get(url=url, headers=headers,) page_text = response.text tree = etree.HTML(page_text) title = tree.xpath('//div[@class="container article__container"]//h1[@class="article__bd__title"]/text()') print(title) print("=="*20) data_dict = {} data_dict["title"] = title p_list = tree.xpath('//div[@class="article__bd__detail"]/p') content_list = [] for p in p_list: content = p.xpath('./text()|./b/text()') content = "".join(content).strip() # print(content) if len(content)>0: content_list.append(content) content_str = "".join(content_list) data_dict["content"] = content_str # print(data_dict) collection.insert([data_dict]) def main(): get_page_list() if __name__ == '__main__': main()

    优化成redis增量式获取数据

    import requests
    from lxml import etree
    import json
    from redis import Redis
    import pymongo
    import time
    import datetime
    
    client = pymongo.MongoClient('127.0.0.1', port=27017)
    db = client.xueqiu
    collection = db.xueqiu
    
    conn = Redis(host='127.0.0.1',port=6379)
    
    
    url = 'https://xueqiu.com/'
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36"
    }
    session = requests.Session()
    session.get(url=url,headers=headers)
    
    def get_page_list():
        url = 'https://xueqiu.com/v4/statuses/public_timeline_by_category.json?since_id=-1&max_id=-1&count=10&category=-1'
        params = {
            "since_id": "-1",
            "max_id": "-1",
            "count": "10", # 获取10条信息
            "category": "-1"
        }
        response = session.get(url=url,headers=headers,params=params)
        page_text = response.json()
        content = page_text["list"]
    
        info_dict = {}
        for x in content:
            per_info = x["data"] #json 格式
            per_info = json.loads(per_info)
            # print(per_info)
            id  = per_info["id"]
            title = per_info["title"]
            description = per_info["description"]
            target = per_info["target"]
            detail_url = "https://xueqiu.com"+target
    
         # url去重 ex
    = conn.sadd('news_urls', detail_url) if ex == 0: print('暂无最新数据可爬取......') else: print('有最新数据的更新......') info_dict['id']=id info_dict['title'] = title info_dict['detail_url']=detail_url parse_detail(detail_url) # break def parse_detail(url): response = session.get(url=url, headers=headers,) page_text = response.text tree = etree.HTML(page_text) title = tree.xpath('//div[@class="container article__container"]//h1[@class="article__bd__title"]/text()') print(title) print("=="*20) data_dict = {} data_dict["title"] = title p_list = tree.xpath('//div[@class="article__bd__detail"]/p') content_list = [] for p in p_list: content = p.xpath('./text()|./b/text()') content = "".join(content).strip() # print(content) if len(content)>0: content_list.append(content) content_str = "".join(content_list) data_dict["content"] = content_str # print(data_dict) collection.insert([data_dict]) def main(): flag = 0 now = datetime.datetime.now() sched_time = datetime.datetime(now.year, now.month, now.day, now.hour, now.minute, now.second) + datetime.timedelta( seconds=6) while True: now = datetime.datetime.now() # 设置爬取时间, 多久爬一次 if sched_time < now: time.sleep(300) print(now) get_page_list() flag = 1 else: if flag == 1: sched_time = sched_time+datetime.timedelta(minutes=1) flag = 0 if __name__ == '__main__': main()
  • 相关阅读:
    如何在DOS中枚举PCI设备
    [Color]深入学习YCbCr色彩模型
    [Imm]Imm API学习笔记——输入法属性
    VBE_INFO(获取VBE信息)
    用VB写高效的图像处理程序 V2.0(2006524)
    ANSI环境下支持多语言输入的单行文本编辑器 V0.01
    分析外星人计算Pi的程序
    位运算模块mBit.bas
    [FileFormat]用VB写的高速GIF、JPEG 编码/解码 程序
    ANTLR笔记3 ANTLRWorks
  • 原文地址:https://www.cnblogs.com/kenD/p/11123726.html
Copyright © 2011-2022 走看看