zoukankan      html  css  js  c++  java
  • Ajax数据爬取

        Ajax 即“Asynchronous Javascript And XML”(异步 JavaScript 和 XML),是指一种创建交互式、快速动态网页应用的网页开发技术,无需重新加载整个网页的情况下,能够更新部分网页的技术。

        通过在后台与服务器进行少量数据交换,Ajax 可以使网页实现异步更新。这意味着可以在不重新加载整个网页的情况下,对网页的某部分进行更新。

    1. 爬取微博页面Ajax数据

    import requests
    from urllib.parse import urlencode
    from pyquery import PyQuery as pq
    import json, pymongo
    
    def get_ajax_page(page):
        headers = {
            'Host': 'weibo.com',
            'Referer': 'https://weibo.com/1461280777/Iz3Iqx2wG?ref=feedsdk&type=comment',
            'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36',
            'X-Requested-With': 'XMLHttpRequest',
            'Cookie': 'SINAGLOBAL=7735058780719.93.1582184597719; _s_tentry=zz.253.com; Apache=6823476113810.396.1584424118910; ULV=1584424118929:5:1:1:6823476113810.396.1584424118910:1582854530521; SUB=_2AkMpLTzuf8NxqwJRmP8dzGLgbIxxywvEieKfcc01JRMxHRl-yT92qnAFtRB6Aq0SASvP3fxjV-YYDUSQSyRek7uE3A6b; SUBP=0033WrSXqPxfM72-Ws9jqgMF55529P9D9WF39BLl0OFppYFLW7GUd5Zl; login_sid_t=8687f896b60dd07aaa80e41d83159a89; cross_origin_proto=SSL; Ugrow-G0=6fd5dedc9d0f894fec342d051b79679e; YF-V5-G0=2583080cfb7221db1341f7a137b6762e; wb_view_log=1366*7681; UOR=zz.253.com,widget.weibo.com,www.baidu.com; YF-Page-G0=d30fd7265234f674761ebc75febc3a9f|1584511608|1584511567'
        }
        url = 'https://weibo.com/aj/v6/comment/big'
        params = {
            'ajwvr': '6',
            'id': '4483557667874538',
            'root_comment_max_id_type': '0',
            'page': page,
        }
        try:
            response = requests.get(url=url, headers=headers, params=params)
            if response.status_code == 200:
                return response.json()
                # print(type(response.json()), response.json())
        except requests.ConnectionError as e:
            print('error', e.args)
    
    
    def parse_page(js):
        data = js.get('data')
        html = data.get('html')
        doc = pq(html)
        items = doc('div.list_con').items()
        for item in items:
            msg = {}
            msg['name'] = item('.WB_text').text().split('')[0]
            msg['content'] = item('.WB_text').text().split('')[1]
            msg['datetime'] = item('div.WB_from.S_txt2').text()
    
            yield msg
    
    def collection_mongo(host='localhost', port=27017):
        client = pymongo.MongoClient(host=host, port=port)
    
        return client
    
    
    def save_mongo(client ,data):
    
        db = client.weibo
        collection = db.weibo
    
        if collection.insert(data):
            print('Save to mongo')
    
    def search_mongo(client):
        db = client.weibo
        collection = db.weibo
        result = collection.find()
        return result
    
    def main():
        for i in range(1, 11):
            js = get_ajax_page(str(i))
            results = parse_page(js)
            for result in results:
                client = collection_mongo('10.0.0.100')
                save_mongo(client, result)
    
    if __name__ == '__main__':
        # main()
        client = collection_mongo('10.0.0.100')
        data = search_mongo(client)
        for item in data:
            print(item)
    View Code

    2. Ajax爬取头条街拍图片

    import requests
    from urllib.parse import urlencode
    from pyquery import PyQuery as pq
    import json, pymongo
    
    def get_ajax_page(page):
        headers = {
            'Host': 'weibo.com',
            'Referer': 'https://weibo.com/1461280777/Iz3Iqx2wG?ref=feedsdk&type=comment',
            'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36',
            'X-Requested-With': 'XMLHttpRequest',
            'Cookie': 'SINAGLOBAL=7735058780719.93.1582184597719; _s_tentry=zz.253.com; Apache=6823476113810.396.1584424118910; ULV=1584424118929:5:1:1:6823476113810.396.1584424118910:1582854530521; SUB=_2AkMpLTzuf8NxqwJRmP8dzGLgbIxxywvEieKfcc01JRMxHRl-yT92qnAFtRB6Aq0SASvP3fxjV-YYDUSQSyRek7uE3A6b; SUBP=0033WrSXqPxfM72-Ws9jqgMF55529P9D9WF39BLl0OFppYFLW7GUd5Zl; login_sid_t=8687f896b60dd07aaa80e41d83159a89; cross_origin_proto=SSL; Ugrow-G0=6fd5dedc9d0f894fec342d051b79679e; YF-V5-G0=2583080cfb7221db1341f7a137b6762e; wb_view_log=1366*7681; UOR=zz.253.com,widget.weibo.com,www.baidu.com; YF-Page-G0=d30fd7265234f674761ebc75febc3a9f|1584511608|1584511567'
        }
        url = 'https://weibo.com/aj/v6/comment/big'
        params = {
            'ajwvr': '6',
            'id': '4483557667874538',
            'root_comment_max_id_type': '0',
            'page': page,
        }
        try:
            response = requests.get(url=url, headers=headers, params=params)
            if response.status_code == 200:
                return response.json()
                # print(type(response.json()), response.json())
        except requests.ConnectionError as e:
            print('error', e.args)
    
    
    def parse_page(js):
        data = js.get('data')
        html = data.get('html')
        doc = pq(html)
        items = doc('div.list_con').items()
        for item in items:
            msg = {}
            msg['name'] = item('.WB_text').text().split('')[0]
            msg['content'] = item('.WB_text').text().split('')[1]
            msg['datetime'] = item('div.WB_from.S_txt2').text()
    
            yield msg
    
    def collection_mongo(host='localhost', port=27017):
        client = pymongo.MongoClient(host=host, port=port)
    
        return client
    
    
    def save_mongo(client ,data):
    
        db = client.weibo
        collection = db.weibo
    
        if collection.insert(data):
            print('Save to mongo')
    
    def search_mongo(client):
        db = client.weibo
        collection = db.weibo
        result = collection.find()
        return result
    
    def main():
        for i in range(1, 11):
            js = get_ajax_page(str(i))
            results = parse_page(js)
            for result in results:
                client = collection_mongo('10.0.0.100')
                save_mongo(client, result)
    
    if __name__ == '__main__':
        # main()
        client = collection_mongo('10.0.0.100')
        data = search_mongo(client)
        for item in data:
            print(item)
    View Code
  • 相关阅读:
    关于window7下的apache+php+mysql的配置
    pysam
    pysam读取bam files[转载]
    曼哈顿图[转载]
    关于在shell中直接修改文件名
    Linux怎样将文本行倒序排列
    用Annovar注释非人类基因组,如小鼠mm9
    shell中的##*,%%*问题
    BEAMing技术
    Annovar注释说明【转载自http://blog.csdn.net/u013816205/article/details/51262289】
  • 原文地址:https://www.cnblogs.com/Caiyundo/p/12554341.html
Copyright © 2011-2022 走看看