zoukankan      html  css  js  c++  java
  • 去哪儿

    '''
    爬取去哪儿网所有城市自由行数据
    爬取速度过快时会引发服务器返回错误
    反反爬策略:设置cookies池和ip代理池以及延长爬虫休眠时间
    '''
    import requests
    import time
    from urllib.parse import quote
    from multiprocessing import Pool
    import pymongo
    import random
    
    def begin():
        """
        获取去哪儿网出发地站点列表
        :return:
        """
        depurl='https://touch.dujia.qunar.com/depCities.qunar'
        response=requests.get(depurl)
        deps=response.json()
        for dep_item in deps['data']:
            for dep in deps['data'][dep_item]:
                yield dep#出发城市
    
    def main(dep):
        """
        获取去哪儿网出发地可旅行的目的地列表
        :param dep: 出发地
        :return: 目的地列表
        """
        a = []
        #中文字符要进行编码
        desurl = 'https://touch.dujia.qunar.com/golfz/sight/arriveRecommend?dep={}&exclude=&extensionImg=255,175'.format(
            quote(dep))
        time.sleep(random.randint(2,4))
        response = requests.get(desurl)
        des = response.json()
        for des_item in des['data']:
            for des_item_1 in des_item['subModules']:
                for query in des_item_1['items']:
                    if query['query'] not in a:#去重,目的城市中有重复出现
                        a.append(query['query'])#目的城市列表
        get(a, dep)
    
    def get(array,dep):
        """
        得到去哪儿网自由行数据搜索结果
        :param array: 目的城市列表
        :param dep: 出发城市
        :return:出发城市到目的城市的自由行结果
        """
        for item in array:
            # 头文件 防止反爬
            headers = {
                'cookie':'QN99=8770; QN1=eIQjmVtYQgbBDaEiPevvAg==; csrfToken=zKMVroGqYK6fdBphXg8rqQ3MpcaiZ7TZ; QN269=AA9586A58FEC11E88A24FA163E233FC1; QN601=3f55b4673bbd18ac3206bfea7c5996d3; QunarGlobal=10.86.213.148_6291bf49_164d0ba9dbf_-1a4d|1532510727219; _i=RBTKSaIAM3KBlurx6OwRjfuQ8pEx; QN300=auto_4e0d874a; QN163=0; QN6=auto_4e0d874a; QN48=tc_427b9f2555dccb4c_164d9787381_d960; _RSG=Ue4lzWGVuXAKnGpozKI.OB; _RDG=28c738c8ddc979203b2642a9f86b2ac273; _RGUID=a8787d08-3dbc-4a1e-b63e-494f72cd0c54; QN205=auto_4e0d874a; QN234=home_free_t; _vi=Xan8_FldA2NGBwqzRSKDNIYHisxd4ARxiomsg1mowQsC4OV3wCXnooJECkbZWsL9_3XGq9mmj5lTyMlGPRfgZD0jC_eS-Vas8fJyOdtOVO02USpBUqqwRZ1LfhiofVGvkPVi9NW0omogB1BkpWCaX2atkxba7uWItHjFuSd5R2NK; QN162=%E6%B7%B1%E5%9C%B3; QN233=FreetripTouchin; DJ12=eyJxIjoi5p2t5bee6Ieq55Sx6KGMIiwic3UiOiI4MDU5MjU4OTIiLCJkIjoi5rex5ZyzIiwiZSI6IkEiLCJsIjoiMCwyOCIsInRzIjoiZGQxNDZmZWYtMWY2NC00N2U5LWIyNjAtMTY0ODE2ZTlmYmQ0In0; _RF1=113.110.176.137; _pk_ref.1.8600=%5B%22%22%2C%22%22%2C1533395038%2C%22http%3A%2F%2Ftouch.qunar.com%2F%22%5D; _pk_ses.1.8600=*; _pk_id.1.8600=92302397325aca81.1533353790.5.1533395068.1533392908.; QN243=168',
                'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 '
                              '(KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36',
                'Referer': 'https://touch.dujia.qunar.com/p/list?cfrom=zyx&dep={}&query={}&it=FreetripTouchin&et=home_free_t'.format(quote(dep),quote(item))
            }
            resulturl = 'https://touch.dujia.qunar.com/list?modules=list%2CbookingInfo%2CactivityDetail&dep={}&query={}&dappDealTrace=false&mobFunction=%E6%89%A9%E5%B1%95%E8%87%AA%E7%94%B1%E8%A1%8C&cfrom=zyx&it=FreetripTouchin&date=&configDepNew=&needNoResult=true&originalquery={}&limit=0,28&includeAD=true&qsact=search'.format(
                quote(dep), quote(item), quote(item))
            time.sleep(random.randint(2,4))
            response = requests.get(resulturl, headers=headers).json()
    
            #容错处理,防止json文件中有不存在的项引起报错
            try:
                routecount = int(response['data']['limit']['routeCount'])#获取
                for limit in range(0, routecount, 28):
                    resulturl = 'https://touch.dujia.qunar.com/list?modules=list%2CbookingInfo%2CactivityDetail&dep={}&query={}' 
                                '&dappDealTrace=false&mobFunction=%E6%89%A9%E5%B1%95%E8%87%AA%E7%94%B1%E8%A1%8C&cfrom=zyx&' 
                                'it=FreetripTouchin&date=&configDepNew=&needNoResult=true&originalquery={}&limit={},28&' 
                                'includeAD=true&qsact=search'.format(quote(dep), quote(item), quote(item), limit)
                    time.sleep(random.randint(2,4))
                    response = requests.get(resulturl, headers=headers)
                    items=response.json()['data']['list']['results'][0]
                    result = {
                        '时间': time.strftime('%Y-%m-%d', time.localtime(time.time())),
                        '出发地': dep,
                        '目的地': item,
                        '价格':items['price'],
                        '天数': items['accomInclude'],
                        '亮点': items['brightspots'],
                        '出行工具':items['backtraffic'],
                        '类别':items['ttsRouteType']
                    }
                    print(result)
                    savetomongo(result)
                    time.sleep(1)
            except:
                return
    
    client=pymongo.MongoClient(host='127.0.0.1',port=27017)#连接mongodb
    db=client['qunar']#mongodb数据库
    collection='travel'#mongodb集合
    
    
    def savetomongo(result):
        """
        保存到mongodb数据库
        :param result: 出发城市到目的城市自由行搜索结果
        :return:
        """
        db[collection].insert(result)#插入数据到mongodb
    
    
    if __name__ == '__main__':
        deps = begin()
        #开启多线程
        pool=Pool()
        pool.map(main,[dep for dep in deps])
        client.close()
  • 相关阅读:
    合并hive/hdfs小文件
    NoSql图形数据库
    mysql分表
    查看带宽使用
    mysql中间件kingshard
    centos7安装docker
    nginx的proxy_pass路径转发规则最后带/问题
    centos关闭邮件提醒
    JOSN转列格式(csv文件)
    Android 常见对话框
  • 原文地址:https://www.cnblogs.com/tjp40922/p/10877171.html
Copyright © 2011-2022 走看看