''' 爬取去哪儿网所有城市自由行数据 爬取速度过快时会引发服务器返回错误 反反爬策略:设置cookies池和ip代理池以及延长爬虫休眠时间 ''' import requests import time from urllib.parse import quote from multiprocessing import Pool import pymongo import random def begin(): """ 获取去哪儿网出发地站点列表 :return: """ depurl='https://touch.dujia.qunar.com/depCities.qunar' response=requests.get(depurl) deps=response.json() for dep_item in deps['data']: for dep in deps['data'][dep_item]: yield dep#出发城市 def main(dep): """ 获取去哪儿网出发地可旅行的目的地列表 :param dep: 出发地 :return: 目的地列表 """ a = [] #中文字符要进行编码 desurl = 'https://touch.dujia.qunar.com/golfz/sight/arriveRecommend?dep={}&exclude=&extensionImg=255,175'.format( quote(dep)) time.sleep(random.randint(2,4)) response = requests.get(desurl) des = response.json() for des_item in des['data']: for des_item_1 in des_item['subModules']: for query in des_item_1['items']: if query['query'] not in a:#去重,目的城市中有重复出现 a.append(query['query'])#目的城市列表 get(a, dep) def get(array,dep): """ 得到去哪儿网自由行数据搜索结果 :param array: 目的城市列表 :param dep: 出发城市 :return:出发城市到目的城市的自由行结果 """ for item in array: # 头文件 防止反爬 headers = { 'cookie':'QN99=8770; QN1=eIQjmVtYQgbBDaEiPevvAg==; csrfToken=zKMVroGqYK6fdBphXg8rqQ3MpcaiZ7TZ; QN269=AA9586A58FEC11E88A24FA163E233FC1; QN601=3f55b4673bbd18ac3206bfea7c5996d3; QunarGlobal=10.86.213.148_6291bf49_164d0ba9dbf_-1a4d|1532510727219; _i=RBTKSaIAM3KBlurx6OwRjfuQ8pEx; QN300=auto_4e0d874a; QN163=0; QN6=auto_4e0d874a; QN48=tc_427b9f2555dccb4c_164d9787381_d960; _RSG=Ue4lzWGVuXAKnGpozKI.OB; _RDG=28c738c8ddc979203b2642a9f86b2ac273; _RGUID=a8787d08-3dbc-4a1e-b63e-494f72cd0c54; QN205=auto_4e0d874a; QN234=home_free_t; _vi=Xan8_FldA2NGBwqzRSKDNIYHisxd4ARxiomsg1mowQsC4OV3wCXnooJECkbZWsL9_3XGq9mmj5lTyMlGPRfgZD0jC_eS-Vas8fJyOdtOVO02USpBUqqwRZ1LfhiofVGvkPVi9NW0omogB1BkpWCaX2atkxba7uWItHjFuSd5R2NK; QN162=%E6%B7%B1%E5%9C%B3; QN233=FreetripTouchin; DJ12=eyJxIjoi5p2t5bee6Ieq55Sx6KGMIiwic3UiOiI4MDU5MjU4OTIiLCJkIjoi5rex5ZyzIiwiZSI6IkEiLCJsIjoiMCwyOCIsInRzIjoiZGQxNDZmZWYtMWY2NC00N2U5LWIyNjAtMTY0ODE2ZTlmYmQ0In0; _RF1=113.110.176.137; _pk_ref.1.8600=%5B%22%22%2C%22%22%2C1533395038%2C%22http%3A%2F%2Ftouch.qunar.com%2F%22%5D; _pk_ses.1.8600=*; _pk_id.1.8600=92302397325aca81.1533353790.5.1533395068.1533392908.; QN243=168', 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 ' '(KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36', 'Referer': 'https://touch.dujia.qunar.com/p/list?cfrom=zyx&dep={}&query={}&it=FreetripTouchin&et=home_free_t'.format(quote(dep),quote(item)) } resulturl = 'https://touch.dujia.qunar.com/list?modules=list%2CbookingInfo%2CactivityDetail&dep={}&query={}&dappDealTrace=false&mobFunction=%E6%89%A9%E5%B1%95%E8%87%AA%E7%94%B1%E8%A1%8C&cfrom=zyx&it=FreetripTouchin&date=&configDepNew=&needNoResult=true&originalquery={}&limit=0,28&includeAD=true&qsact=search'.format( quote(dep), quote(item), quote(item)) time.sleep(random.randint(2,4)) response = requests.get(resulturl, headers=headers).json() #容错处理,防止json文件中有不存在的项引起报错 try: routecount = int(response['data']['limit']['routeCount'])#获取 for limit in range(0, routecount, 28): resulturl = 'https://touch.dujia.qunar.com/list?modules=list%2CbookingInfo%2CactivityDetail&dep={}&query={}' '&dappDealTrace=false&mobFunction=%E6%89%A9%E5%B1%95%E8%87%AA%E7%94%B1%E8%A1%8C&cfrom=zyx&' 'it=FreetripTouchin&date=&configDepNew=&needNoResult=true&originalquery={}&limit={},28&' 'includeAD=true&qsact=search'.format(quote(dep), quote(item), quote(item), limit) time.sleep(random.randint(2,4)) response = requests.get(resulturl, headers=headers) items=response.json()['data']['list']['results'][0] result = { '时间': time.strftime('%Y-%m-%d', time.localtime(time.time())), '出发地': dep, '目的地': item, '价格':items['price'], '天数': items['accomInclude'], '亮点': items['brightspots'], '出行工具':items['backtraffic'], '类别':items['ttsRouteType'] } print(result) savetomongo(result) time.sleep(1) except: return client=pymongo.MongoClient(host='127.0.0.1',port=27017)#连接mongodb db=client['qunar']#mongodb数据库 collection='travel'#mongodb集合 def savetomongo(result): """ 保存到mongodb数据库 :param result: 出发城市到目的城市自由行搜索结果 :return: """ db[collection].insert(result)#插入数据到mongodb if __name__ == '__main__': deps = begin() #开启多线程 pool=Pool() pool.map(main,[dep for dep in deps]) client.close()