zoukankan      html  css  js  c++  java
  • 爬取携程7天内的全国热门城市航班

      1 #!/usr/bin/env python
      2 # coding: utf-8
      6 
      7 import requests
      8 import pandas as pd
      9 import json,random,time,datetime
     10 
     11 # userAgent
     12 userAgent = [
     13     "Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36",
     14     "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.7 (KHTML, like Gecko) Chrome/16.0.912.36 Safari/535.7",
     15     "Mozilla/5.0 (Windows NT 6.2; Win64; x64; rv:16.0) Gecko/16.0 Firefox/16.0",
     16     "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/534.55.3 (KHTML, like Gecko) Version/5.1.3 Safari/534.53.10",
     17     "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)",
     18     "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)",
     19     "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1500.55 Safari/537.36",
     20     "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_2) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1309.0 Safari/537.17"
     21     "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:2.0b13pre) Gecko/20110307 Firefox/4.0b13pre",
     22     "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:16.0) Gecko/20100101 Firefox/16.0",
     23     "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11",
     24     "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36",
     25     "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.70 Safari/537.36"
     26 ]
     27 
     28 # get city
     29 def getCityMsg():
     30     headers = {
     31         'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36',
     32         "Referer": "https://flights.ctrip.com/itinerary",
     33         "Content-Type": "application/json"
     34     }
     35     url = 'https://flights.ctrip.com/itinerary/api/poi/get'
     36     r = requests.get(url=url,headers=headers).text
     37 #     print(len(r))
     38     # get city msg
     39     city = {}
     40     city_load = json.loads(r).get('data')
     41     for data in city_load.keys():
     42         ## 所有航班
     43         # if data != '热门':   
     44         #     tmpdata = city_load.get(data)
     45     #         for i in tmpdata:
     46     # #             print(i)  # A 
     47     #             for k in tmpdata.get(i):   
     48     #                 name = k.get('data').split('|')
     49     #                 cityNumId = name[2]
     50     #                 cityId = name[3]
     51     #                 cityName = name[1].split('(')[0]
     52     #                 city[cityName] = [cityId, cityNumId]
     53         if data == '热门':     # 仅限热门城市
     54             tmpdata = city_load.get(data)
     55             for i in tmpdata:   # tmpdata is list , i is dict
     56                 name = i.get('data').split('|')
     57                 cityNumId = name[2]
     58                 cityId = name[3]
     59                 cityName = name[1].split('(')[0]
     60                 city[cityName] = [cityId, cityNumId]
     61 
     62             
     63     return city
     64 
     65 # 生成自今日至往后7天日期
     66 def get_date():
     67     dateList = []  # 存放时间list
     68     formatDate = datetime.datetime.now()  # 生成今日的格式化时间
     69     strDate = formatDate.strftime('%Y-%m-%d')  # 生成字符串日期
     70     stpDate = datetime.datetime.strptime(strDate,'%Y-%m-%d')  # 将字符串转为日期格式的日期
     71     for i in range(7):
     72         stpDate += datetime.timedelta(days=+1)   # 日期叠加1
     73         dateList.append(datetime.datetime.strftime(stpDate,'%Y-%m-%d'))  # 放入字典
     74     return dateList
     75 
     76 # get page text:routeList
     77 def  get_routeList(headers, load_json, cnt):
     78     try:
     79         response = requests.post(url = "https://flights.ctrip.com/itinerary/api/12808/products",data=json.dumps(load_json), headers = headers).text
     80         result = json.loads(response)["data"].get('routeList')
     81         return json.loads(response)["data"].get('routeList')
     82     except Exception as e:
     83         print('Get 【{} --> {}】 Page is failed !'.format(load_json.get('airportParams')[0].get('dcityname'), load_json.get('airportParams')[0].get('acityname')))
     84         print('休息10m后再来……')
     85         time.sleep(600)
     86         cnt += 1
     87         if cnt <= 10:
     88             get_routeList(headers, load_json, cnt)
     89         else:
     90             return None
     91 # get Data
     92 def get_data(index, df, routeList):
     93     if routeList is not None:
     94         for i, route in enumerate(routeList):
     95             if route.get('routeType') == 'Flight':  # 只要航班
     96                 index += 1
     97                 # route is dict
     98                 # we need route inside legs, legs is list, but its lengths is 1
     99                 # so we should legs[0], legs[0] is dict
    100 
    101                 # flight
    102                 flight = route.get('legs')[0].get('flight')  # dict
    108 
    109                 #### about flight
    110                 if flight is not None:
    111                     # common attr
    112                     df.loc[index,'airlineCode'] = flight.get('airlineCode')
    113                     df.loc[index,'AirlineName'] = flight.get('airlineName')
    114                     df.loc[index,'durationDays'] = flight.get('durationDays')
    115                     df.loc[index,'flightNumber'] = flight.get('flightNumber')
    116                     df.loc[index,'mealFlag'] = flight.get('mealFlag')
    117                     df.loc[index,'mealType'] = flight.get('mealType')
    118                     df.loc[index,'comfort'] = flight.get('comfort')
    119                     df.loc[index,'craftKind'] = flight.get('craftKind')
    120                     df.loc[index,'craftTypeCode'] = flight.get('craftTypeCode')
    121                     df.loc[index,'craftTypeKindDisplayName'] = flight.get('craftTypeKindDisplayName')
    122                     df.loc[index,'craftTypeName'] = flight.get('craftTypeName')
    123                     df.loc[index,'delayedTime'] = flight.get('delayedTime')
    124                     df.loc[index,'oilFee'] = flight.get('oilFee')
    125                     df.loc[index,'punctualityRate'] = flight.get('punctualityRate')
    126                     df.loc[index,'sharedFlightName']  = flight.get('sharedFlightName')
    127                     df.loc[index,'sharedFlightNumber'] = flight.get('sharedFlightNumber')
    128                     df.loc[index,'specialCraft'] = flight.get('specialCraft')
    129                     df.loc[index,'stopInfo'] = flight.get('stopInfo')
    130                     df.loc[index,'stopTimes'] = flight.get('stopTimes')
    131                     df.loc[index,'tax'] = flight.get('tax')
    132                     # arrival
    133                     df.loc[index,'arrivalairportName'] = flight.get('arrivalAirportInfo').get('airportName')
    134                     df.loc[index,'arrivalairportTlc'] = flight.get('arrivalAirportInfo').get('airportTlc')
    135                     df.loc[index,'arrivalcityName'] = flight.get('arrivalAirportInfo').get('cityName')
    136                     df.loc[index,'arrivalcityTlc'] = flight.get('arrivalAirportInfo').get('cityTlc')
    137                     df.loc[index,'arrivalTerminalName'] = flight.get('arrivalAirportInfo').get('terminal').get('name')
    138                     df.loc[index,'arrivalDate'] = flight.get('arrivalDate')
    139                     # departure 
    140                     df.loc[index,'departureairportName'] = flight.get('departureAirportInfo').get('airportName')
    141                     df.loc[index,'departureairportTlc'] = flight.get('departureAirportInfo').get('airportTlc')
    142                     df.loc[index,'departureCityName'] = flight.get('departureAirportInfo').get('cityName')
    143                     df.loc[index,'departureCityTlc'] = flight.get('departureAirportInfo').get('cityTlc')
    144                     df.loc[index,'departureTerminalName'] = flight.get('departureAirportInfo').get('terminal').get('name')
    145                     df.loc[index,'departureDate'] = flight.get('departureDate')
    146 
    147                 #### characteristic : charactor
    148                 # characteristic:charactor
    149                 charactor = route.get('legs')[0].get('characteristic')  # dict
    150                 if charactor is not None:
    151                     df.loc[index, 'businessAircraft'] = charactor.get('businessAircraft')
    152                     df.loc[index, 'discountAmount'] = charactor.get('discountAmount')
    153                     df.loc[index, 'discountShowType'] = charactor.get('discountShowType')
    154                     df.loc[index, 'flyMan'] = charactor.get('flyMan')
    155                     df.loc[index, 'groupTicketPrice'] = charactor.get('groupTicketPrice')
    156                     df.loc[index, 'hotFlight'] = charactor.get('hotFlight')
    157                     df.loc[index, 'hx'] = charactor.get('hx')
    158                     df.loc[index, 'infantSoldOut'] = charactor.get('infantSoldOut')
    159                     df.loc[index, 'lowPriceDiscount'] = charactor.get('lowPriceDiscount')
    160                     df.loc[index, 'lowestBabyCfPrice'] = charactor.get('lowestBabyCfPrice')
    161                     df.loc[index, 'lowestBabyPrice'] = charactor.get('lowestBabyPrice')
    162                     df.loc[index, 'lowestCfPrice'] = charactor.get('lowestCfPrice')
    163                     df.loc[index, 'lowestChildAdultCfPrice'] = charactor.get('lowestChildAdultCfPrice')
    164                     df.loc[index, 'lowestChildAdultPrice'] = charactor.get('lowestChildAdultPrice')
    165                     df.loc[index, 'lowestChildCfPrice'] = charactor.get('lowestChildCfPrice')
    166                     df.loc[index, 'lowestChildPrice'] = charactor.get('lowestChildPrice')
    167                     df.loc[index, 'lowestPrice'] = charactor.get('lowestPrice')
    168                     df.loc[index, 'promotion'] = charactor.get('promotion')
    169                     df.loc[index, 'providerHx'] = charactor.get('providerHx')
    170                     df.loc[index, 'roundTripDiscounts'] = charactor.get('roundTripDiscounts')
    171                     tmp_charactor = charactor.get('standardPrices')
    172                     if tmp_charactor is not None:
    173                         for i, stdPrice in enumerate(tmp_charactor):
    174                             diffCabinCla = stdPrice.get('cabinClass')
    175                             df.loc[index, 'price' + diffCabinCla] = stdPrice.get('price')
    176                     df.loc[index, 'superFlyMan'] = charactor.get('superFlyMan')
    177                     df.loc[index, 'weight'] = charactor.get('weight')
    178 
    209     return (index, df)
    210         
    211 
    212 # main function    
    213 def main(city):
    214     # 初始化 时间
    215     flightDates = get_date()
    216     for flightDate in flightDates:   # 起飞日期
    217         df = pd.DataFrame()
    218         index = 0
    219         print(flightDate, end= '	')
    220         
    221         
    222         # 当出现错误时,在后续的过程中加入加入该段代码     
    223 #         for (fromCityName, fromCityId) in city[city.index('厦门'):].items():
    224         # 当第一次运行时,执行下面这个for
    225         for (fromCityName, fromCityId) in city.items():  # 起飞城市
    226             print(fromCityName,end='')
    227             for (toCityName, toCityId) in city.items():  # 降落城市    
    228                 # 容错次数
    229                 cnt = 1
    230                 if  fromCityName != toCityName:  
    231                     print(toCityName,end='	')
    232                     # headers
    233                     headers = {
    234                         "User-Agent": random.choice(userAgent),
    235                         "origin": "https://flights.ctrip.com",
    236                         "content-type": "application/json"
    237                     }
    238                     # 加载不同 load_json
    239                     load_json = {
    240                         "airportParams":[
    241                             {"dcity":fromCityId[0],"dcityname":fromCityName,"acity":toCityId[0],"acityname":toCityName,"date":flightDate,"dcityid":fromCityId[1],"acityid":toCityId[1]}
    242                         ],
    243                         "classType": "ALL",
    244                         "date": flightDate,
    245                         "flightWay": "Oneway",
    246                         "hasBaby": False,
    247                         "hasChild": False,
    248                         "searchIndex": 1,
    249                         "token": "a4d91efc14f95ad7e1abaf914da140f3"
    250                     }
    251                     # routeList
    252                     routeList = get_routeList(headers, load_json, cnt)
    253                     # get_data
    254                     if routeList is not None:  # 没有航班则跳过
    255                         (index, df) = get_data(index, df, routeList)
    256 #                         print(index,df.shape, end='	')
    257                 time.sleep(random.choice(range(2)))
    258             print('
    ' + '--'*50)
    259             time.sleep(random.choice(range(3)))
    260                #######################################
    261             print('【{}】起飞,抓完!'.format(fromCityName))
    262         time.sleep(random.choice(range(60,90)))
    263         print(df.shape)
    264         csv_path = '【{}】起飞航班.csv'.format(flightDate)
    265         print('起飞日期:{},抓完,写入文件!'.format(fromCityName))
    266         print(csv_path)
    267         df.to_csv(csv_path,index=False, encoding='utf-8')
    268     return (index, df)
    269 
    270 
    271 
    272 if __name__ == "__main__":
    273      # getCityMsg
    274     city = getCityMsg()
    275     (index, df) = main(city)

    降低爬取速度可用

  • 相关阅读:
    POJ 3259 Wormholes【BellmanFord】
    POJ 2960 SNim【SG函数的应用】
    ZOJ 3578 Matrixdp水题
    HDU 2897 邂逅明下【bash博弈】
    BellmanFord 算法及其优化【转】
    【转】几个Java的网络爬虫
    thinkphp 反字符 去标签 自动加点 去换行 截取字符串 冰糖
    php 二维数组转 json文本 (jquery datagrid 数据格式) 冰糖
    PHP 汉字转拼音(首拼音,所有拼音) 冰糖
    设为首页与加入收藏 兼容firefox 冰糖
  • 原文地址:https://www.cnblogs.com/Alexisbusyblog/p/12580891.html
Copyright © 2011-2022 走看看