zoukankan      html  css  js  c++  java
  • 爬取携程7天内的全国热门城市航班

      1 #!/usr/bin/env python
      2 # coding: utf-8
      6 
      7 import requests
      8 import pandas as pd
      9 import json,random,time,datetime
     10 
     11 # userAgent
     12 userAgent = [
     13     "Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36",
     14     "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.7 (KHTML, like Gecko) Chrome/16.0.912.36 Safari/535.7",
     15     "Mozilla/5.0 (Windows NT 6.2; Win64; x64; rv:16.0) Gecko/16.0 Firefox/16.0",
     16     "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/534.55.3 (KHTML, like Gecko) Version/5.1.3 Safari/534.53.10",
     17     "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)",
     18     "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)",
     19     "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1500.55 Safari/537.36",
     20     "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_2) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1309.0 Safari/537.17"
     21     "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:2.0b13pre) Gecko/20110307 Firefox/4.0b13pre",
     22     "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:16.0) Gecko/20100101 Firefox/16.0",
     23     "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11",
     24     "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36",
     25     "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.70 Safari/537.36"
     26 ]
     27 
     28 # get city
     29 def getCityMsg():
     30     headers = {
     31         'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36',
     32         "Referer": "https://flights.ctrip.com/itinerary",
     33         "Content-Type": "application/json"
     34     }
     35     url = 'https://flights.ctrip.com/itinerary/api/poi/get'
     36     r = requests.get(url=url,headers=headers).text
     37 #     print(len(r))
     38     # get city msg
     39     city = {}
     40     city_load = json.loads(r).get('data')
     41     for data in city_load.keys():
     42         ## 所有航班
     43         # if data != '热门':   
     44         #     tmpdata = city_load.get(data)
     45     #         for i in tmpdata:
     46     # #             print(i)  # A 
     47     #             for k in tmpdata.get(i):   
     48     #                 name = k.get('data').split('|')
     49     #                 cityNumId = name[2]
     50     #                 cityId = name[3]
     51     #                 cityName = name[1].split('(')[0]
     52     #                 city[cityName] = [cityId, cityNumId]
     53         if data == '热门':     # 仅限热门城市
     54             tmpdata = city_load.get(data)
     55             for i in tmpdata:   # tmpdata is list , i is dict
     56                 name = i.get('data').split('|')
     57                 cityNumId = name[2]
     58                 cityId = name[3]
     59                 cityName = name[1].split('(')[0]
     60                 city[cityName] = [cityId, cityNumId]
     61 
     62             
     63     return city
     64 
     65 # 生成自今日至往后7天日期
     66 def get_date():
     67     dateList = []  # 存放时间list
     68     formatDate = datetime.datetime.now()  # 生成今日的格式化时间
     69     strDate = formatDate.strftime('%Y-%m-%d')  # 生成字符串日期
     70     stpDate = datetime.datetime.strptime(strDate,'%Y-%m-%d')  # 将字符串转为日期格式的日期
     71     for i in range(7):
     72         stpDate += datetime.timedelta(days=+1)   # 日期叠加1
     73         dateList.append(datetime.datetime.strftime(stpDate,'%Y-%m-%d'))  # 放入字典
     74     return dateList
     75 
     76 # get page text:routeList
     77 def  get_routeList(headers, load_json, cnt):
     78     try:
     79         response = requests.post(url = "https://flights.ctrip.com/itinerary/api/12808/products",data=json.dumps(load_json), headers = headers).text
     80         result = json.loads(response)["data"].get('routeList')
     81         return json.loads(response)["data"].get('routeList')
     82     except Exception as e:
     83         print('Get 【{} --> {}】 Page is failed !'.format(load_json.get('airportParams')[0].get('dcityname'), load_json.get('airportParams')[0].get('acityname')))
     84         print('休息10m后再来……')
     85         time.sleep(600)
     86         cnt += 1
     87         if cnt <= 10:
     88             get_routeList(headers, load_json, cnt)
     89         else:
     90             return None
     91 # get Data
     92 def get_data(index, df, routeList):
     93     if routeList is not None:
     94         for i, route in enumerate(routeList):
     95             if route.get('routeType') == 'Flight':  # 只要航班
     96                 index += 1
     97                 # route is dict
     98                 # we need route inside legs, legs is list, but its lengths is 1
     99                 # so we should legs[0], legs[0] is dict
    100 
    101                 # flight
    102                 flight = route.get('legs')[0].get('flight')  # dict
    108 
    109                 #### about flight
    110                 if flight is not None:
    111                     # common attr
    112                     df.loc[index,'airlineCode'] = flight.get('airlineCode')
    113                     df.loc[index,'AirlineName'] = flight.get('airlineName')
    114                     df.loc[index,'durationDays'] = flight.get('durationDays')
    115                     df.loc[index,'flightNumber'] = flight.get('flightNumber')
    116                     df.loc[index,'mealFlag'] = flight.get('mealFlag')
    117                     df.loc[index,'mealType'] = flight.get('mealType')
    118                     df.loc[index,'comfort'] = flight.get('comfort')
    119                     df.loc[index,'craftKind'] = flight.get('craftKind')
    120                     df.loc[index,'craftTypeCode'] = flight.get('craftTypeCode')
    121                     df.loc[index,'craftTypeKindDisplayName'] = flight.get('craftTypeKindDisplayName')
    122                     df.loc[index,'craftTypeName'] = flight.get('craftTypeName')
    123                     df.loc[index,'delayedTime'] = flight.get('delayedTime')
    124                     df.loc[index,'oilFee'] = flight.get('oilFee')
    125                     df.loc[index,'punctualityRate'] = flight.get('punctualityRate')
    126                     df.loc[index,'sharedFlightName']  = flight.get('sharedFlightName')
    127                     df.loc[index,'sharedFlightNumber'] = flight.get('sharedFlightNumber')
    128                     df.loc[index,'specialCraft'] = flight.get('specialCraft')
    129                     df.loc[index,'stopInfo'] = flight.get('stopInfo')
    130                     df.loc[index,'stopTimes'] = flight.get('stopTimes')
    131                     df.loc[index,'tax'] = flight.get('tax')
    132                     # arrival
    133                     df.loc[index,'arrivalairportName'] = flight.get('arrivalAirportInfo').get('airportName')
    134                     df.loc[index,'arrivalairportTlc'] = flight.get('arrivalAirportInfo').get('airportTlc')
    135                     df.loc[index,'arrivalcityName'] = flight.get('arrivalAirportInfo').get('cityName')
    136                     df.loc[index,'arrivalcityTlc'] = flight.get('arrivalAirportInfo').get('cityTlc')
    137                     df.loc[index,'arrivalTerminalName'] = flight.get('arrivalAirportInfo').get('terminal').get('name')
    138                     df.loc[index,'arrivalDate'] = flight.get('arrivalDate')
    139                     # departure 
    140                     df.loc[index,'departureairportName'] = flight.get('departureAirportInfo').get('airportName')
    141                     df.loc[index,'departureairportTlc'] = flight.get('departureAirportInfo').get('airportTlc')
    142                     df.loc[index,'departureCityName'] = flight.get('departureAirportInfo').get('cityName')
    143                     df.loc[index,'departureCityTlc'] = flight.get('departureAirportInfo').get('cityTlc')
    144                     df.loc[index,'departureTerminalName'] = flight.get('departureAirportInfo').get('terminal').get('name')
    145                     df.loc[index,'departureDate'] = flight.get('departureDate')
    146 
    147                 #### characteristic : charactor
    148                 # characteristic:charactor
    149                 charactor = route.get('legs')[0].get('characteristic')  # dict
    150                 if charactor is not None:
    151                     df.loc[index, 'businessAircraft'] = charactor.get('businessAircraft')
    152                     df.loc[index, 'discountAmount'] = charactor.get('discountAmount')
    153                     df.loc[index, 'discountShowType'] = charactor.get('discountShowType')
    154                     df.loc[index, 'flyMan'] = charactor.get('flyMan')
    155                     df.loc[index, 'groupTicketPrice'] = charactor.get('groupTicketPrice')
    156                     df.loc[index, 'hotFlight'] = charactor.get('hotFlight')
    157                     df.loc[index, 'hx'] = charactor.get('hx')
    158                     df.loc[index, 'infantSoldOut'] = charactor.get('infantSoldOut')
    159                     df.loc[index, 'lowPriceDiscount'] = charactor.get('lowPriceDiscount')
    160                     df.loc[index, 'lowestBabyCfPrice'] = charactor.get('lowestBabyCfPrice')
    161                     df.loc[index, 'lowestBabyPrice'] = charactor.get('lowestBabyPrice')
    162                     df.loc[index, 'lowestCfPrice'] = charactor.get('lowestCfPrice')
    163                     df.loc[index, 'lowestChildAdultCfPrice'] = charactor.get('lowestChildAdultCfPrice')
    164                     df.loc[index, 'lowestChildAdultPrice'] = charactor.get('lowestChildAdultPrice')
    165                     df.loc[index, 'lowestChildCfPrice'] = charactor.get('lowestChildCfPrice')
    166                     df.loc[index, 'lowestChildPrice'] = charactor.get('lowestChildPrice')
    167                     df.loc[index, 'lowestPrice'] = charactor.get('lowestPrice')
    168                     df.loc[index, 'promotion'] = charactor.get('promotion')
    169                     df.loc[index, 'providerHx'] = charactor.get('providerHx')
    170                     df.loc[index, 'roundTripDiscounts'] = charactor.get('roundTripDiscounts')
    171                     tmp_charactor = charactor.get('standardPrices')
    172                     if tmp_charactor is not None:
    173                         for i, stdPrice in enumerate(tmp_charactor):
    174                             diffCabinCla = stdPrice.get('cabinClass')
    175                             df.loc[index, 'price' + diffCabinCla] = stdPrice.get('price')
    176                     df.loc[index, 'superFlyMan'] = charactor.get('superFlyMan')
    177                     df.loc[index, 'weight'] = charactor.get('weight')
    178 
    209     return (index, df)
    210         
    211 
    212 # main function    
    213 def main(city):
    214     # 初始化 时间
    215     flightDates = get_date()
    216     for flightDate in flightDates:   # 起飞日期
    217         df = pd.DataFrame()
    218         index = 0
    219         print(flightDate, end= '	')
    220         
    221         
    222         # 当出现错误时,在后续的过程中加入加入该段代码     
    223 #         for (fromCityName, fromCityId) in city[city.index('厦门'):].items():
    224         # 当第一次运行时,执行下面这个for
    225         for (fromCityName, fromCityId) in city.items():  # 起飞城市
    226             print(fromCityName,end='')
    227             for (toCityName, toCityId) in city.items():  # 降落城市    
    228                 # 容错次数
    229                 cnt = 1
    230                 if  fromCityName != toCityName:  
    231                     print(toCityName,end='	')
    232                     # headers
    233                     headers = {
    234                         "User-Agent": random.choice(userAgent),
    235                         "origin": "https://flights.ctrip.com",
    236                         "content-type": "application/json"
    237                     }
    238                     # 加载不同 load_json
    239                     load_json = {
    240                         "airportParams":[
    241                             {"dcity":fromCityId[0],"dcityname":fromCityName,"acity":toCityId[0],"acityname":toCityName,"date":flightDate,"dcityid":fromCityId[1],"acityid":toCityId[1]}
    242                         ],
    243                         "classType": "ALL",
    244                         "date": flightDate,
    245                         "flightWay": "Oneway",
    246                         "hasBaby": False,
    247                         "hasChild": False,
    248                         "searchIndex": 1,
    249                         "token": "a4d91efc14f95ad7e1abaf914da140f3"
    250                     }
    251                     # routeList
    252                     routeList = get_routeList(headers, load_json, cnt)
    253                     # get_data
    254                     if routeList is not None:  # 没有航班则跳过
    255                         (index, df) = get_data(index, df, routeList)
    256 #                         print(index,df.shape, end='	')
    257                 time.sleep(random.choice(range(2)))
    258             print('
    ' + '--'*50)
    259             time.sleep(random.choice(range(3)))
    260                #######################################
    261             print('【{}】起飞,抓完!'.format(fromCityName))
    262         time.sleep(random.choice(range(60,90)))
    263         print(df.shape)
    264         csv_path = '【{}】起飞航班.csv'.format(flightDate)
    265         print('起飞日期:{},抓完,写入文件!'.format(fromCityName))
    266         print(csv_path)
    267         df.to_csv(csv_path,index=False, encoding='utf-8')
    268     return (index, df)
    269 
    270 
    271 
    272 if __name__ == "__main__":
    273      # getCityMsg
    274     city = getCityMsg()
    275     (index, df) = main(city)

    降低爬取速度可用

  • 相关阅读:
    【Intellij】Intellij Idea 2017创建web项目及tomcat部署实战
    【IntelliJ 】IntelliJ IDEA 15 创建maven项目
    【IntelliJ】IntelliJ IDEA的安装破解及使用
    maven坐标查询
    【PowerDesigner】PowerDesigner之CDM、PDM、SQL之间转换
    【c3p0】 C3P0的三种配置方式以及基本配置项详解
    LinkedHashMap和HashMap的比较使用
    【全局变量】mysql查看全局变量以及设置全局变量的值
    Redis总结
    Java高概率面试题目—finally
  • 原文地址:https://www.cnblogs.com/Alexisbusyblog/p/12580891.html
Copyright © 2011-2022 走看看