zoukankan      html  css  js  c++  java
  • 去哪儿网门票数据

    import re
    import json
    from odps import ODPS
    from threading import Thread
    import threading
    from urllib import parse
    import datetime
    from lxml import etree

    import random 
    import requests
    import time

    from models import *

    def write_txt(html_data):
        f = open("a.txt", 'a+')
        f.write(html_data)
        f.write(" ")
        f.close()

    def get_nodes_json():
        url =  r.lpop('qunar.com:ticket_url')
        if url:
            city_name = re.search(r"%E5%8D%97%E9%80%9A_(.*)_all?",url).group(1) # 此处获取城市的名字
            vacation_url = vacation_url.replace("%E4%B8%8A%E6%B5%B7",city_name) # 替换
            payload_data = vacation_payload
            
            headers_data = vacation_headers
            headers_data['referer'] = url

            url = vacation_url

            vacation_number = 0
            while(1):
                response = requests.request("GET", url, headers=headers_data, data = payload_data)
                false = False
                true = True
                null = None
                json_data = eval(response.text)
                if response.status_code == 200:
                    if json_data['status'] == 0:
                        vacation_number = json_data['data']['list']['numFound']    
                        break

            #print(vacation_number)
            start_num = 0
            while(1): 
                if vacation_number  > 0:
                    vacation_url = vacation_url.replace("%E4%B8%8A%E6%B5%B7",city_name) # 替换
                    page_start = re.search(r"&lm=(d+)%2C60",vacation_url).group(1)
                    vacation_url = vacation_url.replace("&lm=" + str(page_start) + "%2C60","&lm=" + str(start_num) + "%2C60")
                    
                    process_response_data(headers_data,payload_data,vacation_url)
                    start_num = start_num + 60
                    vacation_number = vacation_number - 60
                else:
                    break

    def process_response_data(headers_data,payload_data,vacation_url): # 处理response的相应信息
        while(1):
            response = requests.request("GET", vacation_url, headers=headers_data, data = payload_data)
            if response.status_code == 200:
                json_data = json.loads(response.text)
                # false = False
                # true = True
                # null = None
                # json_data = eval(response.text)
                if json_data['status'] == 0:
                    try:
                        #product_city = ''.join(json_data['data']['qdata']['destinations']) # list数据类型转换成str类型数据
                        product_city = json_data['data']['qdata']['realQuery'] # 目的地城市
                    except:
                        product_city = "NULL"
                        #write_txt(json_data['data']['qdata']['destinations'])
                    for data in json_data['data']['list']['results']:
                        try:
                            vacation_data = Vacation_Product()
                            vacation_data.product_title = data['title'].encode('utf-16','surrogatepass').decode('utf-16') # 度假产品名称
                            vacation_data.product_city = product_city
                            vacation_data.product_price = data['accuratePrice'] # 度假产品价格
                            vacation_data.product_score = data['productScore'] # 度假产品评分
                            vacation_data.product_reviews = data['reviews'] # 度假产品评论数
                            vacation_data.product_soldCount = data['soldCount'] # 度假产品销量
                            vacation_data.tripTime = data['details']['tripTime']  # 度假时长
                            vacation_data.hotel_night = data['details']['hotelNight'] # 住宿时长
                            vacation_data.traffic_tool = data['details']['traffic'] # 出行工具
                            vacation_data.supplier_name = data['summary']['supplier']['name'] # 商家名称
                            #vacation_data.supplier_url = "https:" + data['summary']['supplier']['url']  # 商家url链接
                            vacation_data.supplier_url = data['summary']['supplier']['url'].split('.')[0] # 商家url链接
                            vacation_data.create_time = datetime.datetime.now() # 抓取时间
                            r.lpush('qunar.com:store_url',"https:" + data['summary']['supplier']['url'])#度假商品商户的url
                            vacation_data.save(force_insert=True)
                        except:
                            pass
                    break       
                else:
                    pass    
            else:
                pass

    class parse_qunar_url_Thread(Thread):
        def run(self):
            while(1):
                get_nodes_json()
                #保存最终的数据


    if __name__ == "__main__":
        for i in range(10):
            parse_qunar_url_thread = parse_qunar_url_Thread()     
            parse_qunar_url_thread.start()    
        
    两年大概看此博客blog.codingnow.com/aee/
  • 相关阅读:
    VC(VISUAL_C++)虚拟键VK值列表
    关于新一轮QQ Tencent://Message 在线联系
    (记录) sql exists 应用及 order by注意点
    (记录)IE8 ..样式错乱解决
    jquery 关于ajax 中文字符长度过长后不执行
    DataList 嵌套绑定CheckBoxList [记录, 以免忘记哈.]
    (记录)MSSQL 的一些应用 查询数据统计适用 添加月份日号作为行记录
    数据结构回顾算法
    Modeling Our World笔记
    数据结构2数组
  • 原文地址:https://www.cnblogs.com/dog-and-cat/p/13536698.html
Copyright © 2011-2022 走看看