zoukankan      html  css  js  c++  java
  • 去哪儿网度假代码更新

    import re
    import json
    from odps import ODPS
    from threading import Thread
    import threading
    from urllib import parse
    import datetime
    from lxml import etree

    import random 
    import requests
    import time

    from models import *

    def write_txt(html_data):
        f = open("vacation_error_url.txt", 'a+')
        f.write(html_data)
        f.write(" ")
        f.close()

    def get_comment_url(url):
        response = requests.request("post", url)
        html = etree.HTML(response.text)
        data = html.xpath('//a[contains(@class,"pack js-taocan-item")]')
        num_list = []
        for data_0 in data:
            product_id = data_0.attrib
            product_id = product_id['data-subid']
            num_list.append(product_id)
        # if len(num_list) > 0:
        #     print(url)
        #     print(num_list)
        url_id = url.split("id=")[1]
        #print(url_id)
        url_ = url.split("/user/")[0]
        url_temp = url_ + '/user/comment/product/queryComments.json'
        payload = "type=all&pageNo=1&pageSize=10&productIds=4068686556&rateStatus=ALL"
        if url_id not in num_list:
            payload = payload.replace('4068686556',str(url_id))
        else :
            temp_id = ""
            for item in range(len(num_list)):
                if item != len(num_list) -1:
                    temp_id = temp_id + str(num_list[item]) + '%2C'
                else:
                    temp_id = temp_id + str(num_list[item])
            payload = payload.replace('4068686556',str(temp_id))
        #print(payload)
        #url = "https://kmgl2.package.qunar.com/user/comment/product/queryComments.json"
        headers = {
            'content-type': 'application/x-www-form-urlencoded; charset=UTF-8',
            'Cookie': 'QN1=00001480319827120b981f99; QN300=hotel.qunar.com'
        }
        response_id = requests.request("POST", url_temp, headers=headers, data = payload)
        try:
            json_data = json.loads(response_id.text)
            return json_data['data']['ratingExcellent'],json_data['data']['ratingAverage'],json_data['data']['ratingAwful']
        except:
            write_txt(url_temp + "评论数错误")
            return 0,0,0


    def get_nodes_json():
        url =  r.lpop('test.com:vacation_url')
        vacation_url = "https://dujia.qunar.com/golfz/routeList/adaptors/pcTop?isTouch=0&t=all&o=pop-desc&lm=60%2C60&fhLimit=0%2C60&q=%E4%B8%8A%E6%B5%B7&d=%E5%8D%97%E9%80%9A&s=all&qs_ts=1597025404801&tf=Ihot_02&ti=3&tm=l02&sourcepage=list&userResident=%E5%8D%97%E9%80%9A&random=749376&aroundWeight=1&qssrc=eyJ0cyI6IjE1OTcwMjU2OTU3NTIiLCJzcmMiOiJhbGwuZW52YyIsImFjdCI6InNjcm9sbCIsInJhbmRvbSI6IjM5NTUyIn0%3D&m=l%2CbookingInfo%2CbrowsingInfo%2Clm&displayStatus=pc&ddf=true&userId=00001b802e1827051e5021c6&hlFields=title&gpscity=%E5%8D%97%E9%80%9A&lines6To10=0"
        if url:
            city_name = re.search(r"%E5%8D%97%E9%80%9A_(.*)_all?",url).group(1) # 此处获取城市的名字
            vacation_url = vacation_url.replace("%E4%B8%8A%E6%B5%B7",city_name) # 替换
      
            product_city = url.split(" ")[1] # 度假产品管辖城市
            url = vacation_url
            vacation_number = 0
            while(1):
                response = requests.request("GET", url)
                false = False
                true = True
                null = None
                json_data = eval(response.text)
                if response.status_code == 200:
                    if json_data['status'] == 0:
                        vacation_number = json_data['data']['list']['numFound']    
                        break

            #print(vacation_number)
            start_num = 0
            while(1): 
                if vacation_number  > 0:
                    vacation_url = vacation_url.replace("%E4%B8%8A%E6%B5%B7",city_name) # 替换
                    page_start = re.search(r"&lm=(d+)%2C60",vacation_url).group(1)
                    vacation_url = vacation_url.replace("&lm=" + str(page_start) + "%2C60","&lm=" + str(start_num) + "%2C60")
                    #print(vacation_url)
                    process_response_data(vacation_url,product_city)
                    start_num = start_num + 60
                    vacation_number = vacation_number - 60
                else:
                    break

    def process_response_data(vacation_url,product_city): # 处理response的相应信息
        flag_num = 0 # 设置请求标志位
        while(1):
            flag_num = flag_num + 1
            response = requests.request("GET", vacation_url)
            if response.status_code == 200:
                json_data = json.loads(response.text)
                # false = False
                # true = True
                # null = None
                # json_data = eval(response.text)
                if json_data['status'] == 0:
                    try:
                        #product_city = ''.join(json_data['data']['qdata']['destinations']) # list数据类型转换成str类型数据
                        product_district = json_data['data']['qdata']['realQuery'] # 目的地城市
                    except:
                        product_district = "NULL"
                        break
                        #write_txt(json_data['data']['qdata']['destinations'])
                    for data in json_data['data']['list']['results']:
                        temp_url = 'https:' + data['summary']['supplier']['url']
                        temp_id = data['ttsEnid']
                        temp_url = temp_url + '/user/detail.jsp?id=' + str(temp_id) # 拼接获取评论数据的url
                        product_negativeCount = 0 # 度假产品差评数
                        product_neutralCount = 0 # 度假产品中评数
                        product_positiveCount = 0 # 度假产品好评数

                        product_positiveCount,product_neutralCount,product_negativeCount = get_comment_url(temp_url)
                        # print(product_positiveCount,product_neutralCount,product_negativeCount)

                        vacation_data = qunar_Vacation_Product()
                        vacation_data.product_title = data['title'].encode('utf-16','surrogatepass').decode('utf-16') # 度假产品名称
                        vacation_data.product_title = vacation_data.product_title.replace("<em>","")
                        vacation_data.product_title = str(vacation_data.product_title.replace("</em>",""))
                        vacation_data.product_city = product_city # 度假产品管辖城市
                        vacation_data.product_district = product_district # 度假产品地级县市
                        vacation_data.product_price = data['accuratePrice'] # 度假产品价格
                        vacation_data.product_score = data['productScore'] # 度假产品评分
                        try:
                            vacation_data.product_commentCount = data['reviews'] # 度假产品评论数
                        except:
                            vacation_data.product_commentCount = 0 # 度假产品评论数
                            write_txt(vacation_url+ "缺少reviews")
                        vacation_data.product_positiveCount = product_positiveCount # 度假产品好评数
                        vacation_data.product_neutralCount = product_neutralCount # 度假产品中评数
                        vacation_data.product_negativeCount = product_negativeCount # 度假产品差评数
                        try:
                            vacation_data.product_soldCount = data['soldCount'] # 度假产品销量
                        except:
                            vacation_data.product_soldCount = 0 # 度假产品销量
                            write_txt(vacation_url + "缺少soldCount")
                        vacation_data.tripTime = data['details']['tripTime']  # 度假时长
                        vacation_data.hotel_night = data['details']['hotelNight'] # 住宿时长
                        try:
                            vacation_data.traffic_tool = data['details']['traffic'] # 出行工具
                        except:
                            vacation_data.traffic_tool = "无"
                            write_txt(vacation_url + "缺少['details']['traffic']")
                        vacation_data.supplier_name = data['summary']['supplier']['name'] # 商家名称
                        #vacation_data.supplier_url = "https:" + data['summary']['supplier']['url']  # 商家url链接
                        vacation_data.supplier_url = data['summary']['supplier']['url'].split('.')[0] # 商家url链接
                        vacation_data.create_time = datetime.datetime.now() # 抓取时间
                        r.lpush('test.com:store_url',"https:" + data['summary']['supplier']['url'])#度假商品商户的url
                        #vacation_data.save(force_insert=True)
                    break      
                else:
                    pass   
            else:
                if flag_num > 5:
                    break

    class parse_qunar_url_Thread(Thread):
        def run(self):
            while(1):
                get_nodes_json()
                #保存最终的数据


    if __name__ == "__main__":
        #get_nodes_json()
        for i in range(150):
            parse_qunar_url_thread = parse_qunar_url_Thread()     
            parse_qunar_url_thread.start()   
        
  • 相关阅读:
    Spring Bean前置后置处理器的使用
    js用正则表达式查找中文
    wpf文字模糊
    Monster Audio 使用教程 (五) 添加区域效果器
    关于数据库锁的一些注意事项
    Monster Audio 使用教程(四)Wifi 远程遥控
    Monster Audio 使用教程(三)多音轨录音、播放
    Monster Audio 使用教程(二)效果参数的保存
    Monster Audio 使用教程(一)入门教程 + 常见问题
    wpf中实现快捷键
  • 原文地址:https://www.cnblogs.com/dog-and-cat/p/13615466.html
Copyright © 2011-2022 走看看