import re
import json
from odps import ODPS
from threading import Thread
import threading
from urllib import parse
import datetime
from lxml import etree
import random
import requests
import time
from models import *
def write_txt(html_data):
f = open("a.txt", 'a+')
f.write(html_data)
f.write("
")
f.close()
def get_nodes_json():
url = r.lpop('qunar.com:ticket_url')
if url:
city_name = re.search(r"%E5%8D%97%E9%80%9A_(.*)_all?",url).group(1) # 此处获取城市的名字
vacation_url = vacation_url.replace("%E4%B8%8A%E6%B5%B7",city_name) # 替换
payload_data = vacation_payload
headers_data = vacation_headers
headers_data['referer'] = url
url = vacation_url
vacation_number = 0
while(1):
response = requests.request("GET", url, headers=headers_data, data = payload_data)
false = False
true = True
null = None
json_data = eval(response.text)
if response.status_code == 200:
if json_data['status'] == 0:
vacation_number = json_data['data']['list']['numFound']
break
#print(vacation_number)
start_num = 0
while(1):
if vacation_number > 0:
vacation_url = vacation_url.replace("%E4%B8%8A%E6%B5%B7",city_name) # 替换
page_start = re.search(r"&lm=(d+)%2C60",vacation_url).group(1)
vacation_url = vacation_url.replace("&lm=" + str(page_start) + "%2C60","&lm=" + str(start_num) + "%2C60")
process_response_data(headers_data,payload_data,vacation_url)
start_num = start_num + 60
vacation_number = vacation_number - 60
else:
break
def process_response_data(headers_data,payload_data,vacation_url): # 处理response的相应信息
while(1):
response = requests.request("GET", vacation_url, headers=headers_data, data = payload_data)
if response.status_code == 200:
json_data = json.loads(response.text)
# false = False
# true = True
# null = None
# json_data = eval(response.text)
if json_data['status'] == 0:
try:
#product_city = ''.join(json_data['data']['qdata']['destinations']) # list数据类型转换成str类型数据
product_city = json_data['data']['qdata']['realQuery'] # 目的地城市
except:
product_city = "NULL"
#write_txt(json_data['data']['qdata']['destinations'])
for data in json_data['data']['list']['results']:
try:
vacation_data = Vacation_Product()
vacation_data.product_title = data['title'].encode('utf-16','surrogatepass').decode('utf-16') # 度假产品名称
vacation_data.product_city = product_city
vacation_data.product_price = data['accuratePrice'] # 度假产品价格
vacation_data.product_score = data['productScore'] # 度假产品评分
vacation_data.product_reviews = data['reviews'] # 度假产品评论数
vacation_data.product_soldCount = data['soldCount'] # 度假产品销量
vacation_data.tripTime = data['details']['tripTime'] # 度假时长
vacation_data.hotel_night = data['details']['hotelNight'] # 住宿时长
vacation_data.traffic_tool = data['details']['traffic'] # 出行工具
vacation_data.supplier_name = data['summary']['supplier']['name'] # 商家名称
#vacation_data.supplier_url = "https:" + data['summary']['supplier']['url'] # 商家url链接
vacation_data.supplier_url = data['summary']['supplier']['url'].split('.')[0] # 商家url链接
vacation_data.create_time = datetime.datetime.now() # 抓取时间
r.lpush('qunar.com:store_url',"https:" + data['summary']['supplier']['url'])#度假商品商户的url
vacation_data.save(force_insert=True)
except:
pass
break
else:
pass
else:
pass
class parse_qunar_url_Thread(Thread):
def run(self):
while(1):
get_nodes_json()
#保存最终的数据
if __name__ == "__main__":
for i in range(10):
parse_qunar_url_thread = parse_qunar_url_Thread()
parse_qunar_url_thread.start()