zoukankan      html  css  js  c++  java
  • 去哪儿网门票数据爬虫更新

    import re
    import json
    from odps import ODPS
    from threading import Thread
    import threading
    from urllib import parse
    import datetime
    from lxml import etree

    import random 
    import requests
    import time

    from models import *


    # def write_txt(html_data):
    #     f = open("a.txt", 'a+')
    #     f.write(html_data)
    #     f.write(" ")
    #     f.close()

    def get_ticket_id(url): # 获取门票景点的id
        response = requests.request("GET", url)
        while(response.status_code == 404):
            time.sleep(2)
            response = requests.request("GET", url)
        html = etree.HTML(response.text)
        data = html.xpath('//link[@rel="canonical"]')
        try:
            data_0 = data[0].attrib
            num = re.search(r'd+',str(data_0)).group(0)
            return num
        except:
            print(url)
            print(response.status_code)
            return '0'

    def get_nodes_json():
        url =  r.lpop('test.com:ticket_url')
        #print(url)
        if url:
            district = re.search(r"list_(.*).html",url).group(1) # 获取管辖区域名称
            district = parse.unquote(district) # 对管理辖区进行url转ascii
            response = requests.request("POST", url)
            while(response.status_code == 404):
                response = requests.request("POST", url)
            #print(response.status_code,"status_code")
            html = etree.HTML(response.text)
            data = html.xpath('//div[@class="result_list"]/div[contains(@class,"sight_item")]')
            ticket_num = html.xpath('//div[contains(@id,"pager-container")]')[0]
            ticket_num = re.search(r'd+',str(ticket_num.attrib))[0]
            ticket_num = int(ticket_num)
            page_num = int(ticket_num / 15 + 1)
            #print(page_num)
            url_temp = url
            while(page_num > 0):
                url_temp = url.replace("page=1","page=" + str(page_num))
                #print(url_temp,page_num)
                process_response_data(url_temp)
                page_num = page_num - 1 

    def process_response_data(url): # 处理response的相应信息
        #url = 'https://piao.qunar.com/ticket/list_%E6%96%97%E9%97%A8%E5%8C%BA.html?from=mps_search_suggest_c&keyword=%E6%96%97%E9%97%A8%E5%8C%BA&page=1'
        district = re.search(r"list_(.*).html",url).group(1) # 获取管辖区域名称
        district = parse.unquote(district) # 对管理辖区进行url转ascii
        
        response = requests.request("POST", url)
        while(response.status_code == 404):
            response = requests.request("POST", url)
        #print(response.status_code,"status_code")
        html = etree.HTML(response.text)
        data = html.xpath('//div[@class="result_list"]/div[contains(@class,"sight_item")]')
        ticket_num = html.xpath('//div[contains(@id,"pager-container")]')[0]
        ticket_num = re.search(r'd+',str(ticket_num.attrib))[0]
        ticket_num = int(ticket_num)
        # print(ticket_num)

        for item in range(len(data)):
            data_0 = data[item].attrib
            ticket = qunar_Tickets()
            ticket.ticket_name = data_0['data-sight-name']
            ticket.ticket_pic = data_0['data-sight-img-u-r-l']
            url_ticket = data[item].xpath('./div[@class="sight_item_detail clrfix"]/div[@class="sight_item_about"]/h3[@class="sight_item_caption"]//@href') # 此处用来拼接单独门票旅游景点url
            url_ticket = 'https://piao.qunar.com' + "".join(url_ticket)
            
            num = 0
            try:
                num = get_ticket_id(url_ticket)
            except:
                continue
            #print(num,"num")

            json_url = 'https://piao.qunar.com/ticket/detailLight/sightCommentList.json?sightId=13893&index=1&page=1' # url请求获取json数据,处理评论数
            json_url = json_url.replace('13893',num)
            # print(response_json.status_code,"response_json.status_code")
            # print(response_json.text)
            
            response_json = requests.request("GET", json_url)
            #print(response_json.status_code)
            while(len(response_json.text) < 48):
                response_json = requests.request("GET", json_url)

            json_data = json.loads(response_json.text)
            #print(json_data['data']['tagList'])
            for data_com in json_data['data']['tagList']:
                if data_com['tagType'] == 3:
                    ticket.ticket_negativeCount = data_com['tagNum']
                if data_com['tagType'] == 2:
                    ticket.ticket_neutralCount = data_com['tagNum']
                if data_com['tagType'] == 1:
                    ticket.ticket_positiveCount = data_com['tagNum']
                if data_com['tagType'] == 0:
                    ticket.ticket_commentCount = data_com['tagNum']
            try:
                ticket_level = data[item].xpath('./div[@class="sight_item_detail clrfix"]/div[@class="sight_item_about"]/div[@class="sight_item_info"]/div[@class="clrfix"]/span[@class="level"]/text()') # 景区等级
                ticket_level = "".join(ticket_level)
                ticket.ticket_level = ticket_level
            except:
                ticket.ticket_level = " "

            try:
                ticket_price = data[item].xpath('./div[@class="sight_item_detail clrfix"]/div[@class="sight_item_pop"]/table/tr/td/span[@class="sight_item_price"]/em/text()') # 获取票面价
                ticket_price = "".join(ticket_price)
                ticket.ticket_price = float(ticket_price)
            except:
                ticket.ticket_price = 0

            try:
                ticket_city = data_0['data-districts']
                ticket_city = ticket_city.split("·")[0]  + " " + ticket_city.split("·")[1]
                ticket.ticket_city = ticket_city
                ticket.ticket_district = district
                ticket.ticket_location = data_0['data-address']
                ticket.ticket_sales = data_0['data-sale-count']
                ticket.create_time = datetime.datetime.now()
                ticket.save(force_insert=True)
            except:
                pass

    class parse_qunar_url_Thread(Thread):
        def run(self):
            while(1):
                get_nodes_json()
                #保存最终的数据


    if __name__ == "__main__":
        #get_nodes_json()
        for i in range(30):
            parse_qunar_url_thread = parse_qunar_url_Thread()     
            parse_qunar_url_thread.start()    
        
  • 相关阅读:
    解决无线打印机休眠后掉线无法进行局域网打印的问题
    快速为某个目录的verilog文件生成filelist
    使用Visual Studio的Spy++查找弹窗广告进程
    【转载】verilog语法之generate语句的基本认识
    补码(为什么按位取反再加一):告诉你一个其实很简单的问题
    【转载】EDID的简介和解析
    win32diskimager 谨慎使用
    UXE的一些使用归纳
    如何在win8或win10系统里添加inf驱动程序
    STM32 USB HID
  • 原文地址:https://www.cnblogs.com/dog-and-cat/p/13615459.html
Copyright © 2011-2022 走看看