zoukankan      html  css  js  c++  java
  • 去哪儿网门票数据爬虫更新

    import re
    import json
    from odps import ODPS
    from threading import Thread
    import threading
    from urllib import parse
    import datetime
    from lxml import etree

    import random 
    import requests
    import time

    from models import *

    # def write_txt(html_data):
    #     f = open("a.txt", 'a+')
    #     f.write(html_data)
    #     f.write(" ")
    #     f.close()

    def get_nodes_json():
        url =  r.lpop('qunar.com:ticket_url')
        print(url)
        if url:
            process_response_data(url)

    def process_response_data(url): # 处理response的相应信息
        #url = 'https://piao.qunar.com/ticket/list_%E6%96%97%E9%97%A8%E5%8C%BA.html?from=mps_search_suggest_c&keyword=%E6%96%97%E9%97%A8%E5%8C%BA&page=1'
        district = re.search(r"list_(.*).html",url).group(1) # 获取管辖区域名称
        district = parse.unquote(district) # 对管理辖区进行url转ascii
        
        response = requests.request("POST", url) 
        html = etree.HTML(response.text)
        data = html.xpath('//div[@class="result_list"]/div[contains(@class,"sight_item")]')
        ticket_num = html.xpath('//div[contains(@id,"pager-container")]')[0]
        ticket_num = re.search(r'd+',str(ticket_num.attrib))[0]
        ticket_num = int(ticket_num)
        print(ticket_num)

        for item in range(len(data)):
            data_0 = data[item].attrib
            ticket = Tickets()
            ticket.ticket_name = data_0['data-sight-name']
            ticket.ticket_pic = data_0['data-sight-img-u-r-l']

            try:
                ticket_level = data[item].xpath('./div[@class="sight_item_detail clrfix"]/div[@class="sight_item_about"]/div[@class="sight_item_info"]/div[@class="clrfix"]/span[@class="level"]/text()') # 景区等级
                ticket_level = "".join(ticket_level)
                ticket.ticket_level = ticket_level
            except:
                ticket.ticket_level = " "

            try:
                ticket_price = data[item].xpath('./div[@class="sight_item_detail clrfix"]/div[@class="sight_item_pop"]/table/tr/td/span[@class="sight_item_price"]/em/text()') # 获取票面价
                ticket_price = "".join(ticket_price)
                ticket.ticket_price = float(ticket_price)
            except:
                ticket.ticket_price = 0

            ticket_city = data_0['data-districts']
            ticket_city = ticket_city.split("·")[0]  + " " + ticket_city.split("·")[1]
            ticket.ticket_city = ticket_city
            #ticket.ticket_district = data_0['data-districts']
            ticket.ticket_district = district
            ticket.ticket_location = data_0['data-address']
            ticket.ticket_sales = data_0['data-sale-count']
            ticket.create_time = datetime.datetime.now()

            ticket.save(force_insert=True)

    class parse_qunar_url_Thread(Thread):
        def run(self):
            while(1):
                get_nodes_json()
                #保存最终的数据


    if __name__ == "__main__":
        get_nodes_json()
        # for i in range(10):
        #     parse_qunar_url_thread = parse_qunar_url_Thread()     
        #     parse_qunar_url_thread.start()    
        
  • 相关阅读:
    CNN结构:SPP-Net为CNNs添加空间尺度卷积-神经元层
    VR: AR和VR演进哲学
    OpenCV:使用 随机森林与GBDT
    VTK初始化New返回Null问题
    VTK嵌入MFC同步显示
    VTK:VTK嵌入MFC成功
    深度学习的技术困难
    编译Caffe-Win错误集锦
    Reducing the Dimensionality of Data with Neural Networks:神经网络用于降维
    VC维与DNN的Boundary
  • 原文地址:https://www.cnblogs.com/dog-and-cat/p/13542429.html
Copyright © 2011-2022 走看看