zoukankan      html  css  js  c++  java
  • 去哪儿网门票数据爬虫更新

    import re
    import json
    from odps import ODPS
    from threading import Thread
    import threading
    from urllib import parse
    import datetime
    from lxml import etree

    import random 
    import requests
    import time

    from models import *

    # def write_txt(html_data):
    #     f = open("a.txt", 'a+')
    #     f.write(html_data)
    #     f.write(" ")
    #     f.close()

    def get_nodes_json():
        url =  r.lpop('qunar.com:ticket_url')
        print(url)
        if url:
            process_response_data(url)

    def process_response_data(url): # 处理response的相应信息
        #url = 'https://piao.qunar.com/ticket/list_%E6%96%97%E9%97%A8%E5%8C%BA.html?from=mps_search_suggest_c&keyword=%E6%96%97%E9%97%A8%E5%8C%BA&page=1'
        district = re.search(r"list_(.*).html",url).group(1) # 获取管辖区域名称
        district = parse.unquote(district) # 对管理辖区进行url转ascii
        
        response = requests.request("POST", url) 
        html = etree.HTML(response.text)
        data = html.xpath('//div[@class="result_list"]/div[contains(@class,"sight_item")]')
        ticket_num = html.xpath('//div[contains(@id,"pager-container")]')[0]
        ticket_num = re.search(r'd+',str(ticket_num.attrib))[0]
        ticket_num = int(ticket_num)
        print(ticket_num)

        for item in range(len(data)):
            data_0 = data[item].attrib
            ticket = Tickets()
            ticket.ticket_name = data_0['data-sight-name']
            ticket.ticket_pic = data_0['data-sight-img-u-r-l']

            try:
                ticket_level = data[item].xpath('./div[@class="sight_item_detail clrfix"]/div[@class="sight_item_about"]/div[@class="sight_item_info"]/div[@class="clrfix"]/span[@class="level"]/text()') # 景区等级
                ticket_level = "".join(ticket_level)
                ticket.ticket_level = ticket_level
            except:
                ticket.ticket_level = " "

            try:
                ticket_price = data[item].xpath('./div[@class="sight_item_detail clrfix"]/div[@class="sight_item_pop"]/table/tr/td/span[@class="sight_item_price"]/em/text()') # 获取票面价
                ticket_price = "".join(ticket_price)
                ticket.ticket_price = float(ticket_price)
            except:
                ticket.ticket_price = 0

            ticket_city = data_0['data-districts']
            ticket_city = ticket_city.split("·")[0]  + " " + ticket_city.split("·")[1]
            ticket.ticket_city = ticket_city
            #ticket.ticket_district = data_0['data-districts']
            ticket.ticket_district = district
            ticket.ticket_location = data_0['data-address']
            ticket.ticket_sales = data_0['data-sale-count']
            ticket.create_time = datetime.datetime.now()

            ticket.save(force_insert=True)

    class parse_qunar_url_Thread(Thread):
        def run(self):
            while(1):
                get_nodes_json()
                #保存最终的数据


    if __name__ == "__main__":
        get_nodes_json()
        # for i in range(10):
        #     parse_qunar_url_thread = parse_qunar_url_Thread()     
        #     parse_qunar_url_thread.start()    
        
  • 相关阅读:
    java.util.regex.PatternSyntaxException: Dangling meta character '*' near index 0 *&* 解决方法
    一个罕见的MSSQL注入漏洞案例
    工具推荐:ATSCAN,功能强大的Perl脚本扫描器
    突破XSS字符限制执行任意JS代码
    用Nginx分流绕开Github反爬机制
    浅析XSS与XSSI异同
    IE安全系列之——RES Protocol
    跨站请求伪造(CSRF)攻击原理解析:比你所想的更危险
    SQL注入攻击和防御
    SQL 注入,永不过时的黑客技术
  • 原文地址:https://www.cnblogs.com/dog-and-cat/p/13542429.html
Copyright © 2011-2022 走看看