zoukankan      html  css  js  c++  java
  • 去哪儿网北京当日酒店信息爬取

    >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>

    声明:仅学习参考

    版本:verison_0

    说明:主要是通过selenium拿到网页源码,然后通过lxml进行解析,大部分时间也花在解析网页源码提取数据上面和写逻辑上面了

    技术:selenium,lxml,json

       在xpath中如果要提取子节点的所有文本信息,可以用 "li.xpath('string(xpath_path)')"

    效果图:

    源码:

    from selenium import webdriver
    import time
    import re
    import json
    from lxml import etree
    from urllib import parse
    from pprint import pprint
    
    
    class QuNaErSpider():
        """获取当日北京的酒店信息"""
        def __init__(self):
            self.driver = webdriver.Chrome()
        
        def save_info(self,content):
            with open("qunaer_hotel_today_info.json",'a+',encoding='utf-8') as f:
                f.write(json.dumps(obj=content,ensure_ascii=False,indent=4))
                f.write(",")
                print("写入完成")
        
        def parse_html(self,html_str,source_url):
            html_etree = etree.HTML(text=html_str)
            li_list = html_etree.xpath('//ul[contains(@id,"hotel_lst_body")]/li')
            current_page_info_list = list()
            for li in li_list:
                item = dict()
                hotel_name = li.xpath('.//div[@class="cont"]/p[@class="name"]/a/@title')
                item["hotel_name"] = hotel_name[0] if hotel_name else None
                hotel_href = li.xpath('.//div[@class="cont"]/p[@class="name"]/a/@href')
                item['totel_href'] = parse.urljoin(base=source_url,url=hotel_href[0]) if hotel_href else None
                hotel_type = li.xpath('//div[@class="cont"]/p[@class="name"]/span[last()]/text()')
                item['total_type'] = hotel_type[0] if hotel_type else None
                hotel_price = li.xpath('string(.//p[@class="price_new"])')
                item['hotel_price'] = hotel_price if hotel_price else None
                hotel_address = li.xpath('.//div[@class="cont"]/p[@class="adress"]/text()')
                item["hotel_address"] = hotel_address[0] if hotel_address else None
                hotel_comment = li.xpath('string(.//div[@class="cont"]/p[@class="comm"])')
                item['hotel_comment'] = hotel_comment if hotel_comment else None
                hotel_subject = li.xpath('string(.//div[@class="cont"]/div[@class="subj rmb"])')
                item['hotel_subject'] = hotel_subject if hotel_subject else None
                current_page_info_list.append(item)
            return current_page_info_list
    
        def into_first_page(self,driver,url=None):
            driver.get(url)
            hotel_element = driver.find_element_by_xpath('//div[contains(@class,"q_header_mnav")]/ul/li[3]')
            hotel_element.click()
            search_button = driver.find_element_by_xpath('//div[@class="G_searchIndex fl_left"]//div[@class="btn clearfix"]')
            search_button.click()
            time.sleep(1)
            return driver
    
        def run(self):
            root_url = "https://www.qunar.com/"
            driver = self.into_first_page(driver=self.driver,url=root_url)
            current_page_info_list = self.parse_html(html_str=driver.page_source,source_url=driver.current_url)
            self.save_info(current_page_info_list)
            nextpage_button = driver.find_element_by_xpath('//p[@class="next fl_right cur able"]')
            while nextpage_button:
                    nextpage_button.click()
                    time.sleep(1)
                    current_page_info_list = self.parse_html(html_str=driver.page_source,source_url=driver.current_url)
                    self.save_info(current_page_info_list)
                    try:
                        nextpage_button = driver.find_element_by_xpath('//p[@class="next fl_right cur able"]')
                    except Exception:
                        nextpage_button = None
            driver.quit()
    
    
    if __name__ == "__main__":
        obj = QuNaErSpider()
        obj.run()

    <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<

  • 相关阅读:
    Qt 查询字符串数据
    #include <stdint.h>
    滤波器设计-巴特沃尔斯低通滤波设计 转
    小波学习之二(单层一维离散小波变换DWT的Mallat算法C++实现优化)--转载
    机器学习之Bagging与随机森林笔记
    机器学习之决策树笔记
    机器学习之softmax回归笔记
    机器学习之逻辑回归(Logistic)笔记
    机器学习之模型拟合效果的判断笔记
    机器学习最小二乘法笔记
  • 原文地址:https://www.cnblogs.com/nuochengze/p/13126607.html
Copyright © 2011-2022 走看看