zoukankan html css js c++ java

去哪儿网北京当日酒店信息爬取

>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>

声明：仅学习参考

版本：verison_0

说明：主要是通过selenium拿到网页源码，然后通过lxml进行解析，大部分时间也花在解析网页源码提取数据上面和写逻辑上面了

技术：selenium,lxml,json

　　　在xpath中如果要提取子节点的所有文本信息，可以用 "li.xpath('string(xpath_path)')"

效果图：

源码：

from selenium import webdriver
import time
import re
import json
from lxml import etree
from urllib import parse
from pprint import pprint


class QuNaErSpider():
    """获取当日北京的酒店信息"""
    def __init__(self):
        self.driver = webdriver.Chrome()
    
    def save_info(self,content):
        with open("qunaer_hotel_today_info.json",'a+',encoding='utf-8') as f:
            f.write(json.dumps(obj=content,ensure_ascii=False,indent=4))
            f.write(",")
            print("写入完成")
    
    def parse_html(self,html_str,source_url):
        html_etree = etree.HTML(text=html_str)
        li_list = html_etree.xpath('//ul[contains(@id,"hotel_lst_body")]/li')
        current_page_info_list = list()
        for li in li_list:
            item = dict()
            hotel_name = li.xpath('.//div[@class="cont"]/p[@class="name"]/a/@title')
            item["hotel_name"] = hotel_name[0] if hotel_name else None
            hotel_href = li.xpath('.//div[@class="cont"]/p[@class="name"]/a/@href')
            item['totel_href'] = parse.urljoin(base=source_url,url=hotel_href[0]) if hotel_href else None
            hotel_type = li.xpath('//div[@class="cont"]/p[@class="name"]/span[last()]/text()')
            item['total_type'] = hotel_type[0] if hotel_type else None
            hotel_price = li.xpath('string(.//p[@class="price_new"])')
            item['hotel_price'] = hotel_price if hotel_price else None
            hotel_address = li.xpath('.//div[@class="cont"]/p[@class="adress"]/text()')
            item["hotel_address"] = hotel_address[0] if hotel_address else None
            hotel_comment = li.xpath('string(.//div[@class="cont"]/p[@class="comm"])')
            item['hotel_comment'] = hotel_comment if hotel_comment else None
            hotel_subject = li.xpath('string(.//div[@class="cont"]/div[@class="subj rmb"])')
            item['hotel_subject'] = hotel_subject if hotel_subject else None
            current_page_info_list.append(item)
        return current_page_info_list

    def into_first_page(self,driver,url=None):
        driver.get(url)
        hotel_element = driver.find_element_by_xpath('//div[contains(@class,"q_header_mnav")]/ul/li[3]')
        hotel_element.click()
        search_button = driver.find_element_by_xpath('//div[@class="G_searchIndex fl_left"]//div[@class="btn clearfix"]')
        search_button.click()
        time.sleep(1)
        return driver

    def run(self):
        root_url = "https://www.qunar.com/"
        driver = self.into_first_page(driver=self.driver,url=root_url)
        current_page_info_list = self.parse_html(html_str=driver.page_source,source_url=driver.current_url)
        self.save_info(current_page_info_list)
        nextpage_button = driver.find_element_by_xpath('//p[@class="next fl_right cur able"]')
        while nextpage_button:
                nextpage_button.click()
                time.sleep(1)
                current_page_info_list = self.parse_html(html_str=driver.page_source,source_url=driver.current_url)
                self.save_info(current_page_info_list)
                try:
                    nextpage_button = driver.find_element_by_xpath('//p[@class="next fl_right cur able"]')
                except Exception:
                    nextpage_button = None
        driver.quit()


if __name__ == "__main__":
    obj = QuNaErSpider()
    obj.run()

<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<

查看全文

相关阅读:
大数据概述
 c语言文法
 实验一词法分析程序实验
 语法分析程序
 第一篇博客~关于编译原理的理解
 《DenseNet Models for Tiny ImageNet Classification》课程设计论文
 物体检测的尺度效应实验
 16路PWM输出的pca9685模块
 Rocketlab公司火箭Electron介绍
 网址图书收藏

原文地址：https://www.cnblogs.com/nuochengze/p/13126607.html