zoukankan      html  css  js  c++  java
  • Python 之lxml解析库

     一、XPath常用规则

     二、解析html文件

    from lxml import etree
    
    
    # 读取HTML文件进行解析
    def parse_html_file():
        html = etree.parse("./test.html", parser=etree.HTMLParser())
        print(etree.tostring(html).decode("utf-8"))
        '''
        <!DOCTYPE html>
        <html lang="en">&#13;
        <head>&#13;
            <meta charset="UTF-8"/>&#13;
            <title>Title</title>&#13;
        </head>&#13;
        <body>&#13;
            <h1>yangs</h1>&#13;
        </body>&#13;
        </html>
        '''
    
    
    # 读取文本解析节点
    def get_text_node(text):
        html = etree.HTML(text, parser=etree.HTMLParser())
        print(html.xpath("//ul/li[position()=2]/text()"))  # ['你好!!!']
        print(html.xpath("//ul/li[2]/text()"))  # ['你好!!!']
    
    
    # 获取所有节点
    def get_all_node(text):
        html = etree.HTML(text, parser=etree.HTMLParser())
        print(html.xpath(
            "//*"))  # [<Element html at 0x20be0903f48>, <Element body at 0x20be0910048>, <Element div at 0x20be0910088>, <Element ul at 0x20be09100c8>, <Element li at 0x20be0910108>, <Element a at 0x20be0910188>, <Element li at 0x20be09101c8>, <Element li at 0x20be0910208>, <Element span at 0x20be0910248>]
    
    
    # 获取子节点
    def get_children_node(text):
        html = etree.HTML(text, parser=etree.HTMLParser())
        print(html.xpath("//div/ul/li/a"))  # [<Element a at 0x1e15740e108>]
    
    
    # 获取父节点
    def get_parent_node(text):
        html = etree.HTML(text, parser=etree.HTMLParser())
        print(html.xpath("//a/.."))  # [<Element li at 0x28a7d2ae108>, <Element li at 0x28a7d2ae208>]
    
    
    # 属性匹配
    def math_attr(text):
        html = etree.HTML(text, parser=etree.HTMLParser())
        print(html.xpath("//a[@href='2.html']/text()"))  # ['hello world']
    
    
    # 属性获取
    def get_attr(text):
        html = etree.HTML(text, parser=etree.HTMLParser())
        print(html.xpath("//a/@href"))  # ['1.html', '2.html']
    
    
    # 属性多值匹配
    def match_more_attr(text):
        html = etree.HTML(text, parser=etree.HTMLParser())
        print(html.xpath("//li[contains(@class, 'aaa')]/a/text()"))  # ['yangs']
    
    
    if __name__ == '__main__':
        text = '''
            <div>
                <ul>
                    <li class="aaa last-li"><a href="1.html">yangs</a></li>
                    <li>你好!!!</li>
                    <li class="last-li"><a href="2.html">hello world</a></li>
                </ul>
            </div>
        '''

     三、去哪儿网html抓取案例

    import requests
    from lxml import etree
    
    
    def go_where(keyword):
        url = "https://piao.qunar.com/ticket/list.htm?keyword=" + keyword
        headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36"
        }
        try:
            html = requests.get(url, headers=headers).content.decode("utf-8")
        except RuntimeError as e:
            print(e)
        try:
            html_object = etree.HTML(html, parser=etree.HTMLParser())
            # 获取总共多少条数据
            count = len(html_object.xpath("//div[@class='sight_item']"))
            return_data = []
            for i in range(count):
                name = html_object.xpath("//div[@class='sight_item']/@data-sight-name")
                districts = html_object.xpath("//div[@class='sight_item']/@data-districts")
                point = html_object.xpath("//div[@class='sight_item']/@data-point")
                img_url = html_object.xpath("//div[@class='sight_item']/@data-sight-img-u-r-l")
                address = html_object.xpath("//div[@class='sight_item']/@data-address")
                return_data.append({
                    "name": name[i],
                    "districts": districts[i],
                    "point": point[i],
                    "address": address[i],
                    "img_url": img_url[i]
                })
            return return_data
        except RuntimeError as e:
            print(e)
    
    
    if __name__ == '__main__':
        data = go_where("温州")
        print(data)  # [{'name': '雁荡山', 'districts': '浙江·温州·乐清市', 'point': '121.095868,28.352028', 'address': '浙江省温州乐清市雁荡镇雁山路88号', 'img_url': 'https://imgs.qunarzz.com/sight/p0/1604/70/7094d064511234be90.img.jpg_280x200_03cb9d77.jpg'}, {'name': '江心屿', 'districts': '浙江·温州·鹿城区', 'point': '120.645422,28.032889', 'address': '浙江省温州市鹿城区望江东路119号', 'img_url': 'https://imgs.qunarzz.com/sight/p0/201402/24/0b725e8cd5bb14af0a634e7dc7057e15.jpg_280x200_6042c3f2.jpg'}, {'name': '楠溪江', 'districts': '浙江·温州·永嘉县', 'point': '120.696651,28.063045', 'address': '浙江省温州市永嘉县楠溪江风景区', 'img_url': 'https://imgs.qunarzz.com/sight/p0/1603/20/20e0961e888e8db790.water.jpg_280x200_373cc32f.jpg'}, {'name': '石桅岩', 'districts': '浙江·温州·楠溪江', 'point': '120.906672,28.38873', 'address': '浙江省温州市永嘉县鹤盛乡', 'img_url': 'https://imgs.qunarzz.com/sight/p0/201301/16/18efacf1a049d44793835fbb.jpg_280x200_ded84edf.jpg'}, {'name': '龙湾潭国家森林公园', 'districts': '浙江·温州·楠溪江', 'point': '120.881758,28.343969', 'address': '浙江省温州市永嘉县鹤盛乡季家岙', 'img_url': 'https://imgs.qunarzz.com/sight/p0/201301/15/3a5b3d27b59a888393835fbb.jpg_280x200_fb391fc7.jpg'}, {'name': '大龙湫', 'districts': '浙江·温州·雁荡山', 'point': '121.060234,28.354889', 'address': '浙江省温州乐清市雁荡山雁山路88号', 'img_url': 'https://imgs.qunarzz.com/sight/p0/201405/20/2d7f19b34f7a6064e9bb8dc0d531e4b1.jpg_280x200_cc049369.jpg'}, {'name': '仙叠岩', 'districts': '浙江·温州·洞头县', 'point': '121.171743,27.82429', 'address': '浙江省温州市洞头县', 'img_url': 'https://imgs.qunarzz.com/sight/p73/201211/03/b3a8633322999c0d93835fbb.jpg_280x200_8d32214b.jpg'}, {'name': '灵峰', 'districts': '浙江·温州·雁荡山', 'point': '121.122449,28.38293', 'address': '浙江省温州乐清市中雁荡山的东大门', 'img_url': 'https://imgs.qunarzz.com/sight/p0/1410/14/72a3cf0e134514459208762c339ea137.jpg_280x200_a9c29a7f.jpg'}, {'name': '雁荡山净名谷', 'districts': '浙江·温州·雁荡山', 'point': '121.106647,28.37454', 'address': '浙江省温州乐清市雁荡镇响岭头村净名路16-1号', 'img_url': 'https://imgs.qunarzz.com/sight/p0/201301/16/32144da037b4f5bd93835fbb.jpg_280x200_9d664b50.jpg'}, {'name': '小龙湫', 'districts': '浙江·温州·灵岩', 'point': '121.09865,28.365099', 'address': '浙江省温州市乐清市雁荡山白芙线旁', 'img_url': 'https://imgs.qunarzz.com/sight/p47/201211/02/bf0df4ce367cf77893835fbb.jpg_280x200_7db1df9d.jpg'}, {'name': '雁荡山飞拉达攀岩景区', 'districts': '浙江·温州·雁荡山', 'point': '121.059208,28.399697', 'address': '浙江省温州市乐清市仙溪镇龙西乡庄屋村', 'img_url': 'https://imgs.qunarzz.com/sight/p0/1802/48/488f3680d455fc9da3.img.jpg_280x200_62d7e7f7.jpg'}]

     有我案例代码优化的,可以发给我。。。

  • 相关阅读:
    coffee.js
    domOperation.js
    ImmediateFunc.js
    callback.js
    array.js
    asynchronous.js
    addEventListener.js
    meta的日常设置
    11.11 双十一 前端教你一键领取天猫千张优惠券 (领前先想想有没有钱花这些优惠券)
    前端的最后是逻辑和数学
  • 原文地址:https://www.cnblogs.com/yang-2018/p/10930340.html
Copyright © 2011-2022 走看看