zoukankan      html  css  js  c++  java
  • 6、通过xpath获取网页数据

    1、xpath解析网页源文件

    from urllib import request
    from lxml import etree
    # 请求的url
    url = "http://www.dfenqi.cn/Product/Index"
    # 请求的头文件
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36"
    }
    # 创建请求对象
    req = request.Request(url,headers = headers)
    # 创建处理器对象
    httpHandler = request.HTTPHandler()
    # 创建opener
    opener = request.build_opener(httpHandler)
    # 发送请求
    response = opener.open(req)
    # 读取源文件
    html = response.read().decode('utf-8')
    # 创建xpath关系
    xpath = "//div[@class='liebiao']/ul/li/p/text()"
    # 获取属性值列表
    # xpath = "//div[@class='liebiao']/ul/li/p/@class"
    # 将html转换成可解析对象
    selector = etree.HTML(html)
    # 返回xpath查询列表
    goodsList = selector.xpath(xpath)
    # 显示商品标题
    for goods in goodsList:
        print(goods)
    

    2、xpath解析源文件,并下载图片至本地

    from urllib import request
    from lxml import etree
    import os
    
    class Spilder():
        def __init__(self,pageUrl):
            # 需要爬取网页的url
            self.pageUrl = pageUrl
            # 请求头文件
            self.headers = {
                "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36"
            }
            # 请求的处理器
            self.httpHandler = request.HTTPHandler()
            # 请求的opener
            self.opener = request.build_opener(self.httpHandler)
    
        def loadPage(self):
            '''
            请求网页
            :return: 返回网页源文件
            '''
            req = request.Request(self.pageUrl,headers = self.headers)
            response = self.opener.open(req)
            return response.read()
    
        def getImageUrls(self,html,xpath):
            '''
            根据xpath解析源文件
            :param html: 源文件
            :param xpath: xpath解析字符串
            :return: 解析列表
            '''
            selector = etree.HTML(html)
            imgUrls = selector.xpath(xpath)
            return imgUrls
    
        def loadImage(self,url):
            '''
            下载图片
            :param url: 图片url
            :return: 返回图片数据
            '''
            req = request.Request(url,headers=self.headers)
            response = self.opener.open(req)
            return response.read()
    
        def writeImage(self,img,imgName):
            '''
            在当前文件夹下面创建image子文件夹,将图片写入本地,
            :param img: 图片数据
            :param imgName: 图片名称
            :return:
            '''
            folderName = os.path.join(os.path.abspath(os.curdir),"image")
            if not(os.path.isdir(folderName)):
                os.mkdir(folderName)
            with open('image/%s' % imgName,'wb') as f:
                f.write(img)
    
    if __name__ == "__main__":
        url = "http://www.dfenqi.cn/Product/Index"
        spilder = Spilder(url)
        html = spilder.loadPage()
        xpath = "//div[@class='liebiao']/ul/li/div/a/img/@src"
        imgUrls = spilder.getImageUrls(html,xpath)
        index = 0
        for url in imgUrls:
            index += 1
            img = spilder.loadImage(url)
            spilder.writeImage(img,'img%s.jpg' % index)
    
  • 相关阅读:
    moment JS 时间操作指南
    react 项目使用 echarts-wordcloud(文字云)
    moment实现计算两个时间的差值
    JS实现回到页面顶部的五种写法(从实现到增强)
    关于谷歌浏览器携带cookie失效解决方案
    Axios发送请求下载文件(重写二进制流文件)
    修改 input / textarea placeholder 属性的颜色和字体大小
    js实现数组浅拷贝和深拷贝
    JS中的可枚举属性与不可枚举属性
    物流管理
  • 原文地址:https://www.cnblogs.com/toloy/p/8618007.html
Copyright © 2011-2022 走看看