zoukankan      html  css  js  c++  java
  • 爬虫之lxml

    # 解析原理:
    # - 获取页面源码数据
    # - 实例化一个etree对象,并且将页面源码数据加载到该对象中
    # - 调用该对象的xpath方法进行指定标签定位
    # - xpath函数必须结合着xpath表达式进行标签定位和内容捕获
    # xpath表达式:
    # - 属性定位: //div[@class="song"] 找到class属性值为song的div 返回一个列表
    # - 索引层级定位: //div[@class="tang"]/ul/li[2]/a
    # - 逻辑运算: //a[@href="" and @class="du"] 并且
    # - 模糊匹配: //div[contains(@class, 'ng')] class包含 ng 的div
    #            //div[startwith(@class, 'ta')] class以 ta 开头的div
    # - 取文本: //div[@class="song"]/p[1]/text() div下的文本内容
    #          //div[@class="tang"]//text() div下以及字标签下的文本内容 返回列表
    # - 取属性: // div[@class="tang"]//a[1]/@href

    下面上几个小案例:

    import requests
    from lxml import etree
    
    url = 'https://bj.58.com/ershoufang/?utm_source=sem-sales-baidu-pc&spm=85077276202.21974091622&utm_campaign=sell&utm_medium=cpc&showpjs=pc_fg'
    
    headers = {
        'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'
    }
    
    page_text = requests.get(url=url, headers=headers).text
    
    tree = etree.HTML(page_text)
    li_list = tree.xpath('//ul[@class="house-list-wrap"]/li') # 返回的是Element对象
    fp = open('58.csv', 'w', encoding='utf8')
    for li in li_list:
        title = li.xpath('./div[2]/h2/a/text()')[0] # 局部页面解析要加'.'
        price1 = li.xpath('./div[3]//text()')
        price = ''.join(price1)
        fp.write(title+":"+price+'
    ')
    fp.close()
    print('over')
    爬取 58二手房信息
    xpath 解析图片资源
    
    import requests
    from lxml import etree
    
    url = "http://pic.netbian.com/4kmeinv/"
    
    headers = {
        'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'
    }
    page_text = requests.get(url=url, headers=headers).text
    
    tree = etree.HTML(page_text)
    # etree.parse(page_text) 解析本地文件推荐使用
    li_list = tree.xpath('//div[@class="slist"]/ul/li')
    for li in li_list:
        image_name = li.xpath('./a/b/text()')[0]
        image_name = image_name.encode('iso-8859-1').decode('gbk')
        image_url = 'http://pic.netbian.com'+li.xpath('./a/img/@src')[0]
        image_path = './img/'+image_name+'.jpg'
        img = requests.get(image_url).content
        with open(image_path, 'wb') as f:
            f.write(img)
        print(image_path+'下载成功')
    图片怎么爬取呢?
    import requests
    import base64
    from lxml import etree
    
    headers = {
        'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'
    }
    url = 'http://jandan.net/top'
    response = requests.get(url=url, headers=headers)
    page_text = response.text
    tree = etree.HTML(page_text)
    code_list = tree.xpath('//span[@class="img-hash"]/text()')
    for img_code in code_list:
        img_url = 'http:'+base64.b64decode(img_code).decode()
        img_name = img_url.split('/')[-1]
        img_path = f'./jd_img/{img_name}'
        print(img_url)
        content = requests.get(img_url).content
        with open(img_path, 'wb') as f:
            f.write(content)
        print(img_name+'成功')
    print('over')
    有的时候我找不到我要的图片链接呀

    上面是煎蛋网采用了js的方法对图片链接地址进行了base64的加密

    # 简历模板爬取(ip禁用问题)
    # 解决方法:
    #       ip代理,
    #       请求头中添加Connection字段:close
    import requests
    import random
    from lxml import etree
    
    url = 'http://sc.chinaz.com/jianli/free.html'
    
    headers = {
              'Connection': 'close', # 每次请求成功之后,发马上断开连接(修改后有几率无法立即生效,出现Httppool...错误- 重新运行)
              'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'
    }
    page_text = requests.get(url=url, headers=headers).text
    tree = etree.HTML(page_text)
    a_list = tree.xpath('//div[@id="container"]/div/a[1]')
    for a in a_list:
        title = a.xpath('./img/@alt')[0].encode('iso-8859-1').decode('utf-8')
        detail_url = a.xpath('./@href')[0]
        detail_text = requests.get(url=detail_url, headers=headers).text
        d_tree = etree.HTML(detail_text)
        down_url_list = d_tree.xpath('//div[@class="down_wrap"]//li/a/@href')
        down_url = random.choice(down_url_list)
        data = requests.get(down_url,headers=headers).content
        with open(f'./简历模板/{title}.rar', 'wb') as f:
            f.write(data)
        print(title+'完成')
    print('over')
    站长之家模板资源爬取下载
  • 相关阅读:
    2016/11/17 周四 <javascript的封装简单示例>
    JavaScript资源大全中文版(Awesome最新版转载自张果老师博客)
    <web Font的使用>
    博客园首页飘彩色雪花代码
    C#多线程
    SQL Server数据库优化措施:索引优化(转)
    HOWTO: InstallShield中如何实现MSI包的权限提升(转)
    C# 获取操作系统版本信息
    installshield msi程序安装问题
    bat和VBS
  • 原文地址:https://www.cnblogs.com/Treasuremy/p/10444490.html
Copyright © 2011-2022 走看看