zoukankan      html  css  js  c++  java
  • xpath练习(链家二手房案例,百度贴吧图片抓取案例)

    链家二手房案例(xpath)

    实现步骤

      1.确定是否为静态

        打开二手房页面 -> 查看网页源码 -> 搜索关键字

      2.xpath表达式

    1、基准xpath表达式(匹配每个房源信息节点列表)
      //ul[@class="sellListContent"]/li[@class="clear LOGVIEWDATA LOGCLICKDATA"]
    
    2、依次遍历后每个房源信息xpath表达式
       * 名称: './/a[@data-el="region"]/text()'
       
       # 户型+面积+方位+是否精装
       info_list = './/div[@class="houseInfo"]/text()'  [0].strip().split('|')
       * 户型(model): info_list[1]
       * 面积(area): info_list[2]
       * 方位(direction): info_list[3]
       * 精装(perfect): info_list[4]
       
    
       * 楼层(floor): './/div[@class="positionInfo"]/text()'
       * 区域(address): './/div[@class="positionInfo"]/a/text()'
       * 总价(total_price): './/div[@class="totalPrice"]/span/text()'
       * 单价(unit_price): './/div[@class="unitPrice"]/span/text()'

      3.实现代码

    import requests
    from lxml import etree
    import time
    import random
    
    class LianjiaSpider(object):
      def __init__(self):
        self.url = 'https://bj.lianjia.com/ershoufang/pg{}/'
        self.headers = {'User-Agent' : 'Mozilla/5.0'}
    
      def get_page(self,url):
        try:
            # 设定超时时间,超时后抛出异常,被except捕捉,继续执行此函数再次请求
            res = requests.get(url,headers=self.headers,timeout=5)
            res.encoding = 'utf-8'
            html = res.text
            self.parse_page(html)
        except Exception as e:
            self.get_page(url)
    
      def parse_page(self,html):
        parse_html = etree.HTML(html)
        # 基准xpath,匹配每个房源信息的节点对象
        li_list = parse_html.xpath('//ul[@class="sellListContent"]/li[@class="clear LOGVIEWDATA LOGCLICKDATA"]')
        # 定义空字典,用来存储抓取的最终数据
        house_dict = {}
        # 遍历依次匹配每个房源信息,获取所有所需数据
        for li in li_list:
          # 房源名称
          name_list = li.xpath('.//a[@data-el="region"]/text()')
          house_dict['house_name'] = [ name_list[0] if name_list else None ][0]
    
          # 列表:户型+面积+方位+是否精装
          info_list = li.xpath('.//div[@class="houseInfo"]/text()')
          house_info = [ info_list[0].strip().split('|') if info_list else None ][0]
          if house_info:
              # 户型
              house_dict['house_model'] = house_info[1]
              # 面积
              house_dict['area'] = house_info[2]
              # 方位
              house_dict['direction'] = house_info[3]
              # 是否精装
              house_dict['hardcover'] = house_info[4]
          ###########################################
          # 楼层
          floor_list = li.xpath('.//div[@class="positionInfo"]/text()')
          house_dict['floor'] = [ floor_list[0].strip()[:-2] if floor_list else None ][0]
          # 区域
          address_list = li.xpath('.//div[@class="positionInfo"]/a/text()')
          house_dict['address'] = [ address_list[0].strip() if address_list else None ][0]
          # 总价
          total_list = li.xpath('.//div[@class="totalPrice"]/span/text()')
          house_dict['total_price'] = [ total_list[0].strip() if total_list else None ][0]
          # 单价
          unit_list = li.xpath('.//div[@class="unitPrice"]/span/text()')
          house_dict['unit_price'] = [ unit_list[0].strip() if unit_list else None ][0]
    
          print(house_dict)
    
      def main(self):
        for pg in range(1,11):
          url = self.url.format(str(pg))
          self.get_page(url)
          print('第%d页爬取成功' % pg)
          time.sleep(random.randint(1,3))
    
    if __name__ == '__main__':
      start = time.time()
      spider = LianjiaSpider()
      spider.main()
      end = time.time()
      print('执行时间:%.2f' % (end-start))
    代码实现

    百度贴吧图片抓取

    目标:抓取指定贴吧所有图片

    思路:

    1、获取贴吧主页URL,下一页,找到不同页的URL规律
    2、获取1页中所有帖子URL地址: [帖子链接1,帖子链接2,...]
    3、对每个帖子链接发请求,获取图片URL
    4、向图片的URL发请求,以wb方式写入本地文件

    实现步骤:

      1.贴吧url规律

    http://tieba.baidu.com/f?kw=??&pn=50

      2.xpath表达式

    1、帖子链接xpath
       //div[@class="t_con cleafix"]/div/div/div/a/@href
        
    2、图片链接xpath
       //div[@class="d_post_content j_d_post_content  clearfix"]/img[@class="BDE_Image"]/@src
        
    3、视频链接xpath
       //div[@class="video_src_wrapper"]/embed/@data-video
       # 注意: 此处视频链接前端对响应内容做了处理,需要查看网页源代码来查看,复制HTML代码在线格式化

      3.代码实现

    import requests
    from urllib import parse
    from lxml import etree
    import time 
    import random
    
    class BaiduImgSpider(object):
      def __init__(self):
        self.url = 'http://tieba.baidu.com/f?{}'
        self.headers = {'User-Agent':'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; InfoPath.3)'}
    
      # 获取html函数
      def get_html(self,url):
          try:
              res = requests.get(url=url,headers=self.headers)
              res.encoding = 'utf-8'
              html = res.text
    
              return html
          except Exception as e:
              self.get_html(url)
    
      # 解析html函数
      def xpath_func(self,xpath_bds,html):
          parse_html = etree.HTML(html)
          r_list = parse_html.xpath(xpath_bds)
    
          return r_list
    
    
      # 一级页面:获取帖子链接,最终搞定所有图片下载
      # 还记得吗?多级页面抓取所有数据都在一级页面中搞定!!!
      def get_tlink(self,url):
        html = self.get_html(url)
        xpath_bds = '//div[@class="t_con cleafix"]/div/div/div/a/@href'
        # tlink_list: ['/p/23234','/p/9032323']
        tlink_list = self.xpath_func(xpath_bds,html)
        # 依次遍历每个帖子链接,搞定所有的图片下载
        if tlink_list:
            for tlink in tlink_list:
              t_url = 'http://tieba.baidu.com' + tlink
              # 提取图片链接并保存
              self.get_image(t_url)
              time.sleep(random.randint(1,3))
        else:
            print('No Data')
    
      # 获取图片链接
      def get_image(self,t_url):
        html = self.get_html(t_url)
        # 提取图片链接
        xpath_bds = '//*[@class="d_post_content j_d_post_content  clearfix"]/img/@src'
        imglink_list = self.xpath_func(xpath_bds,html)
    
        for imglink in imglink_list:
          html_content = requests.get(imglink,headers=self.headers).content
          filename = imglink[-10:]
          with open(filename,'wb') as f:
              f.write(html_content)
              print('%s下载成功' % filename)
    
      # 指定贴吧名称,起始页和终止页,爬取图片
      def main(self):
        name = input('请输入贴吧名:')
        begin = int(input('请输入起始页:'))
        end = int(input('请输入终止页:'))
        for page in range(begin,end+1):
          # 查询参数编码
          params = {
            'kw' : name,
            'pn' : str( (page-1)*50 )
          }
          params = parse.urlencode(params)
          url = self.url.format(params)
    
          # 开始获取图片
          self.get_tlink(url)
    
    if __name__ == '__main__':
      spider = BaiduImgSpider()
      spider.main()
    代码实现
  • 相关阅读:
    并发技术
    体系结构基础
    TCP协议总结
    HTTP协议总结
    SQL Cookbook:使用字符串
    模运算的基本性质
    682. Baseball Game
    编程之法:面试和算法心得(字符串的全排列)
    编程之法:面试和算法心得(最长回文子串)
    编程之法:面试和算法心得(回文判断)
  • 原文地址:https://www.cnblogs.com/maplethefox/p/11338195.html
Copyright © 2011-2022 走看看