zoukankan      html  css  js  c++  java
  • python3 xpath数据获取案例

    import requests
    from retrying import retry
    from lxml import etree
    import json


    class DaCheng(object):
      def __init__(self):

        self.temp_url = "http://www.dachengnet.com/cn/professionals?currentPageNo={}&"
        self.headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.91 Safari/537.36"}

      def get_url_list(self):
        url_list = [self.temp_url.format(i) for i in range(1, 78)] # 使用列表推导式获取pageNo
        return url_list

      @retry(stop_max_attempt_number=3)
      def _parse_url(self, url):
        r = requests.get(url, headers=self.headers, timeout=3)
        assert r.status_code == 200
        return etree.HTML(r.content) # 返回一个xpath对象

      def parse_url(self, url):
        # print(url)
        try:
          html = self._parse_url(url)
        except:
          html = None
        return html

      def get_content_list(self, html):
        tr_list = html.xpath("//tbody/tr")
        content_list = []
        for tr in tr_list:
          item = dict()
          # 姓名
          item['Name'] = tr.xpath('./td[1]/a/text()')[0] if len(tr.xpath('./td[1]/a/text()')) > 0 else None
          # 邮箱
          item['Email'] = tr.xpath('./td[2]/text()')[0] if len(tr.xpath('./td[2]/text()')) > 0 else None
          # 职位
          item['Position'] = tr.xpath('./td[3]/text()')[0].strip().replace(' ', '').replace(' ', '') if len(
            tr.xpath('./td[3]/text()')) > 0 else None
          # 地点
          item['Location'] = tr.xpath('./td[4]/text()')[0].strip().replace(' ', '').replace(' ', '') if len(
            tr.xpath('./td[4]/text()')) > 0 else None
          content_list.append(item)
          # print(item)
          return content_list

      def save_content_list(self, content_list):
        with open('DaCheng.json', 'a') as f:
          for content in content_list:
            json.dump(content, f, ensure_ascii=False, indent=2)
            f.write(', ')
            print('保存成功')

      def run(self):
        # 1.获取首页url
        url_list = self.get_url_list()
        # 2.循环发送请求,获取响应
        for url in url_list:
          html = self.parse_url(url)
          # 3.提取数据
          content_list = self.get_content_list(html)
          # 4.保存
          self.save_content_list(content_list)

    if __name__ == '__main__':
    dacheng = DaCheng()
    dacheng.run()

  • 相关阅读:
    服务器时间不准导致 com.sun.facelets.impl.DefaultFacelet refresh
    推荐10款来自极客标签的超棒前端特效[第五期] java程序员
    IE10的市场占有率扩充了一倍 java程序员
    固定背景实现的背景滚动特效 java程序员
    支持触摸设备的响应式HTML5音频播放器 AudioPlayer.js java程序员
    WebRTC与Ace在线代码编辑器合作,实现实时协作编程 java程序员
    最流行的JavaScript库,jQuery不再支持IE旧版本 java程序员
    Jquery实现鼠标移上弹出提示框,移出消失 java程序员
    xxx.c: Error: C3065E: type of input file 'xxxx' unknown java程序员
    35+多用途WordPress主题 java程序员
  • 原文地址:https://www.cnblogs.com/x-pyue/p/7798819.html
Copyright © 2011-2022 走看看