zoukankan      html  css  js  c++  java
  • 爬取猫眼电影榜单TOP100榜-以命令行输出

    一、使用正则表达式匹配

    from urllib import request
    import re
    import time
    import random
    from useragents import ua_list
    
    class MaoyanSpider(object):
      def __init__(self):
        self.url = 'https://maoyan.com/board/4?offset={}'
        # 计数
        self.num = 0
    
      def get_html(self,url):
        headers = {
          'User-Agent' : random.choice(ua_list)
        }
        req = request.Request(url=url,headers=headers)
        res = request.urlopen(req)
        html = res.read().decode('utf-8')
        # 直接调用解析函数
        self.parse_html(html)
    
      def parse_html(self,html):
        re_bds = r'<div class="movie-item-info">.*?title="(.*?)".*?class="star">(.*?)</p>.*?releasetime">(.*?)</p>'
        pattern = re.compile(re_bds,re.S)
        # film_list: [('霸王别姬','张国荣','1993'),()]
        film_list = pattern.findall(html)
        # 直接调用写入函数
        self.write_html(film_list)
    
      def write_html(self,film_list):
        film_dict = {}
        for film in film_list:
          film_dict['name'] = film[0].strip()
          film_dict['star'] = film[1].strip()
          film_dict['time'] = film[2].strip()[5:15]
          print(film_dict)
    
          self.num += 1
    
      def main(self):
        for offset in range(0,31,10):
          url = self.url.format(offset)
          self.get_html(url)
          time.sleep(random.randint(1,2))
        print('共抓取数据:',self.num)
    
    if __name__ == '__main__':
      start = time.time()
      spider = MaoyanSpider()
      spider.main()
      end = time.time()
      print('执行时间:%.2f' % (end-start))

    二、使用xpath匹配

      一、xpath表达式

      1、基准xpath: 匹配所有电影信息的节点对象列表
      //dl[@class="board-wrapper"]/dd

      2、遍历对象列表,依次获取每个电影信息
      for dd in dd_list:
        电影名称 :.//p[@class="name"]/a/@title
        电影主演 :.//p[@class="star"]/text()
        上映时间 :.//p[@class="releasetime"]/text()

      二、代码实现

    import requests
    from lxml import etree
    import time
    import random
    from useragents import ua_list
    
    class MaoyanSpider(object):
      def __init__(self):
        self.url = 'https://maoyan.com/board/4?offset={}'
        # 计数
        self.num = 0
        self.blag = 1
    
      def get_html(self,url):
        headers = {
          'User-Agent' : random.choice(ua_list)
        }
        if self.blag <= 3:
          try:
            res = requests.get(url=url,headers=headers,timeout=3)
            res.encoding = 'utf-8'
            html = res.text
            # 直接调用解析函数
            self.parse_html(html)
          except Exception as e:
            print('Retry')
            self.blag += 1
            self.get_html(url)
    
    
      def parse_html(self,html):
        # 此处用xpath实现 - 先基准xpath,再依次遍历
        parse_html = etree.HTML(html)
        base_xpath = '//dl[@class="board-wrapper"]/dd'
        dd_list = parse_html.xpath(base_xpath)
        item = {}
        if dd_list:
          for dd in dd_list:
            # 电影名称
            xpath_name = './/p[@class="name"]/a/@title'
            name_list = dd.xpath(xpath_name)
            item['name'] = [
              name_list[0].strip() if name_list else None
            ][0]
            # 主演
            xpath_star = './/p[@class="star"]/text()'
            star_list = dd.xpath(xpath_star)
            item['star'] = [
              star_list[0].strip() if star_list else None
            ][0]
            # 时间
            xpath_time = './/p[@class="releasetime"]/text()'
            time_list = dd.xpath(xpath_time)
            item['time'] = [
              time_list[0].strip() if time_list else None
            ][0]
    
            print(item)
        else:
          print('No dd_list')
    
      def main(self):
        for offset in range(0,31,10):
          url = self.url.format(offset)
          self.get_html(url)
          time.sleep(random.randint(1,2))
          # 重置标签
          self.blag = 1
        print('共抓取数据:',self.num)
    
    if __name__ == '__main__':
      start = time.time()
      spider = MaoyanSpider()
      spider.main()
      end = time.time()
      print('执行时间:%.2f' % (end-start))
  • 相关阅读:
    总结在ssm整合中,Mybatis出现Mapped Statements collection already contains value for xxxxx的解决方案
    一般二叉树的创建,前序,中序,后序遍历
    无向图的广度优先遍历和深度优先遍历(简易实现)
    为什么局部内部类中访问同一方法中的变量,该变量一定要是final修饰的
    uml统一建模语言学习笔记(一)
    Font Awesome 字体使用方法, 兼容ie7+
    Java的三种代理模式&完整源码分析
    xxl-job源码分析
    MySQl看这一篇就够了
    第二部分:Spring中配置mongodb
  • 原文地址:https://www.cnblogs.com/hooo-1102/p/12155257.html
Copyright © 2011-2022 走看看