zoukankan      html  css  js  c++  java
  • python3爬取豆瓣排名前250电影信息

    #!/usr/bin/env python
    # -*- coding: utf-8 -*-
    # @File  : doubanmovie.py
    # @Author: Anthony.waa
    # @Date  : 2019/3/2 0028
    # @Desc  : PyCharm
    
    
    import requests
    from lxml import html
    
    headers = {
        'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36'
    }
    
    # 爬取页面内容
    def moviesInfo(url):
        reponse = requests.get(url=url, headers=headers).content
        bs = html.fromstring(reponse)
        num = 0
        for i in bs.xpath('//div[@class ="info"]'):
            try:
                # 电影名称
                movieName = i.xpath('div[@class="hd"]/a/span[@class="title"]/text()')[0]
                # 电影信息
                movieInfo = i.xpath('div[@class="bd"]/p[1]/text()')
                # 电影简述
                movieDescribes = i.xpath('//span[@class="inq"]/text()')
                # 电影评论人数
                movieNums = i.xpath('//div[@class="star"]/span[4]/text()')
    
    
                # 上映国家
                moviePeople = movieInfo[1].replace(" ","").replace("
    ","").split("/")[1]
                # 电影上映时间
                movieDate = movieInfo[1].replace(" ","").replace("
    ","").split("/")[0]
                # 获取电影的每一条简述
                movieDescribe = movieDescribes[num]
                # 获取每一个电影的评论人数
                movieNum = movieNums[num]
    
                with open('2019movies.txt','a+',encoding="utf-8") as file:
                    file.writelines("%s   %s   %s   %s   %s
    "%(movieName, moviePeople, movieDate, movieDescribe,movieNum))
                # print(movieName, moviePeople, movieDate, movieDescribe,movieNum,'
    ')
                num += 1
            except Exception as e:
                break
    
    
    
    if __name__ == '__main__':
        num = 0
        for i in range(10):
            page = 'https://movie.douban.com/top250?start=%d&filter='%num
            moviesInfo(page)
            num += 25
    

      

  • 相关阅读:
    容器云技术:容器化微服务,Istio占C位出道
    如何用istio实现请求超时管理
    技术进阶:Kubernetes高级架构与应用状态部署
    如何基于 K8S 多租能力构建 Serverless Container
    面试题目<转载>
    PHP面试出场率较高的题目<转载>
    命名规范
    字符串大小写转换(三种方法)
    php反转输出字符串(两种方法)
    获取文件名后缀的方法
  • 原文地址:https://www.cnblogs.com/ipyanthony/p/10461440.html
Copyright © 2011-2022 走看看