zoukankan      html  css  js  c++  java
  • python 爬虫豆瓣top250

    网页api:https://movie.douban.com/top250?start=0&filter=
    用到的模块:urllib,re,csv 

    捣鼓一上午终于好了,有些小问题

    (top218有bug)具体问题:上图没有主演:用到正则表达式时取出过多的值,下图则是正常取值

    所以取前200名,具体python代码实现如下,望大佬指导

    #! /usr/bin/python3
    # -*- coding:UTF-8 -*-
    from urllib import request
    import re,csv
    
    class MovieTopForDouBan(object):
        def __init__(self):
            self.start = 0
            self.param = '&filter='
            self.headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 '
                                       '(KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36'}
            self.file_path = 'D:\\'
            self.head = ['排名','名称','别名','其他名称','导演','主演','年份','地区','类型','平均分','人数','短评']
            self.movie_list=[]
    
        def get_page(self):
            try:
                url = 'https://movie.douban.com/top250?start=' + str(self.start)
                req = request.Request(url, headers=self.headers)
                response = request.urlopen(req)
                page = response.read().decode('utf-8')
                page_num = (self.start + 25) // 25
                print('正在抓取第' + str(page_num) + '页数据...')
                self.start += 25
                return page
            except request.URLError as e:
                if hasattr(e, 'reason'):
                    print('抓取失败,失败原因:', e.reason)
    
        def get_movie_info(self):
            pattern = re.compile(u'<div.*?class="item">.*?<em class="">(.*?)</em>'
                                 u'.*?<span.*?class="title">(.*?)</span>'
                                 u'.*?<span.*?class="title">(.*?)</span>'
                                 u'.*?<span.*?class="other">(.*?)</span>'
                                 u'.*?<div.*?class="bd">.*?<p.*?class="">'
                                 u'.*?导演:(.*?)&nbsp;.*?主演: (.*?)<br>'
                                 u'(.*?)&nbsp;/&nbsp;(.*?)&nbsp;/&nbsp;(.*?)</p>.*?<div.*?class="star">'
                                 u'.*?<span.*?class="rating_num".*?property="v:average">(.*?)</span>'
                                 u'.*?<span>(.*?)人评价</span>.*?</div>'
                                 u'.*?<span.*?class="inq">(.*?)</span>.*?</p>', re.S)
            while self.start <= 176:#取前俩百 (top:218 电影名:初恋这件小事)有bug
                page=self.d=self.get_page()
                movies=re.findall(pattern,page)
                for movie in movies:
                    data =list(movie)
                    data[2] = data[2].lstrip('&nbsp;/&nbsp;')
                    data[3] = data[3].lstrip('&nbsp;/&nbsp;')
                    data[6] = data[6].lstrip()
                    data[8] = data[8].rstrip()
                    self.movie_list.append(data)
    
        def write_text(self):
            print('开始向文件写入数据....')
            with open(self.file_path+'movie_info.txt','w',encoding='utf-8') as file_TopText:
                try:
                    for movie in self.movie_list:
                        file_TopText.write('电影排名:' + movie[0] + '\r\n')
                        file_TopText.write('电影名称:' + movie[1] + '\r\n')
                        file_TopText.write('外文名称:' + movie[2] + '\r\n')
                        file_TopText.write('电影别名:' + movie[3] + '\r\n')
                        file_TopText.write('导演姓名:' + movie[4] + '\r\n')
                        file_TopText.write('主演姓名:' + movie[5] + '\r\n')
                        file_TopText.write('上映年份:' + movie[6] + '\r\n')
                        file_TopText.write('制作国家/地区:' + movie[7] + '\r\n')
                        file_TopText.write('电影类别:' + movie[8] + '\r\n')
                        file_TopText.write('电影评分:' + movie[9] + '\r\n')
                        file_TopText.write('参评人数:' + movie[10] + '\r\n')
                        file_TopText.write('简短影评:' + movie[11] + '\r\n\r\n')
                    print('抓取结果写入文件成功...')
                except Exception as e:
                     print(e)
            print('数据写入完毕....')
    
        def write_csv_file(self):
            path = self.file_path + 'movie_info.csv'
            common=0
            try:
                with open(path, 'w', newline='',encoding='utf-8') as csv_file:
                    writer = csv.writer(csv_file, dialect='excel')
                    if self.head is not None:
                        writer.writerow(self.head)
                    for row in self.movie_list:
                        writer.writerow(row)
                        common+=1
                    print("将CSV文件写入路径%s成功。" % path)
            except Exception as e:
                print("将CSV文件写入路径: %s, 信息: %s" % (path, e))
                print(common)
    
        def main(self):
            print('开始从豆瓣电影抓取数据........')
            self.get_movie_info()
            self.write_text()
            #self.write_csv_file()
            print('数据抓取完毕...')
    
    if __name__ == '__main__': 
        movie = MovieTopForDouBan()
        movie.main()

    d盘根目录生成一个movie_info.txt 文件

  • 相关阅读:
    如何在一个项目中同时包含mvc建站、webapi接口
    解决api、WebService跨域问题
    mvc接口、webapi、webservice 对比
    云服务器 远程mysql 无法连接
    c#快速写本地日志
    使用筛选器特性标记方法解决webapi 跨域问题
    流量控制(滑动窗口协议)
    解释Windows7“上帝模式”的原理
    Linux网络协议栈(二)——套接字缓存(socket buffer)
    理解MySQL——架构与概念
  • 原文地址:https://www.cnblogs.com/wananonline/p/9172579.html
Copyright © 2011-2022 走看看