zoukankan      html  css  js  c++  java
  • python 爬取豆瓣电影top100,保存到Excel

    #!/usr/bin/env python
    # -*- coding: utf-8 -*-
    # @Time : 2021/2/7 16:23
    # @Author :
    
    from bs4 import BeautifulSoup
    import re
    import xlwt
    import urllib.request, urllib.response, urllib.error
    
    
    def default():
        baseUrl = 'https://movie.douban.com/top250?start='
        dataList = getData(baseUrl)
    
        savePath = '豆瓣Top250.xls'
        saveDataExcel(savePath, dataList)
    
    
    
    def getData(baseUrl):
        """
        获取数据
        """
        findUrl = re.compile(r'<a href="(.*?)">')
        findImg = re.compile(r'<img.*src="(.*?)"')
        findTitle = re.compile(r'<span class="title">(.*)</span>')
        findRating = re.compile(r'<span class="rating_num" property="v:average">(.*)</span>')
        findJudge = re.compile(r'<span>(d*)人评价</span>')
        findInq = re.compile(r'<span class="inq">(.*)</span>')
        findBd = re.compile(r'<p class="">(.*?)</p>', re.S)  # 忽略换行符
    
        dataList = []
        for i in range(0, 10):
            url = baseUrl + str(i * 25)
            html = getOneUrl(url)
            soup = BeautifulSoup(html, 'html.parser')
            for item in soup.find_all('div', class_="item"):
                data = []
                item = str(item)
                ##链接地址
                link = re.findall(findUrl, item)[0]
                data.append(link)
                img = re.findall(findImg, item)[0]
                data.append(img)
                titles = re.findall(findTitle, item)
                if len(titles) == 2 :
                    cTitle = titles[0]
                    oTitle = titles[1].replace('/', '')
                else:
                    cTitle = titles[0]
                    oTitle = ' '
                data.append(cTitle.strip())
                data.append(oTitle.strip())
                rating = re.findall(findRating, item)[0]
                data.append(rating)
                judge = re.findall(findJudge, item)[0]
                data.append(judge)
                inq = re.findall(findInq, item)
                if len(inq) != 0:
                    inq = inq[0].replace('', '')
                else:
                    inq = ' '
                data.append(inq)
                bd = re.findall(findBd, item)[0]
                bd = re.sub('<br(s+)?/>(s+)?', ' ', bd) # 去掉br
                bd = re.sub('/', ' ', bd) # 去掉/
                data.append(bd.strip())
                dataList.append(data)
    
    
        return dataList
    
    def getOneUrl(url):
        """获取一个地址信息"""
        header = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.104 Safari/537.36',
        }
        request = urllib.request.Request(url=url, headers=header)
        try:
            resp = urllib.request.urlopen(request)
            html = resp.read().decode('utf-8')
            # print(html)
        except urllib.error.HTTPError as e:
            if hasattr(e, 'code'):
                print('code', e.code)
            if hasattr(e, 'reason'):
                print('reason', e.reason)
            print('error', e)
    
        return html
    
    
    def saveDataExcel(path, dataList):
        """保存数据到Excel"""
        print('save....')
        workbook = xlwt.Workbook(encoding='utf-8', style_compression=0) #创建workbook对象
        worksheet = workbook.add_sheet('shheet1', cell_overwrite_ok=True) #创建工作表
        col = ("链接", "图片", "中文名", "外国名", "评分", "评价数", "概况", "相关信息")
        for item in range(0, 8):
            worksheet.write(0, item, col[item]) #列名
        for item in range(0, 250):
            print("第%d条" % (item + 1))
            data = dataList[item]
            for i in range(0, 8):
                worksheet.write(item+1, i, data[i])
    
        workbook.save(path)
        print('over')
    
    
    if __name__ == '__main__':
        default()
  • 相关阅读:
    google的几道面试题
    轮盘赌算法
    基于packstack的openstack单节点安装
    攻克python3-字典(第四篇)
    攻克python3-列表与元组(第三篇)
    攻克python3-字符串(第二篇)
    攻克python3(第一篇)
    二维数组
    小白对c语言指针的基础总结
    小白对c语言数组的基础总结
  • 原文地址:https://www.cnblogs.com/l-zl/p/14463813.html
Copyright © 2011-2022 走看看