zoukankan      html  css  js  c++  java
  • (二)python爬虫实例:猫眼电影TOP100榜并将电影信息写入到Excel(Excel列宽自适应)

    # -*- coding:utf-8 -*-
    import requests
    from bs4 import BeautifulSoup
    import xlrd,xlwt
    
    urls = [
        "https://maoyan.com/board/4?offset={}".format(i)
        for i in range(0,100,10)
    ]
    
    header = {
    "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 ("
                  "KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36"
    }
    length = {}
    
    
    
    def len_byte(value):
        length = len(value)
        utf8_length = len(value.encode('utf-8'))
        length = (utf8_length - length)/2 + length
        return int(length)
    
    #电影 主演 时间 国家 评分
    def FilmInformation(url):
        content = []
        r = requests.get(url,headers = header)
        respond = r.text
        soup = BeautifulSoup(respond,"html.parser")
        films = soup.select(".board-item-main")
        for film in films:
            name = (film.select("[title]")[0].text)
            # name = (film.select(".name a")[0].text) 也可以 为什么是0呢,因为返回的是一个列表
            staring = (film.select(".star")[0].text).strip().split("")[1]  #中文的冒号
            releasetime = (film.select(".releasetime")[0].text).split("")[1].split("(")[0]
            country = (film.select(".releasetime")[0].text).split("")[1][10:]
            if country:
                country = country.replace("(","").replace(")","") #lstrip() rstip()等也可以实现
            else:
                country = "(暂无)"
            integer = (film.select(".integer")[0].text)
            fraction = (film.select(".fraction")[0].text)
            score = integer + fraction
            content.append([name,staring,releasetime,country,score])
        return content
    
    def WriteExcel(data):
        global length
        title = ["电影","主演","时间","国家","评分"]
        workbook = xlwt.Workbook(encoding = "utf-8")
        sheet = workbook.add_sheet("猫眼前100")
        row = 1
        for i in range(len(title)):
            sheet.write(0,i,title[i])
        for num in data:
            for num_num in num:
                #for num_num_num in num_num:
                for col in range(len(num_num)):
                    sheet.write(row,col,num_num[col])
                    if col in length:
                        if length[col] < len(num_num[col]):
                            length[col] = len(num_num[col])
                    else:
                        length.setdefault(col, len(num_num[col]))
                row +=1
        for key,value in length.items():
            sheet.col(key).width = int(256*value*2)
    
        workbook.save("maoyan.xls")
    
    
    
    def main():
        all = []
        for url in urls:
            result = FilmInformation(url)
            all.append(result)
        WriteExcel(all)
    
    if __name__ =="__main__":
        main()
        print (length)
  • 相关阅读:
    小程序工程化探索:大规模场景下的问题和解决方案----------------引用
    对Taro Next小程序跨框架开发的探索与实践-----------------引用
    对Node.js 中的依赖管理------------引用
    对redux的研究--------引用
    JavaScript 中的 for 循环---------------引用
    对JavaScript 模块化的深入-----------------引用
    对Webpack 应用的研究-----------------引用
    webpack5持久化缓存
    设置x 轴斜体(每次我都百度,这次单独为它发一个)
    字典元组列表常用方法
  • 原文地址:https://www.cnblogs.com/python-kp/p/12519311.html
Copyright © 2011-2022 走看看