zoukankan      html  css  js  c++  java
  • python 爬虫&爬取豆瓣电影top250

    爬取豆瓣电影top250
    from urllib.request import * #导入所有的request,urllib相当于一个文件夹,用到它里面的方法request
    from lxml import etree #调用包
    import pickle #
    import time
    arr = [] #定义一个空数组,用来添加爬出的数据
    url = "https://movie.douban.com/top250?start=" #豆瓣top250网址
    urls = [ url+str(i) for i in range(0,250,25)] #每次步进值25,总共250个,爬取十次
    def aa(link): #定义一个函数aa
    time.sleep(1) #间隔一秒
    print("正在爬取:%s"%link) #提示信息可以实时看到爬取信息
    with urlopen(link) as html: #在html中打开爬取的数据
    text = html.read().decode("utf-8")# 读取并且解码数据
    doc = etree.HTML(text) #解析html etree这是lxml中的方法
    #分别爬取电影名字titles、详细信息news、评分grade、最佳评论comment、网址links
    titles = doc.xpath("//ol[@class='grid_view']/li/div[@class='item']/div[@class='info']/div[@class='hd']/a/span[1]/text()")
    news= doc.xpath("//ol[@class='grid_view']/li/div[@class='item']/div[@class='info']/div[@class='bd']/p/text()")
    grade= doc.xpath("//ol[@class='grid_view']/li/div[@class='item']/div[@class='info']/div[@class='bd']/div[@class='star']/span[@class='rating_num']/text()")
    comment= doc.xpath("//ol[@class='grid_view']/li/div[@class='item']/div[@class='info']/div[@class='bd']/p[@class='quote']/span[@class='inq']/text()")
    links = doc.xpath("//ol[@class='grid_view']/li/div[@class='item']/div[@class='info']/div[@class='hd']/a/@href")
    arr.append(list(zip(titles,news,grade,comment,links))) #用append方法将爬取数据添加到数组arr
    for link in urls: #遍历十页urls
    aa(link) #调用
    with open("豆瓣电影.txt",'wb') as f: #打开本地文件“豆瓣电影.txt”以写的方式,二进制
    pickle.dump(arr,f) #pickle包
    with open("豆瓣电影.txt",'rb') as f:
    obj = pickle.load(f) #加载
    for item in obj:
    print(item)
    import xlwt#(写入)
    wb=xlwt.Workbook() #创建表格对象
    ws=wb.add_sheet("豆瓣电影")
    with open("豆瓣电影.txt",'rb') as f:
    arr=pickle.load(f)
    index=0
    for arr2 in arr:
    for title,news,grade,comment,links in arr2:
    #序号
    ws.write(index,0,index+1)
    # title
    ws.write(index,1,title)
    ws.write(index,2,news)
    ws.write(index,3,grade)
    ws.write(index,4,comment)
    ws.write(index,5,links)
    index+=1

    wb.save("豆瓣电影.xls")
  • 相关阅读:
    LeetCode 485. Max Consecutive Ones
    LeetCode 367. Valid Perfect Square
    LeetCode 375. Guess Number Higher or Lower II
    LeetCode 374. Guess Number Higher or Lower
    LeetCode Word Pattern II
    LeetCode Arranging Coins
    LeetCode 422. Valid Word Square
    Session 共享
    java NIO
    非阻塞IO
  • 原文地址:https://www.cnblogs.com/aloneindefeat/p/10654858.html
Copyright © 2011-2022 走看看