zoukankan      html  css  js  c++  java
  • 爬取豆瓣电影 top 前100

    # =============================================================================
    # 采集猫眼看电影中top前100的电影名称
    # 
    # 
    # 1. 电影名称
    # 2. 电影主演
    # 3. 上映时间
    # 4. 电影分数
    # 5. 电影封面图片
    # 6. 电影主页网址    
    # =============================================================================
    import requests 
    import re
    import os
    from bs4 import BeautifulSoup
    import pandas as pd
    
    url = 'https://maoyan.com/board/4'
    headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64)
           AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 
           Safari/537.36'}
    r = requests.get(url,headers = headers)
        
    r.encoding = 'utf-8'
    
    r.text
    
    bs = BeautifulSoup(r.text,'lxml')    
    
    bs    
    # 将得到的数据类型转化为需要的格式 
    bs.find_all('p','name')[0].text    
        
    # 首先提取出电影的名称
    
    # =============================================================================
    # name = []
    # for i in bs.find_all('p','name'):
    #     name.append(i.text)
    # name  
    # =============================================================================
    # 利用列表生成式
    [i.text for i in  bs.find_all('p','name')]   
    
    
    # 提取出电影的主演
    
    # =============================================================================
    # bs.find_all('p','star')[0].text.strip()  # 其中的strip是去除得到数据的前后空格
    # 
    # user = []
    # for i in bs.find_all('p','star'):
    #     user.append(i.text.strip())
    # user    
    # =============================================================================
        
    # 列表生成式
    
    [i.text.strip() for i in bs.find_all('p','star')]    
        
    
    # 提取出上映时间
    
    a = bs.find_all('p','releasetime')[0].text
    
    
    # =============================================================================
    # time = []
    # for i in bs.find_all('p','releasetime'):
    #     time.append(i.text.replace('上映时间:',''))
    # time
    # =============================================================================
    
    
    [i.text.replace('上映时间:','') for i in bs.find_all('p','releasetime')]
        
    # 电影的评分
    
    bs.find_all('p','score')[0].text
    
    [i.text for i in bs.find_all('p','score')]
    
    
    
    # 电影封面的网址
    
    bs.find_all('img','board-img')[0]['data-src']
    
    [i['data-src'] for i in bs.find_all('img','board-img')]
    
    
    # 电影主页的网址
    
    bs.find_all('a','image-link')[0]['href']
    
    [i['href'] for i in bs.find_all('a','image-link')]
    
    # In[] =======================================================================================
    # 进行整合 一次爬取电影一页的信息 def fun1(url): name,score,star,time,img,href = [],[],[],[],[],[] url = 'https://maoyan.com/board/4' headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'} r = requests.get(url,headers = headers) r.encoding = 'utf-8' bs = BeautifulSoup(r.text,'lxml') # 将得到的数据类型转化为需要的格式 bs.find_all('p','name')[0].text name = [i.text for i in bs.find_all('p','name')] score = [i.text for i in bs.find_all('p','score')] star = [i.text.strip() for i in bs.find_all('p','star')] time = [i.text.replace('上映时间:','') for i in bs.find_all('p','releasetime')] img = [i['data-src'] for i in bs.find_all('img','board-img')] href = [i['href'] for i in bs.find_all('a','image-link')] df = pd.DataFrame() df['电影名'] = name df['评分'] = score df['主演'] = star df['上映时间'] = time df['电影画面网址'] = img df['电影主页网址'] = href return df # =================================================== 调用 url = 'https://maoyan.com/board/4' data = fun1(url) data['完整主页网址'] = 'https://maoyan.com'+ data.电影主页网址 data.完整主页网址 # In[]========================================================================== # 将上面的函数用到 想提取几页就提取几页 想提取那个网址 直接设置地址就好 # 首先提取网页源代码 url = 'https://maoyan.com/board/4?offset=10' import time empty_df = pd.DataFrame() # 为了节省时间 此处只截取前5页 如果想多采集更该循环次数即可 for i in range(0,5): url = 'https://maoyan.com/board/4?offset={}'.format(i*10) print(url) # 每采集完一夜的信息打印一次网址 df = fun1(url) empty_df = empty_df.append(df,ignore_index= True ) # 将得到的每一页的信息拼接到这张大表中 ignore_index = True 表示对得到的列表序列重新排序 time.sleep(3) print(i+1) # ============================================================================= # # 在爬取信息时 为了防止封ip 设置休眠 隔几秒采取一次 # import time # time.sleep(3) # =============================================================================

      

  • 相关阅读:
    poj3122
    poj1323
    poj1328
    poj1700
    poj2586
    存储过程
    java基础3
    springmvc ---->helloworld
    选取下拉框,显示对应的图片
    java基础2
  • 原文地址:https://www.cnblogs.com/manjianlei/p/11545545.html
Copyright © 2011-2022 走看看