zoukankan      html  css  js  c++  java
  • IMDB-TOP_250-爬虫

    这个小学期Python大作业搞了个获取IMDB TOP 250电影全部信息的爬虫。第二次写爬虫,比在暑假集训时写的熟练多了。欢迎大家评论。

      1 '''
      2 ************************************************
      3 *Time:2017.9.11       
      4 *Target:All movies' information of IMDB TOP_250
      5 *Resources:http://www.imdb.cn/IMDB250/
      6 ************************************************
      7 '''
      8 
      9 import re
     10 import requests
     11 import numpy as np
     12 import matplotlib.pyplot as plt
     13 from bs4 import BeautifulSoup
     14 
     15 num = 1 #电影计数
     16 All_txt = [] #全部电影的信息
     17 headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.221 Safari/537.36 SE 2.X MetaSr 1.0'}#浏览器代理
     18 def  getHTMLText(url):
     19     try:
     20         #print(url)
     21         r = requests.get( url,headers = headers )
     22         #print(r)
     23         r.encoding = 'utf-8'
     24         return r.text
     25     except:
     26         return "错误"
     27 
     28 #从每一部电影的页面中获取全部信息
     29 def get_all_information(url,page):
     30     global num,All_txt
     31     txt = getHTMLText(url)
     32     if txt != "错误":
     33         print('page'+str(page)+' NO.'+str(num)+' Get it!')
     34     if num == 247:
     35         print('Finished!!!')
     36     soup = BeautifulSoup(txt,"html.parser")
     37     Cname,Ename,Score,title,Actor,Starring,Infor = '','','','','','',''
     38 
     39     #TOP250-film_Chinese_name&Score
     40     infor_1 = soup.find_all('div',class_ = 'hdd')
     41     rel = '<h3>'+'[sS]*?'+'</h3>'
     42     pattern = re.compile(rel)
     43     Cname = ''.join(pattern.findall(str(infor_1[0])))
     44     Cname = Cname.replace('<h3>','').replace('</h3>','')
     45     #print(Cname)
     46     #find_the_year & save
     47     rel = ''+'[sS]*?'+''
     48     pattern = re.compile(rel)
     49     time_ = ''.join(pattern.findall(Cname))
     50     #print(time_)
     51     with open('time.txt','a',encoding='utf-8') as t:
     52         t.write( time_.replace('','').replace('','') + '
    ' )
     53     #find_Score
     54     rel = '<i>'+'[sS]*?'+'</i>'
     55     pattern = re.compile(rel)
     56     Score = ''.join(pattern.findall(str(infor_1[0])))
     57     Score = Score.replace('<i>','').replace('</i>','')
     58     #print(Cname,Score)
     59 
     60     #TOP250-film_many_infor
     61     now = soup.find_all('div',class_ = 'bdd clear')
     62     #print(now[0])
     63     a = BeautifulSoup(str(now[0]), "html.parser")
     64     many_infor = a.find_all('li')
     65 
     66     #TOP250-film_Ename
     67     Ename = str(many_infor[0]).replace('<li>','').replace('<i>','').replace('</i>','').replace('</li>','').replace('<a>','').replace('</a>','')
     68     #TOP250-film_Actor
     69     Actor_temp = BeautifulSoup(str(many_infor[2]), "html.parser").find_all('a')
     70     Actor = Actor_temp[0].get_text().replace('导演:','')
     71     #TOP250-film_Starring
     72     Starring_temp = BeautifulSoup(str(many_infor[3]), "html.parser").find_all('a')
     73     for i in Starring_temp:
     74         Starring += i.get_text().replace(' ','') + ' '
     75     #print(Starring)
     76 
     77     #Top-film_Infor
     78     for j in range(4,7):
     79         Infor_temp = BeautifulSoup(str(many_infor[j]), "html.parser")
     80         for i in Infor_temp.children:
     81             Infor += i.get_text().replace(' ','') + ' '
     82         Infor += '
    '
     83     #print(Infor)
     84 
     85     #TOP250-film_Synopsis
     86     content =  soup.find_all('div',class_ = 'fk-4 clear')
     87     #print(content)
     88     soup_con = BeautifulSoup(str(content[0]), "html.parser")
     89     title = soup_con.find_all('div',class_ = 'hdd')
     90     title = str(title[0]).replace('<div class="hdd">','').replace('</div>','
    ')
     91     #print(title)
     92     content_1 = soup_con.find_all('div',class_ = 'bdd clear')
     93     content_1 = str(content_1[0]).replace('<div class="bdd clear" style="font-size:15px">','').replace('</div>','')
     94     content_1 = content_1.replace('<!-- <p><a href="#">更多剧情 >></a></p>  -->','').replace('<br/>','
    ')
     95 
     96     #Save_all_information
     97     All_txt.append(''+str(num)+''+'
    ')
     98     All_txt.append( Cname+'
    ' )
     99     All_txt.append( '【英文名】'+Ename+'
    ' )
    100     All_txt.append( '【评分】'+Score+'
    ' )
    101     All_txt.append( '【导演】'+Actor+'
    ' )
    102     All_txt.append( '【主演】'+Starring+'
    ' )
    103     All_txt.append( Infor+'
    ' )
    104     All_txt.append( title+'
    '+content_1+'
    ' )
    105     All_txt.append('
    ')
    106     num += 1
    107 
    108 #在每一页中得到当前页的全部电影的url
    109 def getin_one(url,page):
    110     txt = getHTMLText(url)
    111     soup = BeautifulSoup(txt, "html.parser")
    112     #print(soup)
    113     temp = soup.find_all('div',class_="ss-3 clear")
    114     rel = '<a href="' + '[sS]*?' + '">'
    115     pattern = re.compile(rel)
    116     All_url = pattern.findall( str(temp[0]) )
    117     for i in range(len(All_url)):
    118         temp_url = 'http://www.imdb.cn'+All_url[i].replace('<a href="','').replace('">','')
    119         get_all_information(temp_url,page)
    120     #print(All_url)
    121 
    122 #将所有电影的年份统计并生成条形图
    123 def Analyze_some_infor():
    124     plt.rc('font', family='SimHei', size=13)#字体及大小
    125     #Analyze_time
    126     file = open('time.txt')
    127     a,b,c,d,e,f = 0,0,0,0,0,0
    128     for line in file:
    129         line = eval(line)
    130         if line == 0:
    131             f += 1
    132         elif line < 1940 and line >= 1920:
    133             a += 1 
    134         elif line < 1960 and line >= 1940:
    135             b += 1
    136         elif line < 1980 and line >= 1960:
    137             c += 1
    138         elif line < 2000 and line >= 1980:
    139             d += 1
    140         else:
    141             e += 1
    142     times = [a,b,c,d,e,f]
    143     range_time = ['1920-1940','1940-1960','1960-1980','1980-2000','2000-现在','无信息']
    144     idx = np.arange(len(range_time))
    145     width = 0.5
    146     plt.bar(idx,times,width,color='green')
    147     plt.xticks(idx+width/2, range_time, rotation=40)
    148     plt.xlabel('电影年代')
    149     plt.ylabel('数目')
    150     plt.savefig('time_pic.jpg')
    151     plt.show()
    152 
    153 def main():
    154     global All_txt
    155     getin_one('http://www.imdb.cn/IMDB250/',1)
    156     for i in range(2,10):
    157         getin_one( 'http://www.imdb.cn/imdb250/'+str(i) , i )
    158     #将已有内容清空
    159     with open('All_infor.txt','w',encoding='utf-8') as x:
    160         pass
    161     with open('All_infor.txt','a',encoding='utf-8') as x:
    162         for i in All_txt:
    163             x.write(i)
    164     Analyze_some_infor()
    165 
    166 main()

    作者: LB919
    出处:http://www.cnblogs.com/L1B0/
    该文章为LB919投入了时间和精力的原创;
    如有转载,荣幸之至!请随手标明出处;

  • 相关阅读:
    Redhat as 版本下启用 Telnet 和 FTP 服务
    Eclipse中设置编码的方式
    rhel3上安装Oracle(来自Oracle网站)
    home/end的快捷键~
    Red Hat Linux 9中文本模式与图形模式的切换
    Highcharts:非常漂亮的图表API
    Linux裸设备总结(ZT)
    Red Hat Linux操作系统下从文本模式切换到图形模式的方法
    pear
    Java中的asList
  • 原文地址:https://www.cnblogs.com/L1B0/p/7545073.html
Copyright © 2011-2022 走看看