zoukankan      html  css  js  c++  java
  • 记一次python爬虫实战,豆瓣电影Top250爬虫

     1 import requests
     2 from bs4 import BeautifulSoup
     3 import re
     4 import traceback
     5 
     6 def GetHtmlText(url):
     7     for i in range(0,1):        #尝试两次
     8         try:
     9             r=requests.get(url)
    10             r.encoding = 'utf-8'
    11             r.raise_for_status();
    12             return r.text;
    13         except:
    14             traceback.print_exc()
    15             continue
    16     return 
    17 
    18 def GetMovieInfo(url):
    19     movieDict={}
    20     for page in range(0,10):
    21         try:
    22             page_url = '?start='+str(page*25)
    23             html = GetHtmlText(url+page_url)
    24             Soup = BeautifulSoup(html, 'html.parser')
    25             movie = Soup.find(name="ol",class_='grid_view') #所有电影信息
    26             movieList = movie.find_all(name='li')  #电影信息列表
    27             for single in movieList:        #循环单页的电影信息
    28                 num = single.find(name='em').string    #电影排名
    29                 title1 = single.find_all(name='span',class_='title')
    30                 title2 = single.find(name='span',class_='other').string
    31                 if len(title1)==2:
    32                     movieTitle = title1[0].string+title1[1].string+title2.string
    33                 else:
    34                     movieTitle = title1[0].string+title2.string
    35                 classBD = single.find(name='div',class_='bd').contents    #我也不知道为什么bs给我返回7个节点
    36                 movieActor = classBD[1].text
    37                 movieRating = re.findall(r'\d?\.\d?',str(classBD[3]))[0]
    38                 movieQuote = classBD[5].text       
    39                 movieDict['num'] = num
    40                 movieDict['movieTitle'] = movieTitle
    41                 movieDict['actor'] = movieActor
    42                 movieDict['rating'] = movieRating
    43                 movieDict['quote'] = movieQuote
    44                 printMovieInfo(movieDict)
    45         except:
    46             traceback.print_exc()
    47 
    48 
    49 def printMovieInfo(Info):
    50     try:
    51         with open('/home/why/py/movieInfo.txt','a',encoding='utf-8') as f:
    52             f.write(str(Info['num']+Info['movieTitle']+'\n'+Info['actor']+'\n评分:'+Info['rating']+'\n评价:'+Info['quote']+'\n'))
    53     except:
    54         traceback.print_exc()
    55 
    56 
    57 def main():
    58     base_url = 'https://movie.douban.com/top250'
    59     GetMovieInfo(base_url)
    60 main()

    结果:

  • 相关阅读:
    [BZOJ5015][Snoi2017]礼物
    [BZOJ5016][Snoi2017]一个简单的询问
    [BZOJ4184]shallot
    上传头像
    前端基础之jQuery
    前端之html的查漏补缺
    CSS3圆角、阴影、rgba
    CSS3新增选择器
    前端基础之BOM和DOM
    前端基础之JavaScript
  • 原文地址:https://www.cnblogs.com/Alexzzzz/p/7900299.html
Copyright © 2011-2022 走看看