zoukankan      html  css  js  c++  java
  • Python爬虫实现抓取腾讯视频所有电影【实战必学】

     

    前言
    本文的文字及图片来源于网络,仅供学习、交流使用,不具有任何商业用途,版权归原作者所有,如有问题请及时联系我们以作处理。
    作者: Python新手学习之家

    用python实现的抓取腾讯视频所有电影的爬虫

    1.  
      # -*- coding: utf-8 -*-
    2.  
      import re
    3.  
      import urllib2
    4.  
      from bs4 import BeautifulSoup
    5.  
      import string, time
    6.  
      import pymongo
    7.  
       
    8.  
      NUM = 0 #全局变量,电影数量
    9.  
      m_type = u'' #全局变量,电影类型
    10.  
      m_site = u'qq' #全局变量,电影网站
    11.  
       
    12.  
      #根据指定的URL获取网页内容
    13.  
      def gethtml(url):
    14.  
      req = urllib2.Request(url)
    15.  
      response = urllib2.urlopen(req)
    16.  
      html = response.read()
    17.  
      return html
    18.  
      '''
    19.  
      在学习过程中有什么不懂得可以加我的python学习交流扣扣qun,934109170,群里有不错的学习教程与开发工具。
    20.  
      '''
    21.  
       
    22.  
      #从电影分类列表页面获取电影分类
    23.  
      def gettags(html):
    24.  
      global m_type
    25.  
      soup = BeautifulSoup(html) #过滤出分类内容
    26.  
      #print soup
    27.  
      #<ul class="clearfix _group" gname="mi_type" gtype="1">
    28.  
      tags_all = soup.find_all('ul', {'class' : 'clearfix _group' , 'gname' : 'mi_type'})
    29.  
      #print len(tags_all), tags_all
    30.  
      #print str(tags_all[1]).replace(' ', '')
    31.  
       
    32.  
      #<a _hot="tag.sub" class="_gtag _hotkey" href="http://v.qq.com/list/1_0_-1_-1_1_0_0_20_0_-1_0.html"title="动作" tvalue="0">动作</a>
    33.  
      re_tags = r'<a _hot="tag.sub" class="_gtag _hotkey" href="(.+?)" title="(.+?)" tvalue="(.+?)">.+?</a>'
    34.  
      p = re.compile(re_tags, re.DOTALL)
    35.  
       
    36.  
      tags = p.findall(str(tags_all[0]))
    37.  
      if tags:
    38.  
      tags_url = {}
    39.  
      #print tags
    40.  
      for tag in tags:
    41.  
      tag_url = tag[0].decode('utf-8')
    42.  
      #print tag_url
    43.  
      m_type = tag[1].decode('utf-8')
    44.  
      tags_url[m_type] = tag_url
    45.  
       
    46.  
      else:
    47.  
      print "Not Find"
    48.  
      return tags_url
    49.  
       
    50.  
      #获取每个分类的页数
    51.  
      def get_pages(tag_url):
    52.  
      tag_html = gethtml(tag_url)
    53.  
      #div class="paginator
    54.  
      soup = BeautifulSoup(tag_html) #过滤出标记页面的html
    55.  
      #print soup
    56.  
      #<div class="mod_pagenav" id="pager">
    57.  
      div_page = soup.find_all('div', {'class' : 'mod_pagenav', 'id' : 'pager'})
    58.  
      #print div_page #len(div_page), div_page[0]
    59.  
       
    60.  
      #<a class="c_txt6" href="http://v.qq.com/list/1_2_-1_-1_1_0_24_20_0_-1_0.html" title="25"><span>25</span></a>
    61.  
      re_pages = r'<a class=.+?><span>(.+?)</span></a>'
    62.  
      p = re.compile(re_pages, re.DOTALL)
    63.  
      pages = p.findall(str(div_page[0]))
    64.  
      #print pages
    65.  
      if len(pages) > 1:
    66.  
      return pages[-2]
    67.  
      else:
    68.  
      return 1
    69.  
       
    70.  
       
    71.  
      def getmovielist(html):
    72.  
      soup = BeautifulSoup(html)
    73.  
       
    74.  
      #<ul class="mod_list_pic_130">
    75.  
      divs = soup.find_all('ul', {'class' : 'mod_list_pic_130'})
    76.  
      #print divs
    77.  
      for div_html in divs:
    78.  
      div_html = str(div_html).replace(' ', '')
    79.  
      #print div_html
    80.  
      getmovie(div_html)
    81.  
       
    82.  
       
    83.  
      def getmovie(html):
    84.  
      global NUM
    85.  
      global m_type
    86.  
      global m_site
    87.  
       
    88.  
      re_movie = r'<li><a class="mod_poster_130" href="(.+?)" target="_blank" title="(.+?)"><img.+?</li>'
    89.  
      p = re.compile(re_movie, re.DOTALL)
    90.  
      movies = p.findall(html)
    91.  
      if movies:
    92.  
      conn = pymongo.Connection('localhost', 27017)
    93.  
      movie_db = conn.dianying
    94.  
      playlinks = movie_db.playlinks
    95.  
      #print movies
    96.  
      for movie in movies:
    97.  
      #print movie
    98.  
      NUM += 1
    99.  
      print "%s : %d" % ("=" * 70, NUM)
    100.  
      values = dict(
    101.  
      movie_title = movie[1],
    102.  
      movie_url = movie[0],
    103.  
      movie_site = m_site,
    104.  
      movie_type = m_type
    105.  
      )
    106.  
      print values
    107.  
      playlinks.insert(values)
    108.  
      print "_" * 70
    109.  
      NUM += 1
    110.  
      print "%s : %d" % ("=" * 70, NUM)
    111.  
       
    112.  
      #else:
    113.  
      # print "Not Find"
    114.  
       
    115.  
      def getmovieinfo(url):
    116.  
      html = gethtml(url)
    117.  
      soup = BeautifulSoup(html)
    118.  
       
    119.  
      #pack pack_album album_cover
    120.  
      divs = soup.find_all('div', {'class' : 'pack pack_album album_cover'})
    121.  
      #print divs[0]
    122.  
       
    123.  
      #<a href="http://www.tudou.com/albumplay/9NyofXc_lHI/32JqhiKJykI.html" target="new" title="《血滴子》独家纪录片" wl="1"> </a>
    124.  
      re_info = r'<a href="(.+?)" target="new" title="(.+?)" wl=".+?"> </a>'
    125.  
      p_info = re.compile(re_info, re.DOTALL)
    126.  
      m_info = p_info.findall(str(divs[0]))
    127.  
      if m_info:
    128.  
      return m_info
    129.  
      else:
    130.  
      print "Not find movie info"
    131.  
       
    132.  
      return m_info
    133.  
       
    134.  
       
    135.  
      def insertdb(movieinfo):
    136.  
      global conn
    137.  
      movie_db = conn.dianying_at
    138.  
      movies = movie_db.movies
    139.  
      movies.insert(movieinfo)
    140.  
       
    141.  
      if __name__ == "__main__":
    142.  
      global conn
    143.  
       
    144.  
      tags_url = "http://v.qq.com/list/1_-1_-1_-1_1_0_0_20_0_-1_0.html"
    145.  
      #print tags_url
    146.  
      tags_html = gethtml(tags_url)
    147.  
      #print tags_html
    148.  
      tag_urls = gettags(tags_html)
    149.  
      #print tag_urls
    150.  
       
    151.  
       
    152.  
      for url in tag_urls.items():
    153.  
      print str(url[1]).encode('utf-8') #,url[0]
    154.  
      maxpage = int(get_pages(str(url[1]).encode('utf-8')))
    155.  
      print maxpage
    156.  
       
    157.  
      for x in range(0, maxpage):
    158.  
      #http://v.qq.com/list/1_0_-1_-1_1_0_0_20_0_-1_0.html
    159.  
      m_url = str(url[1]).replace('0_20_0_-1_0.html', '')
    160.  
      movie_url = "%s%d_20_0_-1_0.html" % (m_url, x)
    161.  
      print movie_url
    162.  
      movie_html = gethtml(movie_url.encode('utf-8'))
    163.  
      #print movie_html
    164.  
      getmovielist(movie_html)
    165.  
      time.sleep(0.1)
     大工告成,以上代码大家都看明白了没? 如果你看不懂,建议你可以去小编的Python交流.裙 :一久武其而而流一思(数字的谐音)转换下可以找到了,里面有最新Python教程项目,多练习自然就懂了!
  • 相关阅读:
    SDN课程阅读作业(2)
    2019 SDN上机第5次作业
    linux 最常用命令
    tomcat 部署 React 项目后,浏览器刷新报404问题
    Java8 ~ 特性
    React ~ 小结
    ES6 map与filter
    Idea 快捷键
    实现div可以调整高度(div实现resize)
    linux常用命令(4)
  • 原文地址:https://www.cnblogs.com/chengxuyuanaa/p/12005765.html
Copyright © 2011-2022 走看看