zoukankan      html  css  js  c++  java
  • 电影天堂爬取详情页

    爬取电影天堂最新电影,地址https://www.dytt8.net/html/gndy/dyzz/list_23_1.html

     1 import requests
     2 from lxml import etree
     3 
     4 BASE_DOMAIN = 'https://www.dytt8.net'
     5 HEADERS = {
     6     'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
     7 }
     8 
     9 
    10 def get_detail_urls(url):
    11     response = requests.get(url, headers=HEADERS)
    12     text = response.content.decode(encoding='gbk', errors='ignore')
    13     html = etree.HTML(text)
    14     detail_urls = html.xpath('//table[@class="tbspan"]//a/@href')
    15     detail_urls = map(lambda url: BASE_DOMAIN+url, detail_urls)
    16     return detail_urls
    17 
    18 
    19 def get_detail_info(url):
    20     movie = {}
    21     response = requests.get(url, headers=HEADERS)
    22     text = response.content.decode(encoding='gbk', errors='ignore')
    23     html = etree.HTML(text)
    24     movie['title'] = html.xpath('//div[@class="title_all"]//font/text()')[0]
    25     img = html.xpath("//div[@id='Zoom']//img/@src")
    26     movie['cover'] = img[0]
    27     movie['screenshot'] = img[1]
    28     infos = html.xpath('//div[@id="Zoom"]//p/text()')
    29     # 提取信息
    30     is_actors = False
    31     actors = []
    32     for info in infos:
    33         # print(info)
    34         if info.startswith('◎年  代'):
    35             movie['year'] = info.replace("◎年  代", "").strip()
    36         elif info.startswith('◎产  地'):
    37             movie['country'] = info.replace("◎产  地", "").strip()
    38         elif info.startswith('◎类  别'):
    39             movie['category'] = info.replace("◎类  别", "").strip()
    40         elif info.startswith('◎豆瓣评分'):
    41             movie['douban_rating'] = info.replace("◎豆瓣评分", "").strip()
    42         elif info.startswith('◎片  长'):
    43             movie['duration'] = info.replace("◎片  长", "").strip()
    44         elif info.startswith('◎导  演'):
    45             movie['director'] = info.replace("◎导  演", "").strip()
    46         elif info.startswith('◎主  演'):
    47             actors = [info.replace("◎主  演", "").strip()]
    48             is_actors = True
    49         elif is_actors:
    50             if info.startswith(''):
    51                 is_actors = False
    52                 movie['actors'] = actors
    53                 continue
    54             actors.append(info.strip())
    55     movie['download'] = html.xpath("//div[@id='Zoom']//tbody//a/text()")
    56     movie['magnet'] = html.xpath("//div[@id='Zoom']//a/@href")[0]
    57 
    58     return movie
    59 
    60 
    61 def spider():
    62     base_url = 'https://www.dytt8.net/html/gndy/dyzz/list_23_{}.html'
    63     movies = []
    64     for i in range(1, 2):
    65         url = base_url.format(i)
    66         detail_urls = get_detail_urls(url)
    67         for detail_url in detail_urls:
    68             # 对详情页提取信息
    69             # print(detail_url)
    70             movies.append(get_detail_info(detail_url))
    71     print(movies)
    72 
    73 
    74 if __name__ == '__main__':
    75     spider()

    学习的视频中代码有几处跟我的有不同,可以学习

    一、

    其中提取主演的代码不同,如下

    for index,info in enumerate(infos):
            if info.startswith("◎年  代"):
                info = parse_info(info,"◎年  代")
                movie['year'] = info
            # .......省略
            elif info.startswith("◎主  演"):
                info = parse_info(info,"◎主  演")
                actors = [info]
                for x in range(index+1,len(infos)):
                    actor = infos[x].strip()
                    if actor.startswith(""):
                        break
                    actors.append(actor)
                movie['actors'] = actors

    采用的是index的方式.

    enumerate() 函数用于将一个可遍历的数据对象(如列表、元组或字符串)组合为一个索引序列,同时列出数据和数据下标,一般用在 for 循环当中。

    二、

    还有

    detail_urls = map(lambda url:BASE_DOMAIN+url,detail_urls)

    这段代码没怎么用过.记录下.

    三、

    base_url = 'https://www.dytt8.net/html/gndy/dyzz/list_23_{}.html'
        for i in range(1, 10):
            url = base_url.format(i)

    以前写的时候没这么写过,都是直接弄成

    url = 'https://www.dytt8.net/html/gndy/dyzz/list_23_'+i+'.html'这样.

    四、

    在自己写代码时有个错误,也需要记录下,在movie的字典赋值的时候,

    如下

        for info in infos:
            # print(info)
            if info.startswith('◎年  代'):
                year = info.replace("◎年  代", "").strip()
            elif info.startswith('◎产  地'):
                country = info.replace("◎产  地", "").strip()
            elif info.startswith('◎类  别'):
                category = info.replace("◎类  别", "").strip()
            elif info.startswith('◎豆瓣评分'):
                douban_rating = info.replace("◎豆瓣评分", "").strip()

    赋值的时候使用

     movie = {
            'year': year,
            'country': country,
            'category': category,
            'douban_rating': douban_rating
        }

    会报错,因为其中有一个豆瓣评分是不存在的,不会对其赋值,所以movie赋值的时候会错误.

    UnboundLocalError: local variable 'douban_rating' referenced before assignment

     当然,个人觉得用正则可以更容易解决。

    在记录下爬取豆瓣正在上映的电影用xpath的代码

    import requests
    from lxml import etree
    
    headers = {
        'Referer': 'https://movie.douban.com/',
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
    }
    url = "https://movie.douban.com/cinema/nowplaying/changsha/"
    response = requests.get(url, headers=headers)
    text = response.text
    html = etree.HTML(text)
    lis = html.xpath("//div[@id='nowplaying']//ul[@class='lists']/li")
    movies = []
    for li in lis:
        # print(etree.tostring(li, encoding='utf-8').decode('utf-8'))
        title = li.xpath("@data-title")[0]
        score = li.xpath("@data-score")[0]
        duration = li.xpath("@data-duration")[0]
        director = li.xpath("@data-director")[0]
        actor = li.xpath("@data-actors")[0]
        img = li.xpath(".//img/@src")[0]
        movie = {
            'title': title,
            'score': score,
            'duration': duration,
            'director': director,
            'actor': actor,
            'img': img
        }
        movies.append(movie)
    
    print(movies)
  • 相关阅读:
    在Xampp中添加memache扩展
    使用PHP中的curl发送请求
    php开启openssl扩展
    chisel初见2
    chisel初见
    数字IC设计之DC(二):DC设置、库和对象
    数字IC设计之DC(一):DC简介
    IC基础(八):数字电路设计中常用的算法
    IC基础(七):FPGA为什么与ASIC频率差异巨大?
    IC基础(六):时序分析过程需要的相关计算以及处理方法
  • 原文地址:https://www.cnblogs.com/weiwei2016/p/10411041.html
Copyright © 2011-2022 走看看