zoukankan      html  css  js  c++  java
  • python3电影详细信息爬取-------------------电影天堂

      1 # -*- coding: utf-8 -*-
      2 # author:zxy
      3 #Date:2018-9-19
      4 
      5 import requests
      6 from lxml import etree
      7 HEADERS = {
      8     'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) '
      9                   'AppleWebKit/537.36 (KHTML, like Gecko)'
     10                   ' Chrome/67.0.3396.99 Safari/537.36'
     11 }
     12 BASE_DOMAIN="http://www.dytt8.net"
     13 
     14 
     15 def get_detail_url(url):
     16     response = requests.get(url, headers=HEADERS) #print(response.content.decode('gbk'))
     17     text = response.text.encode("utf-8")  #拿到数据,,再解码
     18     html = etree.HTML(text)
     19     detail_urls = html.xpath("//table[@class='tbspan']//a/@href")
     20     detail_urls=map(lambda url:BASE_DOMAIN+url,detail_urls)
     21     return detail_urls
     22 
     23 def parse_detail_page(url):
     24     movie={}
     25     response=requests.get(url,headers=HEADERS)
     26     text=response.content.decode('gbk')  #text = response.text.encode("utf-8")
     27     html=etree.HTML(text)
     28     title=html.xpath("//div[@class='title_all']//font[@color='#07519a']/text()")[0]
     29     # for x in title:
     30     #     print(etree.tostring(x,encoding="utf-8").decode("utf-8"))
     31     #print(title)
     32     movie['title']=title
     33     Zoome=html.xpath("//div[@id='Zoom']")[0] #return list
     34     imgs=Zoome.xpath(".//img/@src")
     35     #print(cover)
     36     cover=imgs[0]
     37     # screenshot=imgs[1]
     38     movie['cover']=cover
     39     # movie['screenshot']=screenshot  not all movie has screenshot ,so discard for this moment
     40 
     41     def parse_info(info,rule):
     42         return info.replace(rule,"").strip()
     43 
     44     infos=Zoome.xpath(".//text()")
     45     # print(infos) each line is a element of the list
     46 
     47     for index,info in enumerate(infos):
     48         if info.startswith("◎年  代"):
     49             info=parse_info(info,"◎年  代")
     50             # print(info)
     51             movie['year']=info
     52         elif info.startswith("◎产  地"):
     53             info=parse_info(info,"◎产  地")
     54             movie['country']=info
     55         elif info.startswith("◎类  别"):
     56             info=parse_info(info,"◎类  别")
     57             movie['category']=info
     58         elif info.startswith("◎语  言"):
     59             info=parse_info(info,"◎语  言")
     60             movie['language']=info
     61         elif info.startswith("◎字  幕"):
     62             info=parse_info(info,"◎字  幕")
     63             movie['sub_title']=info
     64         elif info.startswith("◎上映日期"):
     65             info=parse_info(info,"◎上映日期")
     66             movie['release_time']=info
     67         elif info.startswith("◎豆瓣评分"):
     68             info=parse_info(info,"◎豆瓣评分")
     69             movie['douban_score']=info
     70         elif info.startswith("◎片  长"):
     71             info=parse_info(info,"◎片  长")
     72             movie['length']=info
     73         elif info.startswith("◎导  演"):
     74             info=parse_info(info,"◎导  演")
     75             movie['director']=info
     76         elif info.startswith("◎主  演"):
     77             info=parse_info(info,"◎主  演")
     78             actors=[info]
     79             for x in range(index+1,len(infos)):
     80                 actor=infos[x].strip()
     81                 if actor.startswith(""):
     82                     break
     83                 actors.append(actor)
     84             movie['actors']=actors
     85         elif info.startswith("◎简  介"):
     86             info=parse_info(info,"◎简  介")
     87             profiles=[info]
     88             for x in range(index+1,len(infos)):
     89                 profile=infos[x].strip()
     90                 if profile.startswith("【下载地址】"):
     91                     break
     92                 profiles.append(profile)
     93                 movie['profiles']=profiles
     94     download_url=html.xpath("//td[@bgcolor='#fdfddf']/a/@href")[0]
     95     #print(download_url)
     96     movie['download_url']=download_url
     97     return movie
     98 
     99 movies=[]
    100 
    101 def spider():
    102     base_url = 'http://www.dytt8.net/html/gndy/dyzz/list_23_{}.html'
    103     for x in range(1,2):  #how much page depend on you
    104         # print("==="*30)
    105         # print(x)
    106         url=base_url.format(x)
    107         detail_urls=get_detail_url(url)
    108         for detail_url in detail_urls:
    109             # print(detail_url)
    110             movie=parse_detail_page(detail_url)
    111             movies.append(movie)
    112 
    113 if __name__ == '__main__':
    114     spider()
    115     with open('movies.txt','a',encoding='utf-8') as f:
    116         for movie in movies:
    117             f.write("="*30)
    118             f.write('
    '*2)
    119             for (key,value) in movie.items():
    120                 if(key=='actors'):
    121                     str='actors :{}'
    122                     f.write(str.format(value))
    123                     f.write('
    ')
    124                 elif(key=='profiles'):
    125                     str='profiles :{}'
    126                     f.write(str.format(value))
    127                     f.write('
    ')
    128                 else:
    129                     f.write(key+":"+value)
    130                     f.write('
    ')
    131             f.write('
    '*3)

    效果如图所示:

  • 相关阅读:
    小程序上传多张图片
    小程序倒计时遇到的问题
    taro小程序展示富文本
    taro小程序地址选择组件
    构建基于Suricata+Splunk的IDS入侵检测系统
    SQL注入学习资料总结
    常见WAF绕过思路
    业务安全漏洞挖掘归纳总结
    细说验证码安全 —— 测试思路大梳理
    验证码安全那些事
  • 原文地址:https://www.cnblogs.com/z-712/p/9693143.html
Copyright © 2011-2022 走看看