zoukankan      html  css  js  c++  java
  • 爬虫11-lxml爬取复杂网页,电影天堂

    import requests
    from  lxml import  etree
    url_domain="https://www.dytt8.net"
    headers={
    "User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36"
    }
    def get_detail_urls(url):
        response=requests.get(url,headers=headers)
        text =response.content.decode('gbk', "ignore")
    
        html=etree.HTML(text)
        detail_urls=html.xpath("//table[@class='tbspan']//a[2]/@href")
        detail_urls=map(lambda url:url_domain+url,detail_urls)
        return detail_urls
    
    def parse_info(info,rule):
        return info.replace(rule, "").strip()
    
    def parse_detail_url(url="https://www.dytt8.net/html/gndy/dyzz/20200306/59787.html"):
        movie={}
        response=requests.get(url,headers=headers)
        text=response.content.decode("gbk","ignore")
        html=etree.HTML(text)
        title=html.xpath("//div[@class='title_all']//font[@color='#07519a']//text()")[0]
        movie['title']=title
        zoomE=html.xpath("//div[@id='Zoom']")[0]
        infos=zoomE.xpath("//text()")
        for index,info in enumerate(infos):
            if info.startswith("◎年  代"):
                info=parse_info(info,"◎年  代")
                movie['year']=info
            elif info.startswith("◎产  地"):
                info = parse_info(info, "◎产  地")
                movie['country'] = info
            elif info.startswith("◎类  别"):
                info = parse_info(info, "◎类  别")
                movie['category'] = info
            elif info.startswith("◎上映日期"):
                info = parse_info(info, "◎上映日期")
                movie['date'] = info
            elif info.startswith("◎片  长"):
                info = parse_info(info, "◎片  长")
                movie['time'] = info
            elif info.startswith("◎豆瓣评分"):
                info = parse_info(info, "◎豆瓣评分")
                movie['score'] = info
            elif info.startswith("◎导  演"):
                info = parse_info(info, "◎导  演")
                movie['director'] = info
            elif info.startswith("◎主  演"):
                info = parse_info(info, "◎主  演")
                actors = [info]
                for x in range(index+1,len(infos)):
                    actor=infos[x].strip()
                    if actor.startswith("◎"):
                        break
                    actors.append(actor)
                movie['actors']=actors
        return movie
    
    def spider():
        base_url="https://www.dytt8.net/html/gndy/dyzz/list_23_{}.html"
        movies=[]
        for x in range(1,8):
            url=base_url.format(x)
            detail_urls=get_detail_urls(url)
            for detail_url in detail_urls:
                movie=parse_detail_url(detail_url)
                movies.append(movie)
                print(movie)
    
    if __name__ == '__main__':
        spider()
    

      

  • 相关阅读:
    C语言位操作
    Ribbon负载规则的替换
    Nginx 的配置文件
    Nginx 操作常用的命令
    Nginx 是什么?
    SpringCloud Eureka 新版本依赖
    @Autowired 与@Resource的区别
    spring 注释
    redis 的 rdb 和 aof 持久化的区别
    jdk1.7下HashMap的头插法问题
  • 原文地址:https://www.cnblogs.com/wcyMiracle/p/12468299.html
Copyright © 2011-2022 走看看