zoukankan      html  css  js  c++  java
  • python爬虫爬取某电影网站

    #coding:utf-8
    import requests
    from lxml import etree
    BASE_DOMAIN = "http://www.8080s.net/"
    
    url = "http://www.8080s.net/dm/list/----14--p2"
    
    headers = {
        
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36'
    }
    
    def get_detail_urls(url):
    
    
        response = requests.get(url,headers=headers)
        text = response.text
        html = etree.HTML(text)
        detail_urls = html.xpath("//ul[@class='me1 clearfix']//a/@href")
    #for detail_url in detail_urls:
     #   print(BASE_DOMAIN+detail_url)
    
        detail_urls = map(lambda url: BASE_DOMAIN+url,detail_urls)
        return detail_urls
    #获取内容页数据
    
    def parse_detail_page(url):
        movie ={}
        response = requests.get(url,headers=headers)
        text=response.text
        html = etree.HTML(text)
        title = html.xpath("//div[@class='info']/text")
        #print(title)
        movie['title'] = title
        update = html.xpath("//span[@class='tip']//text()")
        movie['update'] = update
       
        return movie
    
    
    
    
    
    
    
    
    
    #获取列表数据
    def spider():
        base_url = "http://www.8080s.net/dm/list/----14--p{}"
        movies = []
        for x in range(2,9):
            url = base_url.format(x)
            #print(url)
            detail_urls = get_detail_urls(url)
            for detail_url in detail_urls:
                movie = parse_detail_page(detail_url)
                movies.append(movie)
                print(movie)
    
    
    
    if __name__ == '__main__':
        spider()
    
  • 相关阅读:
    基本的Dos命令
    OneCloud记录
    Wireguard笔记
    windows网络流量监控
    CoreDNS笔记
    Goland 使用[临时]
    js for循环的同步代码
    看我如何用微信上线CobaltStrike
    图数据库 Nebula Graph 在 Boss 直聘的应用
    熵池 在计算机科学与金融学中的应用
  • 原文地址:https://www.cnblogs.com/wangtanzhi/p/12403330.html
Copyright © 2011-2022 走看看