zoukankan      html  css  js  c++  java
  • 爬取套图 xpath

    # -*- coding: utf-8 -*-
    """
    Created on Sat Nov 7 19:15:04 2020
    微信公众号: 凹凸数据
    @File :spider.py
    @Author :叶庭云
    @CSDN :https://yetingyun.blog.csdn.net/
    """
    import requests
    from random import choice, randint
    from lxml import etree
    import os
    from concurrent.futures import ThreadPoolExecutor
    from time import sleep
    
    
    # 自己构造请求头池 用于切换
    user_agent = [
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
        "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
        "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
        "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
        ]
    
    # 不存在文件夹 就创建
    if not os.path.exists('女神套图'):
        os.mkdir('女神套图')
    
    
    # 获取5页的套图的URL
    def get_taotu_url():
        taotu_urls = []
        for i in range(1, 6):
            url = f'http://www.win4000.com/meinvtag4_{i}.html'
            headers = {
                'User-Agent': choice(user_agent)
            }
            # 发送请求 获取响应
            rep = requests.get(url, headers=headers)
            # print(rep.status_code) 状态码 200
            # print(rep.text)
            html = etree.HTML(rep.text)
            taotu_url = html.xpath('//div[@class="tab_tj"]/div/div/ul/li/a/@href')
            # 过滤掉无效的url
            taotu_url = [item for item in taotu_url if len(item) == 39]
            # 一个页面有24个图片
            # print(taotu_url, len(taotu_url), sep='
    ')
            taotu_urls.extend(taotu_url)
    
        return taotu_urls
    
    
    # 进入套图详情页爬取图片
    def get_img(url):
        headers = {
            'User-Agent': choice(user_agent)
        }
        # 发送请求 获取响应
        rep = requests.get(url, headers=headers)
        # 解析响应
        html = etree.HTML(rep.text)
        # 获取套图名称 最大页数
        name = html.xpath('//div[@class="ptitle"]/h1/text()')[0]
        os.mkdir(r'./女神套图/{}'.format(name))
        max_page = html.xpath('//div[@class="ptitle"]/em/text()')
        # 字符串替换 便于之后构造url请求
        url1 = url.replace('.html', '_{}.html')
        # 翻页爬取这组套图的图片
        for i in range(1, int(max_page[0]) + 1):
            # 构造url
            url2 = url1.format(i)
            # 休眠 
            sleep(randint(1, 3))
            # 发送请求 获取响应
            reps = requests.get(url2, headers=headers)
            # 解析响应
            dom = etree.HTML(reps.text)
            # 定位提取图片下载链接
            src = dom.xpath('//div[@class="main-wrap"]/div[1]/a/img/@data-original')[0]
            # 构造图片保存的名称
            file_name = name + f'第{i}张.jpg'
            # 请求下载图片 保存图片 输出提示信息
            img = requests.get(src, headers=headers).content
            with open(r'./女神套图/{}/{}'.format(name, file_name), 'wb') as f:
                f.write(img)
                print(f'成功下载图片:{file_name}')
    
    
    # 主函数调用 开多线程
    def main():
        taotu_urls = get_taotu_url()
        with ThreadPoolExecutor(max_workers=4) as exector:
            exector.map(get_img, taotu_urls)
        print('=================== 图片全部下载成功啦!=====================')
        
        
    if __name__ == '__main__':
        main()
    故乡明
  • 相关阅读:
    for of 与 for in的区别
    Mac Item2 SSH免密登录Linux 服务器的两种方式
    组塞式,非阻塞式,同步异步
    Thrift_简介(基于C#)
    HTTP Error 500.22
    http协议
    IIS_部署出错
    JavaScript如何实现继承
    $(function(){})与 (function(){})() (function($){})() 的区别
    C#_反射机制
  • 原文地址:https://www.cnblogs.com/luweiweicode/p/14729356.html
Copyright © 2011-2022 走看看