zoukankan      html  css  js  c++  java
  • 20200311_最新爬取mzitu

     废话不多, 直接上代码, python3.6:

    import requests
    from bs4 import BeautifulSoup
    import os
    import time;
    import random
    
    #pip install BeautifulSoup4 -i  https://pypi.douban.com/simple
    #pip install requests -i  https://pypi.douban.com/simple
    
    # http请求头
    Hostreferer = {
        'Referer': 'http://www.mzitu.com',
        
        'Upgrade-Insecure-Requests': '1',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36'
    }
    
    # 此请求头Referer破解盗图链接
    Picreferer = {
        # 'User-Agent': 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)',
        # 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3679.0 Safari/537.36',
        # 'Referer': 'http://i.meizitu.net',
        # https://www.mzitu.com/224497/3
        'Referer': 'http://www.mzitu.com',
        'Upgrade-Insecure-Requests': '1',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36'
    }
    all_url = 'https://www.mzitu.com'
    # 对mzitu主页all_url发起请求,将返回的HTML数据保存,便于解析
    start_html = requests.get(all_url, headers=Hostreferer)
     
    soup = BeautifulSoup(start_html.text, "html.parser") # 缩进格式
    page = soup.find_all('a', class_='page-numbers')
    # 最大页数
    max_page = page[-2].text
    for n in range(1, int(max_page) + 1):
        path = 'D:/mzitu/' #存储路径
        all_url = 'https://www.mzitu.com' #重新赋值
        if n!=1:
            all_url=  all_url+"/page/"+str(n)+"/";
        print('开始爬第 %s 页, 网址是 %s' % (n , all_url))
        start_html = requests.get(all_url, headers=Hostreferer);
        soup = BeautifulSoup(start_html.text, "html.parser")
    #    alt =  soup.find(id='pins').find_all('a', target='_blank').find_all('img',class_='lazy').get('alt');
        hrefs = soup.find(id='pins').find_all('a', target='_blank'); #根据ID找
    
        for href in hrefs:
            imgs = href.find('img',class_='lazy');
            if imgs == None:
                break;
            alt = imgs.get('alt');
            url = href.get('href');
            start_html2 = requests.get(url, headers=Hostreferer);
            soup2 = BeautifulSoup(start_html2.text, "html.parser")  # 缩进格式
            page2 = soup2.find('div', class_='pagenavi').find_all('a');
            # print (page2[0])
            max_page2 = page2[-2].text;
            path = path + alt.strip().replace('?', '');
            if (os.path.exists(path)):
                pass
                # print('目录已存在')
            else:
                os.makedirs(path)
            for m in range(1,int(max_page2)):
    
                time.sleep(random.randint(1,5))
                # alt = href.find('img', class_='lazy').get('alt');
                # url = href.get('href');
                url3 = url+'/'+str(m)+'/'
                print('开始爬→%s' % url3)
                start_html3 = requests.get(url3, headers=Hostreferer);
                soup3 = BeautifulSoup(start_html3.text, "html.parser")  # 缩进格式
                picSrc = soup3.find('div', class_='main-image').find('a').find('img').get('src');#.get('src');#.get('src'); #div class="main-image"
                # imglist = #获取当前页上所有的子连接, 不包含class="box"
                html = requests.get(picSrc, headers=Picreferer)
    
                # 提取图片名字
                file_name = path+'/'+picSrc.split(r'/')[-1];
                # 保存图片
                f = open(file_name, 'wb')
                f.write(html.content)
                f.close()
                print('图片保存到%s' % file_name);
    

      

  • 相关阅读:
    webBrowser控制新窗口
    MSIL指令速 查表
    [转载]最新.NET Reactor v4.0.0.0 注册机
    如何得到webbrowser的句柄
    【C#】获取本地Cookie的问题
    visio 2007 简体中文版下载
    VMware Workstation(虚拟机) V6.0.2 Build 59824 汉化版 |
    ComponentOne Studio Enterprise 2010 v1
    [转载]MaxtoCode对.Net程序加密的原理及解密探讨三(实例解密)
    用WPF实现打印及打印预览
  • 原文地址:https://www.cnblogs.com/wxylog/p/12460202.html
Copyright © 2011-2022 走看看