zoukankan      html  css  js  c++  java
  • Python3多线程爬取meizitu的图片

    python环境:python3

    运行环境:win10和linux都可以,其他系统没测

    1 安装依赖

    pip install requests
    pip install lxml
    pip install feedparser

    2 创建一个新文件夹

    3 运行该脚本

    python mzitu.py

    源码如下:

    # -*- coding: UTF-8 –*-
    import feedparser
    import requests
    from lxml import etree
    import threading
    import random
    import os
     
    def get_url():
     
        rss_url = 'https://www.mzitu.com/feed/'
        feeds = feedparser.parse(rss_url)
     
        page_url = []
        for i in range(20):
            page_url.append(feeds.entries[i]['link'])
     
        return page_url
     
    def download(dirname, imgurl):
     
        headers = {
        'referer':'https://www.mzitu.com/',
        'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'
        }
     
        filename = imgurl.split('/')[-1]
     
        r = requests.get(imgurl, headers = headers, stream=True)
        if os.path.exists(dirname):
            with open(dirname + '/' + filename, 'wb') as f:
                for chunk in r.iter_content(chunk_size=32):
                    f.write(chunk)
                print('下载:%s中' % filename)
        else:
            os.mkdir(dirname)
            with open(dirname + '/' + filename, 'wb') as f:
                for chunk in r.iter_content(chunk_size=32):
                    f.write(chunk)
                print('下载:%s中' % filename)
     
    def get_img(url):
     
        r = requests.get(url)
        page = etree.HTML(r.text)
        span = page.xpath('/html/body/div[2]/div[1]/div[4]/a[5]/span')
        hs = page.xpath('//h2[@class="main-title"]')
        for h in hs:
            title = h.text
        for a in span:
            pages = a.text
        try:
            for i in range(int(pages)+1):
                if i == 1:
                    pass
                else:
                    imgpage = url + '/' + str(i)
                    r1 = requests.get(imgpage)
                    page1 = etree.HTML(r1.text)
                    x_href = page1.xpath('/html/body/div[2]/div[1]/div[3]/p/a/img')
                    for href in x_href:
                        imgurl = href.get('src')
                        download(title, imgurl)
        except KeyboardInterrupt:
            pass
        except:
            pass
     
    def main():
     
        urls = get_url()
        threads=[]
        for i in range(len(urls)):
            t = threading.Thread(target=get_img, args=(urls[0+i],))
            threads.append(t)
     
        for i in threads:
            i.start()
     
        for i in threads:
            i.join()
     
    if __name__ == '__main__':
        main()
    View Code

    如果遇到问题,源码请到百度网盘下载;百度网盘   提取码:7pv8 

    4 升级版(可下载所有组图)

    源码如下:

    # -*- coding: UTF-8 鈥?-
    import feedparser
    import requests
    from lxml import etree
    import threading
    import random
    import os
     
    def get_url2():
            rss_url = 'https://www.mzitu.com/all/'
            r = requests.get(rss_url)
            page = etree.HTML(r.text)
            result =page.xpath('/html/body/div[2]/div[1]/div[2]/ul/li/p[2]/a')
            print('鏈?d缁勫浘'%len(result))
            page_url = []
            for x in result:
                    page_url.append(x.get('href'))
                    #print(x.get('href'))
            return page_url
     
    def download(dirname, imgurl):
     
        headers = {
        'referer':'https://www.mzitu.com/',
        'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'
        }
     
        filename = imgurl.split('/')[-1]
     
        r = requests.get(imgurl, headers = headers, stream=True)
        if os.path.exists(dirname):
            with open(dirname + '/' + filename, 'wb') as f:
                for chunk in r.iter_content(chunk_size=32):
                    f.write(chunk)
                print('涓嬭浇:%s涓? % filename)
        else:
            os.mkdir(dirname)
            with open(dirname + '/' + filename, 'wb') as f:
                for chunk in r.iter_content(chunk_size=32):
                    f.write(chunk)
                print('涓嬭浇:%s涓? % filename)
     
    def get_img(url):
     
        r = requests.get(url)
        page = etree.HTML(r.text)
        span = page.xpath('/html/body/div[2]/div[1]/div[4]/a[5]/span')
        hs = page.xpath('//h2[@class="main-title"]')
        for h in hs:
            title = h.text
        for a in span:
            pages = a.text
        try:
            for i in range(int(pages)+1):
                if i == 1:
                    pass
                else:
                    imgpage = url + '/' + str(i)
                    r1 = requests.get(imgpage)
                    page1 = etree.HTML(r1.text)
                    x_href = page1.xpath('/html/body/div[2]/div[1]/div[3]/p/a/img')
                    for href in x_href:
                        imgurl = href.get('src')
                        download(title, imgurl)
        except KeyboardInterrupt:
            pass
        except:
            pass
     
    def main():
     
        urls = get_url2()
        threads=[]
        for i in range(len(urls)):
            t = threading.Thread(target=get_img, args=(urls[0+i],))
            threads.append(t)
     
        for i in threads:
            i.start()
     
        for i in threads:
            i.join()
     
    if __name__ == '__main__':
        main()
    View Code

    如果遇到问题,源码请到百度网盘下载;百度网盘  提取码:nxoo 

    注意:经测试,4 升级版在运行时,会大量占用内存,内存小的电脑估计抗不住。。

  • 相关阅读:
    ThinkPHP Model+数据库的切换使用
    关于SSD安装系统的一些设置(PE安装win 7)
    PHP实现文件下载:header
    Thinkphp 使用PHPExcel导入,栗子
    Ueditor 的使用(这里以php+ci为例)
    js获取鼠标选中的文字内容
    WNMP 下 Nginx 配置 (使用了phpfind一键安装环境)
    javascript 实现 trim
    javascript 获取 CSS 样式表属性
    javascript 删除节点问题
  • 原文地址:https://www.cnblogs.com/zkfopen/p/10822332.html
Copyright © 2011-2022 走看看