zoukankan      html  css  js  c++  java
  • Python爬取斗图啦网站

     首先我们简单的分析一下这个网站,我们需要爬取的就是图片,然后将图片网址爬取下来,下载

    .

    图片的下载地址就在这里然后我们开始吧

    from  lxml import etree
    import requests
    from urllib import request
    import re
    import os
    import threading
    from queue import Queue
    
    # 创建生产者类。目的是为了获取当前网页的图片地址
    class Procuder(threading.Thread):
        def __init__(self, page_queue,img_queue,*args,**kwargs):
            self.headers = {
                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'
            }
            super(Procuder, self).__init__(*args, **kwargs)
            self.page_queue  = page_queue
            self.img_queue = img_queue
    
        def run(self):
            while True:
                if self.page_queue.empty():
                    break
                ulr = self.page_queue.get()
                self.parse_page(ulr)
    
    
        def parse_page(self,url):
            repon = requests.get(url, headers=self.headers)
            text = repon.text
            html = etree.HTML(text)
            imgs_text = html.xpath("""//a[@class="col-xs-6 col-sm-3"]//img[@referrerpolicy="no-referrer"]/@alt""")
            imgs = html.xpath("""//a[@class="col-xs-6 col-sm-3"]//img[@referrerpolicy="no-referrer"]/@data-original""")
    
            for values in zip(imgs_text, imgs):
                imgs_texts, img = values
                imgs_text = re.sub(r'[\??,,。.!!\.、\*]', '', imgs_texts)
                imgs_houzui = os.path.splitext(img)[1]
                img_life = imgs_text + imgs_houzui
                self.img_queue.put((img,img_life))
    
    # 创建消费者类,目的为了下载已经存储进img_queue队列的图片地址
    class Consumer(threading.Thread):
        def __init__(self, page_queue,img_queue,*args,**kwargs):
            super(Consumer, self).__init__(*args, **kwargs)
            self.page_queue = page_queue
            self.img_queue = img_queue
    
        def run(self):
            while True:
                if self.img_queue.empty() and self.page_queue.empty():
                    break
                img, img_life = self.img_queue.get()
                request.urlretrieve(img, "C:\\Users\\Administrator\\Desktop\\imgs\\" + img_life)
                print("%s下载成功" % img_life)
    
    def main():
        # 创建需要存储ulr的队列
        page_queue = Queue(3391)
        img_queue = Queue(2000)
        for i in range(1, 3392):
            url = "https://www.doutula.com/photo/list/?page={}".format(i)
            page_queue.put(url)
    
        for x in range(20):
            t = Procuder(page_queue,img_queue)
            t.start()
    
        for h in range(30):
            t = Consumer(page_queue,img_queue)
            t.start()
    
    
    
    
    if __name__ == '__main__':
        main()
  • 相关阅读:
    第一部分 题目要求
    完全卸载oracle
    zabbix的面试题目总结
    性能优化之MySQL调优篇
    select与epoll、apache与nginx实现原理对比
    深度优化LNMP之PHP
    深度优化LNMP之Nginx (转)
    git常用命令
    ansible 安装与卸载软件
    java8两个List集合取交集、并集、差集、去重并集
  • 原文地址:https://www.cnblogs.com/wocaonidaye/p/12725460.html
Copyright © 2011-2022 走看看