zoukankan      html  css  js  c++  java
  • Python多线程爬取某网站表情包

    # 爬取网络图片
    import requests
    from lxml import etree
    from urllib import request
    from queue import Queue # 导入队列
    import threading
    import os
    import re

    class Producer(threading.Thread):
    headers = {
    'User-Agent': "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36"
    }
    def __init__(self,page_Queue,image_Queue,*args,**kwargs):
    super(Producer,self).__init__(*args,**kwargs)
    self.page_Queue = page_Queue
    self.image_Queue = image_Queue

    def run(self):
    while True:
    if self.page_Queue.empty():
    break
    url = self.page_Queue.get()
    self.parse_page(url) # 取URL

    def parse_page(self,url):
    respone = requests.get(url,headers=self.headers)
    text = respone.text
    html = etree.HTML(text)
    imgs = html.xpath("//div[@class='page-content text-center']//a//img")
    for img in imgs:
    # img_url = img.xpath(".//@data-original")[0]
    img_url = img.get("data-original")
    # alt = img.xpath(".//@alt")[0] # 提取文件名称
    alt = img.get("alt") # 提取文件名称
    suffix = os.path.splitext(img_url)[1] # 提取扩展名称
    alt = re.sub(r'[/ :*?"<>|]','',alt) # 使用正则来替换alt名称的特殊字符
    filename = alt + suffix
    self.image_Queue.put((img_url,filename))

    class Consumer(threading.Thread):
    def __init__(self,page_Queue,image_Queue,*args,**kwargs):
    super(Consumer,self).__init__(*args,**kwargs)
    self.page_Queue = page_Queue
    self.image_Queue = image_Queue

    def run(self):
    while True:
    if self.page_Queue.empty() and self.image_Queue.empty():
    break
    image_url,filename = self.image_Queue.get()
    path = 'E:\image\'
    request.urlretrieve(image_url,path+filename)
    print('正在存储文件%s'%filename)

    def main():
    page_Queue = Queue(10)
    image_Queue = Queue(10)

    # 爬取100页图片
    for x1 in range(1,4):
    url = 'https://www.doutula.com/photo/list/?page=%d'%x1
    page_Queue.put(url)

    for x in range(5):
    t1 = Producer(page_Queue,image_Queue)
    t1.start()

    for x in range(5):
    t2 = Consumer(page_Queue,image_Queue)
    t2.start()

    if __name__ == '__main__':
    main()

  • 相关阅读:
    python操作excel文件一(xlrd读取文件)
    pytest 1.简单介绍一,安装和如何运行
    request鉴权的处理和判断
    Struts2 easy UI插件
    Struts2 JQuery UI常用插件
    Struts2 JSON
    Struts2 使用jQuery实现Ajax
    Struts2 Ajax校验
    oracle连接方式、创建数据库用户、忘记数据库密码、用户锁定
    第二次考试:错题总结
  • 原文地址:https://www.cnblogs.com/Teachertao/p/14111708.html
Copyright © 2011-2022 走看看