zoukankan html css js c++ java

python多线程爬取斗图啦数据

python多线程爬取斗图啦网的表情数据

使用到的技术点

requests请求库
re 正则表达式
pyquery解析库,python实现的jquery
threading 线程
queue 队列

'''
斗图啦多线程方式

'''

import   requests,time,re,os
from  pyquery  import  PyQuery as jq
from requests.exceptions import   RequestException
from urllib import  request
# 导入线程类
import threading
# 导入队列类
from queue import Queue
head = {
            "User_Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36",
        }
# 创建项目文件夹
pt=os.path.dirname(os.path.abspath(__file__))
path = os.path.join(pt, "斗图啦")
if not os.path.exists(path):
    os.mkdir(path)

'''
生产者类
继承自多线程类threading.Thread
重写init方法和run方法
'''
class Producer(threading.Thread):
    def __init__(self,img_queue,url_queue,*args,**kwargs):
        super(Producer, self).__init__(*args,*kwargs)
        self.img_queue=img_queue
        self.url_queue=url_queue

    def run(self):
        while True:
            if self.url_queue.empty():# 如果没有url了 直接退出循环
                break
            url=self.url_queue.get()
            self.parse_page(url)
   ## 解析数据方法
    def parse_page(self,url):
        res=requests.get(url,headers=head)
        doc=jq(res.text)
       # print(res.text)
        # 查询到所有的a标签
        items= doc.find(".page-content a").items()
        for a  in  items:
            title=a.find("p").text()
            src=a.find("img.img-responsive").attr("data-original")
            # 分割路径 拿到扩展名
            pathtype= os.path.splitext(src)[1]
            # 使用正则表达式 去掉特殊字符
            patitle=re.sub(r'[.。，?？*!！/~]',"",title)
            filename = patitle + pathtype
            filepath=os.path.join(path,filename)
            #  添加到消费者队列 循环下载图片
            self.img_queue.put((filepath,src))





'''
消费者
和生产者一样的道理
'''
class Customer(threading.Thread):
    def __init__(self,img_queue,url_queue,*args,**kwargs):
        super(Customer, self).__init__(*args,**kwargs)
        self.img_queue=img_queue
        self.url_queue=url_queue

    def run(self):
        while True:
            if self.img_queue.empty() and self.url_queue.empty():#如果没有url并且图片下载完成 直接退出
                break
            # 在队列中拿到路径和图片链接
            filepath,src=self.img_queue.get()
            print('%s开始下载,链接%s' % (filepath, src))
            # 请求图片
            img = requests.get(src)
            # 写入本地 content表示二进制数据,text是文本数据
            with open(filepath, "wb")as f:
                f.write(img.content)
            # request.urlretrieve(src,os.path.join(path,filename))
            print('%s下载完成' % filepath)




def main():
    # 构建url队列和img队列
    url_queue=Queue(100000)
    img_queue=Queue(100000)

    # 构建url 爬取1到100页的数据
    for  i in range(1,101):
        url="https://www.doutula.com/photo/list/?page="+str(i)
        url_queue.put(url)# 添加到生产者队列中
# 开启5个线程线程执行生产者
    for i in range(5):
        t=Producer(img_queue,url_queue)
        t.start()
    # 开启3个线程线程执行消费者
    for i in range(3):
        t=Customer(img_queue,url_queue)
        t.start()


if __name__ == '__main__':
    print("爬虫调度启动---------")
    main()
    print("爬虫调度完成---------")

查看全文

相关阅读:
maven工程下的“run as application”
Spark机器配置计算
 数学思路
 关联和依赖
 spark数据倾斜
 windows的DOS窗口如何修改大小
 MySQL的索引创建、删除
 使用composer命令创建laravel项目命令详解
 Windows平台查看端口占用情况
 使用composer安装laravel

原文地址：https://www.cnblogs.com/HiLzd/p/11246116.html