# 没个图都不好意思玩微信 @.@
# 需求:想要多点搞笑图为了微信斗斗图
# 抓取时注意:1.实现生产者消费者模式 2.抓取url不重复 3.解析url也不能重复
# 多线程抓取多个url
# coding:utf-8
from threading import Thread
from bs4 import BeautifulSoup
import requests
import time
import os
import threading
from queue import Queue
# 没用队列使用简单的列表
# 创建url
base_url = 'http://www.doutula.com/article/list/?page='
# 请求的url
url_list = []
# 拼接url
for k in range(1,3):
url_list.append(base_url + str(k))
# print(url_list)
header = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36'
}
# 页面源代码
page_source_list = []
# 创建锁
lock = threading.Lock()
# 生产者 -- get page
class Producer(Thread):
print ('当前生产线程-----%s' %threading.currentThread())
# 重写run方法
def run(self):
if len(url_list) > 0:
# 加锁防止多线程死锁
# lock.acquire()
target_url = url_list.pop()
print(target_url)
# lock.release()
# url_list.pop() 取出之后删除防止重复
page = requests.get(target_url,headers=header).content
# 把请求的网页放入page_source_list
page_source_list.append(page)
print ('---111----%s' %len(page_source_list))
# 消费者 -- parse page
class Consumer(Thread):
print ('当前消费线程-----%s' %threading.currentThread())
# 重写run方法
def run(self):
time.sleep(2)
print ('------222-----%s' %(page_source_list))
if len(page_source_list) > 0:
lock.acquire()
page_source = page_source_list.pop()
html = BeautifulSoup(page_source,'lxml')
img_list = html.find_all('img',attrs={'class':'lazy image_dtb img-responsive'})
for img in img_list:
src = img['data-original']
imgs = requests.get(src, headers=header).content
# print(imgs)
img_path = os.getcwd() + '/images/'
if not os.path.exists(img_path):
os.mkdir(img_path)
img_name = src[-15:-8]
print(img_name)
with open(img_path+img_name+'.jpg','wb') as f:
f.write(imgs)
lock.release()
if __name__ == '__main__':
for i in range(3):
Producer().start()
for j in range(2):
time.sleep(2)
Consumer().start()
代码运行环境python3,图片抓取有些粗糙,png格式的没有做区分,都保存为jpg格式的,大体思路大家可以参考下哦~