爬取百度和Flickr图像
import requests
from threading import Thread
import re
import time
import hashlib
class BaiDu:
"""
爬取百度图片
"""
def __init__(self, name, page):
self.start_time = time.time()
self.name = name
self.page = page
#self.url = 'https://image.baidu.com/search/acjson?tn=resultjson_com&ipn=rj&rn=60&'
self.url = 'https://image.baidu.com/search/acjson'
self.header = {'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36'}# 添加为自己的浏览器版本,具体操作网上一大推
self.num = 0
self.all_num = 0
self.thread_all = [] # thread num
def queryset(self):
"""
将字符串转换为查询字符串形式
"""
pn = 0
for i in range(int(self.page)):
pn += 60 * i
name = {'word': self.name, 'pn': pn, 'tn':'resultjson_com', 'ipn':'rj', 'rn':60}
url = self.url
self.all_num += 60
self.getrequest(i, url, name)
def getrequest(self, index, url, data):
"""
发送请求
"""
print('[INFO]: 开始发送请求:' + url)
ret = requests.get(url, headers=self.header, params=data)
if str(ret.status_code) == '200':
print('[INFO]: request 200 ok :' + ret.url)
else:
print('[INFO]: request {}, {}'.format(ret.status_code, ret.url))
response = ret.content.decode()
img_links = re.findall(r'thumbURL.*?.jpg', response)
links = []
# 提取url
for link in img_links:
links.append(link[11:])
self.build_thread(index, links)
def saveimage(self, links):
"""
保存图片
"""
for i, link in enumerate(links):
if not link:
continue
#print('[INFO]:正在保存图片:' + link)
m = hashlib.md5()
m.update(link.encode())
name = m.hexdigest()
try:
ret = requests.get(link, headers = self.header)
image_content = ret.content
filename = './images/' + name + '.jpg'
with open(filename, 'wb') as f:
f.write(image_content)
#print('[INFO]:保存成功,图片名为:{}.jpg'.format(name))
except Exception:
pass
self.num += 1
def run(self):
for thred_p in self.thread_all:
thred_p.start()
for thred_p in self.thread_all:
thred_p.join()
def build_thread(self, i, links):
"""多线程"""
self.thread_all.append(Thread(target=self.saveimage, args=(links,)))
def __del__(self):
end_time = time.time()
print('request total images: {}, actual download images: {}, time cost {} second'.format(self.all_num, self.num, (end_time - self.start_time)))
def main():
hand_name = ['人脸', 'head','arm']
for name in hand_name:
#name = '手势图像'#input('请输入你要爬取的图片类型: ')
page = 10 #input('请输入你要爬取图片的页数(60张一页):')
baidu = BaiDu(name, page)
baidu.queryset()
baidu.run()
if __name__ == '__main__':
main()
#coding:utf-8
import flickrapi
import urllib
import os
from threading import Thread
from tqdm import tqdm
class CrawlFlickr:
def __init__(self, API_KEY="", API_SECRET="", SavePath="",
PerPage=10, Text="", Tags="", ThreadNum=4,
MaxCounter=10):
self.urls = []
self.ThreadNum = ThreadNum
self.SavePath = SavePath
self.Thread_All = []
self.MaxCounter = MaxCounter
flickr = flickrapi.FlickrAPI(API_KEY, API_SECRET, cache=True)
self.photos=flickr.walk(text=Text,
tag_mode='all',
tags=Tags,
extras='url_c',
per_page=PerPage, # may be you can try different numbers..
sort='relevance')
self.get_url()
self.build_thread()
def get_url(self):
for i, photo in enumerate(self.photos):
url = photo.get('url_c')
if str(url) == "None":
continue
self.urls.append(url)
if i >= self.MaxCounter:
break
if i%200==0:
print('get {} url, max {}
'.format(len(self.urls), self.MaxCounter))
print('
get {} url finish.....
'.format(len(self.urls)))
def build_thread(self):
if self.ThreadNum >= len(self.urls):
raise ValueError(f"Input Thread number is large: {self.ThreadNum},"
"while data is small: {len(self.urls)}")
part = len(self.urls) // self.ThreadNum
for i in range(self.ThreadNum)[::-1]:
self.Thread_All.append(Thread(target=self.get_img, args=(self.urls[i * part:],)))
self.urls = self.urls[:i * part]
print('build thread finish...
')
def run(self):
for thred in self.Thread_All:
thred.start()
for thred in self.Thread_All:
thred.join()
print('download image finish...
')
def get_img(self, urls):
for url in urls:
img_name = url.split('/')[-1]
if '.jpg' in img_name or '.png' in img_name:
urllib.request.urlretrieve(url, os.path.join(self.SavePath, img_name))
print('download {}
'.format(os.path.join(self.SavePath, img_name)))
if __name__ == "__main__":
param = dict(
API_KEY="",
API_SECRET="",
SavePath="./images",
PerPage=10,
Text="human pose",
Tags="",
ThreadNum=8,
MaxCounter=500
)
crawl_flickr = CrawlFlickr(**param)
crawl_flickr.run()