zoukankan      html  css  js  c++  java
  • 爬取取百度和Flickr图像

    爬取百度Flickr图像

    import requests
    from threading import Thread
    import re
    import time
    import hashlib
    
    class BaiDu:
        """
        爬取百度图片
        """
        def __init__(self, name, page):
            self.start_time = time.time()
            self.name = name
            self.page = page
            #self.url = 'https://image.baidu.com/search/acjson?tn=resultjson_com&ipn=rj&rn=60&'
            self.url = 'https://image.baidu.com/search/acjson'
            self.header = {'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36'}# 添加为自己的浏览器版本,具体操作网上一大推
            self.num = 0
            self.all_num = 0
            self.thread_all = [] # thread num
    
        def queryset(self):
            """
            将字符串转换为查询字符串形式
            """
            pn = 0
            for i in range(int(self.page)):
                pn += 60 * i
                name = {'word': self.name, 'pn': pn, 'tn':'resultjson_com', 'ipn':'rj', 'rn':60}
                url = self.url
                self.all_num += 60
                self.getrequest(i, url, name)
    
        def getrequest(self, index, url, data):
            """
            发送请求
            """
            print('[INFO]: 开始发送请求:' + url)
            ret = requests.get(url, headers=self.header, params=data)
    
            if str(ret.status_code) == '200':
                print('[INFO]: request 200 ok :' + ret.url)
            else:
                print('[INFO]: request {}, {}'.format(ret.status_code, ret.url))
    
            response = ret.content.decode()
            img_links = re.findall(r'thumbURL.*?.jpg', response)
            links = []
            # 提取url
            for link in img_links:
    
                links.append(link[11:])
     
            self.build_thread(index, links)
    
        def saveimage(self, links):
            """
            保存图片
            """
            for i, link in enumerate(links):
                if not link:
                    continue
                #print('[INFO]:正在保存图片:' + link)
                m = hashlib.md5()
                m.update(link.encode())
                name = m.hexdigest()
                try:
                    ret = requests.get(link, headers = self.header)
                    image_content = ret.content
                    filename = './images/' + name + '.jpg'
    
                    with open(filename, 'wb') as f:
                        f.write(image_content)
    
                    #print('[INFO]:保存成功,图片名为:{}.jpg'.format(name))
                except Exception:
                    pass
                self.num += 1
    
        def run(self):
            for thred_p in self.thread_all:
                thred_p.start()
            for thred_p in self.thread_all:
                thred_p.join() 
    
        def build_thread(self, i, links):
            """多线程"""
            self.thread_all.append(Thread(target=self.saveimage, args=(links,)))
            
    
        def __del__(self):
    
            end_time = time.time()
            print('request total images: {}, actual download images: {}, time cost {} second'.format(self.all_num, self.num, (end_time - self.start_time)))
    
    def main():
        hand_name = ['人脸', 'head','arm']
        for name in hand_name:
            #name = '手势图像'#input('请输入你要爬取的图片类型: ')
            page = 10 #input('请输入你要爬取图片的页数(60张一页):')
            baidu = BaiDu(name, page)
            baidu.queryset()
            baidu.run()
    
    
    if __name__ == '__main__':
    
    
        main()
    
    #coding:utf-8
    
    import flickrapi
    import urllib
    import os
    from threading import Thread
    from tqdm import tqdm
    
    class CrawlFlickr:
        def __init__(self, API_KEY="", API_SECRET="", SavePath="", 
                            PerPage=10, Text="", Tags="", ThreadNum=4,
                            MaxCounter=10):
           
            self.urls = []
            self.ThreadNum = ThreadNum
            self.SavePath = SavePath
            self.Thread_All = []
            self.MaxCounter = MaxCounter
    
            flickr = flickrapi.FlickrAPI(API_KEY, API_SECRET, cache=True)
            self.photos=flickr.walk(text=Text,
                                    tag_mode='all',
                                    tags=Tags,
                                    extras='url_c',
                                    per_page=PerPage,           # may be you can try different numbers..
                                    sort='relevance')
            self.get_url()
            self.build_thread()
    
        def get_url(self):
            for i, photo in enumerate(self.photos):
                url = photo.get('url_c')
                if str(url) == "None":
                    continue
                self.urls.append(url)
                if i >= self.MaxCounter:
                    break
                if i%200==0:
                    print('get {} url, max {}
    '.format(len(self.urls), self.MaxCounter))
            print('
    get {} url finish.....
    '.format(len(self.urls)))
        
        def build_thread(self):
            if self.ThreadNum >= len(self.urls):
                raise ValueError(f"Input Thread number is large: {self.ThreadNum},"
                            "while data is small: {len(self.urls)}")
    
            part = len(self.urls) // self.ThreadNum
            for i in range(self.ThreadNum)[::-1]:
                self.Thread_All.append(Thread(target=self.get_img, args=(self.urls[i * part:],)))
                self.urls = self.urls[:i * part]
            print('build thread finish...
    ')
    
        def run(self):
            for thred in self.Thread_All:
                thred.start()
            for thred in self.Thread_All:
                thred.join()
            print('download image finish...
    ')
    
        def get_img(self, urls):
            for url in urls:
                img_name = url.split('/')[-1]
                if '.jpg' in img_name or '.png' in img_name:
                    urllib.request.urlretrieve(url, os.path.join(self.SavePath, img_name))
                    print('download {}
    '.format(os.path.join(self.SavePath, img_name)))
    
    if __name__ == "__main__":
        param = dict(
            API_KEY="", 
            API_SECRET="", 
            SavePath="./images", 
            PerPage=10,
            Text="human pose", 
            Tags="", 
            ThreadNum=8,
            MaxCounter=500
        )
        crawl_flickr = CrawlFlickr(**param)
        crawl_flickr.run()
    

    -------------------------------------------

    个性签名:衣带渐宽终不悔,为伊消得人憔悴!

    如果觉得这篇文章对你有小小的帮助的话,记得关注再下的公众号,同时在右下角点个“推荐”哦,博主在此感谢!

  • 相关阅读:
    面相对象2016/4/19
    2016/4/19
    ajax2016/4/15 post与get
    iframe2016/4/12
    Hibernate报错解决Error parsing JNDI name [],Need to specify class name
    vue-router 源码解析最简版
    vue数据响应式原理
    3-箭头函数与普通函数
    2.1.7 html的处理与打包
    2.1.6 css的编译与处理 -2
  • 原文地址:https://www.cnblogs.com/wjy-lulu/p/13539962.html
Copyright © 2011-2022 走看看