zoukankan      html  css  js  c++  java
  • python爬虫(爬取图片)

    python爬虫爬图片

    第一步

    载入爬虫模块

    from requests_html import HTMLSession            #载入爬虫模块
    

    第二步

    创建session对象

    from requests_html import HTMLSession            #载入爬虫模块
    session =HTMLSession() #创建完毕
    

    第三步

    获得发现百度图片搜索规律并发起请求并匹配到图片的url

    http://image.baidu.com/search/index?tn=baiduimage&fm=result&ie=utf-8&word=我们搜图片的关键字

    from requests_html import HTMLSession            #载入爬虫模块
    session =HTMLSession() #创建完毕
    #拿二傻子为例
    response = session.get('http://image.baidu.com/search/index?tn=baiduimage&fm=result&ie=utf-8&word=二傻子')
    #获取我们图片的url的正则匹配格式
    img_url_regex = '"thumbURL":"{}",'
    #解析并获取图片url_list
    img_url_list = response.html.search_all(img_url_regex)
    

    第四步

    访问图片url并且保存下来

    from requests_html import HTMLSession            #载入爬虫模块
    session =HTMLSession() #创建完毕
    #拿二傻子为例
    response = session.get('http://image.baidu.com/search/index?tn=baiduimage&fm=result&ie=utf-8&word=二傻子')
    #获取我们图片的url的正则匹配格式
    img_url_regex = '"thumbURL":"{}",'
    #解析并获取图片url_list
    img_url_list = response.html.search_all(img_url_regex)
    
    mun=0
    for url in img_url_list:
        mun+=1
        #访问图片链接
        response= session.get(url[0])
        #保存二进制并保存至本地
        with open(f'第{mun}张.jpg','wb') as fw:
            fw.write(response.content)
    

    第五步

    类的封装

    from requests_html import HTMLSession    
    
    class BaiDuImg:
        session = HTMLSession()
        img_url_regex = '"thumbURL":"{}",'
        url=''
        img_url_list =[]
        
        def get_search(self):
            search=input('请输入你要搜索的图片')
            self.url=f'http://image.baidu.com/search/index?tn=baiduimage&fm=result&ie=utf-8&word={search}'
            
        def get_img_url_list(self):
            response=self.session.get(self.url)
            self.img_url_list = response.html.search_all(self.img_url_regex)
            
        def save_img(self):
            mun = 0
            for url in self.img_url_list:
                mun += 1
                # 访问图片链接
                response = self.session.get(url[0])
                # 保存二进制并保存至本地
                with open(f'第{mun}张.jpg', 'wb') as fw:
                    fw.write(response.content)
        
        def run(self):
            self.get_search()
            self.get_img_url_list()
            self.save_img()
            
    if __name__ == '__main__':
        baidu=BaiDuImg()
        baidu.run()
    

    后来有个研一的小姐姐说要把全部爬完那就改改

    from requests_html import HTMLSession
    
    class BaiDuImg:
        session = HTMLSession()
        img_url_regex = '"thumbURL":"{}",'
        url = ''
        img_url_list = []
    
        def get_search(self):
            search = input('请输入你要搜索的图片')
            #有点点偷懒参数没有好好分析全,只对关键参数处理
            self.url = f'https://image.baidu.com/search/acjson?tn=resultjson_com&ipn=rj&ct=201326592&is=&fp=result&queryWord={search}&cl=2&lm=-1&ie=utf-8&oe=utf-8&adpicid=&st=-1&z=&ic=0&hd=&latest=&copyright=&word={search}&s=&se=&tab=&width=&height=&face=0&istype=2&qc=&nc=1&fr=&expermode=&force=&rn=30&gsm='
    	
        def get_img_url_list(self):
            '&pn=30000'
            pn = 0
            try:
                while True:  #由于百度限制只能抓取450张,嗯可能能获取480张,我懒没接着分析了,如果真的需要私聊我我可以写全
                    res = self.session.get(f'{self.url}&pn={pn}')
                    print(res.json()['bdIsClustered'])
                    if  res.json()['bdIsClustered']=='2':
                        break
                    else:
                        pn+=30
                        for dic in res.json()['data']:
                            img_url = dic.get('thumbURL')
                            if img_url:
                                self.img_url_list.append(img_url)
            except Exception as e:
                pass
    
    
    
        def save_img(self):
            mun = 0
            for url in self.img_url_list:
                mun += 1
                # 访问图片链接
                response = self.session.get(url)
                # 保存二进制并保存至本地
                with open(f'第{mun}张.jpg', 'wb') as fw:
                    fw.write(response.content)
                    print(f'第{mun}张保存本地完毕')
    
        def run(self):
            self.get_search()
            self.get_img_url_list()
            print(len(self.img_url_list))
            self.save_img()
    
    
    if __name__ == '__main__':
        baidu = BaiDuImg()
        baidu.run()
    
  • 相关阅读:
    240. Search a 2D Matrix II
    239. Sliding Window Maximum
    238. Product of Array Except Self
    237. Delete Node in a Linked List
    SCR文件的关联被AutoCAD所取代的解决办法
    香草世界
    write something
    ArcGIS 网络分析[2.3] 最近设施点
    ArcGIS 网络分析[2.2] 服务区分析
    ArcGIS 网络分析[2.1] 最短路径
  • 原文地址:https://www.cnblogs.com/pythonywy/p/10856508.html
Copyright © 2011-2022 走看看