zoukankan      html  css  js  c++  java
  • python爬虫(爬取图片)

    python爬虫爬图片

    第一步

    载入爬虫模块

    from requests_html import HTMLSession            #载入爬虫模块
    

    第二步

    创建session对象

    from requests_html import HTMLSession            #载入爬虫模块
    session =HTMLSession() #创建完毕
    

    第三步

    获得发现百度图片搜索规律并发起请求并匹配到图片的url

    http://image.baidu.com/search/index?tn=baiduimage&fm=result&ie=utf-8&word=我们搜图片的关键字

    from requests_html import HTMLSession            #载入爬虫模块
    session =HTMLSession() #创建完毕
    #拿二傻子为例
    response = session.get('http://image.baidu.com/search/index?tn=baiduimage&fm=result&ie=utf-8&word=二傻子')
    #获取我们图片的url的正则匹配格式
    img_url_regex = '"thumbURL":"{}",'
    #解析并获取图片url_list
    img_url_list = response.html.search_all(img_url_regex)
    

    第四步

    访问图片url并且保存下来

    from requests_html import HTMLSession            #载入爬虫模块
    session =HTMLSession() #创建完毕
    #拿二傻子为例
    response = session.get('http://image.baidu.com/search/index?tn=baiduimage&fm=result&ie=utf-8&word=二傻子')
    #获取我们图片的url的正则匹配格式
    img_url_regex = '"thumbURL":"{}",'
    #解析并获取图片url_list
    img_url_list = response.html.search_all(img_url_regex)
    
    mun=0
    for url in img_url_list:
        mun+=1
        #访问图片链接
        response= session.get(url[0])
        #保存二进制并保存至本地
        with open(f'第{mun}张.jpg','wb') as fw:
            fw.write(response.content)
    

    第五步

    类的封装

    from requests_html import HTMLSession    
    
    class BaiDuImg:
        session = HTMLSession()
        img_url_regex = '"thumbURL":"{}",'
        url=''
        img_url_list =[]
        
        def get_search(self):
            search=input('请输入你要搜索的图片')
            self.url=f'http://image.baidu.com/search/index?tn=baiduimage&fm=result&ie=utf-8&word={search}'
            
        def get_img_url_list(self):
            response=self.session.get(self.url)
            self.img_url_list = response.html.search_all(self.img_url_regex)
            
        def save_img(self):
            mun = 0
            for url in self.img_url_list:
                mun += 1
                # 访问图片链接
                response = self.session.get(url[0])
                # 保存二进制并保存至本地
                with open(f'第{mun}张.jpg', 'wb') as fw:
                    fw.write(response.content)
        
        def run(self):
            self.get_search()
            self.get_img_url_list()
            self.save_img()
            
    if __name__ == '__main__':
        baidu=BaiDuImg()
        baidu.run()
    

    后来有个研一的小姐姐说要把全部爬完那就改改

    from requests_html import HTMLSession
    
    class BaiDuImg:
        session = HTMLSession()
        img_url_regex = '"thumbURL":"{}",'
        url = ''
        img_url_list = []
    
        def get_search(self):
            search = input('请输入你要搜索的图片')
            #有点点偷懒参数没有好好分析全,只对关键参数处理
            self.url = f'https://image.baidu.com/search/acjson?tn=resultjson_com&ipn=rj&ct=201326592&is=&fp=result&queryWord={search}&cl=2&lm=-1&ie=utf-8&oe=utf-8&adpicid=&st=-1&z=&ic=0&hd=&latest=&copyright=&word={search}&s=&se=&tab=&width=&height=&face=0&istype=2&qc=&nc=1&fr=&expermode=&force=&rn=30&gsm='
    	
        def get_img_url_list(self):
            '&pn=30000'
            pn = 0
            try:
                while True:  #由于百度限制只能抓取450张,嗯可能能获取480张,我懒没接着分析了,如果真的需要私聊我我可以写全
                    res = self.session.get(f'{self.url}&pn={pn}')
                    print(res.json()['bdIsClustered'])
                    if  res.json()['bdIsClustered']=='2':
                        break
                    else:
                        pn+=30
                        for dic in res.json()['data']:
                            img_url = dic.get('thumbURL')
                            if img_url:
                                self.img_url_list.append(img_url)
            except Exception as e:
                pass
    
    
    
        def save_img(self):
            mun = 0
            for url in self.img_url_list:
                mun += 1
                # 访问图片链接
                response = self.session.get(url)
                # 保存二进制并保存至本地
                with open(f'第{mun}张.jpg', 'wb') as fw:
                    fw.write(response.content)
                    print(f'第{mun}张保存本地完毕')
    
        def run(self):
            self.get_search()
            self.get_img_url_list()
            print(len(self.img_url_list))
            self.save_img()
    
    
    if __name__ == '__main__':
        baidu = BaiDuImg()
        baidu.run()
    
  • 相关阅读:
    Python (一)Tkinter窗口组件:Label
    Python (八)Tkinter窗口组件:Scrollbar
    Python (四)Tkinter窗口组件:Radiobutton
    Python (五)Tkinter窗口组件:LabelFrame
    Python (三)Tkinter窗口组件:Checkbutton
    Scrapy安装及相关知识点概括
    Python (九)Tkinter窗口组件:Scale
    Python (六)Tkinter窗口组件:Entry
    电脑通过蓝牙适配器连接手机与蓝牙耳机之经验
    Noi2018 归途
  • 原文地址:https://www.cnblogs.com/pythonywy/p/10856508.html
Copyright © 2011-2022 走看看