zoukankan      html  css  js  c++  java
  • Python_爬虫_百度图片

    百度图片有些有编码问题,暂时不能爬取,多试几个

    #思路:抓取图片地址,根据地址转存图片(注意名称);难点:转码
    
    # -*- coding:utf-8 -*-
    from urllib import request,error
    import json,re
    
    
    # for page in range(4):
    #     url = "http://image.baidu.com/search/acjson?tn=resultjson_com&ipn=rj&ct=201326592&is=&fp=result&queryWord=%E5%93%88%E5%A3%AB%E5%A5%87&cl=2&lm=-1&ie=utf-8&oe=utf-8&adpicid=&st=-1&z=&ic=0&word=%E5%93%88%E5%A3%AB%E5%A5%87&s=&se=&tab=&width=&height=&face=0&istype=2&qc=&nc=1&fr=&pn="+str(page*30)+"&rn=30&gsm=1e&1520997016315="
    #     try:
    #         response = request.urlopen(url).read().decode("utf-8")
    #         print(type(response))
    #
    #     except error.URLError as e:
    #         print(e.reason)
    #
    class BaiduImg(object):
        def __init__(self):
            super(BaiduImg,self).__init__()
            print('开始采集图片')
            self.page = 30
        def request(self):
            while self.page <= 30:
                request_url='http://image.baidu.com/search/acjson?tn=resultjson_com&ipn=rj&ct=201326592&is=&fp=result&queryWord=%E5%93%88%E5%A3%AB%E5%A5%87&cl=2&lm=-1&ie=utf-8&oe=utf-8&adpicid=&st=-1&z=&ic=0&word=%E5%93%88%E5%A3%AB%E5%A5%87&s=&se=&tab=&width=&height=&face=0&istype=2&qc=&nc=1&fr=&pn=30&rn=30&gsm=1e&1520997014923='
                # print(len(request_url))
                headers = {
                    'user-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36'
                    # 'Content-type':'text/html'
                    # 'Content-type': 'text/html'
                }
                req=request.Request(request_url,headers=headers)
                with request.urlopen(req) as f:
                    if f.status == 200:
                        content = f.read().decode('utf-8')
                        content_dict = json.loads(content)
                        self.download2(content_dict['data'])
                self.page += 30
        #下载图片的方法
        def dowload(self,data):
            for image in data:
                if image.get('middleURL'):
                    url = image['middleURL']
                elif image.get('thumbURL'):
                    url = image['thumbURl']
                elif image.get('hoverURL'):
                    url = image['hoverURL']
                else:
                    url=''
                if url:
                    data = request.urlopen(url).read()
                    imageName = strip(image['fromPageTitleEnc'])
                    FileName = str('images/')+imageName+str('.jpg')
                    with open(FileName,'wb') as f:
                        f.write(data)
        #使用urllib.request.urlretrive()保存图片
        # 使用urllib.request.urlretrieve()保存图片
        def download2(self, data):
            for image in data:
                if image.get('middleURL'):
                    url = image['middleURL']
                elif image.get('thumbURL'):
                    url = image['thumbURL']
                else:
                    url = ""
                if url:
                    imageName = strip(image['fromPageTitleEnc'])
                    filePath = str('images/') + imageName + str('.jpg')
                    request.urlretrieve(url, filePath)
    #过滤函数
    def strip(path):
        path = re.sub(r'[?\*|"<>:/!?]','',str(path))
        return path
    if __name__ == '__main__':
        bi = BaiduImg()
        bi.request()
  • 相关阅读:
    python依赖包整体迁移方法
    ubuntu关于ssh协议登录问题
    k8s部署02-----kubeadm部署k8s
    k8s部署01-----what is k8s?
    兼容到ie10的js文件导出、下载到本地
    webstorm减少内存占用
    foxmail占cpu 100%解决办法
    原生js返回顶部
    js字符串驼峰和下划线互相转换
    element-ui的rules中正则表达式
  • 原文地址:https://www.cnblogs.com/hellangels333/p/8591684.html
Copyright © 2011-2022 走看看