zoukankan      html  css  js  c++  java
  • python scrapy baidu image【转】

    原 https://github.com/vivianLL/baidupictures

    #!/usr/bin/env Python
    # coding=utf-8
    #__author__ = 'leilu'
    
    
    import json
    import itertools
    import urllib
    import requests
    import os
    import re
    import codecs
    import sys
    import imghdr
    
    reload(sys)
    sys.setdefaultencoding("utf-8")
    
    str_table = {
        '_z2C$q': ':',
        '_z&e3B': '.',
        'AzdH3F': '/'
    }
    
    char_table = {
        'w': 'a',
        'k': 'b',
        'v': 'c',
        '1': 'd',
        'j': 'e',
        'u': 'f',
        '2': 'g',
        'i': 'h',
        't': 'i',
        '3': 'j',
        'h': 'k',
        's': 'l',
        '4': 'm',
        'g': 'n',
        '5': 'o',
        'r': 'p',
        'q': 'q',
        '6': 'r',
        'f': 's',
        'p': 't',
        '7': 'u',
        'e': 'v',
        'o': 'w',
        '8': '1',
        'd': '2',
        'n': '3',
        '9': '4',
        'c': '5',
        'm': '6',
        '0': '7',
        'b': '8',
        'l': '9',
        'a': '0'
    }
    
    # str 的translate方法需要用单个字符的十进制unicode编码作为key
    # value 中的数字会被当成十进制unicode编码转换成字符
    # 也可以直接用字符串作为value
    char_table = {ord(key): ord(value) for key, value in char_table.items()}
    
    # 解码图片URL
    def decode(url):
        # 先替换字符串
        for key, value in str_table.items():
            url = url.replace(key, value)
        # 再替换剩下的字符
        return url.translate(char_table)
    
    # 生成网址列表
    def buildUrls(word):
        word = urllib.quote(word)
        url = r"http://image.baidu.com/search/acjson?tn=resultjson_com&ipn=rj&ct=201326592&fp=result&queryWord={word}&cl=2&lm=-1&ie=utf-8&oe=utf-8&st=-1&ic=0&word={word}&face=0&istype=2nc=1&pn={pn}&rn=60"
        urls = (url.format(word=word, pn=x) for x in itertools.count(start=0, step=60))
        return urls
    
    # 解析JSON获取图片URL
    re_url = re.compile(r'"objURL":"(.*?)"')
    def resolveImgUrl(html):
        imgUrls = [decode(x) for x in re_url.findall(html)]
        return imgUrls
    
    def downImg(imgUrl, dirpath, imgName):
        tempfile = os.path.join(dirpath, imgName)
        try:
            res = requests.get(imgUrl, timeout=15)
            if str(res.status_code)[0] == "4":
                print(str(res.status_code), ":" , imgUrl)
                return False
        except Exception as e:
            print("抛出异常:", imgUrl)
            print(e)
            return False
        with open(tempfile, "wb") as f:
            f.write(res.content)
            if os.path.getsize(tempfile) < 1024*10:
                print("small file")
                return False
            if imghdr.what(tempfile) == 'png':
                filename = os.path.join(dirpath, imgName+'.png')
            elif imghdr.what(tempfile) =='jpeg':
                filename = os.path.join(dirpath, imgName+'.jpg')
            elif imghdr.what(tempfile) =='jpg':
                filename = os.path.join(dirpath, imgName+'.jpg')
            elif imghdr.what(tempfile) =='bmp':
                filename = os.path.join(dirpath, imgName+'.bmp')
            else:
                print(imghdr.what(tempfile))
                return False
            os.remove(tempfile)
        with open(filename, "wb") as f:
            f.write(res.content)
        return True
    
    def mkdir(path):
        path = path.strip()
        dirpath = os.path.join("/home/aimhabo/getBaiduiImage/images/",path)
        if not os.path.exists(dirpath):
            print ("新建文件夹")        
            os.makedirs(dirpath)
            print (dirpath)
        return dirpath
    
    if __name__ == '__main__':
        #f = open('/home/aimhabo/getBaiduiImage/images.txt', 'r')
        #for line in f:
        #    word = line.strip().decode('utf-8')
            word = 'cats猫'
            print("正在搜索:", word)
            dirpath = mkdir(word)
    
            word = str(word)
            urls = buildUrls(word)
            index = 1000
            index_max = 1050
            for url in urls:
                print("正在请求:", url)
                html = requests.get(url, timeout=10).content.decode('utf-8')
                imgUrls = resolveImgUrl(html)
                if len(imgUrls) == 0:  # 没有图片则结束
                    break
                for url in imgUrls:
                    if downImg(url, dirpath, str(index)):
                        index += 1
                        print("index now on %s" % index)
                    if index > index_max:
                        break
                if index > index_max:
                    break
    
        #f.close()
  • 相关阅读:
    HDU 2236 无题Ⅱ
    Golden Tiger Claw(二分图)
    HDU 5969 最大的位或 (思维,贪心)
    HDU 3686 Traffic Real Time Query System (图论)
    SCOI 2016 萌萌哒
    Spring Boot支持控制台Banner定制
    构建第一个Spring Boot程序
    Spring Boot重要模块
    Java fastjson JSON和String互相转换
    BCompare 4 Windows激活方法【试用期30天重置】
  • 原文地址:https://www.cnblogs.com/aimhabo/p/8919845.html
Copyright © 2011-2022 走看看