zoukankan      html  css  js  c++  java
  • 汇图网爬虫

    import requests
    from requests.adapters import HTTPAdapter
    import re
    from urllib import parse
    import os
    
    def getpiclist(page, kw):
        headers = {
            'Accept': 'application/json, text/javascript, */*; q=0.01',
            'Accept-Encoding': 'gzip, deflate',
            'Accept-Language': 'zh-CN,zh;q=0.9',
            'Connection': 'keep-alive',
            'Cookie': 'dy2018=1; statinfo=https%3A%2F%2Fwww.baidu.com%2Flink%3Furl%3DiqbEA3_P4ej6UVrv29ikk9Rs4ZXRHyh4c28G8AeYd37%26wd%3D%26eqid%3Dd6bb34b50005e6a2000000065be52571; Hm_lvt_a4fa2a41b865534a782ceef2185fffaf=1541744017; Hm_lpvt_a4fa2a41b865534a782ceef2185fffaf=1541744029',
            'Host': 'soso.huitu.com',
            'Referer': 'http://soso.huitu.com/search?kw='+kw+'&page=1',
            'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36',
            'X-Requested-With': 'XMLHttpRequest'
        }
        url = "http://soso.huitu.com/search/?kw="+kw+"&page=" + str(page) + "&listtype=2"
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            # print(response.text)
            piclist = re.findall("<img src="(http://picddd.huitu.com/pic/d+/d+_d+_0.jpg)" />", response.text)
            # print(piclist)
        else:
            piclist = "not find url"
        return piclist
    
    def downloadpic(url, savepath, s):
        try:
            pic = s.get(url, timeout=5)
            picname = url.rsplit("/", 1)[1]
            fp = open(savepath + picname, 'wb')
            fp.write(pic.content)
            fp.close()
        except requests.exceptions.ConnectionError:
            print(url,"【错误】当前图片无法下载")
        except requests.exceptions.ReadTimeout:
            print(url, "【错误】超时")
        except requests.exceptions.ChunkedEncodingError:
            print(url, "【错误】远程主机强迫关闭了一个现有的连接")
        except requests.exceptions.RequestException as e:
            print(url, "【错误】", e)
    
    def main():
        words = ["芒果"]   #需要下载的名称
        for word in words:
            start = 1        #起始页数
            pagenum = 10     #需要下载的页数
            kw = parse.quote(str(word))
            savepath = "./" + word + "/"
            if not os.path.exists(savepath):
                os.makedirs(savepath)
            s = requests.Session()
            s.mount('http://', HTTPAdapter(max_retries=2))
            s.mount('https://', HTTPAdapter(max_retries=2))
            for i in range(start, pagenum):
                picturelist = getpiclist(i, kw)
                if type(picturelist) is list:
                    print(f"爬取第 {str(i)} 页内容")
                    for picurl in picturelist:
                        print(f"正在下载第 {str(i)} 页第 {picturelist.index(picurl)} 张图片")
                        # picurl = picurl.replace("/pic/", "/res/", 1)
                        # picurl = picurl.replace("_0.", "_1.", 1)
                        downloadpic(picurl, savepath, s)
                else:
                    print("爬取完成")
                    break
    
    if __name__ == "__main__":
        main()
  • 相关阅读:
    $P5240 Derivation$
    $P2504 [HAOI2006]聪明的猴子$
    $P1194 买礼物$
    $P2690 接苹果$
    $CF1141C Polycarp Restores Permutation$
    $CF1141B Maximal Continuous Rest$
    $CF1141A Game 23$
    $P1215 [USACO1.4]母亲的牛奶 Mother's Milk$
    $luogu2375[NOI2014]$
    poj 1419 (最大独立集)
  • 原文地址:https://www.cnblogs.com/xypbk/p/10556860.html
Copyright © 2011-2022 走看看