zoukankan      html  css  js  c++  java
  • 图虫爬虫

    import requests
    from requests.adapters import HTTPAdapter
    import re
    from urllib import parse
    import os
    
    def getpiclist(kw):
        headers = {
            'authority': 'stock.tuchong.com',
            'method': 'GET',
            'path': '/search?term='+kw+'&use=0&source=extbaidudkey68',
            'scheme': 'https',
            'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
            'accept-encoding': 'gzip, deflate, br',
            'accept-language': 'zh-CN,zh;q=0.9',
            'cache-control': 'max-age=0',
            'cookie': '_ga=GA1.2.286933693.1548990624; weilisessionid=e54ec13b6b6a18a62bf33ab9d0400623; wluuid=WLGEUST-970FA996-29E6-75FD-8FF6-A428AD814CCC; wlsource=extbaidudkey68; qimo_seosource_e7dfc0b0-b3b6-11e7-b58e-df773034efe4=%E5%85%B6%E4%BB%96%E7%BD%91%E7%AB%99; qimo_seokeywords_e7dfc0b0-b3b6-11e7-b58e-df773034efe4=%E6%9C%AA%E7%9F%A5; href=https%3A%2F%2Fstock.tuchong.com%2Fsearch%3Fterm%3D%25E5%25B8%2583%25E4%25B8%2581%26use%3D0%26source%3Dextbaidudkey68; accessId=e7dfc0b0-b3b6-11e7-b58e-df773034efe4; bad_ide7dfc0b0-b3b6-11e7-b58e-df773034efe4=79be0961-4966-11e9-9fd8-1d264daba2e8; nice_ide7dfc0b0-b3b6-11e7-b58e-df773034efe4=79be0962-4966-11e9-9fd8-1d264daba2e8; webp_enabled=0; pageViewNum=4',
            'referer': 'https://tc.ftsm-vip.com/?source=extbaidudkey68&utm_source=extbaidudkey68',
            'upgrade-insecure-requests': '1',
            'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'
        }
        url = "https://stock.tuchong.com/free/search/?term=" + kw
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            # print(response.text)
            picl = re.findall(""imageId":"(d+)"", response.text)
            # print(piclist)
            piclist = []
            for pic in picl:
                urls = "https://p3a.pstatp.com/weili/l/" + pic + ".jpg"
                if urls not in piclist:
                    piclist.append(urls)
        else:
            piclist = "not find url"
        return piclist
    
    def downloadpic(url, savepath, s):
        try:
            headers = {
                'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
                'Accept-Encoding': 'gzip, deflate, br',
                'Accept-Language': 'zh-CN,zh;q=0.9',
                'Cache-Control': 'max-age=0',
                'Connection': 'keep-alive',
                'Host': 'p3a.pstatp.com',
                'Upgrade-Insecure-Requests': '1',
                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'
            }
            pic = s.get(url, timeout=5, headers=headers)
            picname = url.rsplit("/", 1)[1]
            fp = open(savepath + picname, 'wb')
            fp.write(pic.content)
            fp.close()
        except requests.exceptions.ConnectionError:
            print(url,"【错误】当前图片无法下载")
        except requests.exceptions.ReadTimeout:
            print(url, "【错误】超时")
        except requests.exceptions.ChunkedEncodingError:
            print(url, "【错误】远程主机强迫关闭了一个现有的连接")
        except requests.exceptions.RequestException as e:
            print(url, "【错误】", e)
    
    def main():
        words = ["桃子"]   #需要下载的名称
        for word in words:
            kw = parse.quote(str(word))
            savepath = "./" + word + "/"
            if not os.path.exists(savepath):
                os.makedirs(savepath)
            s = requests.Session()
            s.mount('http://', HTTPAdapter(max_retries=2))
            s.mount('https://', HTTPAdapter(max_retries=2))
            picturelist = getpiclist(kw)
            if type(picturelist) is list:
                for picurl in picturelist:
                    print(f"正在下载 {word} 第 {picturelist.index(picurl)} 张图片 {picurl}")
                    downloadpic(picurl, savepath, s)
            else:
                print("爬取完成")
                break
    
    if __name__ == "__main__":
        main()
  • 相关阅读:
    数据分析入门_char01
    [转]在Goolge安装谷歌插件Postman
    Ubutu 14.04 Fiddler Android抓包
    Ubuntu14.04 install appium
    【转】ubuntu修改时区和时间的方法
    MongoDB权威指南<2> 1-2 MongoDB 介绍
    python数据类型-字典
    python数据类型-列表
    python数据类型-字符串
    python编码以及格式化输出
  • 原文地址:https://www.cnblogs.com/xypbk/p/10556855.html
Copyright © 2011-2022 走看看