zoukankan      html  css  js  c++  java
  • python爬虫练习--爬取今日头条街拍图片

    #! /usr/bin/env python
    # coding: utf-8
    
    import requests
    import os
    from hashlib import md5
    from multiprocessing.pool import Pool
    
    headers = {
    
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36',
        'X-Requested-With': 'XMLHttpRequest',
    
    }
    
    
    
    def get_pase(url,offset):
    
        params = {
            'offset':offset,
            'format':'json',
            'keyword':'街拍',
            'autoload':'true',
            'count':'20',
            'cur_tab':'1',
            'from':'search_tab'
        }
    
        response = requests.get(url,headers=headers,params=params)
        try:
            if response.status_code == 200:
                return response.json()
    
        except Exception as e:
            print(e.args)
    
    def get_images(json):
        if json:
            items = json.get('data')
            for item in items:
                title = item.get('title')
                images_list = item.get('image_list')
                if images_list:
                    for image in images_list:
                        yield {
                            'title': title,
                            'images': 'http:' + image.get('url')
                        }
    
    
    def save_image(item):
        if not os.path.exists(item.get('title')):
            os.mkdir(item.get('title'))
        try:
    
            image_url = item.get('images')
            new_image_url = image_url.replace('list','large')
            response = requests.get(new_image_url)
            if response.status_code == 200:
                file_path = '{0}/{1}.{2}'.format(item.get('title'), md5(response.content).hexdigest(), 'jpg')
                if not os.path.exists(file_path):
                    with open(file_path, 'wb')as f:
                        f.write(response.content)
                else:
                    print('Already Downloaded', file_path)
        except requests.ConnectionError:
            print('Failed to save image')
    
    
    def main(offset):
        url = 'https://www.toutiao.com/search_content/'
        json = get_pase(url,offset)
        for item in get_images(json):
            print('33[32;1m%s33[0m' % item)
            save_image(item)
    
    GROUP_START = 1
    GROUP_END = 5
    
    if __name__ == '__main__':
        pool = Pool()
        groups = ([x * 20 for x in range(GROUP_START, GROUP_END + 1)])
        pool.map(main, groups)
        pool.close()
        pool.join()
  • 相关阅读:
    2,ActiveMQ-入门
    1,Java消息服务-JMS
    2,Executor线程池
    Thymeleaf-模板引擎
    1,线程基础
    8,HashMap子类-LinkedHashMap
    7,HashMap
    ncnn编译安装-20190415
    VS2015 C#项目工程配置emgucv依赖的方法
    java jni 调用c++ opencv代码成功范例
  • 原文地址:https://www.cnblogs.com/watchslowly/p/9148051.html
Copyright © 2011-2022 走看看