zoukankan      html  css  js  c++  java
  • requests库+ajax数据分析+多线程爬取头条图集

    '''
    根据关键字获取今日头条图集
    '''
    
    
    import os
    import requests
    from urllib.parse import quote
    from hashlib import md5
    from multiprocessing.pool import Pool
    
    baseurl = 'https://www.toutiao.com/search_content/?'
    total = 1
    
    
    def get_page(page, keyword):  # 获得页面
    
        keyword_h = quote(keyword)
        
        headers = {
        'referer': 'https://www.toutiao.com/search/?keyword='+keyword_h,
        'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/67.0.3396.99 Chrome/67.0.3396.99 Safari/537.36',
        'x-requested-with': 'XMLHttpRequest'
        }
    
        params = {
            'offset': page*20,
            'format': 'json',
            'keyword': keyword,
            'autoload': 'true',
            'count': '20',
            'cur_tab': '3',
            'from': 'gallery',
            }
            
        try:
            response = requests.get(baseurl, headers=headers, params=params)
            if response.status_code == 200:
                #print(response.url)
                return response.json()
        except ConnectionError as e:
            print('error', e.args)
          
       
    def get_image(json):  # 解析图集标题和每张图的url
        items = json.get('data')
        if items:
            for item in items:
                title = item.get('title')
                image_list = item.get('image_list')
                for image in image_list:
                    url = 'http:'+image.get('url').replace('list', 'origin')
                    yield{
                        'url': url,
                        'title': title
                    }
                    
                    
    def save_image(image):  # 根据标题保存图片
        if not os.path.exists(image.get('title')):
            os.mkdir(image.get('title'))
        try:
            response = requests.get(image.get('url'))
            if response.status_code == 200:
                file_path = '{0}/{1}.{2}'.format(image.get('title'), md5(response.content).hexdigest(), 'jpg')
                if not os.path.exists(file_path):
                    with open(file_path, 'wb') as f:
                        f.write(response.content)
                else:
                    print('Already Downloaded', file_path)
        except requests.ConnectionError:
            print('Failed to Save Image')
            
        
    def main(page, keyword='小姐姐'):  # 主流程控制
        if not os.path.exists(keyword):
            os.mkdir(keyword)
        os.chdir(keyword)
        json = get_page(page,keyword)
        for item in get_image(json):
            save_image(item)
        os.chdir(os.pardir)
              
              
    if __name__ == '__main__':  # 增加多线程 
        pool = Pool()
        groups_one = ([i for i in range(total)])
        pool.map(main, groups_one)
        pool.close()
        pool.join() 
                   
    
  • 相关阅读:
    hmset
    java 调用mongo 判断大于等于 并且小约等于<=
    Maven项目,别人的没问题,自己机器一直有问题
    linux 时间datetimectl 问题
    真正手把手教 git
    0324-SQLMAP使用参数备注
    安全推荐网址:
    JavaScript Base64 作为文件上传的实例代码解析
    学习笔记|变量的解构赋值
    学习笔记|let 和 const 命令
  • 原文地址:https://www.cnblogs.com/Wang-Y/p/9384279.html
Copyright © 2011-2022 走看看