zoukankan      html  css  js  c++  java
  • [Python爬虫]头条图集爬取

    import requests
    from urllib.parse import urlencode
    import os
    from hashlib import md5
    from multiprocessing.pool import Pool
    
    def get_page(offset):
        headers={
            'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.75 Safari/537.36',
            'cookie':'tt_webid=6724223385113069069; WEATHER_CITY=%E5%8C%97%E4%BA%AC; tt_webid=6724223385113069069; csrftoken=9e9d6c3be6aabc313dce0c4f1a116047; sso_uid_tt=27219b1c2d00b8a6021444d85d83dc38; toutiao_sso_user=7562e682c093b193cce298f25dd396ba; login_flag=8391d980bfc8a8908e7c6c80596a016c; __tea_sdk__ssid=undefined; _ga=GA1.2.931504366.1565662966; sid_guard=7562e682c093b193cce298f25dd396ba%7C1565663040%7C5126263%7CFri%2C+11-Oct-2019+10%3A21%3A43+GMT; uid_tt=27219b1c2d00b8a6021444d85d83dc38; sid_tt=7562e682c093b193cce298f25dd396ba; sessionid=7562e682c093b193cce298f25dd396ba; uuid="w:443dcb551552404fbfde212f1054c781"; __tasessionId=i5j7qcydf1569292028372; s_v_web_id=1e7e3b52d7bc46698bb26079c99fd83d',
            'pragma':'no-cache',
            'referer':'https://www.toutiao.com/search/?keyword=%E8%A1%97%E6%8B%8D',
            'x-requested-with':'XMLHttpRequest'
        }
        params={
            'aid':'24',
            'app_name':'web_search',
            'offset':offset,
            'format':'json',
            'keyword':'街拍',
            'autoload':'true',
            'count':'20',
            'en_qc':'1',
            'cur_tab':'1',
            #'from':'search_tab',
            #'pd':'synthesis',
        }
        print(urlencode(params))
        url='https://www.toutiao.com/api/search/content/?'+urlencode(params)
        try:
            response=requests.get(url,headers=headers)
            if response.status_code==200:
                print(response.json())
                return response.json()
        except requests.ConnectionError:
            return 'No response'
    
    def get_image(json):
        if json.get('data'):
            for item in json.get('data'):
                if 'title' in item and 'image_list' in item and item['image_list']!=[]:
                    title=item.get('title')
                    images=item.get('image_list')
                    for image in images:
                        print(title)
                        print(image)
                        yield {
                            'image':image.get('url'),
                            'title':title
                        }
        else:
            print('Not parse')
    def save_image(item):
        if not os.path.exists(item.get('title')):
            os.mkdir(item.get('title'))
        try:
            response=requests.get(item.get('image'))
            if response.status_code==200:
                file_path='{0}/{1}.{2}'.format(item.get('title'),md5(response.content).hexdigest(),'jpg')
                if not os.path.exists(file_path):
                    with open(file_path,'wb') as f:
                        f.write(response.content)
                else:
                    print('Already Downloaded',file_path)
        except requests.ConnectionError:
            print('Failed to Save Image')
    
    def main(offset):
        json=get_page(offset)
        for item in get_image(json):
            print(item)
            save_image(item)
    GROUP_START=1
    GROUP_END=1
    if __name__=='__main__':
        pool=Pool()
        groups=([x*20 for x in range(GROUP_START,GROUP_END+1)])
        pool.map(main,groups)
        pool.close()
        pool.join()
  • 相关阅读:
    那些创业的艰辛整理
    一个成功的研发团队应具备的9大属性
    如何将 Linux 系统转移至 LVM 卷
    如何在 Linux 上永久挂载一个 Windows 共享
    怎样在 Chromebook 上安装 Linux 系统?
    1087 有多少不同的值 (20 分)C语言
    1052 卖个萌 (20 分)C语言
    1064 朋友数 (20 分)C语言
    1045 快速排序 (25 分)C语言
    1048 数字加密 (20 分)C语言
  • 原文地址:https://www.cnblogs.com/lightmonster/p/11577909.html
Copyright © 2011-2022 走看看