zoukankan      html  css  js  c++  java
  • python爬今日头条(ajax分析)

    爬取今日头条关键字搜索街拍

    import os
    from multiprocessing.pool import Pool
    import requests
    from urllib.parse import urlencode
    from hashlib import md5
    
    headers = {
        'Host':'www.toutiao.com',
        'Referer':'https://www.toutiao.com/search/?keyword=%E8%A1%97%E6%8B%8D',
        'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.79 Safari/537.36',
        'X-Requested-With':'XMLHttpRequest'
    }
    
    def get_page(offset):
        params = {
            'offset': offset,
            'format': 'json',
            'keyword': '街拍',
            'autoload': 'true',
            'count': '20',
            'cur_tab': '1',
        }
        url = 'http://www.toutiao.com/search_content/?' + urlencode(params)
        try:
            response = requests.get(url,headers)
            if response.status_code == 200:
                return response.json()
        except requests.ConnectionError:
            return None
    
    
    def get_images(json):
        if json.get('data'):
            for item in json.get('data'):
                title = item.get('title')
                images = item.get('image_detail')
                for image in images:
                    yield {
                        'image': image.get('url'),
                        'title': title
                    }
    
    #手工先创建头条图片文件夹
    def save_image(item):
        if not os.path.exists('头条图片/'+item.get('title')):
            os.mkdir('头条图片/'+item.get('title'))
        try:
            response = requests.get(item.get('image'))
            if response.status_code == 200:
                file_path = '{0}/{1}.{2}'.format('头条图片1/'+item.get('title'), md5(response.content).hexdigest(), 'jpg')
                if not os.path.exists(file_path):
                    with open(file_path, 'wb') as f:
                        f.write(response.content)
                else:
                    print('Already Downloaded', file_path)
        except requests.ConnectionError:
            print('Failed to Save Image')
    
    
    def main(offset):
        json = get_page(offset)
        for item in get_images(json):
            print(item)
            save_image(item)
    
    
    GROUP_START = 1
    GROUP_END = 20
    
    if __name__ == '__main__':
        pool = Pool()
        groups = ([x * 20 for x in range(GROUP_START, GROUP_END + 1)])
        pool.map(main, groups)
        pool.close()
        pool.join()
    

    具体详见
    参考网址:
    崔大牛崔师兄博客:
    http://cuiqingcai.com/4320.html
    https://github.com/Python3WebSpider/Jiepai/blob/master/spider.py

  • 相关阅读:
    python_控制台输出带颜色的文字方法
    模拟数据库作业
    js笔记
    CSS 笔记
    html 笔记
    必备技能-Git 使用规范流程
    python 闭包
    30个python编程技巧!
    python 面向对象
    python 线程
  • 原文地址:https://www.cnblogs.com/zswbky/p/8454104.html
Copyright © 2011-2022 走看看