zoukankan      html  css  js  c++  java
  • python3爬虫-分析Ajax,抓取今日头条街拍美图

    # coding=utf-8
    from urllib.parse import urlencode
    import requests
    from requests.exceptions import RequestException,Timeout
    import json
    from bs4 import BeautifulSoup
    from pymongo import MongoClient
    from multiprocessing import Pool
    import os
    import string
    from hashlib import md5
    
    
    def get_response(url):
        try:
            headers = {
                "user-agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.62 Safari/537.36"
            }
            # proxies = {'http':'118.11.2.3:8080'}
            response = requests.get(url, headers=headers, timeout=5)
            print(url + 'request success')
            return response
        except Timeout:
            print(url + 'request timeout')
    
    
    def get_page_index(offset, keyword):
    
        data = {
            "offset": offset,
            "format": "json",
            "keyword": keyword,
            "autoload": "true",
            "count": "20",
            "cur_tab": "1",
            "from":"search_tab"
        }
    
    
        url = "https://www.toutiao.com/search_content/?" + urlencode(data)
        print(url)
        try:
            response = get_response(url)
            print(response.status_code)
            if response.status_code == 200:
                return response.text
            return None
        except RequestException:
            print('request error')
            return None
    
    def conn_mongodb():
        client = MongoClient('localhost', 27017)
        db = client['jiepai']
        jiepai = db['jiepai']
        return jiepai
    
    def save_image_url(data):
        jiepai = conn_mongodb()
        jiepai.update({'title':data.get('title')}, {'$set':data}, upsert=True)
    
    def get_image_url():
        jiepai = conn_mongodb()
        data = jiepai.find({}, {'title': 1, 'images_list': 1, '_id': 0})
        return data
    
    
    def download_image(data):
    
        base_dir = os.path.abspath(os.path.dirname(__file__))
        if not os.path.exists(base_dir + 'jiepai'):
            os.mkdir(base_dir + 'jiepai')
        for item in data:
            print(item.get('title'))
            title = item.get('title')
            images_list = item.get('images_list')
            print('images_lsit',images_list)
            # every file name
            file_name = title.strip(string.punctuation)
            file_name = str(file_name).replace('?','')
            if not os.path.exists(base_dir + 'jiepai/' + file_name):
                os.mkdir(base_dir + 'jiepai\' + file_name)
            # save images path
            file_path = base_dir + 'jiepai\' + file_name
            for image_url in images_list:
                print(image_url)
                response = get_response(image_url)
                html = response.content
                image_name = md5(html).hexdigest() + '.jpg'
    
                with open(file_path + '\' + image_name, 'wb') as f:
                    f.write(html)
                    print('download success')
    
    
    def parse_page_index(html):
        data = json.loads(html)
        if data and 'data' in data.keys():
            for item in data.get('data'):
                a_gourp_image_detail = {}
                images_list = []
                title = item.get('title')
                # print(title)
                if title is not None:
                    a_gourp_image_detail['title'] = title
                    images = item.get('image_detail')
                    # print(images)
                    if images:
                        for image in images:
                            # print(image.get('url'))
                            images_list.append(image.get('url'))
                # if images_list:
                a_gourp_image_detail['images_list'] = list(set(images_list))
                print(a_gourp_image_detail)
                save_image_url(a_gourp_image_detail)
    
    
    def main(offset):
    
        html = get_page_index(offset, '街拍')
        # print(html)
        parse_page_index(html)
    
    
    if __name__ == "__main__":
        # 多进程爬取图片链接,并保存到 Mongodb
        # groups = [x*20 for x in range(0,5)]
        # pool = Pool()
        # pool.map(main, groups)
    
        # 从 mongodb 中获取链接,多进程下载图片,并保存
        data = get_image_url()
        datas = [item for item in data]
    
        pool = Pool()
        pool.map(download_image, data)
        # download_image()
    

      

  • 相关阅读:
    Internet protocol optimizer
    SQl常用语句总结(持续更新……)
    让 步( 写的太好了!)
    让 步( 写的太好了!)
    让 步( 写的太好了!)
    $.ajax()参数详解及标准写法
    $.ajax()参数详解及标准写法
    $.ajax()参数详解及标准写法
    JQuery函数attr()和prop()的区别
    公司来了个傻员工,改变了所有聪明的员工
  • 原文地址:https://www.cnblogs.com/royfans/p/8329902.html
Copyright © 2011-2022 走看看