zoukankan      html  css  js  c++  java
  • 今日头条街拍

    spider.py

    import json
    import os
    from urllib.parse import urlencode
    import pymongo
    import requests
    from bs4 import BeautifulSoup
    from requests.exceptions import ConnectionError
    import re
    from multiprocessing import Pool
    from hashlib import md5
    from json.decoder import JSONDecodeError
    from config import *
    
    client = pymongo.MongoClient(MONGO_URL, connect=False)
    db = client[MONGO_DB]
    
    
    def get_page_index(offset, keyword):
        data = {
            'autoload': 'true',
            'count': 20,
            'cur_tab': 3,
            'format': 'json',
            'keyword': keyword,
            'offset': offset,
        }
        params = urlencode(data)
        base = 'http://www.toutiao.com/search_content/'
        url = base + '?' + params
        try:
            response = requests.get(url)
            if response.status_code == 200:
                return response.text
            return None
        except ConnectionError:
            print('Error occurred')
            return None
    
    
    def download_image(url):
        print('Downloading', url)
        try:
            response = requests.get(url)
            if response.status_code == 200:
                save_image(response.content)
            return None
        except ConnectionError:
            return None
    
    
    def save_image(content):
        file_path = '{0}/{1}.{2}'.format(os.getcwd(), md5(content).hexdigest(), 'jpg')
        print(file_path)
        if not os.path.exists(file_path):
            with open(file_path, 'wb') as f:
                f.write(content)
                f.close()
    
    
    def parse_page_index(text):
        try:
            data = json.loads(text)
            if data and 'data' in data.keys():
                for item in data.get('data'):
                    yield item.get('article_url')
        except JSONDecodeError:
            pass
    
    
    def get_page_detail(url):
        try:
            response = requests.get(url)
            if response.status_code == 200:
                return response.text
            return None
        except ConnectionError:
            print('Error occurred')
            return None
    
    
    def parse_page_detail(html, url):
        soup = BeautifulSoup(html, 'lxml')
        result = soup.select('title')
        title = result[0].get_text() if result else ''
        images_pattern = re.compile('gallery: JSON.parse("(.*)")', re.S)
        result = re.search(images_pattern, html)
        if result:
            data = json.loads(result.group(1).replace('\', ''))
            if data and 'sub_images' in data.keys():
                sub_images = data.get('sub_images')
                images = [item.get('url') for item in sub_images]
                for image in images: download_image(image)
                return {
                    'title': title,
                    'url': url,
                    'images': images
                }
    
    
    def save_to_mongo(result):
        if db[MONGO_TABLE].insert(result):
            print('Successfully Saved to Mongo', result)
            return True
        return False
    
    
    def main(offset):
        text = get_page_index(offset, KEYWORD)
        urls = parse_page_index(text)
        for url in urls:
            html = get_page_detail(url)
            result = parse_page_detail(html, url)
            if result: save_to_mongo(result)
    
    
    if __name__ == '__main__':
        pool = Pool()
        groups = ([x * 20 for x in range(GROUP_START, GROUP_END + 1)])
        pool.map(main, groups)
        pool.close()
        pool.join()
    

    config.py

    MONGO_URL = 'localhost'
    MONGO_DB = 'toutiao'
    MONGO_TABLE = 'toutiao'
    
    GROUP_START = 1
    GROUP_END = 20
    KEYWORD='街拍'
    

    来源于微信公众号: 进击的Coder (ID:FightingCoder)

  • 相关阅读:
    POJ 1905 Expanding Rods 木棍膨胀
    [JSOI2007] 文本生成器
    18.09.22模拟赛T2 历史
    [USACO18OPEN] Talent Show
    [国家集训队] 整数的lqp拆分
    [HNOI2008] GT考试
    读入优化效果测试
    Trie图 模板
    manacher算法 详解+模板
    [洛谷P4299] 首都
  • 原文地址:https://www.cnblogs.com/hankleo/p/11489793.html
Copyright © 2011-2022 走看看