zoukankan      html  css  js  c++  java
  • 今日头条街拍

    spider.py

    import json
    import os
    from urllib.parse import urlencode
    import pymongo
    import requests
    from bs4 import BeautifulSoup
    from requests.exceptions import ConnectionError
    import re
    from multiprocessing import Pool
    from hashlib import md5
    from json.decoder import JSONDecodeError
    from config import *
    
    client = pymongo.MongoClient(MONGO_URL, connect=False)
    db = client[MONGO_DB]
    
    
    def get_page_index(offset, keyword):
        data = {
            'autoload': 'true',
            'count': 20,
            'cur_tab': 3,
            'format': 'json',
            'keyword': keyword,
            'offset': offset,
        }
        params = urlencode(data)
        base = 'http://www.toutiao.com/search_content/'
        url = base + '?' + params
        try:
            response = requests.get(url)
            if response.status_code == 200:
                return response.text
            return None
        except ConnectionError:
            print('Error occurred')
            return None
    
    
    def download_image(url):
        print('Downloading', url)
        try:
            response = requests.get(url)
            if response.status_code == 200:
                save_image(response.content)
            return None
        except ConnectionError:
            return None
    
    
    def save_image(content):
        file_path = '{0}/{1}.{2}'.format(os.getcwd(), md5(content).hexdigest(), 'jpg')
        print(file_path)
        if not os.path.exists(file_path):
            with open(file_path, 'wb') as f:
                f.write(content)
                f.close()
    
    
    def parse_page_index(text):
        try:
            data = json.loads(text)
            if data and 'data' in data.keys():
                for item in data.get('data'):
                    yield item.get('article_url')
        except JSONDecodeError:
            pass
    
    
    def get_page_detail(url):
        try:
            response = requests.get(url)
            if response.status_code == 200:
                return response.text
            return None
        except ConnectionError:
            print('Error occurred')
            return None
    
    
    def parse_page_detail(html, url):
        soup = BeautifulSoup(html, 'lxml')
        result = soup.select('title')
        title = result[0].get_text() if result else ''
        images_pattern = re.compile('gallery: JSON.parse("(.*)")', re.S)
        result = re.search(images_pattern, html)
        if result:
            data = json.loads(result.group(1).replace('\', ''))
            if data and 'sub_images' in data.keys():
                sub_images = data.get('sub_images')
                images = [item.get('url') for item in sub_images]
                for image in images: download_image(image)
                return {
                    'title': title,
                    'url': url,
                    'images': images
                }
    
    
    def save_to_mongo(result):
        if db[MONGO_TABLE].insert(result):
            print('Successfully Saved to Mongo', result)
            return True
        return False
    
    
    def main(offset):
        text = get_page_index(offset, KEYWORD)
        urls = parse_page_index(text)
        for url in urls:
            html = get_page_detail(url)
            result = parse_page_detail(html, url)
            if result: save_to_mongo(result)
    
    
    if __name__ == '__main__':
        pool = Pool()
        groups = ([x * 20 for x in range(GROUP_START, GROUP_END + 1)])
        pool.map(main, groups)
        pool.close()
        pool.join()
    

    config.py

    MONGO_URL = 'localhost'
    MONGO_DB = 'toutiao'
    MONGO_TABLE = 'toutiao'
    
    GROUP_START = 1
    GROUP_END = 20
    KEYWORD='街拍'
    

    来源于微信公众号: 进击的Coder (ID:FightingCoder)

  • 相关阅读:
    S5P4418iNand清空方法
    使用 GIT 获得Linux Kernel的代码并查看,追踪历史记录
    Linux3.4内核的基本配置和编译
    uboot---linux
    TestNG的简单使用
    java selenium webdriver处理JS操作窗口滚动条
    testNG入门详解
    零成本实现接口自动化测试 – Java+TestNG 测试Restful service
    Selenium Webdriver——操作隐藏的元素(二)display属性
    python selenium webdriver处理浏览器滚动条
  • 原文地址:https://www.cnblogs.com/hankleo/p/11489793.html
Copyright © 2011-2022 走看看