zoukankan      html  css  js  c++  java
  • 今日头条街拍

    spider.py

    import json
    import os
    from urllib.parse import urlencode
    import pymongo
    import requests
    from bs4 import BeautifulSoup
    from requests.exceptions import ConnectionError
    import re
    from multiprocessing import Pool
    from hashlib import md5
    from json.decoder import JSONDecodeError
    from config import *
    
    client = pymongo.MongoClient(MONGO_URL, connect=False)
    db = client[MONGO_DB]
    
    
    def get_page_index(offset, keyword):
        data = {
            'autoload': 'true',
            'count': 20,
            'cur_tab': 3,
            'format': 'json',
            'keyword': keyword,
            'offset': offset,
        }
        params = urlencode(data)
        base = 'http://www.toutiao.com/search_content/'
        url = base + '?' + params
        try:
            response = requests.get(url)
            if response.status_code == 200:
                return response.text
            return None
        except ConnectionError:
            print('Error occurred')
            return None
    
    
    def download_image(url):
        print('Downloading', url)
        try:
            response = requests.get(url)
            if response.status_code == 200:
                save_image(response.content)
            return None
        except ConnectionError:
            return None
    
    
    def save_image(content):
        file_path = '{0}/{1}.{2}'.format(os.getcwd(), md5(content).hexdigest(), 'jpg')
        print(file_path)
        if not os.path.exists(file_path):
            with open(file_path, 'wb') as f:
                f.write(content)
                f.close()
    
    
    def parse_page_index(text):
        try:
            data = json.loads(text)
            if data and 'data' in data.keys():
                for item in data.get('data'):
                    yield item.get('article_url')
        except JSONDecodeError:
            pass
    
    
    def get_page_detail(url):
        try:
            response = requests.get(url)
            if response.status_code == 200:
                return response.text
            return None
        except ConnectionError:
            print('Error occurred')
            return None
    
    
    def parse_page_detail(html, url):
        soup = BeautifulSoup(html, 'lxml')
        result = soup.select('title')
        title = result[0].get_text() if result else ''
        images_pattern = re.compile('gallery: JSON.parse("(.*)")', re.S)
        result = re.search(images_pattern, html)
        if result:
            data = json.loads(result.group(1).replace('\', ''))
            if data and 'sub_images' in data.keys():
                sub_images = data.get('sub_images')
                images = [item.get('url') for item in sub_images]
                for image in images: download_image(image)
                return {
                    'title': title,
                    'url': url,
                    'images': images
                }
    
    
    def save_to_mongo(result):
        if db[MONGO_TABLE].insert(result):
            print('Successfully Saved to Mongo', result)
            return True
        return False
    
    
    def main(offset):
        text = get_page_index(offset, KEYWORD)
        urls = parse_page_index(text)
        for url in urls:
            html = get_page_detail(url)
            result = parse_page_detail(html, url)
            if result: save_to_mongo(result)
    
    
    if __name__ == '__main__':
        pool = Pool()
        groups = ([x * 20 for x in range(GROUP_START, GROUP_END + 1)])
        pool.map(main, groups)
        pool.close()
        pool.join()
    

    config.py

    MONGO_URL = 'localhost'
    MONGO_DB = 'toutiao'
    MONGO_TABLE = 'toutiao'
    
    GROUP_START = 1
    GROUP_END = 20
    KEYWORD='街拍'
    

    来源于微信公众号: 进击的Coder (ID:FightingCoder)

  • 相关阅读:
    webservice的接口做自动化
    “国货之光”!国产弱网测试神器 QNET,比主流弱网测试工具强在哪?
    面试被问到:fiddler在工作中有哪些应用?怎么破?
    接口阶段总结(下)
    接口阶段总结(上)
    loadrunner 关联原来这么简单
    cookie+token+接口文档解读+接口用例设计+requests入门+postman使用
    如何修改 BeautifulReport 中每一个用例的描述
    如何测试(五)N95口罩如何测试?
    JMeter_响应数据为空以及中文乱码
  • 原文地址:https://www.cnblogs.com/hankleo/p/11489793.html
Copyright © 2011-2022 走看看