zoukankan      html  css  js  c++  java
  • python抓取头条文章

    python抓取头条美文并存储到mongodb

    # Author:song
    from multiprocessing import Pool
    from urllib.parse import urlencode
    import requests
    import json
    from requests import RequestException
    from bs4 import BeautifulSoup
    import re
    import pymongo
    client = pymongo.MongoClient('localhost',connect=False)
    db = client['toutiaowenzhang']
    
    def get_index(offset):
        data = {
            'offset': offset,
            'format': 'json',
            'keyword': '美文',
            'autoload': 'true',
            'count': 20,
            'cur_tab': 1,
            'from':'search_tab'
        }
        url = 'https://www.toutiao.com/search_content/?'+urlencode(data)
        response = requests.get(url)
        try:
            if response.status_code == 200:
                return response.text
            else:
                return None
        except RequestException:
            return None
    
    def get_urls(html):
        data = json.loads(html)
        if data and 'data' in data.keys():
            for item in data.get('data'):
                yield item.get('article_url')
    
    def get_index_detail(url):
        response = requests.get(url)
        try:
            if response.status_code == 200:
                return response.text
            else:
                return None
        except RequestException:
            return None
    
    def parse_detail(html):
        try:
            soup = BeautifulSoup(html,'lxml')
            title = soup.select('title')[0].get_text()
            compile_allarticle= re.compile('content.*?<div&gt(.*?)</div>',re.S)
            allarticle = re.findall(compile_allarticle,html)
            # article =re.sub('(<.*?<span>)','',allarticle[0])#正则匹配上不需要的那部分
            article =re.sub('[a-zA-Z0-9/#;&._]','',str(allarticle)).strip()#直接把字母数字全部替换
            data = {
                'title':title,
                'article':article
            }
            return data
        except TypeError:#解决出现了404界面
            pass
    def save_to_mongodb(result):
        if db['toutiaowenzhang'].insert(result):
            print('successful')
        else:
            print('fail')
    
    def main(offset):
        html = get_index(offset)
        items = get_urls(html)
        for item in items:
            if item:
                ab = get_index_detail(item)
                result = parse_detail(ab)
                save_to_mongodb(result)
    if __name__=='__main__':
        groups = [x*20 for x in range(3)]
        pool = Pool()
        pool.map(main,groups)
  • 相关阅读:
    ACM的算法分类 2015-04-16 14:25 22人阅读 评论(0) 收藏
    初学Larevel 2014-08-21 11:24 90人阅读 评论(0) 收藏
    初学PHP&MySQL 2014-05-31 12:40 92人阅读 评论(0) 收藏
    codeforces 570 E. Pig and Palindromes (dp)
    codeforces 570 D. Tree Requests (dfs序)
    poj 2157 Maze (bfs)
    cf 570 C. Replacement (暴力)
    cf 570B B. Simple Game(构造)
    cf 570 A. Elections
    hdu 1429胜利大逃亡(续) (bfs+状态压缩)
  • 原文地址:https://www.cnblogs.com/master-song/p/8922850.html
Copyright © 2011-2022 走看看