zoukankan      html  css  js  c++  java
  • python抓取头条文章

    python抓取头条美文并存储到mongodb

    # Author:song
    from multiprocessing import Pool
    from urllib.parse import urlencode
    import requests
    import json
    from requests import RequestException
    from bs4 import BeautifulSoup
    import re
    import pymongo
    client = pymongo.MongoClient('localhost',connect=False)
    db = client['toutiaowenzhang']
    
    def get_index(offset):
        data = {
            'offset': offset,
            'format': 'json',
            'keyword': '美文',
            'autoload': 'true',
            'count': 20,
            'cur_tab': 1,
            'from':'search_tab'
        }
        url = 'https://www.toutiao.com/search_content/?'+urlencode(data)
        response = requests.get(url)
        try:
            if response.status_code == 200:
                return response.text
            else:
                return None
        except RequestException:
            return None
    
    def get_urls(html):
        data = json.loads(html)
        if data and 'data' in data.keys():
            for item in data.get('data'):
                yield item.get('article_url')
    
    def get_index_detail(url):
        response = requests.get(url)
        try:
            if response.status_code == 200:
                return response.text
            else:
                return None
        except RequestException:
            return None
    
    def parse_detail(html):
        try:
            soup = BeautifulSoup(html,'lxml')
            title = soup.select('title')[0].get_text()
            compile_allarticle= re.compile('content.*?<div&gt(.*?)</div>',re.S)
            allarticle = re.findall(compile_allarticle,html)
            # article =re.sub('(<.*?<span>)','',allarticle[0])#正则匹配上不需要的那部分
            article =re.sub('[a-zA-Z0-9/#;&._]','',str(allarticle)).strip()#直接把字母数字全部替换
            data = {
                'title':title,
                'article':article
            }
            return data
        except TypeError:#解决出现了404界面
            pass
    def save_to_mongodb(result):
        if db['toutiaowenzhang'].insert(result):
            print('successful')
        else:
            print('fail')
    
    def main(offset):
        html = get_index(offset)
        items = get_urls(html)
        for item in items:
            if item:
                ab = get_index_detail(item)
                result = parse_detail(ab)
                save_to_mongodb(result)
    if __name__=='__main__':
        groups = [x*20 for x in range(3)]
        pool = Pool()
        pool.map(main,groups)
  • 相关阅读:
    取消Git每次拉取、提交推送都要输入密码
    input输入框的oninput和onchange事件
    NPM学习笔记
    命令行语法格式中常用符号的含义
    npm卸载模块报错
    软件的Alpha、Beta、GM、OEM、LTS等版本的含义(转)
    MySQL常用指令
    Git常用指令
    .net Core 解决Form value count limit 1024 exceeded. (文件上传过大)
    EF Core 迁移整理
  • 原文地址:https://www.cnblogs.com/master-song/p/8922850.html
Copyright © 2011-2022 走看看