zoukankan      html  css  js  c++  java
  • 使用Scrapy框架爬取腾讯新闻

         昨晚没事写的爬取腾讯新闻代码,在此贴出,可以参考完善。

       

    # -*- coding: utf-8 -*-
    import json
    
    from scrapy import Spider
    from scrapy.http import Request
    from scrapy.http import Response
    from scrapy.http import FormRequest
    from scrapy.selector import Selector
    from bs4 import BeautifulSoup
    
    from ..items import NewsItem
    
    TencentNewsUrl = 'https://pacaio.match.qq.com/irs/rcd'
    
    # 要闻 https://pacaio.match.qq.com/pc/topNews?callback=__jp0
    # https://pacaio.match.qq.com/irs/rcd?cid=108&ext=&token=349ee24cdf9327a050ddad8c166bd3e3&page=1&expIds=&callback=__jp1
    # https://new.qq.com/cmsn/20180726/20180726A0QOLA00
    # https://new.qq.com/ omn/20180726/20180726A0QOLA.html
    
    class TencentSpider(Spider):
        name = 'tencent'
    
        def start_requests(self):
            # yield Request(
            #     url='https://pacaio.match.qq.com/pc/topNews?callback=__jp0',
            #     callback=self.parse_contents
            # )
    
            yield FormRequest(
                url=TencentNewsUrl,
                formdata={
                    "cid": "58",
                    "token": "c232b098ee7611faeffc46409e836360",
                    "ext": "milite",
                    "page": "0",
                    "expIds": "",
                    "callback": "__jp0"
                },
                callback=self.parse_contents,
                meta={
                    "page": "0",
                    "field": ""
                }
            )
    
        def parse_contents(self, response: Response):
            try:
                data = json.load(response.text)
            except Exception:
                data = json.loads(response.text[(response.text.find('(') + 1):response.text.rfind(')')])
    
            # 处理分离网页
            try:
                data = data['data']
            except Exception:
                pass
            for url in data:
                omn = url['vurl']
                if omn.endswith('00') and '/cmsn/' in omn:
                    omn = omn.replace('/cmsn/', '/omn/')
                    omn = omn[:omn.rfind('00')] + '.html'
                    print(omn)
                yield Request(
                    url=omn,
                    callback=self.parse_news
                )
                break
    
        def parse_news(self, response: Response):
            news = NewsItem()
            news['url'] = response.url
            soup = BeautifulSoup(response.text, "lxml")
            news['title'] = soup.find('div', class_='LEFT').h1.text
            news['content'] = ''
            article = soup.find_all('p', class_='one-p')
            for sentence in article:
                news['content'] += sentence.text
            return news
    

      

  • 相关阅读:
    03.部署 kubectl 命令行工具
    02.创建 CA 证书和秘钥
    01.组件版本和集群环境
    23.kubernetes 组件版本和配置策略
    22.Kubernetes Ingress: HTTP 7层路由机制
    Python程序中#-*-coding: UTF-8 -*-的作用
    Python程序中首行#!/usr/bin/env python的作用
    mac install php dev
    linux的nohup disown setsid screen
    linux 常用查找命令 小技巧
  • 原文地址:https://www.cnblogs.com/68xi/p/9381200.html
Copyright © 2011-2022 走看看