zoukankan      html  css  js  c++  java
  • 使用Scrapy框架爬取腾讯新闻

         昨晚没事写的爬取腾讯新闻代码,在此贴出,可以参考完善。

       

    # -*- coding: utf-8 -*-
    import json
    
    from scrapy import Spider
    from scrapy.http import Request
    from scrapy.http import Response
    from scrapy.http import FormRequest
    from scrapy.selector import Selector
    from bs4 import BeautifulSoup
    
    from ..items import NewsItem
    
    TencentNewsUrl = 'https://pacaio.match.qq.com/irs/rcd'
    
    # 要闻 https://pacaio.match.qq.com/pc/topNews?callback=__jp0
    # https://pacaio.match.qq.com/irs/rcd?cid=108&ext=&token=349ee24cdf9327a050ddad8c166bd3e3&page=1&expIds=&callback=__jp1
    # https://new.qq.com/cmsn/20180726/20180726A0QOLA00
    # https://new.qq.com/ omn/20180726/20180726A0QOLA.html
    
    class TencentSpider(Spider):
        name = 'tencent'
    
        def start_requests(self):
            # yield Request(
            #     url='https://pacaio.match.qq.com/pc/topNews?callback=__jp0',
            #     callback=self.parse_contents
            # )
    
            yield FormRequest(
                url=TencentNewsUrl,
                formdata={
                    "cid": "58",
                    "token": "c232b098ee7611faeffc46409e836360",
                    "ext": "milite",
                    "page": "0",
                    "expIds": "",
                    "callback": "__jp0"
                },
                callback=self.parse_contents,
                meta={
                    "page": "0",
                    "field": ""
                }
            )
    
        def parse_contents(self, response: Response):
            try:
                data = json.load(response.text)
            except Exception:
                data = json.loads(response.text[(response.text.find('(') + 1):response.text.rfind(')')])
    
            # 处理分离网页
            try:
                data = data['data']
            except Exception:
                pass
            for url in data:
                omn = url['vurl']
                if omn.endswith('00') and '/cmsn/' in omn:
                    omn = omn.replace('/cmsn/', '/omn/')
                    omn = omn[:omn.rfind('00')] + '.html'
                    print(omn)
                yield Request(
                    url=omn,
                    callback=self.parse_news
                )
                break
    
        def parse_news(self, response: Response):
            news = NewsItem()
            news['url'] = response.url
            soup = BeautifulSoup(response.text, "lxml")
            news['title'] = soup.find('div', class_='LEFT').h1.text
            news['content'] = ''
            article = soup.find_all('p', class_='one-p')
            for sentence in article:
                news['content'] += sentence.text
            return news
    

      

  • 相关阅读:
    ES6中的类
    promise小案例
    Promise.resolve()与Promise
    Promise.resolve( data)与Promise.reject( data )
    Promise.race()
    Promise.all()
    咦?浏览器又崩了?再试试这个呢!
    页面太卡了?试试这方法呢!
    js进阶之重复的定时器
    关于vue+element-ui项目的分页,返回默认显示第一页的问题解决
  • 原文地址:https://www.cnblogs.com/68xi/p/9381200.html
Copyright © 2011-2022 走看看