zoukankan      html  css  js  c++  java
  • 使用Scrapy框架爬取腾讯新闻

         昨晚没事写的爬取腾讯新闻代码,在此贴出,可以参考完善。

       

    # -*- coding: utf-8 -*-
    import json
    
    from scrapy import Spider
    from scrapy.http import Request
    from scrapy.http import Response
    from scrapy.http import FormRequest
    from scrapy.selector import Selector
    from bs4 import BeautifulSoup
    
    from ..items import NewsItem
    
    TencentNewsUrl = 'https://pacaio.match.qq.com/irs/rcd'
    
    # 要闻 https://pacaio.match.qq.com/pc/topNews?callback=__jp0
    # https://pacaio.match.qq.com/irs/rcd?cid=108&ext=&token=349ee24cdf9327a050ddad8c166bd3e3&page=1&expIds=&callback=__jp1
    # https://new.qq.com/cmsn/20180726/20180726A0QOLA00
    # https://new.qq.com/ omn/20180726/20180726A0QOLA.html
    
    class TencentSpider(Spider):
        name = 'tencent'
    
        def start_requests(self):
            # yield Request(
            #     url='https://pacaio.match.qq.com/pc/topNews?callback=__jp0',
            #     callback=self.parse_contents
            # )
    
            yield FormRequest(
                url=TencentNewsUrl,
                formdata={
                    "cid": "58",
                    "token": "c232b098ee7611faeffc46409e836360",
                    "ext": "milite",
                    "page": "0",
                    "expIds": "",
                    "callback": "__jp0"
                },
                callback=self.parse_contents,
                meta={
                    "page": "0",
                    "field": ""
                }
            )
    
        def parse_contents(self, response: Response):
            try:
                data = json.load(response.text)
            except Exception:
                data = json.loads(response.text[(response.text.find('(') + 1):response.text.rfind(')')])
    
            # 处理分离网页
            try:
                data = data['data']
            except Exception:
                pass
            for url in data:
                omn = url['vurl']
                if omn.endswith('00') and '/cmsn/' in omn:
                    omn = omn.replace('/cmsn/', '/omn/')
                    omn = omn[:omn.rfind('00')] + '.html'
                    print(omn)
                yield Request(
                    url=omn,
                    callback=self.parse_news
                )
                break
    
        def parse_news(self, response: Response):
            news = NewsItem()
            news['url'] = response.url
            soup = BeautifulSoup(response.text, "lxml")
            news['title'] = soup.find('div', class_='LEFT').h1.text
            news['content'] = ''
            article = soup.find_all('p', class_='one-p')
            for sentence in article:
                news['content'] += sentence.text
            return news
    

      

  • 相关阅读:
    10.23 JSTL
    10.22 EL执行表达式
    10.21 EL表达式(只能在jsp中使用)
    10.20 网站访问量统计(application)
    10.19 JSP内置对象作用域
    10.16 Session和Cookie的区别
    10.15 转发与重定向
    剑指Offer_26_二叉搜索树与双向链表
    剑指Offer_25_复杂链表的复制
    剑指Offer_24_二叉树中和为某一值的路径.md
  • 原文地址:https://www.cnblogs.com/68xi/p/9381200.html
Copyright © 2011-2022 走看看