zoukankan      html  css  js  c++  java
  • 网易新闻爬取

    基于requests 模块

    #动态加载的js的数据
    import requests
    import re
    from lxml import etree
    import json
    
    url = 'https://temp.163.com/special/00804KVA/cm_war.js?callback=data_callback'
    js_data = requests.get(url=url).text
    ex = 'data_callback((.*?))'
    list_str = re.findall(ex,js_data,re.S)[0]
    list_obj = json.loads(list_str)
    
    for dic in list_obj:
        title = dic['title']
        detail_url = dic['docurl']
        page_text = requests.get(url=detail_url).text
        tree = etree.HTML(page_text)
        content = tree.xpath('//*[@id="endText"]//text()')
        content = ''.join(content).replace(' ','').replace('
    ','')
        print(content)

    下面是scrapyselenium配合的使用

    # spider.py
    # -*- coding: utf-8 -*-
    import scrapy
    from Net163.items import Net163Item
    from selenium import webdriver
    from selenium.webdriver import ChromeOptions
    
    class NetPageSpider(scrapy.Spider):
        option = ChromeOptions()
        option.add_experimental_option('excludeSwitches', ['enable-automation'])
        # 创建一个浏览器对象
        bro = webdriver.Chrome(executable_path=r'C:spiderscrapy1chromedriver.exe')
        name = 'net_page'
        model_urls = [] # 存放的就是4个板块对应的详情页的url
        # allowed_domains = ['www.xxx.com']
        start_urls = ['https://news.163.com']
    
        # 内容详情页面
        def content_parse(self,response):
            item = response.meta['item']
            # 解析数据 存储到item里面
            content_lst = response.xpath('//div[@id="endText"]//text()').extract()
            # extract 返回的是列表,列表里面存的是字符串
            item['desc'] = ''.join(content_lst).replace(' ','').replace('
    ','').replace('	','')  # 拼接str
            yield item
    
        # 板块页面
        def detail_parse(self, response):
            div_lst = response.xpath('////div[@class="ndi_main"]/div')
            for div in div_lst:
                item = Net163Item()
                title = div.xpath('./div/div[1]/h3/a/text()').extract_first()
                new_detail_url = div.xpath('./div/div[1]/h3/a/@href').extract_first()
    
                item['title'] = title
                # meta是一个字典,字典中所有的键值对都可以传递给指定的回调函数
                yield scrapy.Request(url=new_detail_url, callback=self.content_parse,meta={'item':item})
    
        # 开始的start_urls
        def parse(self, response):
            li_lst = response.xpath('//div[@class="ns_area list"]/ul/li')
            indexs = [3,4,6,7]
            model_lst = [] #板块
            for index in indexs:
                li = li_lst[index]
                model_lst.append(li)
            # 解析板块url
            for li in model_lst:
                model_url = li.xpath('./a/@href').extract_first()
                self.model_urls.append(model_url)
    
                # 对每个板块的url发请求,获取板块页面内容数据
                yield  scrapy.Request(url=model_url,callback=self.detail_parse)
    
        # 关闭浏览器
        def closed(self, spider):
            self.bro.quit()
    # items.py
    import scrapy
    class Net163Item(scrapy.Item):
        # define the fields for your item here like:
        title = scrapy.Field()
        desc = scrapy.Field()
    #pipelines.py
    # 管道  负责持久化的  可以txt mysql redis mongodb  可以写多个类注册到settings,py里面
    class Net163Pipeline(object):
        def process_item(self, item, spider):
            print(item['title'],len(item['desc']))
            return item
    # settings.py 里面的修改
    USER_AGENT = ''           # UA检测
    ROBOTSTXT_OBEY = False        # robot协议
    DOWNLOADER_MIDDLEWARES = {    #下载中间件
       'Net163.middlewares.Net163DownloaderMiddleware': 543,
    }
    ITEM_PIPELINES = {    #管道类
       'Net163.pipelines.Net163Pipeline': 300,  
    }
    LOG_LEVEL = 'ERROR'   #日志等级
    # middlewares.py里面
    #
    -*- coding: utf-8 -*- from time import sleep from scrapy import signals from scrapy.http import HtmlResponse class Net163DownloaderMiddleware(object):    # 类方法 @classmethod def from_crawler(cls, crawler): # This method is used by Scrapy to create your spiders. s = cls() crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) return s    # 处理请求 def process_request(self, request, spider): return None # 该方法可以拦截到所有的响应对象,(需求中需要处理的是指定的某些响应对象) def process_response(self, request, response, spider): # 找出指定的响应对象进行处理操作 # 可以跟根据指定的请求对象定位到指定的响应对象 # 指定的请求对象可以通过请求的url定位 model_urls = spider.model_urls bro = spider.bro if request.url in model_urls: # 通过指定的url定位到指定的request # 通过指定的request定位到指定的response(不符合需求的要求) # 自己手动的创建四个符合需求要求的新的响应对象(需要将符合要求的响应数据存储放置到新的响应对象中) # 使用新的响应对象替换原来原始的响应对象 bro.get(request.url) # 使用浏览器对4个板块发请求的url sleep(2) js = 'window.scrollTo(0,document.body.scrollHeight)' bro.execute_script(js) sleep(2) # 页面源码数据就会包含了加载的动态数据 page_text = bro.page_source # 手动创建一个新的响应对象,将page_text作为响应数据封装到该响应对象中 # body参数表示的就是响应数据 return HtmlResponse(url=bro.current_url,body=page_text,encoding='utf-8',request=request) # 旧的响应对象 return response    # 处理异常 def process_exception(self, request, exception, spider): pass    # 开启爬虫 def spider_opened(self, spider): spider.logger.info('Spider opened: %s' % spider.name)
  • 相关阅读:
    kettle7.0数据库迁移(MySQL迁移到Postgresql,迁移过程中自动创建表结构)
    正向代理与反向代理区别
    MySQL存储引擎相关知识点
    设计模式-装饰器模式
    设计模式-策略模式
    算法—数据结构学习笔记(二)栈
    Spring Boot2.0学习笔记(一)
    关联容器——map
    迭代器
    C风格字符串
  • 原文地址:https://www.cnblogs.com/zhangchen-sx/p/10834494.html
Copyright © 2011-2022 走看看