zoukankan      html  css  js  c++  java
  • Scrapy爬虫-spider.py

     1 class xiaoshuoSpider(scrapy.Spider):
     2     name = "freenovel"
     3     headers={
     4         'Upgrade - Insecure - Requests': '1',
     5     }
     6     def start_requests(self):
     7         #完本、免费小说
     8         start_url=["url起始网址"]
     9         for url in start_url:
    10             yield scrapy.Request(url=url, headers=self.headers,callback=self.first_parse)
    11 
    12     def first_parse(self, response):
    13        sel=Selector(response)
    14        category=sel.css('div[class="select-list"] div ul[type="category"] li a::text').extract()
    15        category_url=sel.css('div[class="select-list"] div ul[type="category"] li a::attr(href)').extract()
    16        items=[]
    17        for i in range(1,len(category_url)):
    18            item=XiaoshuoItem()
    19            item['category']=category[i]
    20            item['category_url']="https:"+category_url[i]
    21            items.append(item)
    22        for item in items:
    23            yield scrapy.Request(url=item['category_url'],meta={"category":item['category']},callback=self.second_parse,headers=self.headers)
    24 
    25     def second_parse(self,response):
    26         sel=Selector(response)
    27         novel_url=sel.css('div[class="book-mid-info"] h4 a::attr(href)').extract()
    28         item=XiaoshuoItem()
    29         item['category']=response.meta['category']
    30         yield scrapy.Request(url="https:" + novel_url[1] + "#Catalog",callback=self.article_parse,
    31                              headers=self.headers)
    32         for i in range(len(novel_url)):
    33             novel_url[i]="https:" + novel_url[i] + "#Catalog"
    34             yield scrapy.Request(url=novel_url[i], meta={"category":item['category']},callback=self.article_parse, headers=self.headers)
    35 
    36     def article_parse(self, response):
    37         sel=Selector(response)
    38         article_name=sel.xpath('//h1/em/text()').extract_first()
    39         article_url=sel.css(
    40             'div[id="j-catalogWrap"] div[class="volume-wrap"] div[class="volume"] ul li a::attr(href)').extract_first()
    41         article_url="https:" + article_url
    42         item=XiaoshuoItem()
    43         item['article_name']=article_name
    44         item['category']=response.meta['category']
    45         yield scrapy.Request(url=article_url, meta={'article_name': item['article_name'],"category":item['category']}, callback=self.detail_parse,
    46                              headers=self.headers)
    47 
    48     def detail_parse(self, response):
    49         sel=Selector(response)
    50         content=""
    51         item=XiaoshuoItem()
    52         content_list=sel.css(
    53             'div[id="j_chapterBox"] div[class="text-wrap"] div[class="main-text-wrap"] div[class="read-content j_readContent"] p::text').extract()
    54         content_name=sel.css('h3[class="j_chapterName"]::text').extract_first()
    55         next_page=sel.css('a[id="j_chapterNext"]::attr(href)').extract_first()
    56         for content_one in content_list:
    57             content+=content_one
    58         item['content']=content
    59         item['content_name']=content_name
    60         item['article_name']=response.meta['article_name']
    61         item['category']=response.meta['category']
    62         yield item
    63         if next_page is not None:
    64             next_page="https:" + next_page
    65             yield scrapy.Request(url=next_page, meta={'article_name': item['article_name'],"category":item['category']}, callback=self.detail_parse,
    66                                  headers=self.headers)
  • 相关阅读:
    VS2010导入DLL的总结
    [转]C#事件简单示例
    VS2010中实现TreeView和Panel的动态更新
    【JZOJ1282】打工
    【NOIP2016提高A组五校联考2】tree
    【NOIP2016提高A组五校联考2】running
    【NOIP2016提高A组五校联考2】string
    8月~9月学习总结
    NOIP2016提高A组五校联考2总结
    NOIP2016提高A组五校联考1总结
  • 原文地址:https://www.cnblogs.com/ShadowXie/p/9699921.html
Copyright © 2011-2022 走看看