zoukankan      html  css  js  c++  java
  • 投诉网站爬虫

     1 # -*- coding: utf-8 -*-
     2 import scrapy
     3 from yg.items import YgItem
     4 
     5 class YgSpiderSpider(scrapy.Spider):
     6     name = 'yg_spider'
     7     allowed_domains = ['wz.sun0769.com']
     8     start_urls = ['http://wz.sun0769.com/index.php/question/questionType?type=4&page=0']
     9 
    10     def parse(self, response):
    11         tr_list = response.xpath("//div[@class='greyframe']/table[2]/tr/td/table/tr")
    12         for tr in tr_list:
    13             item = YgItem()
    14             item["title"] = tr.xpath("./td[2]/a[2]/@title").extract_first()
    15             item["href"] = tr.xpath("./td[2]/a[2]/@href").extract_first()
    16             item["update_time"] = tr.xpath("./td[last()]/text()").extract_first()
    17             # print(item)
    18 
    19             yield scrapy.Request(
    20                 item["href"],
    21                 callback=self.parse_detail,
    22                 meta={"item":item}
    23             )
    24 
    25         next_url = response.xpath("//a[text()='>']/@href").extract_first()
    26         if next_url is not None:
    27             yield scrapy.Request(
    28                 next_url,
    29                 callback=self.parse
    30             )
    31 
    32     def parse_detail(self,response): #处理详情页
    33         item = response.meta["item"]
    34         item["content"] = response.xpath("//div[@class='c1 text14_2']//text()").extract()
    35         item["content_img"] = response.xpath("//div[@class='c1 text14_2']//img/@src").extract()
    36         item["content_img"] = ["http://wz.sun0769.com"+i for i in item["content_img"]]
    37         # print(item)
    38         yield item
     1 # -*- coding: utf-8 -*-
     2 
     3 # Define your item pipelines here
     4 #
     5 # Don't forget to add your pipeline to the ITEM_PIPELINES setting
     6 # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
     7 import re
     8 import json
     9 
    10 class YgPipeline(object):
    11     def process_item(self, item, spider):
    12         item["content"] = self.process_content(item["content"])
    13         with open("yg.txt", "a", encoding="utf-8") as f:
    14             f.write(json.dumps(dict(item), ensure_ascii=False, indent=4))
    15             f.write("
    ")
    16         return item
    17 
    18     def process_content(self, content):
    19         content = [re.sub(r'xa0|s',"",i) for i in content]
    20         content = [i for i in content if len(i)>0]
    21         return content
     1 # -*- coding: utf-8 -*-
     2 
     3 # Define here the models for your scraped items
     4 #
     5 # See documentation in:
     6 # https://doc.scrapy.org/en/latest/topics/items.html
     7 
     8 import scrapy
     9 
    10 
    11 class YgItem(scrapy.Item):
    12     # define the fields for your item here like:
    13     title = scrapy.Field()
    14     update_time = scrapy.Field()
    15     href = scrapy.Field()
    16     content = scrapy.Field()
    17     content_img = scrapy.Field()
    18     # pass
  • 相关阅读:
    Scoket简介
    AOP
    Windows服务
    Nginx 教程 (1):基本概念
    异步编程
    并发编程
    常用排序
    序列化
    MSBuild/Projectjson
    不汇报是职场发展的绊脚石
  • 原文地址:https://www.cnblogs.com/sure-feng/p/10092283.html
Copyright © 2011-2022 走看看