zoukankan      html  css  js  c++  java
  • scrapy爬取西刺网站ip

    # scrapy爬取西刺网站ip
    # -*- coding: utf-8 -*-
    import scrapy
    
    from xici.items import XiciItem
    
    
    class XicispiderSpider(scrapy.Spider):
        name = "xicispider"
        allowed_domains = ["www.xicidaili.com/nn"]
        start_urls = ['http://www.xicidaili.com/nn/']
    
        def parse(self, response):
            item = XiciItem()
            for each in response.css('#ip_list tr'):
                ip = each.css('td:nth-child(2)::text').extract_first()
                port = each.css('td:nth-child(3)::text').extract_first()
                if ip:
                    ip_port = ip + ':' + port
                    item['ip_port'] = ip_port
                    yield item
    import pymongo
    
    class XiciPipeline(object):
    
        collection_name = 'scrapy_items'
    
        def __init__(self, mongo_uri, mongo_db):
            self.mongo_uri = mongo_uri
            self.mongo_db = mongo_db
        #这里的from经常拼错啊
        @classmethod
        def from_crawler(cls, crawler):
            return cls(
                mongo_uri=crawler.settings.get('MONGO_URI'),
                mongo_db=crawler.settings.get('MONGO_DB')
            )
    
        def open_spider(self, spider):
            self.client = pymongo.MongoClient(self.mongo_uri)
            self.db = self.client[self.mongo_db]
    
        def close_spider(self, spider):
            self.client.close()
    
        def process_item(self, item, spider):
            self.db[self.collection_name].insert(dict(item))
            return item
  • 相关阅读:
    杂记-2
    2019CSP-S游记
    csp2020
    特征根法小记
    csp模拟赛低级错误及反思
    备份
    黑科技——树剖两次dfs转一次dfs!
    输出天干地支
    蓝桥0531-输出Y
    蓝桥0615-判断四位数字前后两组是否相等
  • 原文地址:https://www.cnblogs.com/themost/p/7110378.html
Copyright © 2011-2022 走看看