zoukankan      html  css  js  c++  java
  • sunPro

    本项目中运用了相关技术

    1. fake_useragent的随机ua
    2. http://www.goubanjia.com/中的动态代理ip
    3. time.sleep(delay)随机延迟数,来降低被反爬虫策略监控的风险
    4. 存储数据在mysql数据库中
    5. 经本人测试,可爬取阳光政务的 http://wz.sun0769.com/political/index/politicsNewest 中2000页数据,爬取全部网页不在话下

    项目结构

    sunPro
        sunPro
            spiders
                sun.py
            items.py
            middlewares.py
            pipelines.py
            settings.py
    

    sun.py中

    # -*- coding: utf-8 -*-
    import scrapy
    import os
    import random
    # 当前的路径为:
    # base_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
    # print(base_path)
    import sys
    import logging
    
    from sunPro.items import SunproItem
    
    
    # sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath('__file__'))))
    
    class SunSpider(scrapy.Spider):
        name = 'sun'
        allowed_domains = ['wz.sun0769.com']
        start_urls = ['http://wz.sun0769.com/political/index/politicsNewest', ]
        # start_urls = ['http://wz.sun0769.com/political/index/politicsNewest',
        #               'http://wz.sun0769.com/political/index/politicsNewest?id=1&page=2',
        #               'http://wz.sun0769.com/political/index/politicsNewest?id=1&page=3',
        #               'http://wz.sun0769.com/political/index/politicsNewest?id=1&page=4',
        #               'http://wz.sun0769.com/political/index/politicsNewest?id=1&page=5']
    
        # 翻页的url
        page_urls = []
    
        # http://wz.sun0769.com/political/index/politicsNewest?id=1&page=2
    
        # 下一页 <a href="/political/index/politicsNewest?id=1&amp;page=3" class="arrow-page prov_rota"></a>
        def parse(self, response):
            # 获取解析页面
    
            # for i in range(1, 3):
            #     page = i+1
            #     print('-----> 目前的页数为:', page)
            #     yield scrapy.Request(f'http://wz.sun0769.com/political/index/politicsNewest?id=1&page={page}',
            #                          callback=self.parse)
    
            # if 200 <= response.status <= 300:
    
            # new_urls = 'http://wz.sun0769.com' + response.xpath('//div[@class="mr-three paging-box"]/a[2]/@href').extract_first()
            # print('-----> 新得页面的url是:', new_urls)
            # if new_urls:
            #     self.page_urls.append(new_urls)
            for page in range(1, 1000):
                url = 'http://wz.sun0769.com/political/index/politicsNewest?id=1&page=%d' % page
                print('------> url', url)
                yield scrapy.Request(url=url, callback=self.parse_item)
            # logging.info('当前访问的url是 %s' % response.url)
    
            # 此处为翻页
            # new_urls = []
            # try:
            #     new_urls = response.xpath('/html/body/div[2]/div[3]/div[3]/div//a/@href').extract()
            #     # '/html/body/div[2]/div[3]/div[3]/div/a[2]'
            #     print('-----> 新的url列表为:', new_urls)
            # except Exception as e:
            #     print('解析下一页标签出错了 | 已经是最后一页,没有下一页标签了', e)
            # if len(new_urls) > 0:
            #     for i in range(len(new_urls)):
            #         page_url = new_urls[i]
            #         print('-----> page_url', page_url)
            #         yield scrapy.Request('http://wz.sun0769.com' + page_url, callback=self.parse)
    
            # 'http://wz.sun0769.com/political/index/politicsNewest?id=1&page=1'
            # for url in response.xpath('/html/body/div[2]/div[3]/div[3]/div//a/@href').extract():
            #     print('-----> 每一条url是', url)
            #     yield scrapy.Request('http://wz.sun0769.com'+url, callback=self.parse)
    
            # 得出下一页的标签,进行翻页操作,并且拼接成url访问
    
        def parse_item(self, response):
            if 200 <= response.status <=300:
                print('-----> 当前正在解析的页面为:', response.url)
                li_lists = response.xpath('/html/body/div[2]/div[3]/ul[2]//li')
                for li in li_lists:
                    item = SunproItem()
                    sid = li.xpath('./span[1]/text()').extract_first()
                    status = li.xpath('./span[2]/text()').extract_first()
                    ask_title = li.xpath('./span[3]/a/text()').extract_first()
                    rep_time = li.xpath('./span[4]/text()').extract_first()
                    ask_time = li.xpath('./span[5]/text()').extract_first()
    
    
                    item["sid"] = sid
                    item["status"] = status
                    item["title"] = ask_title
                    item["rep_time"] = rep_time
                    item["ask_time"] = ask_time
                    print('-----> 编号:', sid)
                    print('-----> 状态:', status)
                    print('-----> 问政标题:', ask_title)
                    print('-----> 问政时间:', ask_time)
                    print('-----> 回复时间:', rep_time)
                    # 查找详情
                    detail_url = 'http://wz.sun0769.com' + li.xpath('./span[3]/a/@href').extract_first()
                    print('-----> 详情内容的url是:', detail_url)
                    yield scrapy.Request(url=detail_url, meta={'item': item},
                                         callback=self.parse_detail)
            else:
                print('-----> 错误的status代码是', response.status)
    
        def parse_detail(self, response):
            item = response.meta['item']
            details = response.xpath('//div[@class="details-box"]/pre/text()').extract_first()
            content = ''.join(details)
            if content == '':
                content = '未知'
            print('-----> 详情内容是:', content)
            # 问政主体 : 部门是--
            department = response.xpath('//div[@class="mr-three clear"]/div[@class="fl politics-fl"]/text()').extract_first()
            # '/html/body/div[3]/div[2]/div[2]/div[3]/div[1]'
            department = department.split(':')[-1]
            print('-----> 问政部门是:', department)
            if department == '':
                department = '未知'
            item['content'] = content
            item['department'] = department
            yield item
    

    items.py

    # -*- coding: utf-8 -*-
    
    # Define here the models for your scraped items
    #
    # See documentation in:
    # https://docs.scrapy.org/en/latest/topics/items.html
    
    import scrapy
    
    
    class SunproItem(scrapy.Item):
        # define the fields for your item here like:
        # name = scrapy.Field()
        sid = scrapy.Field()     # 编号
        status = scrapy.Field()     # 状态
        title = scrapy.Field()  # 问政标题
        rep_time = scrapy.Field()   # 回复时间
        ask_time = scrapy.Field()   # 问政时间
        content = scrapy.Field()    # 问政内容
        department = scrapy.Field()  # 问政主体
    

    middlewares.py

    # -*- coding: utf-8 -*-
    
    # Define here the models for your spider middleware
    #
    # See documentation in:
    # https://docs.scrapy.org/en/latest/topics/spider-middleware.html
    import time
    import urllib3
    from scrapy import signals
    import requests
    import random
    from fake_useragent import UserAgent
    
    
    class SunproSpiderMiddleware:
        # Not all methods need to be defined. If a method is not defined,
        # scrapy acts as if the spider middleware does not modify the
        # passed objects.
    
        @classmethod
        def from_crawler(cls, crawler):
            # This method is used by Scrapy to create your spiders.
            s = cls()
            crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
            return s
    
        def process_spider_input(self, response, spider):
            # Called for each response that goes through the spider
            # middleware and into the spider.
    
            # Should return None or raise an exception.
            return None
    
        def process_spider_output(self, response, result, spider):
            # Called with the results returned from the Spider, after
            # it has processed the response.
    
            # Must return an iterable of Request, dict or Item objects.
            for i in result:
                yield i
    
        def process_spider_exception(self, response, exception, spider):
            # Called when a spider or process_spider_input() method
            # (from other spider middleware) raises an exception.
    
            # Should return either None or an iterable of Request, dict
            # or Item objects.
            pass
    
        def process_start_requests(self, start_requests, spider):
            # Called with the start requests of the spider, and works
            # similarly to the process_spider_output() method, except
            # that it doesn’t have a response associated.
    
            # Must return only requests (not items).
            for r in start_requests:
                yield r
    
        def spider_opened(self, spider):
            spider.logger.info('Spider opened: %s' % spider.name)
    
    
    # http = ["123.179.161.230:41128",
    #         '171.80.187.163:38367',
    #         '112.240.182.233:44590',
    #         '113.226.97.93:55387',
    #         '117.26.221.137:52067',
    #         '182.244.168.247:38904',
    #         '1.199.193.249:35786',
    #         '183.141.154.179:46691',
    #         '183.141.154.179:46691',
    #         ]
    
    
    # https = ['49.70.17.204:9999',
    #          '49.89.87.85:9999',
    #          '49.70.85.154:9999',
    #          '49.70.85.53:9999',
    #          '120.83.121.172:9999',
    #          '113.195.156.158:9999']
    
    class SunproDownloaderMiddleware:
        # Not all methods need to be defined. If a method is not defined,
        # scrapy acts as if the downloader middleware does not modify the
        # passed objects.
        def __init__(self):
            '''
            self.user_agent_list = [
                'Mozilla/5.0 (Windows NT 5.1; U; en; rv:1.8.1) Gecko/20061208 Firefox/2.0.0 Opera 9.50',
                'Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10',
                'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.57.2 (KHTML, like Gecko) Version/5.1.7 Safari/534.57.2',
                'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.16 (KHTML, like Gecko) '
                'Chrome/10.0.648.133 Safari/534.16',
                'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36',
                'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/2.0 Safari/536.11',
                'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; LBBROWSER) ',
                'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; QQBrowser/7.0.3698.400)',
                'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SV1; QQDownload 732; .NET4.0C; .NET4.0E; SE 2.X MetaSr 1.0)',
                'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 UBrowser/4.0.3214.0 Safari/537.36)'
            ]
            '''
    
            # 随机ua改为fake_useragent比较有派头,方便获取,实时更新
            self.ua = UserAgent(verify_ssl=False)
    
        @classmethod
        def from_crawler(cls, crawler):
            # This method is used by Scrapy to create your spiders.
            s = cls()
            crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
            return s
    
        # 下载中间件 改变发出的请求
        def process_request(self, request, spider):
            # 设置随机ua
            # request.headers['User-Agent'] = random.choice(self.user_agent_list)
            request.headers['User-Agent'] = self.ua.random
    
            # 设置time.sleep随机延迟 降低爬取速率,防止被封ip
            delay = random.randint(0, 3)
            time.sleep(delay)
    
            # 动态代理ip,此处为api接口url,访问可动态返回代理ip
            apiUrl = 'http://api.goubanjia.com/dynamic/get/9ec90437c17a332fb83067f5cb7539e4.html?sep=3'
            # 要抓取的目标网站地址 ,本人使用的为http://www.goubanjia.com/站的代理ip(全网代理ip)
            # targetUrl = "http://1212.ip138.com/ic.asp";
            # 获取IP列表
            res = requests.get(apiUrl).text.strip("
    ")
            # 按照
    分割获取到的IP
            ips = res.split("
    ")
            # 随机选择一个IP
            proxy_ip = random.choice(ips)
    
            # 利用动态加载的代理ip,设置url的proxy
            if request.url.startswith("http://"):
                request.meta['proxy'] = "http://%s" % proxy_ip  # http代理
            elif request.url.startswith("https://"):
                request.meta['proxy'] = "https://%s" % proxy_ip  # https代理
            return None
    
        def process_response(self, request, response, spider):
            # Called with the response returned from the downloader.
    
            # Must either;
            # - return a Response object
            # - return a Request object
            # - or raise IgnoreRequest
            return response
    
        def process_exception(self, request, exception, spider):
            # Called when a download handler or a process_request()
            # (from other downloader middleware) raises an exception.
    
            # Must either:
            # - return None: continue processing this exception
            # - return a Response object: stops process_exception() chain
            # - return a Request object: stops process_exception() chain
            pass
    
        def spider_opened(self, spider):
            spider.logger.info('Spider opened: %s' % spider.name)
    

    pipelines.py

    # -*- coding: utf-8 -*-
    
    # Define your item pipelines here
    #
    # Don't forget to add your pipeline to the ITEM_PIPELINES setting
    # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
    import pymysql
    from scrapy.exceptions import DropItem
    import xlrd
    import xlwt
    import xlutils
    
    from xlutils.copy import copy
    
    
    class SunproPipeline:
        def __init__(self):
            self.ids_seen = set()
    
        def process_item(self, item, spider):
            if item['sid'] in self.ids_seen:
                raise DropItem('Duplicate item found: %s' % item)
            else:
                self.ids_seen.add(item['sid'])
            # 处理得到的数据
            # 1.写入数据库中
            # 2.保存到表格中
            print(item)
    
            return item
            # index = len(value)  # 获取需要写入数据的行数
            '''
            try:
                workbook = xlrd.open_workbook('详情.xls')  # 打开工作簿
                sheets = workbook.sheet_names()  # 获取工作簿中的所有表格
                if len(sheets) == 0:
                    sheet = workbook.add_sheet('相关信息')
                else:
                    worksheet = workbook.sheet_by_name(sheets[0])  # 获取工作簿中所有表格中的的第一个表格
                rows_old = worksheet.nrows  # 获取表格中已存在的数据的行数
                new_workbook = copy(workbook)  # 将xlrd对象拷贝转化为xlwt对象
                new_worksheet = new_workbook.get_sheet(0)  # 获取转化后工作簿中的第一个表格
                value = list(item.values())
                for i in len(value):
                    new_worksheet.write(rows_old, i, value[i])  # 追加写入数据,注意是从i+rows_old行开始写入
                new_workbook.save('详情.xls')  # 保存工作簿
                print("xls格式表格【追加】写入数据成功!")
            except Exception as e:
                workbook = xlwt.Workbook()
                sheet = workbook.add_sheet('相关信息')
                # 表格的头
                for i in range(len(item.keys())):
                    sheet.write(0, i, list(item.keys())[i])
                # 写入数据
                for i in range(len(item.values())):
                    sheet.write(1, i, list(item.values())[i])
                workbook.save('详情.xls')
    
            return item
            '''
    
    
    class mysqlPipeline(object):
        def __init__(self):
            # self.ids_seen = set()
            self.db = pymysql.connect(
                host='localhost',  # 连接的是本地数据库
                user='root',  # 自己的mysql用户名
                passwd='rootpwd',  # 自己的密码
                db='sun',  # 数据库的名字
                charset='utf8mb4',  # 默认的编码方式:
                cursorclass=pymysql.cursors.DictCursor)
            self.cursor = self.db.cursor()
    
        def process_item(self, item, spider):
            '''
            :param item: 对象
            :param spider: 爬虫
            :return:
            '''
            # if item['sid'] in self.ids_seen:
            #     raise DropItem('Duplicate item found: %s' % item)
            # else:
            #     self.ids_seen.add(item['sid'])
            # 将爬取的信息保存到mysql
            # 将item里的数据拿出来
            # title = item['title']
            # link = item['link']
            # posttime = item['posttime']
    
            sid = int(item["sid"])
            status = item["status"]
            title = item["title"]
            rep_time = item["rep_time"]
            ask_time = item["ask_time"]
            content = item['content']
            department = item['department']
    
            # 和本地的newsDB数据库建立连接
    
            try:
                # 使用cursor()方法获取操作游标
                # cursor = self.db.cursor()
                # SQL 插入语句
                sql = "INSERT INTO sun(sid,status,title,content,department ,ask_time,rep_time) 
                      VALUES (%d,'%s','%s','%s', '%s', '%s','%s')" % (sid, status, title, content, department, ask_time, rep_time)
                # 执行SQL语句
                print('-----> sql语句是:', sql)
                self.cursor.execute(sql)
                # 提交修改
                self.db.commit()
            except Exception as e:
                print('-----> 存储数据出错了', e)
                # 关闭连接
                self.db.close()
                self.cursor.close()
    
            return item
    
        def close_spider(self, spider):
            pass
    

    settings.py

    # -*- coding: utf-8 -*-
    
    # Scrapy settings for sunPro project
    #
    # For simplicity, this file contains only settings considered important or
    # commonly used. You can find more settings consulting the documentation:
    #
    #     https://docs.scrapy.org/en/latest/topics/settings.html
    #     https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
    #     https://docs.scrapy.org/en/latest/topics/spider-middleware.html
    
    BOT_NAME = 'sunPro'
    
    SPIDER_MODULES = ['sunPro.spiders']
    NEWSPIDER_MODULE = 'sunPro.spiders'
    
    
    # Crawl responsibly by identifying yourself (and your website) on the user-agent
    USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36'
    
    # Obey robots.txt rules
    ROBOTSTXT_OBEY = False
    
    
    LOG_LEVEL = 'ERROR'
    # Configure maximum concurrent requests performed by Scrapy (default: 16)
    #CONCURRENT_REQUESTS = 32
    
    # Configure a delay for requests for the same website (default: 0)
    # See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
    # See also autothrottle settings and docs
    #DOWNLOAD_DELAY = 3
    # The download delay setting will honor only one of:
    CONCURRENT_REQUESTS_PER_DOMAIN = 8
    #CONCURRENT_REQUESTS_PER_IP = 16
    
    # Disable cookies (enabled by default)
    #COOKIES_ENABLED = False
    
    # Disable Telnet Console (enabled by default)
    #TELNETCONSOLE_ENABLED = False
    
    # Override the default request headers:
    #DEFAULT_REQUEST_HEADERS = {
    #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
    #   'Accept-Language': 'en',
    #}
    
    # Enable or disable spider middlewares
    # See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
    #SPIDER_MIDDLEWARES = {
    #    'sunPro.middlewares.SunproSpiderMiddleware': 543,
    #}
    
    # Enable or disable downloader middlewares
    # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
    DOWNLOADER_MIDDLEWARES = {
        'sunPro.middlewares.SunproDownloaderMiddleware': 543,
    }
    
    # Enable or disable extensions
    # See https://docs.scrapy.org/en/latest/topics/extensions.html
    #EXTENSIONS = {
    #    'scrapy.extensions.telnet.TelnetConsole': None,
    #}
    
    # Configure item pipelines
    # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
    ITEM_PIPELINES = {
        'sunPro.pipelines.SunproPipeline': 300,
        'sunPro.pipelines.mysqlPipeline': 302,
    }
    
    # Enable and configure the AutoThrottle extension (disabled by default)
    # See https://docs.scrapy.org/en/latest/topics/autothrottle.html
    AUTOTHROTTLE_ENABLED = True
    # The initial download delay
    AUTOTHROTTLE_START_DELAY = 5
    # The maximum download delay to be set in case of high latencies
    AUTOTHROTTLE_MAX_DELAY = 60
    # The average number of requests Scrapy should be sending in parallel to
    # each remote server
    #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
    # Enable showing throttling stats for every response received:
    #AUTOTHROTTLE_DEBUG = False
    
    # Enable and configure HTTP caching (disabled by default)
    # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
    #HTTPCACHE_ENABLED = True
    #HTTPCACHE_EXPIRATION_SECS = 0
    #HTTPCACHE_DIR = 'httpcache'
    #HTTPCACHE_IGNORE_HTTP_CODES = []
    #HTTPCACHE_STO RAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
  • 相关阅读:
    win8.1下安装双系统ubuntu14.04.3
    如何使用cmd
    My Test about Mat
    访问Mat矩阵中的元素并为其赋值
    Mat代码操作
    waitKey()
    ASCII码对照表
    vector 中的clear()
    vector 介绍
    Mat的详解
  • 原文地址:https://www.cnblogs.com/shiyishou/p/14023693.html
Copyright © 2011-2022 走看看