zoukankan      html  css  js  c++  java
  • python爬虫

      

    一、python模块

    1、urllib.request常规使用

    import urllib.request
    
    #1、指定URL
    url = 'http://www.baidu.com'
    #2、发起请求:urlopen可以根据指定的url发起请求,且返回一个响应对象
    response = urllib.request.urlopen(url=url)
    #3、获取页面数据:read函数返回的就是响应对象中存储的页面数据(byte)
    response_text = response.read()
    #4、持久化存储
    with open('./baidu.html','wb') as f:
        f.write(response_text)
        print('写入成功')
    urllib.request

     urllib.request处理url中文

    import urllib.request
    import urllib.parse
    
    #1、指定URL
    # url = 'https://tieba.baidu.com/f?ie=utf-8&kw=你好&fr=search'
    url = 'https://tieba.baidu.com/f?ie=utf-8&kw=%s&fr=search'
    #url不可以存在非ascii编码的字符数据
    wd = urllib.parse.quote("你好")
    new_url = format(url % wd)
    print(new_url)
    #2、发起请求:urlopen可以根据指定的url发起请求,且返回一个响应对象
    response = urllib.request.urlopen(url=new_url)
    #3、获取页面数据:read函数返回的就是响应对象中存储的页面数据(byte)
    response_text = response.read()
    #4、持久化存储
    with open('./baidu1.html','wb') as f:
        f.write(response_text)
        print('写入成功')
    urllib.request

    urllib.request伪装请求头User-Agent

    import urllib.request
    
    #1、指定URL
    url = 'http://www.baidu.com'
    
    #自制定一个请求对象
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36",
    }
    #该请求对象的UA进行了伪装
    request = urllib.request.Request(url=url,headers=headers)
    
    #2、发起请求:urlopen根据自制定的request发起请求,且返回一个响应对象
    response = urllib.request.urlopen(request)
    #3、获取页面数据:read函数返回的就是响应对象中存储的页面数据(byte)
    response_text = response.read()
    #4、持久化存储
    with open('./baidu.html','wb') as f:
        f.write(response_text)
        print('写入成功')
    urllib.request

     urllib.request发送post请求

    import urllib.request
    import urllib.parse
    
    #1、指定URL
    url = 'https://fanyi.baidu.com/sug'
    
    #请求头
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36",
    }
    
    #该请求对象的UA进行了伪装
    request = urllib.request.Request(url=url,headers=headers)
    
    #将post请求参数封装到字典
    data = {
        'kw':'你好',
    }
    #使用parse模块对字典进行编码处理
    data = urllib.parse.urlencode(data)
    #将data转换成byte类型
    data = data.encode()
    
    #2、发起请求:urlopen根据自制定的request发起请求,且返回一个响应对象
    response = urllib.request.urlopen(request,data=data)
    #3、获取页面数据:read函数返回的就是响应对象中存储的页面数据(byte)
    response_text = response.read()
    #4、持久化存储
    with open('./baidu.html','wb') as f:
        f.write(response_text)
        print('写入成功')
    urllib.request

     urllib.request使用代理

    import urllib.request
    
    #创建处理器对象,在其内部封装代理ip和端口
    handler = urllib.request.ProxyHandler(proxies={"http":"118.172.211.3:58535"})
    #创建opener对象,然后使用该对象发起一个请求
    opener = urllib.request.build_opener(handler)
    
    #1、指定URL
    url = 'http://www.baidu.com/s?ie=utf-8&wd=ip'
    
    
    #该请求对象的UA进行了伪装
    request = urllib.request.Request(url=url)
    
    
    #2、发起请求:urlopen根据自制定的request发起请求,且返回一个响应对象
    response = opener.open(request)
    #3、获取页面数据:read函数返回的就是响应对象中存储的页面数据(byte)
    response = response.read()
    #4、持久化存储
    with open('./baiduip.html','wb') as f:
        f.write(response)
        print('写入成功')
    使用代理

     urllib.request请求携带cookie(未经验证)

    #使用cookiejar实现人人网的登录
    import urllib.request
    import urllib.parse
    import http.cookiejar
    #请求中的cookie会自动存储到cj对象中
    cj = http.cookiejar.CookieJar()
    #创建携带cookiejar对象的处理器对象
    handler = urllib.request.HTTPCookieProcessor(cj)
    #创建opener对象,(携带handler对象)
    opener = urllib.request.build_opener(handler)
    
    #要让cookiejar获取请求中的cookie数据值
    url = 'http://www.renren.com/ajaxLogin/login?1=1&uniqueTimestamp=2018110912156'
    #自定义一个请求对象,让该对象作为opener的open函数中的参数
    data={
        "email": "aaa",
        "icode": "",
        "origURL": "http://www.renren.com/home",
        "domain": "renren.com",
        "key_id": 1,
        "captcha_type": "web_login",
        "password": "1ae3b707652ccb767340abb542af0616c42fc942beb275069939d6c4cc461e5c",
        "rkey": "948db2d8639bcd2664994c49454256d1",
        "f":"" ,
    }
    data = urllib.parse.urlencode(data).encode()
    request = urllib.request.Request(url,data=data)
    opener.open(request)
    
    #获取当前用户的二级子页面
    s_url = 'http://www.renren-inc.com/zh/product/renren.html'
    
    #该次请求中就携带了cookie
    response = opener.open(s_url)
    with open('./renren.html','wb') as f:
        f.write(response.read())
    urllib.request

    2、requests (pip install requests)

    requests发送get请求

    import requests
    
    #1、指定url
    url = 'https://www.baidu.com'
    
    #2、发起get请求,返回响应对象
    response = requests.get(url=url)
    
    #人为指定编码格式为utf-8
    # response.encoding = 'utf-8'
    
    #3、获取响应对象值 ,.text为str,content为byte
    response_content = response.content
    
    with open('./re3.html',"wb") as f:
        f.write(response_content)
    requests

    response常用属性

    import requests
    
    #1、指定url
    url = 'https://www.baidu.com'
    
    #2、发起get请求,返回响应对象
    response = requests.get(url=url)
    
    #获取str类型值
    # print(response.text)
    
    #获取bytes值
    # print(response.content)
    
    #返回一个响应状态码
    print(response.status_code)
    
    #返回一个响应头信息
    print(response.headers)
    
    #返回请求的url
    print(response.url)
    View Code

    requests发送带参数和自定制headers的get请求

    方式一:url中直接带参,如url = 'https://www.baidu.com/s?ie=utf-8&wd=你好'

    方式二:使用params

    import requests
    
    #1、指定url
    url = 'https://tieba.baidu.com/f'
    
    #定义get参数
    params = {
        "ie": "utf-8",
        "kw": "你好",
        "fr": "search",
    }
    
    #自定制头
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36"
    }
    
    #2、发起get请求,传params参,指定headers头,返回响应对象
    response = requests.get(url=url,params=params,headers=headers)
    
    
    #3、获取响应对象值 ,.text为str,content为byte
    response_content = response.content
    
    #打印请求url
    print(response.url)
    
    with open('./re3.html',"wb") as f:
        f.write(response_content)
    View Code

    requests发送post请求

    import requests
    
    #1、指定url
    url = 'https://accounts.douban.com/login'
    
    #定义post参数
    data = {
        "source": "movie",
        "redir": "https://movie.douban.com/",
        "from_email": "xxx",
        "from_password":"xxx",
        "login":"登录",
    }
    
    #自定制头
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36"
    }
    
    #2、发起post请求
    response = requests.post(url=url,data=data,headers=headers)
    
    
    #3、获取响应对象值 ,.text为str,content为byte
    response_content = response.content
    
    #打印请求url
    print(response.url)
    
    with open('./re3.html',"wb") as f:
        f.write(response_content)
    View Code

     requests的cookie操作

    import requests
    
    session = requests.session()
    
    #1、指定url
    url = 'https://accounts.douban.com/login'
    
    #定义post参数
    data = {
        "source": "movie",
        "redir": "https://movie.douban.com/",
        "from_email": "xxx",
        "from_password":"xxx",
        "login":"登录",
    }
    
    #自定制头
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36"
    }
    
    #2、使用session发起post请求,将cookie获取并存储到session对象中
    login_response = session.post(url=url,data=data,headers=headers)
    
    #对个人主页发起请求(session(cookie)),获取响应页面数据
    url = 'https://www.douban.com/people/xxxx/'
    response = session.get(url=url,headers=headers)
    
    #3、获取响应对象值 ,.text为str,content为byte
    response_content = response.content
    
    #打印请求url
    print(response.url)
    
    with open('./re3.html',"wb") as f:
        f.write(response_content)
    View Code

    requests的代理操作

    免费代理ip的网站提供商:www.goubanjia.com、快代理、西祠代理

    import requests
    
    #1、指定url  ****注意,url的协议要和代理协议保持一致,如都使用http或https
    url = 'http://www.baidu.com/s?ie=utf-8&wd=ip'
    #自定制头
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36"
    }
    #自定制代理ip
    proxy={
        "http":"118.172.211.3:58535"
    }
    #发送请求
    response = requests.get(url=url,proxies=proxy,headers=headers)
    
    #3、获取响应对象值 ,.text为str,content为byte
    response_content = response.content
    
    #4、数据持久化
    with open('./re3.html',"wb") as f:
        f.write(response_content)
        print("写入成功")
    View Code

    3、selenium

    安装:pip install selenium

    下载浏览器对应驱动:

    谷歌浏览器:http://chromedriver.storage.googleapis.com/index.html

    版本对应表:https://blog.csdn.net/huilan_same/article/details/51896672

    编码流程:

    from selenium import webdriver
    from time import sleep
    #创建一个浏览器对象
    bro = webdriver.Chrome('./chromedriver.exe')
    #get方法可以指定一个url,让浏览器进行请求
    bro.get('https://www.baidu.com')
    sleep(1)
    #让百度进行指定词条的一个搜索
    text = bro.find_element_by_id('kw')#定位到text文本框
    text.send_keys('人民币')#send_keys表示向文本框中录入指定内容
    sleep(1)
    button = bro.find_element_by_id('su')
    button.click()#click表示的是点击操作
    sleep(3)
    bro.quit()#关闭浏览器
    View Code

     使用PhantomJS无界面浏览器

    下载PhantomJS:http://phantomjs.org/download.html

    编码流程:

    from selenium import webdriver
    import time
    #创建一个浏览器对象
    bro = webdriver.PhantomJS('./phantomjs-2.1.1-windows/bin/phantomjs.exe')
    #get方法可以指定一个url,让浏览器进行请求
    bro.get('https://www.baidu.com')
    
    #截屏
    bro.save_screenshot('./1.png')
    
    #让百度进行指定词条的一个搜索
    text = bro.find_element_by_id('kw')#定位到text文本框
    text.send_keys('人民币')#send_keys表示向文本框中录入指定内容
    
    #截屏
    bro.save_screenshot('./2.png')
    
    button = bro.find_element_by_id('su')
    button.click()#click表示的是点击操作
    
    #等待两秒,待页面加载完成
    time.sleep(2)
    
    #截屏
    bro.save_screenshot('./3.png')
    
    bro.quit()#关闭浏览器
    PhantomJS

    补充:让页面中的滚轮向下滑动(底部)

    js = 'window.scrollTo(0,document.body.scrollHeight)'
    #浏览器对象执行js代码
    bro.execute_script(js)

    二、框架(scrapy)

    1、安装

    If you’re using Anaconda or Miniconda, you can install the package from the conda-forge channel, which has up-to-date packages for Linux, Windows and OS X.

    conda install -c conda-forge scrapy

    安装成功界面如下:

    windowns安装方式二:

    1、pip install wheel

    2、下载twisted:

      https://www.lfd.uci.edu/~gohlke/pythonlibs/#twisted

      如下:按自己对应的版本下载

     

      安装:

        pip install   **.whl

    3、pip install pywin32

    4、pip install scrapy

    pycharm修改python环境,使pycharm支持scrapy:

    file-->settings->如下图:

      1、

      

      2、

      

      

    2、创建项目

    cmd进入到想要创建项目的目录下,执行:scrapy startproject projectname

    创建成功界面如下:

    3、创建爬虫程序

    a、创建basic爬虫,cmd进入到项目目录

    b、创建crawl爬虫,cmd进入到项目目录

    4、前戏

    a、注释爬虫文件中的allowed_domains

    b、settings.py第22行,ROBOTSTXT_OBEY = True改为ROBOTSTXT_OBEY = False

    c、settings.py第19行,改为USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'

    d、开启管道:67-69行,

    ITEM_PIPELINES = {
    'mytestscrapy.pipelines.MytestscrapyPipeline': 300,
    }

    持久化存储操作:

    一、磁盘文件

    1、基于终端指令

      保证parse方法返回一个可迭代类型的对象(存储解析到的页面内容)

      使用终端指令完成数据存储到指定磁盘文件中的操作

      scrapy crawl 爬虫文件名称  -o 磁盘文件.后缀(xxoo.csv)

    2、基于管道

      将解析到的页面数据存储到items对象

      使用yield关键字将items提交给管道文件进行处理

    def parse(self, response):
        item = MytestscrapyItem()
        item.author = 'hg'
        item['content'] = 'hg_content'
        yield item

      在管道文件中编写代码完成数据存储的操作

    class MytestscrapyPipeline(object):
        def open_spider(self,spider):
            self.f = open('t.txt','w')
        def process_item(self, item, spider):
            author = item['author']
            content = item['content']
            str = "author:"+author+" "+"content:"+content
            self.f.write(str)
    
            return item
        def close_spider(self,spider):
            self.f.close()

      在配置文件中开启管道操作(settings.py  67-69行)

    ITEM_PIPELINES = {
       'mytestscrapy.pipelines.MytestscrapyPipeline': 300,
    }

    二、数据库

    1、mysql

    import pymysql
    
    class MytestscrapyPipeline(object):
        def open_spider(self,spider):
            self.db = pymysql.connect("localhost","root","123456","scrapy_test" )
        def process_item(self, item, spider):
            author = item['author']
            content = item['content']
            sql = 'insert into ac(author,content) VALUES ("hg","hg_content")'
            cursor = self.db.cursor()
            try:
                cursor.execute(sql)
                self.db.commit()
            except:
                self.db.rollback()

    2、redis

    三、实战

    1、发送post请求

    class PostspiderdemoSpider(scrapy.Spider):
        name = 'postSpiderDemo'
        # allowed_domains = ['www.baidu.com']
        start_urls = ['https://fanyi.baidu.com/sug']
    
        def start_requests(self):
            for url in self.start_urls:
                #post请求方式一:
                # scrapy.Request(url=url,callback=self.parse,method='post')
                #post请求方式二:(推荐)
                data = {
                    "kw": "dog"
                }
                yield scrapy.FormRequest(url=url,callback=self.parse,formdata=data)
        def parse(self, response):
            print(response.text)

    2、cookie操作

    发送post请求时会自动保存cookie,之后再发get请求会自动携带。

    3、代理

    (1)、middlewares.py文件新建类,如:

    class MyProxy(object):
        def process_request(self, request, spider):
            request.meta['proxy'] = "http://198.1.122.29:80"

    (2)、settings.py(56-58行)文件取消中间件注释:如:

    DOWNLOADER_MIDDLEWARES = {
       'postDemo.middlewares.MyProxy': 543,
    }

     4、日志等级和写入日志到文件,编辑settings.py文件,增加如下两行:

    LOG_LEVEL = 'ERROR'
    LOG_FILE = 'log.txt'

     5、请求传参

    class RequestArgsSpider(scrapy.Spider):
        name = 'request_args'
        # allowed_domains = ['www.baidu.com']
        start_urls = ['http://www.hao6v.com/gvod/zx.html']
    
        def movie_details(self,response):
            item = response.meta['item']
            print(item['name'])
            print(item['i'])
        i=0
        def parse(self, response):
            li_list = response.xpath('//*[@id="main"]/div[1]/div/ul/li')
            print(len(li_list))
            for li in li_list:
                name = li.xpath('./a//text()').extract_first()
                url = li.xpath('./a/@href').extract_first()
                item = PostdemoItem()
                self.i += 1
                item['name'] = name
                item['i'] = self.i
                #请求传参
                yield scrapy.Request(url=url,callback=self.movie_details,meta={'item':item})

    6、CrawlSpider

    class TestcrawlSpider(CrawlSpider):
        name = 'testcrawl'
        # allowed_domains = ['https://www.baidu.com']
        start_urls = ['https://dig.chouti.com/']
    
        rules = (
            Rule(LinkExtractor(allow=r'/all/hot/recent/d+'), callback='parse_item', follow=True),
        )
    
        def parse_item(self, response):
            print(response)
    import scrapy
    from scrapy.linkextractors import LinkExtractor
    from scrapy.spiders import CrawlSpider, Rule
    import time,random
    from mytestscrapy.items import MytestscrapyItem
    
    class TestcrawlSpider(CrawlSpider):
        name = 'testcrawl'
        # allowed_domains = ['https://www.baidu.com']
        start_urls = ['https://cc.yyss.icu/thread0806.php?fid=2&search=&page=1']
    
        rules = (
            Rule(LinkExtractor(allow=r'thread0806.php?fid=2&search=&page=[0-9]{1,2}$'), callback='parse_item', follow=True),
        )
    
        def parse_item(self, response):
            print(response.url)
            time.sleep(random.randint(2,4))
            if response.url == 'https://cc.yyss.icu/thread0806.php?fid=2&search=&page=1':
                tr_ele = response.xpath('//table[@id="ajaxtable"]/tbody[@style="table-layout:fixed;"]/tr[@class="tr3 t_one tac"]')[2:]
            else:
                tr_ele=response.xpath('//table[@id="ajaxtable"]/tbody[@style="table-layout:fixed;"]/tr[@class="tr3 t_one tac"]')
    
            for tr in tr_ele:
                count = tr.xpath('./td[4]/text()').extract_first()
                if int(count) < 5:
                    continue
                text = tr.xpath('./td[2]//a/text()').extract_first()
                url = 'https://cc.yyss.icu/'+tr.xpath('./td[2]//a/@href').extract_first()
                item = MytestscrapyItem()
                item['urlname'] = text
                item['urladdr'] = url
                item['commentsNum'] = count
                yield item

    7、RedisCrawlSpider分布式爬虫

     (1)概念:多台机器上可以执行同一个爬虫程序,实现网站数据的分布爬取。

    (2)原生的scrapy是不可以实现分布式爬虫的

      a:调度器无法共享

      b:管道无法共享

    (3)scrapy-redis组件:专门为scrapy开发的一套组件,该组件可以让scrapy实现分布式

      a:下载,pip install scrapy-redis

    (4) 分布式爬取的流程

      a:redis配置文件的配置

        注释bind 127.0.0.1

        将protected-mode yes改为 no

      b:redis服务器的开启:基于配置文件

      c:创建scrapy工程后,创建基于crawlSpider的爬虫文件

      d:导入RedisCrawlSpider类,然后将爬虫文件修改成基于该类的源文件

      e:将start_url修改成redis_key = 'xxx'

      f:将项目的管道和调度器配置基于成基于scrapy-redis组件

      g:执行爬虫文件:scrapy runspider xxx.py

      h:将起始url放置到调度器的队列中:redis-cli:lpush 队列的名称(redis-key)起始url

    步骤:

    a、创建项目:

      1、scrapy startproject RedisCrawlSpider

      2、scrapy genspider -t crawl redis_crawl_spider www.baidu.com

    b、安装scrapy-redis

      1、pip install scrapy-redis

    c、启动redis服务

      1、 ./src/redis-server redis.conf

    d、redis_crawl_spider.py

    # -*- coding: utf-8 -*-
    import scrapy
    from scrapy.linkextractors import LinkExtractor
    from scrapy.spiders import CrawlSpider, Rule
    from scrapy_redis.spiders import RedisCrawlSpider
    from RedisCrawlSpider.items import RediscrawlspiderItem
    
    #继承RedisCrawlSpider
    class RedisCrawlSpiderSpider(RedisCrawlSpider):
        name = 'redis_crawl_spider'
        # allowed_domains = ['www.baidu.com']
        # start_urls = ['http://www.baidu.com/']
        redis_key = 'qiubaispider'
        rules = (
            Rule(LinkExtractor(allow=r'/pic/page/d+'), callback='parse_item', follow=True),
        )
    
        def parse_item(self, response):
            div_list = response.xpath('//div[@id="content-left"]/div')
            for div in div_list:
                img_url = "https:"+div.xpath('.//div[@class = "thumb"]/a/img/@src').extract_first()
                item = RediscrawlspiderItem()
                item['img_url'] = img_url
                yield item

    e、settings.py

    USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'
    ROBOTSTXT_OBEY = False
    ITEM_PIPELINES
    = { # 'RedisCrawlSpider.pipelines.RediscrawlspiderPipeline': 300, 'scrapy_redis.pipelines.RedisPipeline': 400, } # 使用scrapy-redis里的去重组件,不使用scrapy默认的去重方式 DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter" # 使用scrapy-redis里的调度器组件,不使用默认的调度器 SCHEDULER = "scrapy_redis.scheduler.Scheduler" # 允许暂停,redis请求记录不丢失 SCHEDULER_PERSIST = True # 指定数据库的主机IP REDIS_HOST = "192.168.9.4" # 指定数据库的端口号 REDIS_PORT = 6379
    # 指定数据库密码:REDIS_PARAMS = {"password":"123456"}

     8、RedisSpider分布式爬虫

      a、代码修改(爬虫类):

        导包:from scrapy_redis.spiders import RedisSpider

        将爬虫类的父类修改成RedisSpider

        将起始url列表注释,添加一个redis_key(调度器队列的名称)的属性 

    # -*- coding: utf-8 -*-
    import scrapy
    import time
    from selenium import webdriver
    from wangyipro.items import WangyiproItem
    from scrapy_redis.spiders import RedisSpider
    
    class WangyispiderSpider(RedisSpider):
        name = 'wangyispider'
        # allowed_domains = ['https://news.163.com/']
        # start_urls = ['https://news.163.com/']
        redis_key = "wangyi"
        def __init__(self):
            #实例化一个浏览器对象(实例化一次)
            options = webdriver.ChromeOptions()
            options.add_argument('--ignore-certificate-errors')
            self.bro = webdriver.Chrome('./chromedriver.exe',chrome_options=options)
    
        def closed(self,spider):
            print("爬虫结束")
            self.bro.quit()
    
        def parse(self, response):
            menu_list = ["menu_guonei","menu_guoji","menu_war","menu_hangkong"]
            li_list = []
            for menu in menu_list:
                li = response.xpath("//div[@class='ns_area list']/ul/li[@class=$val]",val=menu)
                li_list.append(li)
            for li in li_list:
                title = li.xpath('./a/text()').extract_first()
                url = li.xpath('./a/@href').extract_first()
                yield scrapy.Request(url=url,callback=self.secondPage,meta={"title":title})
    
        def secondPage(self,response):
            title = response.meta['title']
            div_list = response.xpath('//div[@class="data_row news_article clearfix "]')
            for div in div_list:
                head = div.xpath('.//h3/a/text()').extract_first()
                url = div.xpath('.//h3/a/@href').extract_first()
                imgUrl = div.xpath('./a[@class="na_pic"]/img/@src').extract_first()
                tag_list = div.xpath('.//div[@class="news_tag"]/div/a/text()').extract()
                for tag in tag_list:
                    tag = tag.strip()
                tag = ",".join(tag_list)
                item = WangyiproItem()
                item['title'] = title
                item['head'] = head
                item['url'] = url
                item['imgUrl'] = imgUrl
                item['tag'] = tag
                yield scrapy.Request(url=url,callback=self.getContent,meta={"item":item})
    
        def getContent(self,response):
            item = response.meta['item']
            content_list = response.xpath('//div[@id="endText"]/p//text()').extract()
            for content in content_list:
                content.strip()
            content = "
    	".join(content_list)
            item['content'] = content
            yield item
    View Code
    # -*- coding: utf-8 -*-
    
    # Define here the models for your spider middleware
    #
    # See documentation in:
    # https://doc.scrapy.org/en/latest/topics/spider-middleware.html
    
    from scrapy import signals
    from scrapy.http import HtmlResponse
    import time
    from scrapy.contrib.downloadermiddleware.useragent import UserAgentMiddleware
    import random
    #UA池
    user_agent_list = [
        "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
        #safari 5.1 – Windows
        "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
        #IE 9.0
        "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0;",
        #IE 8.0
        "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)",
        #IE 7.0
        "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)",
        #IE 6.0
        " Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)",
        # Firefox 4.0.1 – MAC
        " Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1",
        # Firefox 4.0.1 – Windows
        "Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1",
        # Opera 11.11 – MAC
        "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.8.131 Version/11.11",
        # Opera 11.11 – Windows
        "Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11",
        # Chrome 17.0 – MAC
        " Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
        # 傲游(Maxthon)
        " Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Maxthon 2.0)",
        # 腾讯TT
        " Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; TencentTraveler 4.0)",
        # 世界之窗(The World) 2.x
        " Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)",
        # 世界之窗(The World) 3.x
        " Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; The World)",
        # 搜狗浏览器 1.x
        " Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SE 2.X MetaSr 1.0; SE 2.X MetaSr 1.0; .NET CLR 2.0.50727; SE 2.X MetaSr 1.0)",
        # 360浏览器
        " Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)",
        # Avant
        " Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Avant Browser)",
        # Green Browser
        " Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)",
        # 移动设备端:
        # safari iOS 4.33 – iPhone
        "Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
        # safari iOS 4.33 – iPod Touch
        "Mozilla/5.0 (iPod; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
        # safari iOS 4.33 – iPad
        "Mozilla/5.0 (iPad; U; CPU OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
        # Android N1
        " Mozilla/5.0 (Linux; U; Android 2.3.7; en-us; Nexus One Build/FRF91) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
        # Android QQ浏览器 For android
        " MQQBrowser/26 Mozilla/5.0 (Linux; U; Android 2.3.7; zh-cn; MB200 Build/GRJ22; CyanogenMod-7) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
        # Android Opera Mobile
        " Opera/9.80 (Android 2.3.4; Linux; Opera Mobi/build-1107180945; U; en-GB) Presto/2.8.149 Version/11.10",
        # Android Pad Moto Xoom
        " Mozilla/5.0 (Linux; U; Android 3.0; en-us; Xoom Build/HRI39) AppleWebKit/534.13 (KHTML, like Gecko) Version/4.0 Safari/534.13",
        # BlackBerry
        " Mozilla/5.0 (BlackBerry; U; BlackBerry 9800; en) AppleWebKit/534.1+ (KHTML, like Gecko) Version/6.0.0.337 Mobile Safari/534.1+",
        # WebOS HP Touchpad
        " Mozilla/5.0 (hp-tablet; Linux; hpwOS/3.0.0; U; en-US) AppleWebKit/534.6 (KHTML, like Gecko) wOSBrowser/233.70 Safari/534.6 TouchPad/1.0",
        # Nokia N97
        " Mozilla/5.0 (SymbianOS/9.4; Series60/5.0 NokiaN97-1/20.0.019; Profile/MIDP-2.1 Configuration/CLDC-1.1) AppleWebKit/525 (KHTML, like Gecko) BrowserNG/7.1.18124",
        # Windows Phone Mango
        " Mozilla/5.0 (compatible; MSIE 9.0; Windows Phone OS 7.5; Trident/5.0; IEMobile/9.0; HTC; Titan)",
        # UC无
        " UCWEB7.0.2.37/28/999",
        # UC标准
        " NOKIA5700/ UCWEB7.0.2.37/28/999",
        # UCOpenwave
        " Openwave/ UCWEB7.0.2.37/28/999",
        # UC Opera
        " Mozilla/4.0 (compatible; MSIE 6.0; ) Opera/UCWEB7.0.2.37/28/999",
    ]
    
    class RandomUserAgent(UserAgentMiddleware):
        def process_request(self, request, spider):
            ua = random.choice(user_agent_list)
            request.headers.setdefault('User-Agent',ua)
    
    #代理池
    proxies_http = [
        "182.53.197.24:56383",
        "78.58.136.55:39232",
        "188.17.156.26:30773",
    ]
    proxies_https = [
        "103.19.110.177:8080",
        "193.33.101.152:34611",
        "119.176.80.220:9999",
    ]
    
    class Proxy(object):
        def process_request(self, request, spider):
            ph = request.url.split(":")[0]
            if ph == "http":
                proxy = random.choice(proxies_http)
                request.meta['proxy'] = 'http://'+proxy
            else:
                proxy = random.choice(proxies_https)
                request.meta['proxy'] = 'https://' + proxy
    
    
    
    class WangyiproSpiderMiddleware(object):
        # Not all methods need to be defined. If a method is not defined,
        # scrapy acts as if the spider middleware does not modify the
        # passed objects.
    
        @classmethod
        def from_crawler(cls, crawler):
            # This method is used by Scrapy to create your spiders.
            s = cls()
            crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
            return s
    
        def process_spider_input(self, response, spider):
            # Called for each response that goes through the spider
            # middleware and into the spider.
    
            # Should return None or raise an exception.
            return None
    
        def process_spider_output(self, response, result, spider):
            # Called with the results returned from the Spider, after
            # it has processed the response.
    
            # Must return an iterable of Request, dict or Item objects.
            for i in result:
                yield i
    
        def process_spider_exception(self, response, exception, spider):
            # Called when a spider or process_spider_input() method
            # (from other spider middleware) raises an exception.
    
            # Should return either None or an iterable of Response, dict
            # or Item objects.
            pass
    
        def process_start_requests(self, start_requests, spider):
            # Called with the start requests of the spider, and works
            # similarly to the process_spider_output() method, except
            # that it doesn’t have a response associated.
    
            # Must return only requests (not items).
            for r in start_requests:
                yield r
    
        def spider_opened(self, spider):
            spider.logger.info('Spider opened: %s' % spider.name)
    
    
    class WangyiproDownloaderMiddleware(object):
        # Not all methods need to be defined. If a method is not defined,
        # scrapy acts as if the downloader middleware does not modify the
        # passed objects.
    
        @classmethod
        def from_crawler(cls, crawler):
            # This method is used by Scrapy to create your spiders.
            s = cls()
            crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
            return s
    
        def process_request(self, request, spider):
            # Called for each request that goes through the downloader
            # middleware.
    
            # Must either:
            # - return None: continue processing this request
            # - or return a Response object
            # - or return a Request object
            # - or raise IgnoreRequest: process_exception() methods of
            #   installed downloader middleware will be called
            return None
        #拦截到响应对象(下载器传递给Spider的响应对象)
        #request:响应对象对应的请求对象
        #response:拦截到的响应对象
        #spider:爬虫文件中对应的爬虫类的实例
        def process_response(self, request, response, spider):
            if request.url in ["http://news.163.com/domestic/","http://news.163.com/world/","http://war.163.com/","http://news.163.com/air/"]:
                spider.bro.get(url = request.url)
                #将浏览器拖到最底部
                # js = 'window.scrollTo(0,document.body.scrollHeight)'
                # 浏览器对象执行js代码
                # spider.bro.execute_script(js)
                # time.sleep(3)
                page_text = spider.bro.page_source
                print(page_text)
                return HtmlResponse(url=spider.bro.current_url,body=page_text,encoding='utf-8',request=request)
            else:
                return response
    
        def process_exception(self, request, exception, spider):
            # Called when a download handler or a process_request()
            # (from other downloader middleware) raises an exception.
    
            # Must either:
            # - return None: continue processing this exception
            # - return a Response object: stops process_exception() chain
            # - return a Request object: stops process_exception() chain
            pass
    
        def spider_opened(self, spider):
            spider.logger.info('Spider opened: %s' % spider.name)
    middlewares.py

      b、修改redis.conf配置文件,并指定配置文件启动redis服务

        注释bind 127.0.0.1

        protected-mode yes改为 no

      c、修改settings.py文件    

    ITEM_PIPELINES = {
       # 'RedisCrawlSpider.pipelines.RediscrawlspiderPipeline': 300,
        'scrapy_redis.pipelines.RedisPipeline': 400,
    
    }
    
    # 使用scrapy-redis里的去重组件,不使用scrapy默认的去重方式
    DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
    # 使用scrapy-redis里的调度器组件,不使用默认的调度器
    SCHEDULER = "scrapy_redis.scheduler.Scheduler"
    # 允许暂停,redis请求记录不丢失
    SCHEDULER_PERSIST = True
    
    # 指定数据库的主机IP
    REDIS_HOST = "192.168.9.4"
    # 指定数据库的端口号
    REDIS_PORT = 6379
    # 指定数据库密码
    REDIS_PARAMS = {"password":"123456"}

      

      d:执行爬虫文件:scrapy runspider xxx.py

      e:将起始url放置到调度器的队列中:redis-cli:lpush 队列的名称(redis-key)起始url

        

  • 相关阅读:
    Windows2003 Webshell默认权限
    Windows安装Centos7双系统后Windows启动项消失
    GCC编译流程及常用编辑命令
    swoole的websockte例子
    PhpStorm 增加Swoole智能提示
    Centos7/RHEL 7 配置静态路由
    webpack介绍和使用
    Webpack中的sourcemap以及如何在生产和开发环境中合理的设置
    什么是 PWA
    php实现excel单元格合并,字体加粗居中等操作
  • 原文地址:https://www.cnblogs.com/hougang/p/spider.html
Copyright © 2011-2022 走看看