zoukankan      html  css  js  c++  java
  • 复习爬虫

    有关函数

    # 可变对象不能做关键字参数
    
    # def foo(arg, li=[]):
    #     li.append(arg)
    #     return li
    #
    # list1 = foo(21)
    # list2 = foo(21, [1,])
    # list3 = foo(28)
    #
    # print(list1)
    # print(list2)
    # print(list3)
    
    # li.append()没有返回值
    def foo(arg, li=[]):
        return li.append(arg)
    
    list1 = foo(21)
    list2 = foo(21, [1,])
    list3 = foo(28)
    
    print(list1)
    print(list2)
    print(list3)
    
    # list5 = [11, 22, 33, 44, 55]
    # print(list5[10:])
    
    
    # 打乱列表的顺序
    # import random
    # random.shuffle(list5)
    # print(list5)
    View Code

    关于爬虫,核心的一点就是,根据抓包抓到的东西,去分析请求接口,对应创建请求即可。

    代理爬虫:http://www.goubanjia.com/ 去这个网站找免费的代理

    import requests
    
    url = 'http://www.baidu.com/s?&wd=ip'
    
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.75 Safari/537.36'
    }
    proxy2 = {
        'http':'101.4.136.34:81'
    }
    response = requests.get(url=url,proxies = proxy2,headers = headers)
    
    with open('./daili.html','w',encoding='utf-8') as fp:
        fp.write(response.text)
        
    
    print('123')
    View Code

    190815 

      https://www.luffycity.com/micro/play/5070/3074

      1、爬虫指定多页数据的爬取

      2、cookie的作用,用requests.session()方法模拟获取登录后的数据

      3、代理的使用

      4、第三方平台解析验证码

      5、正则回顾

      (1、指定url;2、发起请求;3、获取页面数据;4、数据解析;5、持久化存储)

    re模块知乎爬图初级:

    import requests, re
    import os,time
    
    url = 'https://www.zhihu.com/question/308457217'
    if not os.path.exists('./zhihuImg'):
        os.mkdir('zhihuImg')
    headers = {
        # 存储任意的请求头信息
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.67 Safari/537.36'
    
    }
    response = requests.get(url=url, headers=headers)
    
    # print(response.text)
    
    pic_list = re.findall('<noscript>.*?<img src=".*?">.*?</noscript>', response.text, re.S)
    # print(pic_list)
    new_list = []
    for li in pic_list:
        st = (re.findall('https.*?_hd.jpg', li))[0]
    #     st = st.replace('_hd', '_r')
        new_list.append(st)
    # print(new_list)
    img_path = './zhihuImg/'
    inx = 0
    for index, li in enumerate(new_list):
    #     time.sleep(5)
        inx = inx + 1
        img_data = requests.get(url=li, headers=headers).content
        img_name = img_path + str(inx) + '.jpg'
        with open(img_name, 'wb') as f:
            f.write(img_data)
    print('over')
    View Code

    lxml模块知乎爬图初级:

    import requests,os
    from lxml import etree
    
    url = 'https://www.zhihu.com/question/308457217'
    
    headers = {
        # 存储任意的请求头信息
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.67 Safari/537.36'
    }
    
    tree = etree.HTML(requests.get(url = url ,headers = headers).text)
    pic_li = tree.xpath('//div[@class="List-item"]//img/@data-original')
    pic_list = []
    for li in pic_li:
        if li not in pic_list:
            pic_list.append(li)
    img_path = './zhihuImg2'
    if not os.path.exists(img_path):
        os.mkdir(img_path)
    inx = 0 
    for li in pic_list :
        inx = inx + 1
        img_data = requests.get(url=li, headers=headers).content
        img_name = os.path.join(img_path, str(inx) + '.jpg')
        with open(img_name,'wb') as f:
            f.write(img_data)
    print('over')
    View Code

     bs4模块知乎爬图初级:

    import requests,os
    from bs4 import BeautifulSoup
    
    def down(picurl,dirname,filename):
        if not os.path.exists(dirname):
            os.mkdir(dirname)
        with open('%s/%s.jpg'%(dirname,filename), 'wb') as f:
            headers = {
                'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'
            }
            # time.sleep(3)
            response = requests.get(url=picurl,headers=headers)
            if response:
                f.write(response.content)
                print(filename)
    
    url = 'https://www.zhihu.com/question/285321190'
    
    headers = {
        # 存储任意的请求头信息
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.67 Safari/537.36'
    }
    numb = 0
    res = requests.get(url = url ,headers = headers)
    res.encoding = res.apparent_encoding
    soup = BeautifulSoup(res.text, features="html.parser")
    x = soup.find_all('img')
    print(x)
    for index,imgs in enumerate(x):
        numb = numb + 1
        urls = imgs.attrs.get('src')
        url = urls.replace('\"','').replace('com/50/','com/').replace('_hd','_r').replace('_ipico','_r').replace('_120x160','_r').replace('_180x120','_r')
        print(url)
        if url.startswith('http'):
            print(index,url)
            down(url,'女生有一双好看的腿是怎样的体验',str(numb)+str(index))
    View Code

    知乎的api:

    url = https://www.zhihu.com/api/v4/questions/68381376/answers?sort_by=default&include=data[*].is_normal,admin_closed_comment,reward_info,is_collapsed,annotation_action,annotation_detail,collapse_reason,is_sticky,collapsed_by,suggest_edit,comment_count,can_comment,content,editable_content,voteup_count,reshipment_settings,comment_permission,created_time,updated_time,review_info,question,excerpt,relationship.is_authorized,is_author,voting,is_thanked,is_nothelp,upvoted_followees;data[*].mark_infos[*].url;data[*].author.follower_count,badge[?(type=best_answerer)].topics&limit=20&offset=
    View Code

    模拟浏览器:需求下载对应的驱动(驱动向下兼容)http://chromedriver.storage.googleapis.com/index.html

    from selenium import webdriver
    import time, os
    
    executable_path = 'C:/Users/Simon/Downloads/chromedriver_win32/chromedriver.exe'
    if not os.path.exists(executable_path):
        print('驱动不存在!后面不写try catch 写在else里面也是ok的')
    try:
        bro = webdriver.Chrome(executable_path=executable_path)
    
        bro.get('https://www.baidu.com')
    
        text = bro.find_element_by_id('kw')
    
        text.send_keys('人民币')
    
        button = bro.find_element_by_id('su')
    
        button.click()
        time.sleep(5)
        bro.quit()
    except:
        print('驱动不存在!')
    View Code

     phantomJS:

    from selenium import webdriver
    import time, os
    
    executable_path = 'C:/Users/Simon/Downloads/chromedriver_win32/phantomjs.exe'
    if not os.path.exists(executable_path):
        print('驱动不存在!后面不写try catch 写在else里面也是ok的')
    else:
        bro = webdriver.PhantomJS(executable_path=executable_path)
        bro.get('https://www.baidu.com')
        bro.save_screenshot('./1.png') # 截图
        text = bro.find_element_by_id('kw')
        text.send_keys('人民币')
        button = bro.find_element_by_id('su')
        button.click()
        time.sleep(4)
        bro.save_screenshot('./2.png')
        bro.quit()
    View Code

     用PhantomJS模拟浏览器:

    from selenium import webdriver
    import time, os
    
    # executable_path = 'C:/Users/Simon/Downloads/chromedriver_win32/chromedriver.exe'
    executable_path = 'C:/Users/Simon/Downloads/chromedriver_win32/phantomjs.exe'
    if not os.path.exists(executable_path):
        print('驱动不存在!后面不写try catch 写在else里面也是ok的')
    else:
        dir_name = '0817'
        dir_path = os.path.join('./', dir_name)
        if not os.path.exists(dir_path):
            os.mkdir(dir_path)
        # bro = webdriver.Chrome(executable_path=executable_path)
        bro = webdriver.PhantomJS(executable_path=executable_path)
        js = 'window.scrollTo(0,document.body.scrollHeight)'
        bro.get('https://www.zhihu.com/question/21471417')
        print('sleep001')
        time.sleep(1)
        bro.save_screenshot('./知乎001.png')  # 截图
        bro.execute_script(js)
        print('sleep002')
        time.sleep(1)
        bro.save_screenshot('./知乎002.png')  # 截图
        bro.quit()
        print('exec_done')
    View Code

     scrapy安装:

    i.    linux mac os:pip install scrapy
    ii.    win:
    1.    pip install wheel
    2.    下载twisted:https://www.lfd.uci.edu/~gohlke/pythonlibs/#twisted
    pip install 下载好的框架.whl
    3.    pip install pywin32
    4.    pip install scrapy
    View Code

    scrapy的持久化存储(文件和管道两个方式):firstScrapy.zip

    基于mysql的存储:qiubaiPro.zip

    我自己写的时候使劲儿犯了一个错:使用pymysql链接数据库的时候,port = 3306 而不能写成 port = '3306' ,纠结了一个下午,各种看不懂报错!!

    scrapy同时用三种方式存储爬到的东西,主要是在 pipelines.py 中定义对应的类,并且在setting文件中配置好对应的类型。注:示例中的方式都对,但是redis的语法可能跟稍有问题,不一定能成功存储。qiubaiPro.zip 

    一次循环多个有规律的url:firstScrapy多个url.zip 

    post请求(在parse(函数前加start_requests()函数):postPro.zip 

    # -*- coding: utf-8 -*-
    import scrapy
    
    #需求:百度翻译中指定词条对应的翻译结果进行获取
    class PostdemoSpider(scrapy.Spider):
        name = 'postDemo'
        #allowed_domains = ['www.baidu.com']
        start_urls = ['https://fanyi.baidu.com/sug']
        #该方法其实是父类中的一个方法:该方法可以对star_urls列表中的元素进行get请求的发送
        #发起post:
            #1.将Request方法中method参数赋值成post
            #2.FormRequest()可以发起post请求(推荐)
        def start_requests(self):
            print('start_requests()')
            #post请求的参数
            data = {
                'kw': 'dog',
            }
            for url in self.start_urls:
                #formdata:请求参数对应的字典
                yield scrapy.FormRequest(url=url,formdata=data,callback=self.parse)
    
        def parse(self, response):
            print(response.text)
    View Code

     post登录之后跳转查询个人主页用scrapy框架的时候会自动带上上次的cookie的doubanPro.zip 

    class DoubanSpider(scrapy.Spider):
        name = 'douban'
        #allowed_domains = ['www.douban.com']
        start_urls = ['https://www.douban.com/accounts/login']
    
        #重写start_requests方法
        def start_requests(self):
            #将请求参数封装到字典
            data = {
                'source': 'index_nav',
                'form_email': '15027900535',
                'form_password': 'bobo@15027900535'
            }
            for url in self.start_urls:
                yield scrapy.FormRequest(url=url,formdata=data,callback=self.parse)
        #针对个人主页页面数据进行解析操作
        def parseBySecondPage(self,response):
            fp = open('second.html', 'w', encoding='utf-8')
            fp.write(response.text)
    
            #可以对当前用户的个人主页页面数据进行指定解析操作
    
        def parse(self, response):
            #登录成功后的页面数据进行存储
            fp = open('main.html','w',encoding='utf-8')
            fp.write(response.text)
    
            #获取当前用户的个人主页
            url = 'https://www.douban.com/people/185687620/'
            yield scrapy.Request(url=url,callback=self.parseBySecondPage)
    View Code

    代理的使用proxyPro.zip:代理要使用下载中间件,(在middlewares.py)中写一个自己的类(父类是object),重写process_request的方法

    # -*- coding: utf-8 -*-
    
    # Define here the models for your spider middleware
    #
    # See documentation in:
    # https://doc.scrapy.org/en/latest/topics/spider-middleware.html
    
    from scrapy import signals
    
    #自定义一个下载中间件的类,在类中事先process_request(处理中间价拦截到的请求)方法
    class MyProxy(object):
        def process_request(self,request,spider):
            #请求ip的更换
            request.meta['proxy'] = "https://178.128.90.1:8080"
    View Code

    然后去settings.py中启用当前的下载中间件:

    DOWNLOADER_MIDDLEWARES = {
        'proxyPro.middlewares.MyProxy': 543,# 项目名.默认值.类名
    }
    View Code

     日志等级:在settings.py中

    日志等级(种类):
        ERROR:错误
        WARNING:警告
        INFO:一般信息
        DEBUG:调试信息(默认)
    指定输入某一中日志信息:
        settings:LOG_LEVEL = ‘ERROR’
    将日志信息存储到制定文件中,而并非显示在终端里:
        settings:LOG_FILE = ‘log.txt’
    View Code

     请求传参(即部分内容需在子页面中获取):moviePro.zip

    # 要点:主程序调用解析子页面的函数的时候通过meta参数把主函数值传过去
    # -*- coding: utf-8 -*-
    import scrapy
    from moviePro.items import MovieproItem
    
    class MovieSpider(scrapy.Spider):
        name = 'movie'
        #allowed_domains = ['www.id97.com']
        start_urls = ['http://www.id97.com/movie']
    
        #专门用于解析二级子页面中的数据值
        def parseBySecondPage(self,response):
            actor = response.xpath('/html/body/div[1]/div/div/div[1]/div[1]/div[2]/table/tbody/tr[1]/td[2]/a/text()').extract_first()
            language = response.xpath('/html/body/div[1]/div/div/div[1]/div[1]/div[2]/table/tbody/tr[6]/td[2]/text()').extract_first()
            longTime = response.xpath('/html/body/div[1]/div/div/div[1]/div[1]/div[2]/table/tbody/tr[8]/td[2]/text()').extract_first()
    
            #取出Request方法的meta参数传递过来的字典(response.meta)
            item = response.meta['item']
            item['actor'] = actor
            item['language'] = language
            item['longTime'] = longTime
            #将item提交给管道
            yield item
        def parse(self, response):
            #名称,类型,导演,语言,片长
            div_list = response.xpath('/html/body/div[1]/div[1]/div[2]/div')
            for div in div_list:
                name = div.xpath('.//div[@class="meta"]/h1/a/text()').extract_first()
                #如下xpath方法返回的是一个列表,切列表元素为4
                kind = div.xpath('.//div[@class="otherinfo"]//text()').extract()
                #将kind列表转化成字符串
                kind = "".join(kind)
                url = div.xpath('.//div[@class="meta"]/h1/a/@href').extract_first()
    
                print(kind)
                #创建items对象
                item = MovieproItem()
                item['name'] = name
                item['kind'] = kind
                #问题:如何将剩下的电影详情数据存储到item对象(meta)
                #需要对url发起请求,获取页面数据,进行指定数据解析
                #meta参数只可以赋值一个字典(将item对象先封装到字典)
                yield scrapy.Request(url=url,callback=self.parseBySecondPage,meta={'item':item})
    View Code

     CrawlSpider:crawlSpider代码.zip

    问题:如果我们想要对某一个网站的全站数据进行爬取?
    解决方案:
    1.    手动请求的发送
    2.    CrawlSpider(推荐)
    CrawlSpider概念:CrawlSpider其实就是Spider的一个子类。CrawlSpider功能更加强大(链接提取器,规则解析器)。
    
    代码:
    1.    创建一个基于CrawlSpider的爬虫文件
    a)    scrapy genspider –t crawl 爬虫名称  起始url
    View Code
    # -*- coding: utf-8 -*-
    import scrapy
    from scrapy.linkextractors import LinkExtractor
    from scrapy.spiders import CrawlSpider, Rule
    
    
    class ChoutiSpider(CrawlSpider):
        name = 'chouti'
        #allowed_domains = ['dig.chouti.com']
        start_urls = ['https://dig.chouti.com/']
    
        #实例化了一个链接提取器对象
        #链接提取器:用来提取指定的链接(url)
        #allow参数:赋值一个正则表达式
        #链接提取器就可以根据正则表达式在页面中提取指定的链接
        #提取到的链接会全部交给规则解析器
        link = LinkExtractor(allow=r'/all/hot/recent/d+')
        rules = (
            #实例化了一个规则解析器对象
            #规则解析器接受了链接提取器发送的链接后,就会对这些链接发起请求,获取链接对应的页面内容,就会根据指定的规则对页面内容中指定的数据值进行解析
            #callback:指定一个解析规则(方法/函数)
            #follow:是否将链接提取器继续作用到连接提取器提取出的链接所表示的页面数据中
            Rule(link, callback='parse_item', follow=True),
        )
    
        def parse_item(self, response):
            print(response)
    View Code

    创建项目等终端操作:

    scrapy startproject firstScrapy    # 创建项目
    scrapy genspider (-t crawl) firstScrapy www.baidu.com    # -t  crawl创建的是crawlSpider程序 firstScrapy是爬虫程序名 www.baidu.com是爬虫程序起始地址
    scrapy crawl firstScrapy --nolog    # 执行爬虫程序
    View Code

     分布式爬虫:wangyiPro.zip

    scrappy框架中使用selenium模块:

    基于scrapy-redis的第二种形式的分布式爬虫:
    1.    基于RedisSpider实现的分布式爬虫(网易新闻)
    a)    代码修改(爬虫类):
    i.    导包:from scrapy_redis.spiders import RedisSpider
    ii.    将爬虫类的父类修改成RedisSpider
    iii.    将起始url列表注释,添加一个redis_key(调度器队列的名称)的属性
    b)    redis数据库配置文件的配置redisxxx.conf:
    i.    #bind 127.0.0.1
    ii.    protected-mode no
    c)    对项目中settings进行配置:
    i.    
    #配置redis服务的ip和端口
    REDIS_HOST = 'redis服务的ip地址'
    REDIS_PORT = 6379
    #REDIS_PARAMS = {‘password’:’123456’}
    
    ii.    
    # 使用scrapy-redis组件的去重队列
    DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
    # 使用scrapy-redis组件自己的调度器
    SCHEDULER = "scrapy_redis.scheduler.Scheduler"
    # 是否允许暂停
    SCHEDULER_PERSIST = True
    
    iii.    使用可以被共享的管道
    ITEM_PIPELINES = {
        #'wangyiPro.pipelines.WangyiproPipeline': 300,
        'scrapy_redis.pipelines.RedisPipeline': 400,
    }
    
    d)    开启redis数据库的服务:redis-server 配置文件
    e)    执行爬虫文件:scrapy runspider wangyi.py
    f)    向调度器的队列中扔一个起始url:
    i.    开启redis客户端
    ii.    向调度器队列中扔一个起始url
    lpush wangyi https://news.163.com
    2.    UA池:
    a)    在中间价类中进行导包:
    from scrapy.contrib.downloadermiddleware.useragent import UserAgentMiddleware
    b) 封装一个基于UserAgentMiddleware的类,且重写该类的process_requests方法
    3.    代理池:注意请求url的协议后到底是http•还是https
    4.    selenium如何被应用到scrapy
    a)    在爬虫文件中导入webdriver类
    b)    在爬虫文件的爬虫类的构造方法中进行了浏览器实例化的操作
    c)    在爬虫类的closed方法中进行浏览器关闭的操作
    d)    在下载中间件的process_response方法中编写执行浏览器自动化的操作
    需求:爬取的是基于文字的新闻数据(国内,国际,军事,航空)
    PROXY = [
        '173.82.219.113:3128',
        '92.243.6.37:80',
        '117.102.96.59:8080',
        '213.234.28.94:8080',
        '101.51.123.88:8080',
        '158.58.131.214:41258' ]
    
    
    user_agent_list = [
            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 "
            "(KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
            "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 "
            "(KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 "
            "(KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
            "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 "
            "(KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
            "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 "
            "(KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
            "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 "
            "(KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
            "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 "
            "(KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 "
            "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
            "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 "
            "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
            "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 "
            "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
            "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 "
            "(KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 "
            "(KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
            "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 "
            "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 "
            "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
            "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 "
            "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
            "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 "
            "(KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
            "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 "
            "(KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
            "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 "
            "(KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
           ]
    View Code
    # 类的构造方法中实例化一个浏览器对象
    # 然后在中间件中重写process_response方法,来拦截原本的response(因为页面内容都是动态加载出来的,所以这边必须用selenium模块模拟浏览器操作)
    
    class WangyiproDownloaderMiddleware(object):
        # Not all methods need to be defined. If a method is not defined,
        # scrapy acts as if the downloader middleware does not modify the
        # passed objects.
    
    
    
        def process_request(self, request, spider):
            # Called for each request that goes through the downloader
            # middleware.
    
            # Must either:
            # - return None: continue processing this request
            # - or return a Response object
            # - or return a Request object
            # - or raise IgnoreRequest: process_exception() methods of
            #   installed downloader middleware will be called
            return None
        #拦截到响应对象(下载器传递给Spider的响应对象)
        #request:响应对象对应的请求对象
        #response:拦截到的响应对象
        #spider:爬虫文件中对应的爬虫类的实例
        def process_response(self, request, response, spider):
            #响应对象中存储页面数据的篡改
            # 五个请求url,一个起始,四个想要爬去的子链接,这边只有四个子链接是动态加载的
            if request.url in['http://news.163.com/domestic/','http://news.163.com/world/','http://news.163.com/air/','http://war.163.com/']:
                spider.bro.get(url=request.url)
                js = 'window.scrollTo(0,document.body.scrollHeight)'
                spider.bro.execute_script(js)
                time.sleep(2)  #一定要给与浏览器一定的缓冲加载数据的时间
                #页面数据就是包含了动态加载出来的新闻数据对应的页面数据
                page_text = spider.bro.page_source
                #篡改响应对象
                return HtmlResponse(url=spider.bro.current_url,body=page_text,encoding='utf-8',request=request)
            else:
                return response
    View Code

      UA池作用:代码几秒发起了几百个请求,会被反爬虫机制识别到,所以要通过ua池和代理池来分发压力。也是写在下载中间件中 基于RedisSpider的分布式爬虫.zip

    from scrapy import signals
    
    from scrapy.http import HtmlResponse
    import time
    from scrapy.contrib.downloadermiddleware.useragent import UserAgentMiddleware
    import random
    #UA池代码的编写(单独给UA池封装一个下载中间件的一个类)
    #1,导包UserAgentMiddlware类
    class RandomUserAgent(UserAgentMiddleware):
    
        def process_request(self, request, spider):
            #从列表中随机抽选出一个ua值
            ua = random.choice(user_agent_list)
            #ua值进行当前拦截到请求的ua的写入操作
            request.headers.setdefault('User-Agent',ua)
    
    #批量对拦截到的请求进行ip更换
    class Proxy(object):
        def process_request(self, request, spider):
            #对拦截到请求的url进行判断(协议头到底是http还是https)
            #request.url返回值:http://www.xxx.com
            h = request.url.split(':')[0]  #请求的协议头
            if h == 'https':
                ip = random.choice(PROXY_https)
                request.meta['proxy'] = 'https://'+ip
            else:
                ip = random.choice(PROXY_http)
                request.meta['proxy'] = 'http://' + ip
    View Code

     phantomJS:爬知乎,但是要自己手动登录一下

    from selenium import webdriver
    import time, os, requests
    
    
    def down(picurl, dirname, filename):
        if not os.path.exists(dirname):
            os.mkdir(dirname)
        with open('%s/%s.jpg' % (dirname, filename), 'wb') as f:
            headers = {
                'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'
            }
            time.sleep(1)
            response = requests.get(url=picurl, headers=headers)
            if response:
                f.write(response.content)
                print('下载了1次%s'%time.time())
    
    
    executable_path = 'C:/Users/Simon/Downloads/chromedriver_win32/chromedriver.exe'
    if not os.path.exists(executable_path):
        print('驱动不存在!后面不写try catch 写在else里面也是ok的')
    try:
        bro = webdriver.Chrome(executable_path=executable_path)
    
        # bro.get('https://www.zhihu.com/question/68381376')
        bro.get('https://www.zhihu.com/')
        time.sleep(15)
        bro.get('https://www.zhihu.com/question/360400273')
        js = 'window.scrollTo(0,document.body.scrollHeight)'
        print('sleep001')
        bro.execute_script(js)
        for wait_time in range(1, 30):
            print('这里?【%s】' % wait_time)
            time.sleep(1)
            bro.execute_script(js)
        text = bro.find_elements_by_tag_name('img')
        for val in text:
            if val.get_attribute('data-original'):
                urls = val.get_attribute('data-original')
                if urls.startswith('https'):
                    url = urls.replace('\"', '').replace('com/50/', 'com/').replace('_hd', '_r').replace('_ipico',
                                                                                                          '_r').replace(
                        '_120x160', '_r').replace('_180x120', '_r')
                    t = time.time()
                    name2 = int(round(t * 1000))
                    print(url)
                    down(url, '360400273', name2)
            else:
                continue
        time.sleep(5)
        bro.quit()
    except:
        print('驱动不存在!')
    View Code
    from selenium import webdriver
    import time, os, requests
    
    
    def down(picurl, dirname, filename):
        if not os.path.exists(dirname):
            os.mkdir(dirname)
        with open('%s/%s.jpg' % (dirname, filename), 'wb') as f:
            headers = {
                'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'
            }
            time.sleep(1)
            response = requests.get(url=picurl, headers=headers)
            if response:
                f.write(response.content)
                print('下载了1次%s' % time.time())
    
    
    executable_path = 'C:/Users/Simon/Downloads/chromedriver_win32/chromedriver.exe'
    if not os.path.exists(executable_path):
        print('驱动不存在!后面不写try catch 写在else里面也是ok的')
    try:
    
        url_list = []
        bro = webdriver.Chrome(executable_path=executable_path)
    
        # bro.get('https://www.zhihu.com/question/68381376')
        bro.get('https://www.zhihu.com/')
        time.sleep(15)
        bro.get('https://www.zhihu.com/question/357826995')
        js = 'window.scrollTo(0,document.body.scrollHeight)'
        print('sleep001')
        bro.execute_script(js)
        for wait_time in range(1, 30):
            print('这里?【%s】' % wait_time)
            time.sleep(1)
            bro.execute_script(js)
        text = bro.find_elements_by_tag_name('img')
        for val in text:
            if val.get_attribute('data-original'):
                urls = val.get_attribute('data-original')
                if urls.startswith('https'):
                    url = urls.replace('\"', '').replace('com/50/', 'com/').replace('_hd', '_r').replace('_ipico',
                                                                                                          '_r').replace(
                        '_120x160', '_r').replace('_180x120', '_r')
                    url_list.append(url)
            else:
                continue
        if len(url_list) > 0:
            len_url = len(url_list)
            for index, url in enumerate(url_list):
                t = time.time()
                name2 = int(round(t * 1000))
                print('一共%s张图片,当前是第%s,当前url是 %s' % (len_url, index + 1, url))
                down(url, '357826995', name2)
        time.sleep(5)
        bro.quit()
    except:
        print('驱动不存在!')
    看这个,加了进度条
  • 相关阅读:
    Ubuntu查看端口占用情况
    在jupyter中添加新的环境
    C++指针
    C++排序:冒泡排序,简单选择排序,直接插入排序,希尔排序,堆排序,归并排序,快速排序
    查找一:C++静态查找
    C++链式队列
    C++顺序循环队列
    C++链式栈
    C++顺序栈
    C++双向循环链表
  • 原文地址:https://www.cnblogs.com/Simonsun002/p/9932718.html
Copyright © 2011-2022 走看看