zoukankan      html  css  js  c++  java
  • macOS下python3通过scrapy框架重新生成不得姐网站视频采集过程日志

    1.搭建虚拟python3环境(Virtualenvwrapper)

    参考http://www.cnblogs.com/it-tsz/p/pyhton.html

    2.安装scrapy

    前提先安装好pip,setuptools,然后安装以下模块

     pip install lxml

    pip install twisted

    pip install pyopenssl

    windows下需要安装pywin32(pip install pywin32)

    最后安装scrapy

    pip install scrapy

    3.通过scrapy生成scrapy spider 工程模版

    scrapy startproject <project_name> [project_dir]

    如:

    scrapy startproject budejie

    4.生成spider模块

    scrapy genspider [options] <name> <domain>

    如:

    cd budejie 

    scrapy genspider getbudejievideo budejie.com

    5.修改spider模块(getbudejievideo.py)

    # -*- coding: utf-8 -*-
    import scrapy
    import os
    import urllib
    from lxml import etree


    # urlretrieve()的回调函数,显示当前的下载进度
    # a为已经下载的数据块
    # b为数据块大小
    # c为远程文件的大小


    def jindu(a, b, c):
    if not a:
    print("连接打开")
    if c < 0:
    print("要下载的文件大小为0")
    else:

    per = 100 * a * b / c

    if per > 100:
    per = 100
    print(" 当前下载进度为:" + '%.2f%%' % per, end='')
    if per == 100:
    return True


    ''' def __init__(self):
    self.headers = {
    # 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
    # 'Accept-Encoding': 'gzip, deflate',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'
    }
    '''


    class GetbudejievideoSpider(scrapy.Spider):
    name = 'getbudejievideo'
    allowed_domains = ['budejie.com']
    start_urls = ['http://budejie.com/video']

    cur_page = 1

    def parse(self, response):
    print('*' * 100)

    # 创建video文件保持目录
    path = os.path.join(os.path.abspath(os.path.curdir), 'videos')
    if not os.path.exists(path):
    os.mkdir(path)
    # 获取当前页所有video 的url

    try:
    data = etree.HTML(response.text)
    video_urls = data.xpath('//div[@class="j-video-c"]/div[@data-mp4]')

    # <a href="2" class="pagenxt">下一页</a>
    nextpage = data.xpath('//a[@class="pagenxt"]')
    if nextpage:
    nextpage = nextpage[0].get('href')

    except Exception:
    print('lxml parse failed------------------------------')
    return
    if not video_urls:
    return
    # 下载当前页下所有video url对应的视频文件
    for v in video_urls:
    # if v:
    video_url = v.get('data-mp4')
    print('下载:{}'.format(video_url))
    p = os.path.join(path, v.get('data-mp4').split('/')[-1])

    print(p)

    if not os.path.exists(p):
    try:
    urllib.request.urlretrieve(video_url, p, jindu)
    except Exception:
    print(" 下载文件:{}失败".format(video_url))

    # 检测是否有下一页
    if nextpage:
    if nextpage == '1':
    return
    nextpage_url = self.start_urls【0】 + '/' + nextpage
                self.cur_page += 1
    print(' 下载第{}页视频数据:{}'.format(self.cur_page, nextpage_url))
    #通过生成器对新生成的url继续回调parse
    yield scrapy.Request(nextpage_url, callback=self.parse)

    else:
    return
    4.修改配置文件settings.py以下参数选项
    。。。
    #以下为模拟浏览器验证
    # Crawl responsibly by identifying yourself (and your website) on the user-agent
    # USER_AGENT = 'budejie (+http://www.budejie.com)'
    USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36'

    # Obey robots.txt rules 跳过robots协议验证
    ROBOTSTXT_OBEY = False
    。。。
    5.通过scrapy开启爬虫数据采集
    scrapy crawl getbudejievideo
    6.测试成功。
    通过该案例,进一步学习了scrapy,xpath等相关知识,实践过程中提高了分析问题和解决问题的能力,继续加油!

     

    不使用外部xpath库源代码修改spider模块(getbudejievideo.py)修改如下:

    # -*- coding: utf-8 -*-
    import scrapy
    import os
    import urllib
    import re

    # urlretrieve()的回调函数,显示当前的下载进度
    # a为已经下载的数据块
    # b为数据块大小
    # c为远程文件的大小

    def jindu(a, b, c):
    if not a:
    print("连接打开")
    if c < 0:
    print("要下载的文件大小为0")
    else:
    # global myper
    per = 100 * a * b / c

    if per > 100:
    per = 100
    print(" 当前下载进度为:" + '%.2f%%' % per, end='')
    if per == 100:
    return True


    class GetbudejievideoSpider(scrapy.Spider):
    name = 'getbudejievideo'
    allowed_domains = ['www.budejie.com']
    start_urls = ['http://www.budejie.com/video']

    cur_page = 1

    def parse(self, response):
    print('下载url:{}'.format(response.url))
    # 创建video文件保持目录
    path = os.path.join(os.path.abspath(os.path.curdir), 'videos')
    if not os.path.exists(path):
    os.mkdir(path)
    # 获取当前页所有video 的url
    print('-' * 100)
    try:
    # data = etree.HTML(response.text)
    video_urls = response.xpath('//div[@class="j-video-c"]/div[@data-mp4]').extract()
    v_urls=[]
    for i in video_urls:
    v= re.findall(r'data-mp4="(.*?)"', i,re.M)
    if v:
    v_urls.append(v[0])
    # <a href="2" class="pagenxt">下一页</a>
    nextpage = response.xpath('//a[@class="pagenxt"]').extract()
    if nextpage:
    nextpage = re.findall(r'href="(.*?)"', nextpage[0])

    except Exception:
    print('lxml parse failed:')
    return
    if not v_urls:
    return
    # 下载当前页下所有video url对应的视频文件
    for v in v_urls:
    # video_url = v.get('data-mp4')
    print('下载:{}'.format(v))
    p = os.path.join(path, v.split('/')[-1])

    print(p)

    if not os.path.exists(p):
    try:
    urllib.request.urlretrieve(v, p, jindu)
    except Exception:
    print(" 下载文件:{}失败".format(v))

    # 检测是否有下一页
    if nextpage:
    if nextpage[0] == '1':
    return

    nextpage_url = self.start_urls[0] + '/' + nextpage[0]
    print(nextpage_url)

    self.cur_page += 1
    print(' 下载第{}页视频数据:{}'.format(self.cur_page, nextpage_url))
    # 通过生成器对新生成的url继续回调parse
    yield scrapy.Request(nextpage_url, callback=self.parse)
    else:
    return

    附属调试模块:(debug.py)

    from scrapy import cmdline

    if __name__ == '__main__':
    cmdline.execute('scrapy crawl getbudejievideo'.split(' '))

    from scrapy import cmdline

    if __name__ == '__main__':
    cmdline.execute('scrapy crawl getbudejievideo'.split(' '))

     

     

    
    
    
    
    
    
  • 相关阅读:
    〖Linux〗Kubuntu设置打开应用时就只在打开时的工作区显示
    〖Linux〗Kubuntu, the application 'Google Chrome' has requested to open the wallet 'kdewallet'解决方法
    unity, dll is not allowed to be included or could not be found
    android check box 自定义图片
    unity, ios skin crash
    unity, Collider2D.bounds的一个坑
    unity, ContentSizeFitter立即生效
    类里的通用成员函数应声明为static
    unity, Gizmos.DrawMesh一个坑
    直线切割凹多边形
  • 原文地址:https://www.cnblogs.com/it-tsz/p/8902833.html
Copyright © 2011-2022 走看看