zoukankan      html  css  js  c++  java
  • 9.python爬虫--pyspider

    pyspider简介

    PySpider:一个国人编写的强大的网络爬虫系统并带有强大的WebUI。采用Python语言编写,分布式架构,支持多种数据库后端,强大的WebUI支持脚本编辑器,任务监视器,项目管理器以及结果查看器。在线示例:http://demo.pyspider .org/,学习教程:http://www.pyspider.cn/page/1.html

    项目需求:

    爬去 http://www.adquan.com/ 上面的文章供公司内部学习

    简单实现,初步版(命名规范忽略):

    # -*- coding: utf-8 -*-
    #__author:jiangjing
    #date:2018/2/2
    # !/usr/bin/env python
    # -*- encoding: utf-8 -*-
    # Created on 2017-12-07 13:40:43
    # Project: adquan
    
    from pyspider.libs.base_handler import *
    import os
    import urlparse
    import datetime
    import re
    import requests
    import uuid
    
    UPLOAD_IMAGE_URL = "http://10.32.64.194:8233/api/NoLogin/UploadImage"       #上传图片至目标服务器
    ADD_WEIBO_ARTICLE_URL = "http://10.32.64.194:8233/api/NoLogin/AddDraft"     #把当前文章添加到草稿
    WEIBO_IMAGE_URL = "/upload/image"                                               #地址拼接用
    PLAY_VIDEO_URL = "http://10.32.64.196:10001/hls/video/catch"                         #播放视频的地址
    IMAGE_DIR_PATH = "/var/hls/image/catch"                                         #图片存放地址
    VIDEO_DIR_PATH = "/var/hls/video/catch"                                         #视频存放地址
    
    class Handler(BaseHandler):
        crawl_config = {
        }
    
        def __init__(self):
            self.deal = Deal()
    
        @every(minutes=24 * 60 * 3)
        def on_start(self):
            self.crawl('http://www.adquan.com', callback=self.index_page)
    
        @config(age=100 * 24 * 60 * 60)
        def index_page(self, response):
            for each in response.doc('.work_list_left .w_l_inner_img a').items():
                self.crawl(each.attr.href, callback=self.detail_page)
    
        @config(priority=2)
        def detail_page(self, response):
            today_str = datetime.date.today().strftime("%Y-%m-%d")
            # 抓取封面
            cover_guid = ''
            for img in response.doc('.con_pic_title img').items():
                url = img.attr.src
                if url:
                    image_path = self.deal.getImageDirPath()
                    extension = self.deal.getExtension(url)
                    guid = str(uuid.uuid1()).replace('-', '')
                    file_name = "origin_" + guid + '.' + extension
                    file_path = image_path + '/' + file_name
                    content = requests.get(url).content
                    self.deal.saveImg(content, file_path)
                    self.upload_image_to_weibo(file_path, guid, file_name)
                    cover_guid = guid
    
            # 爬取图片
            for img in response.doc('.con_Text img').items():
                url = img.attr.src
                if url:
                    extension = self.deal.getExtension(url)
                    guid = str(uuid.uuid1()).replace('-', '')
                    file_name = "origin_" + guid + '.' + extension
                    self.crawl(img.attr.src, callback=self.save_img, save={'file_name': file_name, 'guid': guid})
                    img.attr.src = '%s/%s/%s' % (WEIBO_IMAGE_URL, datetime.date.today().strftime("%Y%m%d"), file_name)
            # 抓取视频
            for video in response.doc('.con_Text iframe').items():
                width = video.attr.width
                if not 
                    width = 600
                iframe_url = str(video.attr('data-src')).strip()
                if not video.attr('data-src'):
                    iframe_url = str(video.attr.src).strip()
                if not iframe_url:
                    continue
                ret = urlparse.urlparse(iframe_url)
                vids = re.findall('vid=(d|w*)&?', ret.query)
                if not vids or not vids[0].strip():
                    logger.error("get video id failed, url:%s" % (url))
                    continue
                guid = str(uuid.uuid1()).replace('-', '')
                play_url = '%s/%s/%s.mp4' % (PLAY_VIDEO_URL, today_str, guid)
                cover_img= '%s/%s/%s.jpg' % (PLAY_VIDEO_URL, today_str, guid)
                video.replaceWith('<video controls="1" src=%s width=%s poster=%s></video>' % (play_url, str(width), cover_img))
                video.attr.poster = cover_img
                self.download_video(vids[0].strip(), guid)
            if response.doc('.text_title').text() != '':
                html_content = response.doc('.con_Text').html()
                text_content = self.deal.filterTag(html_content)
                self.add_article_to_weibo(response.doc('.text_title').text(), html_content , text_content, 2,
                                          cover_guid)
    
        def add_article_to_weibo(self, title, content, contentText, articleType, picguid):
            data = {'title': title, "content": content, "contentText": contentText, "articleType": articleType,
                    "picguid": picguid}
            response = requests.post(ADD_WEIBO_ARTICLE_URL, data=data)
            return {
                "add_article_to_weibo": response.text
            }
    
        def download_video(self, vid, guid):
            data = {
                "otype": "xml",
                "platform": 11,
                "format": 2,
                "vid": vid,
                "filename": "1.mp4",
                "appver": '3.2.19.333'
            }
            vurl = 'http://vv.video.qq.com/getkey';
            try:
                response = requests.post('http://vv.video.qq.com/getkey', data=data)
                keys = re.findall('<key>([sS]+?)</key>', response.text)
                if len(keys) != 0:
                    video_url = 'http://videohy.tc.qq.com/video.dispatch.tc.qq.com/%s.mp4?vkey=%s&ocid=2692093356' % (
                    vid, keys[0].strip())
                    ext = ".mp4"
                    ret = urlparse.urlsplit(video_url)
                    items = ret.path.split('/')
                    today_str = datetime.date.today().strftime("%Y-%m-%d")
                    if items[-1]:
                        idx = items[-1].find(".")
                        ext = items[-1][idx:]
                    response = requests.get(video_url)
                    self.save_vedio(response.content, guid)
            finally:
                pass
    
        def save_img(self, response):
            content = response.content
            image_path = self.deal.getImageDirPath()
            file_name = response.save['file_name']
            guid = response.save['guid']
            file_path = image_path + '/' + file_name
            self.deal.saveImg(content, file_path)
            self.upload_image_to_weibo(file_path, guid, file_name)
    
        def upload_image_to_weibo(self, file_path, guid, file_name):
            data = {'guid': guid, "fileName": file_name}
            files = {'file': open(file_path, 'rb')}
            response = requests.post(UPLOAD_IMAGE_URL, data=data, files=files)
            return {
                "upload_image": response.text
            }
    
        def save_vedio(self, content, guid):
            import threading
            ext = ".mp4"
            file_name = guid + ext
            video_path = self.deal.getVideoDirPath()
            file_path = video_path + '/' + file_name
            self.deal.saveVedio(content, file_path)
            os.system('ffmpeg -i %s -y -f  image2  -ss 1 -vframes 1  %s' % (file_path, file_path.replace('.mp4', '.jpg')))
    
        def cut_video(self, shell):
            os.system(shell)
    
    
    class Deal:
        def __init__(self):
            today_str = datetime.date.today().strftime("%Y-%m-%d")
            self.mkDir('%s/%s' % (IMAGE_DIR_PATH, today_str))
            self.mkDir('%s/%s' % (VIDEO_DIR_PATH, today_str))
    
        def getImageDirPath(self):
            today_str = datetime.date.today().strftime("%Y-%m-%d")
            return '%s/%s' % (IMAGE_DIR_PATH, today_str);
    
        def getVideoDirPath(self):
            today_str = datetime.date.today().strftime("%Y-%m-%d")
            return '%s/%s' % (VIDEO_DIR_PATH, today_str);
    
        def mkDir(self, path):
            path = path.strip()
            exists = os.path.exists(path)
            if not exists:
                os.makedirs(path)
                return path
            else:
                return path
    
        def saveImg(self, content, path):
            f = open(path, 'wb')
            f.write(content)
            f.close()
    
        def saveVedio(self, content, path):
            f = open(path, 'wb')
            f.write(content)
            f.close()
    
        def getExtension(self, url):
            extension = url.split('.')[-1]
            return extension
    
        #将HTML中标签等信息去掉
        #@param htmlstr HTML字符串.'''
        def filterTag(self, htmlstr):
            re_cdata = re.compile('<!DOCTYPE HTML PUBLIC[^>]*>', re.I)
            re_script = re.compile('<s*script[^>]*>[^<]*<s*/s*scripts*>', re.I) #过滤脚本
            re_style = re.compile('<s*style[^>]*>[^<]*<s*/s*styles*>', re.I) #过滤style
            re_br = re.compile('<brs*?/?>')
            re_h = re.compile('</?w+[^>]*>')
            re_comment = re.compile('<!--[sS]*-->')
            s = re_cdata.sub('', htmlstr)
            s = re_script.sub('', s)
            s=re_style.sub('',s)
            s=re_br.sub('
    ',s)
            s=re_h.sub(' ',s)
            s=re_comment.sub('',s)
            blank_line=re.compile('
    +')
            s=blank_line.sub('
    ',s)
            s=re.sub('s+',' ',s)
            s=self.replaceCharEntity(s)
            return s
    
        '''##替换常用HTML字符实体.
        #使用正常的字符替换HTML中特殊的字符实体.
        #你可以添加新的实体字符到CHAR_ENTITIES中,处理更多HTML字符实体.
        #@param htmlstr HTML字符串.'''
        def replaceCharEntity(self, htmlstr):
            CHAR_ENTITIES={'nbsp':'','160':'',
                            'lt':'<','60':'<',
                            'gt':'>','62':'>',
                            'amp':'&','38':'&',
                            'quot':'"''"','34':'"'}
            re_charEntity=re.compile(r'&#?(?P<name>w+);') #命名组,把 匹配字段中w+的部分命名为name,可以用group函数获取
            sz=re_charEntity.search(htmlstr)
            while sz:
                #entity=sz.group()
                key=sz.group('name') #命名组的获取
                try:
                    htmlstr=re_charEntity.sub(CHAR_ENTITIES[key],htmlstr,1) #1表示替换第一个匹配
                    sz=re_charEntity.search(htmlstr)
                except KeyError:
                    htmlstr=re_charEntity.sub('',htmlstr,1)
                    sz=re_charEntity.search(htmlstr)
            return htmlstr
  • 相关阅读:
    Android中Scrollview、ViewPager冲突问题汇总(已解决)
    Android 关于ZXing的使用
    startActivityForResult用法详解
    SVN的使用(服务端与客户端)
    Genymotion安装常见问题
    Android Viewpager实现图片轮播(仿优酷效果)
    Android Shape 详解
    Android apktool反编译资源文件为空解决办法(测试天猫、淘宝等apk成功)
    查看CentOS版本
    新建git仓库并与github同步
  • 原文地址:https://www.cnblogs.com/jiangjing/p/8427680.html
Copyright © 2011-2022 走看看