zoukankan      html  css  js  c++  java
  • 一点资讯 视频抓取 phantomjs

    # _*_ coding: utf-8 _*_
    
    """
    思路:
    1.列表页使用phantomjs模拟点击2.每个链接只抓取第一页9-10条内容,按照标题去重
    3.布置定时任务,每天8点执行一次
    """
    
    import MySQLdb
    import redis
    import sys
    import os
    import re
    import urllib
    import requests
    import time
    import hashlib
    import traceback
    import urlparse
    import random
    import signal
    # import multiprocessing
    import matplotlib
    matplotlib.use("Agg")
    import shutil
    import socket #图片下载延迟的
    socket.setdefaulttimeout(30)
    import multiprocessing
    from config import IConfig
    from video_list import ydzx_url_list
    from bs4 import BeautifulSoup
    from upload_images import UploadFile
    from moviepy.editor import VideoFileClip
    from selenium import webdriver
    from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
    
    reload(sys)
    sys.setdefaultencoding('utf-8')
    
    class WxpnVideo(multiprocessing.Process):
    
        def __init__(self):
            self.redisConf = IConfig.load('resource.redis')
            self.redisServer = redis.Redis(host=self.redisConf['host'], port=self.redisConf['port'], db=self.redisConf['db'], password=self.redisConf['passwd'])
    
            self.dbConfig = IConfig.load('resource.mysql')
            self.conn = MySQLdb.connect(
                user = self.dbConfig['user'],
                passwd = self.dbConfig['password'],
                db = self.dbConfig['dbname'],
                host = self.dbConfig['host'],
                charset = "utf8",
                use_unicode = True)
    
            self.conn.ping(True)
            self.cursor = self.conn.cursor()
    
            self.headers = {
                'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36',
                'Host': 'www.yidianzixun.com',
                'X-Requested-With': 'XMLHttpRequest',
                'Connection': 'keep-alive',
                'Accept': '*/*',
                'Accept-Encoding': 'gzip, deflate',
                'Accept-Language': 'zh-CN,zh;q=0.9',
            }
    
            self.domain = IConfig.load('resource.domain')
            self.apiConf = IConfig.load('resource.apiurl')
    
            self.key_video_list = 'wxpn:video:list'
            self.key_title = 'wxpn:video:title'
    
            self.storeConfig = IConfig.load('resource.store')
            self.thumb_path = self.storeConfig['images_path']
    
            self.ossConf = IConfig.load('resource.oss')
            self.key_id = self.ossConf['access_key_id']
            self.key_secret = self.ossConf['access_key_secret']
            self.endponit = self.ossConf['endponit']
    
            self.img_upload = UploadFile()
            self.auth = self.img_upload.auth_oss(self.key_id, self.key_secret)
    
            self.videoConf = IConfig.load('resource.apiurl')
            self.video_publish = self.videoConf['video_publish_api']
            self.ydzx_page_api = self.videoConf['ydzx_page_api']
    
            self.start_time = int(time.time())
            multiprocessing.Process.__init__(self)
    
        def store_video_list_redis(self, video_list):
    
            if video_list:
                for per_list in video_list:
                    if not self.redisServer.sismember(self.key_video_list, per_list):
                        self.redisServer.sadd(self.key_video_list, per_list)
            else:
                return False
    
        def get_video_para(self):
            while True:
                if self.redisServer.scard(self.key_video_list) == 0:
                    break
    
                link = self.redisServer.spop(self.key_video_list)
                print(link)
                # url = self.ydzx_page_api + link
    
                # try:
                #     res = requests.get(url=url, timeout=60)
                # except Exception as e:
                #     print('连接失败')
                # print(res.status_code)
    
                dcap = dict(DesiredCapabilities.PHANTOMJS)
                dcap["phantomjs.page.settings.userAgent"] = (
                "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36"
                )
                try:
                    driver = webdriver.PhantomJS(desired_capabilities=dcap, executable_path='/usr/local/phantomjs/bin/phantomjs')
    
                    # driver.set_page_load_timeout(10)
                    # driver.set_script_timeout(10)
    
                    time.sleep(random.randrange(3, 8))
                    driver.get(link)
                    time.sleep(random.randrange(2, 6))
    
                    text = driver.page_source
    
                    driver.service.process.send_signal(signal.SIGTERM)
                    driver.quit()
    
                except Exception as e:
    
                    print(traceback.format_exc())
                    continue
    
                # if res.status_code == 200:
                soup = BeautifulSoup(text, 'lxml')
                title_list = soup.select('div.channel-news div.doc-title')
                itemid_list = soup.select('div.channel-news a.style-content-middle')
    
                if title_list and itemid_list:
    
                    try:
                        for num, title in enumerate(title_list):
                            m = hashlib.md5()
                            m.update(str(title.text).strip())
                            psw = m.hexdigest()
    
                            print(title.text)
                            itemid = itemid_list[num]['data-docid']
    
                            if not self.redisServer.sismember(self.key_title, psw):
    
                                yield psw, itemid
                    except Exception as e:
                        print(traceback.format_exc())
                        continue
    
                else:
                    print('一点资讯视频列表{0}页链接请求失败,请及时查看原因'.format(link))
    
                    # self.form_data['content'] = '【创业黑马】预警:一点资讯视频列表{0}页链接请求失败,请及时查看原因'.format(link)
                    # res = requests.post(self.msg_api, data=self.form_data)
    
        def time_cycle(self,origin_time):
            now = time.time()
    
            try:
                if origin_time == '昨天':
                    published = int(now) - 24*3600
                elif '' in origin_time:
                    day_one = re.compile('(.*?)天')
                    published = int(now)-int(day_one.findall(origin_time)[0])*24*3600
                elif '小时' in origin_time:
                    hour_one = re.compile('(.*?)小时')
                    published = int(now)-int(hour_one.findall(origin_time)[0])*24*60
                elif '' in origin_time:
                    min_one = re.compile('(.*?)分')
                    published = int(now)-int(min_one.findall(origin_time)[0])*60
                elif '' in origin_time:
                    month_one = re.compile('(.*?)个月')
                    published = int(now)-int(month_one.findall(origin_time)[0])*30*24*3600
                else:
                    timeArray = time.strptime(origin_time, "%Y.%m.%d")
                    published = int(time.mktime(timeArray))
                return published
            except Exception as e:
                print(traceback.format_exc())
    
        def download_video(self, psw, itemid):
            now = int(time.time())
    
            url = 'http://www.yidianzixun.com/article/' + itemid
            print(url)
            self.headers['Referer'] = url
            try:
                res = requests.get(url=url, headers=self.headers, timeout=60)
                print(res.status_code)
            except Exception as e:
                print('小链接连接失败')
    
            if res.status_code == 200:
                soup = BeautifulSoup(res.text, 'lxml')
    
                title = soup.select('div.left-wrapper > h2')[0].text
    
                try:
                    video_src = soup.select('div.video-wrapper > video')[0]['src']
                except Exception as e:
                    print('此篇为文章,不是视频')
    
                thumb_src = soup.select('div.video-wrapper > video')[0]['poster']
    
                try:
                    source = soup.select('body.page-article .left-wrapper > .meta > a')[0].text
                except Exception as e:
                    source = soup.select('body.page-article .left-wrapper > .meta > span')[0].text
                    source_re = re.sub('来源:', '', str(source))
                    source = source_re
    
                publishtime = soup.select('body.page-article .left-wrapper > .meta > span')[0].text
    
                try:
                    timestamp = self.time_cycle(str(publishtime))
                except Exception as e:
                    timestamp = now
    
                img_url_parts = urlparse.urlparse(thumb_src)
                img_url_query = urlparse.parse_qs(img_url_parts.query,True)
    
                if img_url_query.has_key('wx_fmt'):
                    ext_name = '.' + img_url_query['wx_fmt'][0]
                else:
                    ext_name = '.png'
    
                thumb_p = self.thumb_path + 'video/thumb'
                if not os.path.exists(thumb_p):
                    os.mkdir(thumb_p)
    
                img_down_local_path = thumb_p + '/' + psw[:20] + ext_name
                urllib.urlretrieve(thumb_src, img_down_local_path)
                file_name = psw[:20] + ext_name
    
                if os.path.exists(img_down_local_path):
                    images_path = self.ossConf['video_thumb_path']
                    status = self.img_upload.upload_to_oss(self.auth, self.endponit, images_path, file_name, img_down_local_path)
    
                thumb_src = self.domain['img_url_oss'] + 'Cmstop/ydzx/' + file_name
                m = hashlib.md5()
                m.update(str(thumb_src))
                psw_thumb = m.hexdigest()
    
                try:
                    delay_re = re.compile('"duration":(d+)')
                    playtime = delay_re.findall(str(res.text))[0]
                except Exception as e:
                    print(traceback.format_exc())
                    playtime = None
    
                video_path = self.thumb_path + 'video/' + str(video_src).split('/')[-1]
    
                video_res = requests.get(video_src).content
                with open(video_path, 'wb') as f:
                    f.write(video_res)
    
                try:
                    clip = VideoFileClip(video_path)
                    print(clip.duration)
                except Exception as e:
                    print(traceback.format_exc())
                    return False
    
                # with open(video_path, 'r') as f:
                #     length = len(f.read())
                #     if length < 819200:
                #         return False
    
                video_name = str(video_src).split('/')[-1][10:]
                if os.path.exists(video_path):
                    images_path = self.ossConf['video_path']
                    status = self.img_upload.upload_to_oss(self.auth, self.endponit, images_path, video_name, video_path)
    
                    print('')
                    if status != 'success':
                        return False
    
                print('视频上传成功')
                video_link = self.domain['img_url_oss'] + 'Cmstop/video/ydzx/' + video_name
    
                try:
                    sql = "insert into cmstop_comment_topic(title, description, thumb, created, url_md5, url) values(%s, '', %s, %s, %s, '')"
                    params = (title, thumb_src, now, psw_thumb)
                    self.cursor.execute(sql, params)
                    self.conn.commit()
    
                    topicid = self.cursor.lastrowid
    
                except Exception as e:
                    print(traceback.format_exc())
                    self.conn.rollback()
    
                sourceid = self.get_article_sourceid(source)
    
                try:
                    result = self.cursor.execute("""
                                insert into cmstop_content(topicid, sourceid, catid, modelid, title, subtitle, source_title, source_link, weight, status, created, score, published, thumb, createdby)
                                values(%s, %s, %s, %s, %s, %s, %s, %s, 60, %s, %s, %s, %s, %s, %s)
                                """, (topicid, sourceid, 47, 4, title, None, title, '', 3, now, 0, timestamp, thumb_src, 0))
                    self.conn.commit()
    
                    lastrowid = self.cursor.lastrowid
                except Exception as e:
                    print(traceback.format_exc())
                    self.conn.rollback()
    
                video_id = str(video_src).split('/')[-1][10:-4]
                try:
                    sql = "insert into cmstop_video(contentid, video, playtime, author, video_id, aid) values(%s, %s, %s, %s, %s, %s)"
                    print(sql)
                    params = (lastrowid, video_link, playtime, source, video_id, 0)
                    self.cursor.execute(sql, params)
                    self.conn.commit()
    
                except Exception as e:
                    print(traceback.format_exc())
                    self.conn.rollback()
    
                self.redisServer.sadd(self.key_title, psw)
    
                api_url = self.video_publish + str(lastrowid)
                try:
                    resp = urllib.urlopen(api_url)
                    result = resp.read()
                except:
                    print 'connect failed'
    
            else:
                print('一点资讯视频主链接请求失败,请及时查看原因')
    
                # self.form_data['content'] = '【创业黑马】预警:一点资讯视频主链接请求失败,请及时查看原因'
                # res = requests.post(self.msg_api, data=self.form_data)
    
        def get_article_sourceid(self, source, medias = []):
            source = source.strip()
            sourceid = 0
    
            """
            print source
            print set([source.encode('utf-8')])
            print medias
            """
    
            result = self.cursor.execute('select `sourceid`, `name`, `has_signed_contract` from `cmstop_source` where `name`="' + source + '"')
            has_signed_contract = 0
    
            if medias and (set([source.encode('utf-8')]) & medias):
                has_signed_contract = 1
    
            if result:
                data = self.cursor.fetchone()
                sourceid = data[0]
    
                if data[2] != has_signed_contract:
                    try:
                        result = self.cursor.execute("""
                            update `cmstop_source` set `has_signed_contract`=%s where sourceid=%s
                            """, (has_signed_contract, sourceid))
                        self.conn.commit()
                    except:
                        self.conn.rollback()
            else:
                try:
                    result = self.cursor.execute("""
                        insert into `cmstop_source`(`name`, `logo`, `url`, `initial`, `has_signed_contract`)
                        values(%s, %s, %s, %s, %s)
                        """, (source, '', '', '', has_signed_contract))
                    self.conn.commit()
                    sourceid = self.cursor.lastrowid
                except:
                    self.conn.rollback()
    
            return sourceid
    
        def run(self):
            os.system('pkill phantomjs')
    
            lockConf = IConfig.load('resource.lock')
            lock_file = lockConf['lock_path_ydzx']
    
            if os.path.exists(lock_file):
                print('lock file exists')
                return False
            os.system(r'touch %s '% lock_file)
    
            self.store_video_list_redis(ydzx_url_list)
            get_video_para = self.get_video_para()
    
            for psw, itemid in get_video_para:
    
                print(psw)
    
                stop_time = int(time.time())
                balance_time = stop_time - self.start_time
    
                if balance_time >= 10800: #运行时间为3个小时
    
                    self.del_file(self.thumb_path + 'video')
                    os.system(r'rm -rf %s' % lock_file)
                    os._exit(0)
    
                try:
                    self.download_video(psw=psw, itemid=itemid)
    
                    time.sleep(random.uniform(2, 8))
    
                    os.system('pkill ffmpeg-osx-v3.2.4')
    
                except Exception as e:
                    print(traceback.format_exc())
                    continue
    
            self.del_file(self.thumb_path + 'video')
            os.system(r'rm -rf %s' % lock_file)
    
        def video_publish(self):
            sql = 'select contentid from cmstop_video where contentid<=3528920 and contentid>=3430851'
            self.cursor.execute(sql)
            data = self.cursor.fetchall()
    
            for num in data:
                api_url = self.video_publish + str(num[0])
                try:
                    resp = urllib.urlopen(api_url)
                    result = resp.read()
                except:
                    print 'connect failed'
    
        def del_file(self, path):
            os.chdir(path) #进入要清空的目录
            ds = list(os.listdir(path)) #获得该目录下所有文件或文件夹列表皮
    
            for d in ds: #遍历该列表
    
                if os.path.isfile(d): #如果列表项是文件
                    os.remove(d) #直接删除
                else: #如果不会文件
                    shutil.rmtree(d) #也直接删除
    
    if __name__ == '__main__':
    
        video_one = WxpnVideo()
        # video_two = WxpnVideo()
    
        video_one.start()
        # video_two.start()
    
        video_one.join()
        # video_two.join()
  • 相关阅读:
    python读取excel保存到mysql
    python读取mysql返回json
    在C#后台使用MD5值对文件进行加
    使用文件流的形式上传大文件
    IE8兼容性问题
    解决 CentOS 下找不到库文件的问题
    openssl/ossl_typ.h:没有那个文件或目录
    解决 VSCode 进行 C/C++ 开发时 gcc 依赖缺失问题
    VSCode 中进行 C/C++ 开发需要的配置文件
    记一下使用 WeBASE 搭建自己的联盟链过程
  • 原文地址:https://www.cnblogs.com/19921019yy/p/9355369.html
Copyright © 2011-2022 走看看