zoukankan html css js c++ java
一点资讯视频抓取 phantomjs

# _*_ coding: utf-8 _*_

"""
思路:
1.列表页使用phantomjs模拟点击2.每个链接只抓取第一页9-10条内容,按照标题去重
3.布置定时任务,每天8点执行一次
"""

import MySQLdb
import redis
import sys
import os
import re
import urllib
import requests
import time
import hashlib
import traceback
import urlparse
import random
import signal
# import multiprocessing
import matplotlib
matplotlib.use("Agg")
import shutil
import socket #图片下载延迟的
socket.setdefaulttimeout(30)
import multiprocessing
from config import IConfig
from video_list import ydzx_url_list
from bs4 import BeautifulSoup
from upload_images import UploadFile
from moviepy.editor import VideoFileClip
from selenium import webdriver
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities

reload(sys)
sys.setdefaultencoding('utf-8')

class WxpnVideo(multiprocessing.Process):

    def __init__(self):
        self.redisConf = IConfig.load('resource.redis')
        self.redisServer = redis.Redis(host=self.redisConf['host'], port=self.redisConf['port'], db=self.redisConf['db'], password=self.redisConf['passwd'])

        self.dbConfig = IConfig.load('resource.mysql')
        self.conn = MySQLdb.connect(
            user = self.dbConfig['user'],
            passwd = self.dbConfig['password'],
            db = self.dbConfig['dbname'],
            host = self.dbConfig['host'],
            charset = "utf8",
            use_unicode = True)

        self.conn.ping(True)
        self.cursor = self.conn.cursor()

        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36',
            'Host': 'www.yidianzixun.com',
            'X-Requested-With': 'XMLHttpRequest',
            'Connection': 'keep-alive',
            'Accept': '*/*',
            'Accept-Encoding': 'gzip, deflate',
            'Accept-Language': 'zh-CN,zh;q=0.9',
        }

        self.domain = IConfig.load('resource.domain')
        self.apiConf = IConfig.load('resource.apiurl')

        self.key_video_list = 'wxpn:video:list'
        self.key_title = 'wxpn:video:title'

        self.storeConfig = IConfig.load('resource.store')
        self.thumb_path = self.storeConfig['images_path']

        self.ossConf = IConfig.load('resource.oss')
        self.key_id = self.ossConf['access_key_id']
        self.key_secret = self.ossConf['access_key_secret']
        self.endponit = self.ossConf['endponit']

        self.img_upload = UploadFile()
        self.auth = self.img_upload.auth_oss(self.key_id, self.key_secret)

        self.videoConf = IConfig.load('resource.apiurl')
        self.video_publish = self.videoConf['video_publish_api']
        self.ydzx_page_api = self.videoConf['ydzx_page_api']

        self.start_time = int(time.time())
        multiprocessing.Process.__init__(self)

    def store_video_list_redis(self, video_list):

        if video_list:
            for per_list in video_list:
                if not self.redisServer.sismember(self.key_video_list, per_list):
                    self.redisServer.sadd(self.key_video_list, per_list)
        else:
            return False

    def get_video_para(self):
        while True:
            if self.redisServer.scard(self.key_video_list) == 0:
                break

            link = self.redisServer.spop(self.key_video_list)
            print(link)
            # url = self.ydzx_page_api + link

            # try:
            #     res = requests.get(url=url, timeout=60)
            # except Exception as e:
            #     print('连接失败')
            # print(res.status_code)

            dcap = dict(DesiredCapabilities.PHANTOMJS)
            dcap["phantomjs.page.settings.userAgent"] = (
            "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36"
            )
            try:
                driver = webdriver.PhantomJS(desired_capabilities=dcap, executable_path='/usr/local/phantomjs/bin/phantomjs')

                # driver.set_page_load_timeout(10)
                # driver.set_script_timeout(10)

                time.sleep(random.randrange(3, 8))
                driver.get(link)
                time.sleep(random.randrange(2, 6))

                text = driver.page_source

                driver.service.process.send_signal(signal.SIGTERM)
                driver.quit()

            except Exception as e:

                print(traceback.format_exc())
                continue

            # if res.status_code == 200:
            soup = BeautifulSoup(text, 'lxml')
            title_list = soup.select('div.channel-news div.doc-title')
            itemid_list = soup.select('div.channel-news a.style-content-middle')

            if title_list and itemid_list:

                try:
                    for num, title in enumerate(title_list):
                        m = hashlib.md5()
                        m.update(str(title.text).strip())
                        psw = m.hexdigest()

                        print(title.text)
                        itemid = itemid_list[num]['data-docid']

                        if not self.redisServer.sismember(self.key_title, psw):

                            yield psw, itemid
                except Exception as e:
                    print(traceback.format_exc())
                    continue

            else:
                print('一点资讯视频列表{0}页链接请求失败,请及时查看原因'.format(link))

                # self.form_data['content'] = '【创业黑马】预警:一点资讯视频列表{0}页链接请求失败,请及时查看原因'.format(link)
                # res = requests.post(self.msg_api, data=self.form_data)

    def time_cycle(self,origin_time):
        now = time.time()

        try:
            if origin_time == '昨天':
                published = int(now) - 24*3600
            elif '天' in origin_time:
                day_one = re.compile('(.*?)天')
                published = int(now)-int(day_one.findall(origin_time)[0])*24*3600
            elif '小时' in origin_time:
                hour_one = re.compile('(.*?)小时')
                published = int(now)-int(hour_one.findall(origin_time)[0])*24*60
            elif '分' in origin_time:
                min_one = re.compile('(.*?)分')
                published = int(now)-int(min_one.findall(origin_time)[0])*60
            elif '月' in origin_time:
                month_one = re.compile('(.*?)个月')
                published = int(now)-int(month_one.findall(origin_time)[0])*30*24*3600
            else:
                timeArray = time.strptime(origin_time, "%Y.%m.%d")
                published = int(time.mktime(timeArray))
            return published
        except Exception as e:
            print(traceback.format_exc())

    def download_video(self, psw, itemid):
        now = int(time.time())

        url = 'http://www.yidianzixun.com/article/' + itemid
        print(url)
        self.headers['Referer'] = url
        try:
            res = requests.get(url=url, headers=self.headers, timeout=60)
            print(res.status_code)
        except Exception as e:
            print('小链接连接失败')

        if res.status_code == 200:
            soup = BeautifulSoup(res.text, 'lxml')

            title = soup.select('div.left-wrapper > h2')[0].text

            try:
                video_src = soup.select('div.video-wrapper > video')[0]['src']
            except Exception as e:
                print('此篇为文章,不是视频')

            thumb_src = soup.select('div.video-wrapper > video')[0]['poster']

            try:
                source = soup.select('body.page-article .left-wrapper > .meta > a')[0].text
            except Exception as e:
                source = soup.select('body.page-article .left-wrapper > .meta > span')[0].text
                source_re = re.sub('来源：', '', str(source))
                source = source_re

            publishtime = soup.select('body.page-article .left-wrapper > .meta > span')[0].text

            try:
                timestamp = self.time_cycle(str(publishtime))
            except Exception as e:
                timestamp = now

            img_url_parts = urlparse.urlparse(thumb_src)
            img_url_query = urlparse.parse_qs(img_url_parts.query,True)

            if img_url_query.has_key('wx_fmt'):
                ext_name = '.' + img_url_query['wx_fmt'][0]
            else:
                ext_name = '.png'

            thumb_p = self.thumb_path + 'video/thumb'
            if not os.path.exists(thumb_p):
                os.mkdir(thumb_p)

            img_down_local_path = thumb_p + '/' + psw[:20] + ext_name
            urllib.urlretrieve(thumb_src, img_down_local_path)
            file_name = psw[:20] + ext_name

            if os.path.exists(img_down_local_path):
                images_path = self.ossConf['video_thumb_path']
                status = self.img_upload.upload_to_oss(self.auth, self.endponit, images_path, file_name, img_down_local_path)

            thumb_src = self.domain['img_url_oss'] + 'Cmstop/ydzx/' + file_name
            m = hashlib.md5()
            m.update(str(thumb_src))
            psw_thumb = m.hexdigest()

            try:
                delay_re = re.compile('"duration":(d+)')
                playtime = delay_re.findall(str(res.text))[0]
            except Exception as e:
                print(traceback.format_exc())
                playtime = None

            video_path = self.thumb_path + 'video/' + str(video_src).split('/')[-1]

            video_res = requests.get(video_src).content
            with open(video_path, 'wb') as f:
                f.write(video_res)

            try:
                clip = VideoFileClip(video_path)
                print(clip.duration)
            except Exception as e:
                print(traceback.format_exc())
                return False

            # with open(video_path, 'r') as f:
            #     length = len(f.read())
            #     if length < 819200:
            #         return False

            video_name = str(video_src).split('/')[-1][10:]
            if os.path.exists(video_path):
                images_path = self.ossConf['video_path']
                status = self.img_upload.upload_to_oss(self.auth, self.endponit, images_path, video_name, video_path)

                print('')
                if status != 'success':
                    return False

            print('视频上传成功')
            video_link = self.domain['img_url_oss'] + 'Cmstop/video/ydzx/' + video_name

            try:
                sql = "insert into cmstop_comment_topic(title, description, thumb, created, url_md5, url) values(%s, '', %s, %s, %s, '')"
                params = (title, thumb_src, now, psw_thumb)
                self.cursor.execute(sql, params)
                self.conn.commit()

                topicid = self.cursor.lastrowid

            except Exception as e:
                print(traceback.format_exc())
                self.conn.rollback()

            sourceid = self.get_article_sourceid(source)

            try:
                result = self.cursor.execute("""
                            insert into cmstop_content(topicid, sourceid, catid, modelid, title, subtitle, source_title, source_link, weight, status, created, score, published, thumb, createdby)
                            values(%s, %s, %s, %s, %s, %s, %s, %s, 60, %s, %s, %s, %s, %s, %s)
                            """, (topicid, sourceid, 47, 4, title, None, title, '', 3, now, 0, timestamp, thumb_src, 0))
                self.conn.commit()

                lastrowid = self.cursor.lastrowid
            except Exception as e:
                print(traceback.format_exc())
                self.conn.rollback()

            video_id = str(video_src).split('/')[-1][10:-4]
            try:
                sql = "insert into cmstop_video(contentid, video, playtime, author, video_id, aid) values(%s, %s, %s, %s, %s, %s)"
                print(sql)
                params = (lastrowid, video_link, playtime, source, video_id, 0)
                self.cursor.execute(sql, params)
                self.conn.commit()

            except Exception as e:
                print(traceback.format_exc())
                self.conn.rollback()

            self.redisServer.sadd(self.key_title, psw)

            api_url = self.video_publish + str(lastrowid)
            try:
                resp = urllib.urlopen(api_url)
                result = resp.read()
            except:
                print 'connect failed'

        else:
            print('一点资讯视频主链接请求失败,请及时查看原因')

            # self.form_data['content'] = '【创业黑马】预警:一点资讯视频主链接请求失败,请及时查看原因'
            # res = requests.post(self.msg_api, data=self.form_data)

    def get_article_sourceid(self, source, medias = []):
        source = source.strip()
        sourceid = 0

        """
        print source
        print set([source.encode('utf-8')])
        print medias
        """

        result = self.cursor.execute('select `sourceid`, `name`, `has_signed_contract` from `cmstop_source` where `name`="' + source + '"')
        has_signed_contract = 0

        if medias and (set([source.encode('utf-8')]) & medias):
            has_signed_contract = 1

        if result:
            data = self.cursor.fetchone()
            sourceid = data[0]

            if data[2] != has_signed_contract:
                try:
                    result = self.cursor.execute("""
                        update `cmstop_source` set `has_signed_contract`=%s where sourceid=%s
                        """, (has_signed_contract, sourceid))
                    self.conn.commit()
                except:
                    self.conn.rollback()
        else:
            try:
                result = self.cursor.execute("""
                    insert into `cmstop_source`(`name`, `logo`, `url`, `initial`, `has_signed_contract`)
                    values(%s, %s, %s, %s, %s)
                    """, (source, '', '', '', has_signed_contract))
                self.conn.commit()
                sourceid = self.cursor.lastrowid
            except:
                self.conn.rollback()

        return sourceid

    def run(self):
        os.system('pkill phantomjs')

        lockConf = IConfig.load('resource.lock')
        lock_file = lockConf['lock_path_ydzx']

        if os.path.exists(lock_file):
            print('lock file exists')
            return False
        os.system(r'touch %s '% lock_file)

        self.store_video_list_redis(ydzx_url_list)
        get_video_para = self.get_video_para()

        for psw, itemid in get_video_para:

            print(psw)

            stop_time = int(time.time())
            balance_time = stop_time - self.start_time

            if balance_time >= 10800: #运行时间为3个小时

                self.del_file(self.thumb_path + 'video')
                os.system(r'rm -rf %s' % lock_file)
                os._exit(0)

            try:
                self.download_video(psw=psw, itemid=itemid)

                time.sleep(random.uniform(2, 8))

                os.system('pkill ffmpeg-osx-v3.2.4')

            except Exception as e:
                print(traceback.format_exc())
                continue

        self.del_file(self.thumb_path + 'video')
        os.system(r'rm -rf %s' % lock_file)

    def video_publish(self):
        sql = 'select contentid from cmstop_video where contentid<=3528920 and contentid>=3430851'
        self.cursor.execute(sql)
        data = self.cursor.fetchall()

        for num in data:
            api_url = self.video_publish + str(num[0])
            try:
                resp = urllib.urlopen(api_url)
                result = resp.read()
            except:
                print 'connect failed'

    def del_file(self, path):
        os.chdir(path) #进入要清空的目录
        ds = list(os.listdir(path)) #获得该目录下所有文件或文件夹列表皮

        for d in ds: #遍历该列表

            if os.path.isfile(d): #如果列表项是文件
                os.remove(d) #直接删除
            else: #如果不会文件
                shutil.rmtree(d) #也直接删除

if __name__ == '__main__':

    video_one = WxpnVideo()
    # video_two = WxpnVideo()

    video_one.start()
    # video_two.start()

    video_one.join()
    # video_two.join()
查看全文
相关阅读:
DB-MySQL：MySQL 正则表达式
 DB-MySQL：MySQL 事务
 DB-MySQL：MySQL 索引
 DB-MySQL：MySQL 临时表
 DB-MySQL：MySQL 复制表
 DB-MySQL：MySQL 序列使用
 DB-MySQL：MySQL 处理重复数据
 DB-MySql：MySQL 及 SQL 注入
 mysql
PHP+jQuery 注册模块的改进之一：验证码存入SESSION
原文地址：https://www.cnblogs.com/19921019yy/p/9355369.html
一点资讯 视频抓取 phantomjs

一点资讯视频抓取 phantomjs