zoukankan      html  css  js  c++  java
  • 自动爬取ZiMuZu的内容发布到Wordpress

    先说一下大致的步骤. 首先需要模拟浏览器登录网站才能看到相应电影信息, 然后通过正则表达式从网页源代码中筛选出所需要的电影, 最后通过python-wordpress-xmlrpc将信息逐条发布到Wordpress. 以下是代码:

    # coding: utf-8
    import re
    import requests
    import datetime
    import sys
    from wordpress_xmlrpc import Client, WordPressPost
    from wordpress_xmlrpc.methods import posts
    
    # python默认ascii编码, 此处强制它为utf-8编码以实现中文输出
    reload(sys)
    sys.setdefaultencoding('utf8')
    # 变量声明
    today = str(datetime.date.today())[5:]
    yesterday = str(datetime.date.today() + datetime.timedelta(days=-1))[5:]
    login_url = "http://www.zimuzu.tv/User/Login/ajaxLogin"
    today_url = "http://www.zimuzu.tv/today"
    head = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.110 Safari/537.36"
    }
    payload = {
        "account": "your_id",
        "password": "your_pwd",
        "remember": 0,
        "url_back": "http://www.zimuzu.tv/"
    }
    # 登录网页,获取网页源代码
    s = requests.session()
    login_result = s.post(login_url, headers=head, data=payload)
    r = s.get(today_url, headers=head)
    # 正则表达式
    regexp = r'''<trs.*?day="''' + yesterday + r'''".*?>s+
    <tds.*?</td>s+
    <tds.*?</td>s+
    <td><ashref=".*?"starget="_blank">(.*?)</a></td>s+
    <tdsclass="dr_ico">.*?(?:<ashref="(magnet.*?)"[^>]+>(磁)</a>|<ashref="(ed2k.*?)"[^>]+>(驴)</a>)?(?:<ashref="(ed2k.*?)"[^>]+>(驴)</a>|<ashref="(magnet.*?)"[^>]+>(磁)</a>).*?</td>s+
    <td>(.*?)</td>s+
    <tds.*?</td>s+
    </tr>
    '''
    # 将元祖转化为列表, 为了替换"磁", "驴"
    pattern = re.compile(regexp, re.M | re.X)
    result = re.findall(pattern, r.content)  # result = [().()...]
    new_result = []
    for item in result:
        new_result.append(list(item))  # new_result = [[],[]...]
    # 发布博客
    client = Client('https://mvstarblog.wordpress.com/xmlrpc.php', 'your_id', 'your_pwd')
    post = WordPressPost()
    replacement = {'': '磁力下载链接: ', '': '电驴下载链接: '}
    for item in new_result:
        tr = [replacement[x] if x in replacement else x for x in item]
        post.title = tr[0]
        post.content = tr[6] + '
    ' + tr[5] + '
    
    ' + tr[2] + '
    ' + tr[1]
        post.id = client.call(posts.NewPost(post))
        post.post_status = 'publish'
        client.call(posts.EditPost(post.id, post))
        print tr[0] + ' has been published'

    2016-4-18更新正则表达式:

    今天发现第一版的正则表达式对于下列文本会出现匹配结果的序列问题:

    <tr class="list " channel="tv" area="美国" day="04-17" format="720P">
                    <td class="d1">美剧</td>
                    <td class="d2">720P</td>
                    <td><a href="/resource/28688" target="_blank">Banshee.S04E03.720p.HDTV.x264-KILLERS.mkv</a></td>
                    <td class="dr_ico"><a href="ed2k://|file|Banshee.S04E03.720p.HDTV.x264-KILLERS.mkv|1737182740|CF3E15B3C2D53BCA2ECA0AE1C8287091|h=2JMYLXADG7T2QP4W5ELF7Z3M75IMPC3D|/" target="_blank" class="l">驴</a><a oncontextmenu="ThunderNetwork_SetHref(this)" onclick="return OnDownloadClick_Simple(this,2,4);" href="thunder://QUFlZDJrOi8vfGZpbGV8QmFuc2hlZS5TMDRFMDMuNzIwcC5IRFRWLngyNjQtS0lMTEVSUy5ta3Z8MTczNzE4Mjc0MHxDRjNFMTVCM0MyRDUzQkNBMkVDQTBBRTFDODI4NzA5MXxoPTJKTVlMWEFERzdUMlFQNFc1RUxGN1ozTTc1SU1QQzNEfC9aWg==" thunderrestitle="Banshee.S04E03.720p.HDTV.x264-KILLERS.mkv" thundertype="" thunderpid="37361" thunderhref="thunder://QUFlZDJrOi8vfGZpbGV8QmFuc2hlZS5TMDRFMDMuNzIwcC5IRFRWLngyNjQtS0lMTEVSUy5ta3Z8MTczNzE4Mjc0MHxDRjNFMTVCM0MyRDUzQkNBMkVDQTBBRTFDODI4NzA5MXxoPTJKTVlMWEFERzdUMlFQNFc1RUxGN1ozTTc1SU1QQzNEfC9aWg==" class="x">迅</a><a class="m" xmhref="ed2k://|file|Banshee.S04E03.720p.HDTV.x264-KILLERS.mkv|1737182740|CF3E15B3C2D53BCA2ECA0AE1C8287091|h=2JMYLXADG7T2QP4W5ELF7Z3M75IMPC3D|/" rel="xiaomi" target="_blank">&nbsp;小米路由</a><a class="d" target="_blank" rel="yun">云播</a><a class="d" target="_blank" rel="xuan">旋播</a></td>
                    <td>1.62GB</td>
                    <td class="d6">22:14</td>
                </tr>

    结果如下:

    1.	[210-251]	`Banshee.S04E03.720p.HDTV.x264-KILLERS.mkv`
    4.	[305-440]	`ed2k://|file|Banshee.S04E03.720p.HDTV.x264-KILLERS.mkv|1737182740|CF3E15B3C2D53BCA2ECA0AE1C8287091|h=2JMYLXADG7T2QP4W5ELF7Z3M75IMPC3D|/`
    5.	[468-469]	`驴`
    10.	[1418-1424]	`1.62GB`
    11.	[1461-1466]	`22:14`

    这里的序列为1 4 5 10 11, 但我希望的序列应该是1 6 7 10 11, 所以我将代码改成如下:

    <略>
    regexp = r'''<trs.*?day="''' + yesterday + r'''".*?>s+
    <tds.*?</td>s+
    <tds.*?</td>s+
    <td><ashref=".*?"starget="_blank">(.*?)</a></td>s+
    <tdsclass="dr_ico">.*?(?:<ashref="(magnet.*?)"[^>]+>(磁)</a>)?(?:<ashref="(ed2k.*?)"[^>]+>(驴)</a>)?.*?</td>s+
    <td>(.*?)</td>s+
    <tds.*?</td>s+
    </tr>
    '''
    <略>
    post.content = tr[4] + '
    ' + tr[3] + '
    
    ' + tr[2] + '
    ' + tr[1]
    <略>

    2016-4-26更新正则表达式匹配中文字符:

    对于那些"生肉", 我就不抓了, 于是稍稍修改代码, 只抓取标题中有中文的剧集:

    <略>
    regexp = ur'''<trs.*?day="''' + yesterday + ur'''".*?>s+
    <tds.*?</td>s+
    <tds.*?</td>s+
    <td><ashref=".*?"starget="_blank">(.*?[u4e00-u9fa5]+.*?)</a></td>s+
    <tdsclass="dr_ico">.*?(?:<ashref="(magnet.*?)"[^>]+>(磁)</a>|<ashref="(ed2k.*?)"[^>]+>(驴)</a>)?(?:<ashref="(ed2k.*?)"[^>]+>(驴)</a>|<ashref="(magnet.*?)"[^>]+>(磁)</a>).*?</td>s+
    <td>(.*?)</td>s+
    <tds.*?</td>s+
    </tr>
    '''
    <略>
    result = re.findall(pattern, unicode(r.content))
    <略>
    replacement = {u'磁': u'磁力下载链接: ', u'驴': u'电驴下载链接: '}
    <略>

    2016-4-26更新源代码:

    # coding: utf-8
    import re
    import requests
    import datetime
    import sys
    from wordpress_xmlrpc import Client, WordPressPost
    from wordpress_xmlrpc.methods import posts
    
    # python默认ascii编码, 此处强制它为utf-8编码以实现中文输出
    reload(sys)
    sys.setdefaultencoding('utf8')
    # 变量声明
    today = str(datetime.date.today())[5:]
    yesterday = str(datetime.date.today() + datetime.timedelta(days=-1))[5:]
    current = datetime.datetime.now().strftime('%H:%M')
    justnow = (datetime.datetime.now() - datetime.timedelta(hours=1)).strftime('%H:%M')
    login_url = "http://www.zimuzu.tv/User/Login/ajaxLogin"
    today_url = "http://www.zimuzu.tv/today"
    head = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.110 Safari/537.36"
    }
    payload = {
        "account": "your_id",
        "password": "your_pwd",
        "remember": 0,
        "url_back": "http://www.zimuzu.tv/"
    }
    # 登录网页,获取网页源代码
    s = requests.session()
    login_result = s.post(login_url, headers=head, data=payload)
    r = s.get(today_url, headers=head)
    # 替换"磁", "驴"
    r2 = r.content.replace(u'',u'磁力下载链接:').replace(u'', u'电驴下载链接:')
    # 正则表达式
    regexp = ur'''<trs.*?day="''' + yesterday + ur'''".*?>s+
    <tds.*?</td>s+
    <tds.*?</td>s+
    <td><ashref=".*?"starget="_blank">(.*?[u4e00-u9fa5]+.*?)</a></td>s+
    <tdsclass="dr_ico">.*?(?:<ashref="(magnet.*?)"[^>]+>(磁力下载链接:)</a>)?(?:<ashref="(ed2k.*?)"[^>]+>(电驴下载链接:)</a>)?.*?</td>s+
    <td>(.*?)</td>s+
    <tds.*?</td>s+
    </tr>
    '''
    pattern = re.compile(regexp, re.M | re.X)
    result = re.findall(pattern, unicode(r2))
    # 发布博客
    client = Client('https://mvstarblog.wordpress.com/xmlrpc.php', 'your_id', 'your_pwd')
    post = WordPressPost()
    for item in result:
        post.title = item[0]
        post.content = item[4] + '
    ' + item[3] + '
    
    ' + item[2] + '
    ' + item[1]
        post.id = client.call(posts.NewPost(post))
        post.post_status = 'publish'
        client.call(posts.EditPost(post.id, post))
        print item[0] + ' has been published'
  • 相关阅读:
    分布式系统实践解读丨详解高内聚低耦合
    Git:改变世界的一次代码提交
    一分钟带你认识深度学习中的知识蒸馏
    99%的人都能看懂的分布式系统「补偿」机制
    一大波人气博主袭来,现场直播华为全联接2020!
    快速了解前端开发HTML的正确姿势
    华为云IoT智简联接,开启物联世界新纪元
    从一个小程序说起2 C++快速入门03
    从一个小程序说起 C++快速入门02
    PE格式详细讲解6(上) 系统篇06|解密系列
  • 原文地址:https://www.cnblogs.com/IvanChen/p/5377070.html
Copyright © 2011-2022 走看看