zoukankan      html  css  js  c++  java
  • python 爬取百度网盘分享动态

    我之前写的一份爬虫,在百度网盘没有改版之前,有很多资源达人在他们的百度网盘动态分享自己的资源,后来我关注了一批分享影视资源的账号,程序定时去爬取他们的动态,将他们分享出来的百度网盘链接收入自己的数据库,写入数据库之前查询资源是否重复和不良关键词过滤,然后在另一端网页或APP,将数据库的资源展示出来,早期市面上的网盘资源搜索就是运用了这个原理,因为后来百度网盘改版,取消动态分享,程序目前已经无法正常运行,本文做个思路记录。

    程序主入口,实现爬取百度网盘动态分享的功能都写在这个文件了,还负责调用其他文件函数,运行这个脚本就可以不间断的执行

    #   主程序
    import requests,re, json, time
    import random
    from mysql_db import *
    import threading
    from aidy_pc import *
    from yszx import *
    from defs import *
    
    header = {
    "Cookie": "",
    "Host": "pan.baidu.com",
    "Referer": "https://pan.baidu.com/pcloud/friendpage?type=follow&uk=2489863899&self=1",
    "Sec-Fetch-Mode": "cors",
    "Sec-Fetch-Site": "same-origin",
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.87 Safari/537.36",
    "X-Requested-With": "XMLHttpRequest"
    }
    # 获取账号订阅的id
    list_uk = ['2489863899']
    def getShareUser():
        start = 0
        for star in range(100):
            try:
                url = 'https://pan.baidu.com/pcloud/friend/getfollowlist?query_uk=2489863899&limit=24&start=%d' % start
                follows_json = (requests.get(url, headers=header)).json()
    
                if len(follows_json['follow_list']) == 0:  # 如果没有返回数据侧退出
                    break
                lists = follows_json['follow_list']  # 全部信息列表
                for i in lists:  # 遍历分离出每个订阅用户的信息
                    list_uk.append(i['follow_uk'])  # 添加uk到列表
                start = start + 24
                time.sleep(random.randint(10, 25))
            except:
                continue
    
    
        # if list_uk == '':
        #     return False
        # else:
        #     return list_uk
    
    # 程序开始
    def gethtml():           #  爬取网盘资源函数
        tu = getShareUser()   # 这里是去获取我订阅的账号id
        if tu == False:          # 如果获取不到订阅列表,则退出
            pass
        else:
            start = 0
            for uk in list_uk:  # 循环订阅id
                for n in range(2):  # 循环翻页
                    url = "https://pan.baidu.com/pcloud/feed/getdynamiclist?auth_type=1&filter_types=11000&query_uk=%s&category=0&limit=25&start=%s&bdstoken=29b0093f2c23b7afd5f41c39f57be34e&channel=chunlei&clienttype=0&web=1" % (
                    uk, start)
                    filelist_json = requests.get(url, headers=header).json()
                    if filelist_json['errno'] != 0:
                        break
                    list_records = filelist_json['records']  # 本次请求的所有资源列表
                    for data_vaule in list_records:  # 遍历资源列表里的所有字典
                        if data_vaule['category'] == 3:           # 不要图片
                            pass
                        if gjc_gl(data_vaule['title'])==False:         # 关键词过滤
                            pass
                        else:
                            #print(data_vaule['title'])
                            print(data_vaule)
                            #mysql_into(data_vaule)          # 开始写入数据库
                                                  ##print(data_vaule)  # 文件类型:category(文件夹6,视频1,图片3)  链接:shorturl 标题:title  时间:feed_time
                    start = start + 25
                    time.sleep(random.randint(10, 25))
    
    
    if __name__ == '__main__':
        while True:
            try:
                gethtml()                                 #  网盘爬虫函数
                t1 = threading.Thread(target=bdsl)        #  网盘失效连接检测函数
                #t2 = threading.Thread(target=aidy)        #  爱电影网站爬虫函数
                #t3 = threading.Thread(target=main_ys)
                t1.start()
                #t2.start()
                #t3.start()
    
    
                time.sleep(10800)  # 每3个小时爬一次,一天爬4次
            except:
                continue
    View Code

    数据写入数据库和百度网盘失效链接检测删除函数,将爬取到的数据传入函数即可写入数据库,还有一个链接失效检测函数,链接失效很正常,这个函数对整个数据库的链接进行检测,如果失效的链接删除。

    # 对数据库进行连接与数据入库
    
    import pymysql,time
    import requests,re
    import random
    def pysql():
        try:
    
            mysql = pymysql.connect('127.0.0.1', 'bdwp', 'xDnwLnjSEXLbGJYa', 'bdwp', charset="utf8")
            #mysql = pymysql.connect('127.0.0.1', 'root', 'root', 'bdwp', charset="utf8")
            return mysql
        except:
            print("数据库连接失败!")
            exit()
    
    
    def mysql_into(data_vaule):          #  网盘数据添加数据库函数
        mysql = pysql()
        db = mysql.cursor()
    
        sqlcx = "select title from data_zy WHERE title='%s'"%data_vaule['title']
        db.execute(sqlcx)
        data = db.fetchall()
    
        if not data:         # 没有的时候执行
            sqlcxid = "select max(id) from data_zy"
            db.execute(sqlcxid)
            dataid = db.fetchall()
            ids = (int(dataid[0][0])) + 1  # 获取最后一个入库id
            time_time = time.strftime("%Y-%m-%d %H:%M", time.localtime())  # 获取入库时间
            timeStamp = data_vaule['feed_time']                 #  转换资源分享时间
            timeStamp = float(timeStamp / 1000)
            timeArray = time.localtime(timeStamp)
            otherStyleTime = time.strftime("%Y-%m-%d %H:%M:%S", timeArray)
    
            try:
                sqltj = "insert into data_zy (id,category,shorturl,title,feed_time,rk_time) VALUES ('%d','%d','%s','%s','%s','%s')" % (ids,data_vaule['category'], data_vaule['shorturl'],data_vaule['title'],otherStyleTime,time_time)
                db.execute(sqltj)
                mysql.commit()
            except:
                pass
    
        else:
            return False   # 数据库里存在文件时
    
        mysql.close()
    
    
    # 百度链接失效检测函数
    def bdsl():
        header = {
           
            "Host": "pan.baidu.com",
            "Referer": "https://pan.baidu.com/pcloud/friendpage?type=follow&uk=2489863899&self=1",
            "Sec-Fetch-Mode": "cors",
            "Sec-Fetch-Site": "same-origin",
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.87 Safari/537.36",
            "X-Requested-With": "XMLHttpRequest"
        }
        mysql = pysql()
        db = mysql.cursor()
        sqlcx = "select id,shorturl from data_zy"
        db.execute(sqlcx)
        data = db.fetchall()
        #查询完成
        for r in data:
            url = "https://pan.baidu.com/s/"+r[1]
            id = r[0]
    
            html = (requests.get(url, headers=header).text).encode('iso-8859-1').decode('utf-8')
    
            srt = "此链接分享内容可能因为涉及侵权、色情、反动、低俗等信息,无法访问!"
            if srt in html:
                sqlde = "DELETE FROM data_zy WHERE id = %s" % id
                db.execute(sqlde)
    
                time.sleep(random.randint(10, 25))
    
            else:
                pass
    View Code

    这里还有个小函数,如果我们爬取的资源标题包含敏感词则不写入数据库,主要过滤广告

    from mysql_db import pysql
    
    def gjc_gl(title):
        mysql = pysql()
        db = mysql.cursor()
        sql = "select * from gjc_gl WHERE id=1"
        db.execute(sql)
        data = db.fetchall()[0][1]
        data = data.split(',')
        for trs in data:
            if trs in title:
                return False
            else:
                pass
        return True
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    # import os
    # import binascii
    # cats = {
    #     u'video': u'视频',
    #     u'image': u'图片',
    #     u'document': u'书籍',
    #     u'music': u'音乐',
    #     u'package': u'压缩',
    #     u'software': u'软件',
    # }
    #
    # def get_label(name):
    #     if name in cats:
    #         return cats[name]
    #     return u'其它'
    #
    # #   函数用途,根据传入的文件名后缀而判断文件类型
    # def get_category(ext):
    #     ext = ext + '.'
    #     cats = {
    #         u'video': '.avi.mp4.rmvb.m2ts.wmv.mkv.flv.qmv.rm.mov.vob.asf.3gp.mpg.mpeg.m4v.f4v.',
    #         u'image': '.jpg.bmp.jpeg.png.gif.tiff.',
    #         u'document': '.pdf.isz.chm.txt.epub.bc!.doc.docx.xlsx.xls.pptx.ppt.',
    #         u'music': '.mp3.wma.ape.wav.dts.mdf.flac.',
    #         u'package': '.zip.rar.7z.tar.gz.iso.dmg.pkg.',
    #         u'software': '.exe.app.msi.apk.',
    #         u'torrent': '.torrent.'
    #     }
    #     for k, v in cats.items():
    #         if ext in v:
    #             return get_label(k)     # 调用
    #     return '其他'
    View Code

    这里写了一个拓展函数,去爬取其他网站的函数,动态分享获取到的资源或许不够,这里可以多渠道爬取其他网站,从而可以建立一个更加全面的百度网盘资源搜索

    import requests,re,time
    import random
    import pymysql
    from mysql_db import pysql
    
    
    def aidy():
        for i in range(11, 24):  # 1000
            for r in range(1, 6):
                try:
                    url = "http://520.58801hn.com/%d/page/%d" % (i, r)
    
                    header = {
                        "User-Agent": "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Mobile Safari/537.36"}
    
                    html = requests.get(url, headers=header).text
                    re_url = re.findall('<div class="entry-meta">.*?<a href="(.*?)" rel="bookmark">', html, re.S)
                    times = re.findall('<div class="entry-meta">.*?itemprop="datePublished">(.*?)</time></a>', html, re.S)
    
                    t=0
                    for for_url in re_url:
                        html_wp = requests.get(for_url, headers=header).text
                        re_wp = re.findall('<p>.*?href="https://pan.baidu.com/s/(.*?)">百度云盘</a>.*?:(.*?)</p>', html_wp,
                                           re.S)
                        if re_wp:
                            h1 = re.findall('<h1 class="entry-title" itemprop="name headline">(.*?)</h1>', html_wp, re.S)
    
                            # 开始连接数据库
                            mysql = pysql()
                            db = mysql.cursor()
                            # # 查询是否有重复标题
                            sqlcx = "select title from data_zy WHERE title='%s'" % h1[0]
                            db.execute(sqlcx)
                            data = db.fetchall()
    
                            # # 没有重复标题时添加数据
                            if not data:  # 没有的时候执行
                                sqlcxid = "select max(id) from data_zy"
                                db.execute(sqlcxid)
                                dataid = db.fetchall()
                                ids = (int(dataid[0][0])) + 1  # 获取最后一个入库id
                                time_time = time.strftime("%Y-%m-%d %H:%M", time.localtime())  # 获取入库时间
    
                                try:
                                    sqltj = "insert into data_zy (id,category,shorturl,title,feed_time,rk_time,wpmm) VALUES ('%d','6','%s','%s','%s','%s','%s')" % (
                                        ids, re_wp[0][0], h1[0], times[t], time_time, re_wp[0][1])
    
    
                                    db.execute(sqltj)
                                    mysql.commit()
                                    t = t + 1
    
                                except:
                                    pass
    
    
                            else:
                                pass
                        time.sleep(random.randint(2, 10))
    
                    time.sleep(random.randint(2, 10))
    
                except:
                    time.sleep(60)
                    continue
    
    
    
    
    if __name__ == '__main__':
        while True:
            try:
                aidy()
                time.sleep(10800)  # 每3个小时爬一次,一天爬4次
            except:
                continue
    View Code

    数据库的设计比较简单,只做了两个表,可以看看写入数据库函数那部分。

  • 相关阅读:
    Java基础教程——抽象类
    Java基础教程——接口
    Java基础教程——多态
    Java基础教程——继承
    Java基础教程——封装
    Java基础教程——类和对象
    Java基础教程——二维数组
    原码,反码,补码
    java学习之动手实验
    java学习之动手实验
  • 原文地址:https://www.cnblogs.com/hongming/p/12901445.html
Copyright © 2011-2022 走看看