zoukankan      html  css  js  c++  java
  • 转载——最近百度云盘不提供搜索,闲来无事,玩玩python爬虫,爬一下百度云盘的资源

    最近百度云盘不知道为啥不提供资源检索,正好最近看了一下python,正好来练练手,写歌爬虫爬一下百度云盘的资源。

        分析了一下百度云盘的网友源码和js文件,里面有大量ajax的东西,利用json传输数据,前端显示。话说,这样数据爬去就方便多了,也不要用scrapy啥的,直接解析json数据就好。

        分析js文件提炼了下面三个链接:

       

    1
    2
    3
    4
    URL_SHARE = 'http://yun.baidu.com/pcloud/feed/getsharelist?auth_type=1&start={start}&limit=20&query_uk={uk}&urlid={id}'
    URL_FOLLOW = 'http://yun.baidu.com/pcloud/friend/getfollowlist?query_uk={uk}&limit=20&start={start}&urlid={id}'
    #http://yun.baidu.com/pcloud/friend/getfanslist?query_uk=1327787586&limit=25&start=0
    URL_FANS = 'http://yun.baidu.com/pcloud/friend/getfanslist?query_uk={uk}&limit=20&start={start}&urlid={id}'

    整个数据爬取流程起到很重要的作用。

      爬虫分三步,一个是urlids 保存要爬取的网址,一个是user存放用户uk,另一个是share存放user分享的数据,包含任何你想要的数据。

      下面提供三个核心函数代码:

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    26
    27
    28
    29
    30
    31
    32
    33
    34
    35
    36
    37
    38
    39
    40
    41
    42
    43
    44
    45
    46
    47
    48
    49
    50
    51
    52
    53
    54
    55
    56
    57
    58
    59
    60
    61
    62
    63
    64
    65
    66
    67
    68
    69
    70
    71
    72
    73
    74
    75
    76
    77
    78
    79
    80
    81
    82
    83
    84
    85
    86
    87
    88
    89
    90
    91
    92
    93
    94
    95
    96
    97
    98
    99
    100
    101
    102
    103
    104
    105
    106
    107
    108
    109
    110
    111
    112
    113
    114
    115
    116
    117
    118
    119
    120
    121
    122
    123
    124
    125
    126
    127
    128
    129
    130
    131
    132
    133
    134
    135
    136
    137
    138
    139
    140
    141
    142
    143
    144
    145
    146
    147
    148
    149
    150
    151
    152
    153
    154
    155
    156
    157
    158
    159
    160
    161
    162
    163
    164
    165
    166
    167
    168
    169
    170
    171
    172
    173
    174
    175
    176
    177
    178
    179
    180
    181
    182
    183
    184
    185
    186
    187
    188
    189
    190
    191
    192
    193
    194
    195
    196
    197
    198
    199
    200
    201
    202
    203
    204
    205
    206
    207
    208
    209
    210
    211
    212
    213
    214
    215
    216
    217
    218
    219
    220
    221
    #演示站http://wwww.yunsou.me        
    def response_worker():
        global news,totals
        dbconn = mdb.connect(DB_HOST, DB_USER, DB_PASS, 'baiduyun', charset='utf8')
        dbcurr = dbconn.cursor()
        dbcurr.execute('SET NAMES utf8')
        dbcurr.execute('set global wait_timeout=60000')
        while True:
            print "function response_worker",hc_r.qsize()
            # if hc_r.qsize()==0:
            #     print "continue"
            #     continue
            metadata, effective_url = hc_r.get()
            print "response_worker:", effective_url
            try:
                tnow = datetime.datetime.utcnow()
                date = (tnow + datetime.timedelta(hours=8))
                date = datetime.datetime(date.year, date.month, date.day)
                if news>=100:
                    try:
                        dbcurr.execute('INSERT INTO spider_statusreport(date,new_hashes,total_requests)  VALUES(%s,%s,%s) ON DUPLICATE KEY UPDATE ' +'total_requests=total_requests+%s,new_hashes=new_hashes+%s',
                            (date, news,totals,totals,news))
                    except Exception as ex:
                        print "E10"str(ex)
                    news=0
                id = re_urlid.findall(effective_url)[0]
                start = re_start.findall(effective_url)[0]
                if True:
                    if 'getfollowlist' in effective_url: #type = 1
                        follows = json.loads(metadata)
                        print "-------------------------------------follows------------------------------- "
                        uid = re_uid.findall(effective_url)[0]
                        if "total_count" in follows.keys() and follows["total_count"]>0 and str(start) == "0":
                            for in range((follows["total_count"]-1)/ONEPAGE):
                                try:
                                    dbcurr.execute('INSERT INTO urlids(uk, start, limited, type, status) VALUES(%s, %s, %s, 1, 0)' % (uid, str(ONEPAGE*(i+1)), str(ONEPAGE)))
                                except Exception as ex:
                                    print "E1"str(ex)
                                    pass
                         
                        if "follow_list" in follows.keys():
                            for item in follows["follow_list"]:
                    if item['pubshare_count']==0:
                        print "---------------------count ==0------------------------------------------- "
                        #continue
         
                                = dbcurr.execute('SELECT id FROM user WHERE userid=%s', (item['follow_uk'],))
                                = dbcurr.fetchone()
                                print "user uk",item['follow_uk']
                                if not y:
                                    try:
                                        dbcurr.execute('INSERT INTO user(userid, username, files, status, downloaded, lastaccess,avatar_url,fans_count,follow_count,album_count) VALUES(%s, "%s", %s, 0, 0, "%s","%s",%s,%s,%s)' % (item['follow_uk'], item['follow_uname'],item['pubshare_count'],tnow,item['avatar_url'],item['fans_count'],item['follow_count'],item['album_count']))
                                    except Exception as ex:
                                        print "E13"str(ex)
                                        pass
                                else:
                                    print "-----------------userid exists--------------------------------- "
                        else:
                            print "delete 1", uid, start
                            dbcurr.execute('delete from urlids where uk=%s and type=1 and start>%s' % (uid, start))
                    elif 'getfanslist' in effective_url: #type = 2
                        fans = json.loads(metadata)
                        print "----------------------------------------fans---------------------------------- "
                        uid = re_uid.findall(effective_url)[0]
                        if "total_count" in fans.keys() and fans["total_count"]>0 and str(start) == "0":
                            for in range((fans["total_count"]-1)/ONEPAGE):
                                try:
                                    dbcurr.execute('INSERT INTO urlids(uk, start, limited, type, status) VALUES(%s, %s, %s, 2, 0)' % (uid, str(ONEPAGE*(i+1)), str(ONEPAGE)))
                                except Exception as ex:
                                    print "E2"str(ex)
                                    pass
                         
                        if "fans_list" in fans.keys():
                            for item in fans["fans_list"]:
                                if item['pubshare_count']==0:
                    print "---------------------count ==0------------------------------------------- "
                                    #continue
                                = dbcurr.execute('SELECT id FROM user WHERE userid=%s', (item['fans_uk'],))
                                = dbcurr.fetchone()
                                print "user uk",item['fans_uk']
                                if not y:
                                    try:
                                        dbcurr.execute('INSERT INTO user(userid, username, files, status, downloaded, lastaccess,avatar_url,fans_count,follow_count,album_count) VALUES(%s, "%s", %s, 0, 0, "%s","%s",%s,%s,%s)' % (item['fans_uk'], item['fans_uname'],item['pubshare_count'],tnow,item['avatar_url'],item['fans_count'],item['follow_count'],item['album_count']))
                                    except Exception as ex:
                                        print "E23"str(ex)
                                        pass
                                else:
                                    print "-----------------userid exists--------------------------------- "
                                 
                        else:
                            print "delete 2", uid, start
                            dbcurr.execute('delete from urlids where uk=%s and type=2 and start>%s' % (uid, start))
                    else:
                        shares = json.loads(metadata)
                        print "shares"
                        uid = re_uid.findall(effective_url)[0]
                        totals+=1
                        if "total_count" in shares.keys() and shares["total_count"]>0 and str(start) == "0":
                            for in range((shares["total_count"]-1)/ONESHAREPAGE):
                                try:
                                    dbcurr.execute('INSERT INTO urlids(uk, start, limited, type, status) VALUES(%s, %s, %s, 0, 0)' % (uid, str(ONESHAREPAGE*(i+1)), str(ONESHAREPAGE)))
                                except Exception as ex:
                                    print "E3"str(ex)
                                    pass
                        if "records" in shares.keys():
                            for item in shares["records"]:
                                print "-------------------------------------filename------------------ ",item['title']
                                print "--------------------------------------------------------------- "
                                try:
                                    stamp_t=int(item["feed_time"])/1000
                                    t= time.localtime(int(stamp_t))
                                    share_time=time.strftime("%Y-%m-%d %H:%M:%S",t)
                                    urls=""
                                    if "shorturl" in item.keys():
                                        urls=item['shorturl']
                                    news+=1
                                    length=""
                                    if "filelist" in item.keys():
                                        length=str(item['filelist'][0]['size'])
                                    dbcurr.execute('INSERT INTO share(fid,userid, filename, shareid, status,filetype,share_time,create_time,urls,down,length) VALUES("%s",%s, "%s", %s, 0,"%s","%s","%s","%s",0,"%s")' % (sid(int(item['shareid'])),uid, item['title'], item['shareid'],get_category(get_ext(item['title'])),share_time,tnow,urls,length))
                                    # time.sleep(10)
                                     
                                except Exception as ex:
                                    print " >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>E33 "str(ex)
                                    print "item --------------------------------------------- "
                                    # time.sleep(10)
                                    pass
                        else:
                            print "delete 0", uid, start
                            dbcurr.execute('delete from urlids where uk=%s and type=0 and start>%s' % (uid, str(start)))
                         
                    dbcurr.execute('delete from urlids where id=%s' % (id, ))
                    dbconn.commit()
            except Exception as ex:
                print "E5"str(ex), id
             
            pid = re_pptt.findall(effective_url)
             
            if pid:
                print "pid>>>", pid
                ppid = int(pid[0])
                PROXY_LIST[ppid][6-= 1
        dbcurr.close()
        dbconn.close()
    #演示站http://wwww.yunsou.me        
    def worker(k):
        global success, failed
        dbconn = mdb.connect(DB_HOST, DB_USER, DB_PASS, 'baiduyun', charset='utf8')
        dbcurr = dbconn.cursor()
        dbcurr.execute('SET NAMES utf8')
        dbcurr.execute('set global wait_timeout=60000')
        while True:
            #dbcurr.execute('select * from urlids where status=0 order by type limit 1')
            dbcurr.execute('select * from urlids where status=0 limit %s,1'%(str(k),))
            = dbcurr.fetchall()
            #print d
            if d:
                id = d[0][0]
                uk = d[0][1]
                start = d[0][2]
                limit = d[0][3]
                type = d[0][4]
                dbcurr.execute('update urlids set status=1 where id=%s' % (str(id),))
                url = ""
                if type == 0:
                    url = URL_SHARE.format(uk=uk, start=start, id=id).encode('utf-8')
                elif  type == 1:
                    url = URL_FOLLOW.format(uk=uk, start=start, id=id).encode('utf-8')
                elif type == 2:
                    url = URL_FANS.format(uk=uk, start=start, id=id).encode('utf-8')
                if url:
                    hc_q.put((type, url))
            if len(d)==0:
                print " data user uk  "
                dbcurr.execute('select * from user where status=0 limit %s,100'%(str(k*100),))
                print "user "
                = dbcurr.fetchall()
                #print "uk",d
                if d:
                    for item in d:
                        try:
                            print "update user",item[1]
                            dbcurr.execute('insert into urlids(uk, start, limited, type, status) values("%s", 0, %s, 0, 0)' % (item[1], str(ONESHAREPAGE)))
                            dbcurr.execute('insert into urlids(uk, start, limited, type, status) values("%s", 0, %s, 1, 0)' % (item[1], str(ONEPAGE)))
                            dbcurr.execute('insert into urlids(uk, start, limited, type, status) values("%s", 0, %s, 2, 0)' % (item[1], str(ONEPAGE)))
                            dbcurr.execute('update user set status=1 where userid=%s and id=%s' % (item[1],item[6]))
                        except Exception as ex:
                            print "E6"str(ex)
                else:
                    time.sleep(1)
            dbconn.commit()
        dbcurr.close()
        dbconn.close()
    #演示站http://wwww.yunsou.me 
    def req_worker(inx):
        = requests.Session()
        while True:
            time.sleep(1)
            req_item = hc_q.get()
             
            req_type = req_item[0]
            url = req_item[1]
            try:
                = s.get(url)
                hc_r.put((r.text, url))
            except:
                pass
    for item in range(3):    
        = threading.Thread(target = req_worker, args = (item,))
        t.setDaemon(True)
        t.start()
    for item in range(2):     
        = threading.Thread(target = worker, args = (item,))
        s.setDaemon(True)
        s.start()
    for item in range(2):    
        = threading.Thread(target = response_worker, args = ())
        t.setDaemon(True)
        t.start()
    while 1:
        pass

       ok,完工,想看的可以来看下百度云搜

    本文出自 “Cocos2D-X” 博客,请务必保留此出处http://lonag.blog.51cto.com/3340984/1716517

  • 相关阅读:
    修改Tarsphp节点线程数避免请求阻塞
    Docker删除所有容器
    清理mysql数据库binlog日志
    查看centos磁盘情况,查找大文件路径
    winform窗体的生命周期和事件加载顺序是什么?
    数据库自增ID用完了会怎么样?
    MPFIT for python
    Plplot中line width 问题
    剪切Postscript图片中的多余边框
    嵌入式下的深度学习 Sparkfun Edge with TensorFlow(一)Hello World
  • 原文地址:https://www.cnblogs.com/lyf83/p/5069002.html
Copyright © 2011-2022 走看看