zoukankan      html  css  js  c++  java
  • 转载——最近百度云盘不提供搜索,闲来无事,玩玩python爬虫,爬一下百度云盘的资源

    最近百度云盘不知道为啥不提供资源检索,正好最近看了一下python,正好来练练手,写歌爬虫爬一下百度云盘的资源。

        分析了一下百度云盘的网友源码和js文件,里面有大量ajax的东西,利用json传输数据,前端显示。话说,这样数据爬去就方便多了,也不要用scrapy啥的,直接解析json数据就好。

        分析js文件提炼了下面三个链接:

       

    1
    2
    3
    4
    URL_SHARE = 'http://yun.baidu.com/pcloud/feed/getsharelist?auth_type=1&start={start}&limit=20&query_uk={uk}&urlid={id}'
    URL_FOLLOW = 'http://yun.baidu.com/pcloud/friend/getfollowlist?query_uk={uk}&limit=20&start={start}&urlid={id}'
    #http://yun.baidu.com/pcloud/friend/getfanslist?query_uk=1327787586&limit=25&start=0
    URL_FANS = 'http://yun.baidu.com/pcloud/friend/getfanslist?query_uk={uk}&limit=20&start={start}&urlid={id}'

    整个数据爬取流程起到很重要的作用。

      爬虫分三步,一个是urlids 保存要爬取的网址,一个是user存放用户uk,另一个是share存放user分享的数据,包含任何你想要的数据。

      下面提供三个核心函数代码:

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    26
    27
    28
    29
    30
    31
    32
    33
    34
    35
    36
    37
    38
    39
    40
    41
    42
    43
    44
    45
    46
    47
    48
    49
    50
    51
    52
    53
    54
    55
    56
    57
    58
    59
    60
    61
    62
    63
    64
    65
    66
    67
    68
    69
    70
    71
    72
    73
    74
    75
    76
    77
    78
    79
    80
    81
    82
    83
    84
    85
    86
    87
    88
    89
    90
    91
    92
    93
    94
    95
    96
    97
    98
    99
    100
    101
    102
    103
    104
    105
    106
    107
    108
    109
    110
    111
    112
    113
    114
    115
    116
    117
    118
    119
    120
    121
    122
    123
    124
    125
    126
    127
    128
    129
    130
    131
    132
    133
    134
    135
    136
    137
    138
    139
    140
    141
    142
    143
    144
    145
    146
    147
    148
    149
    150
    151
    152
    153
    154
    155
    156
    157
    158
    159
    160
    161
    162
    163
    164
    165
    166
    167
    168
    169
    170
    171
    172
    173
    174
    175
    176
    177
    178
    179
    180
    181
    182
    183
    184
    185
    186
    187
    188
    189
    190
    191
    192
    193
    194
    195
    196
    197
    198
    199
    200
    201
    202
    203
    204
    205
    206
    207
    208
    209
    210
    211
    212
    213
    214
    215
    216
    217
    218
    219
    220
    221
    #演示站http://wwww.yunsou.me        
    def response_worker():
        global news,totals
        dbconn = mdb.connect(DB_HOST, DB_USER, DB_PASS, 'baiduyun', charset='utf8')
        dbcurr = dbconn.cursor()
        dbcurr.execute('SET NAMES utf8')
        dbcurr.execute('set global wait_timeout=60000')
        while True:
            print "function response_worker",hc_r.qsize()
            # if hc_r.qsize()==0:
            #     print "continue"
            #     continue
            metadata, effective_url = hc_r.get()
            print "response_worker:", effective_url
            try:
                tnow = datetime.datetime.utcnow()
                date = (tnow + datetime.timedelta(hours=8))
                date = datetime.datetime(date.year, date.month, date.day)
                if news>=100:
                    try:
                        dbcurr.execute('INSERT INTO spider_statusreport(date,new_hashes,total_requests)  VALUES(%s,%s,%s) ON DUPLICATE KEY UPDATE ' +'total_requests=total_requests+%s,new_hashes=new_hashes+%s',
                            (date, news,totals,totals,news))
                    except Exception as ex:
                        print "E10"str(ex)
                    news=0
                id = re_urlid.findall(effective_url)[0]
                start = re_start.findall(effective_url)[0]
                if True:
                    if 'getfollowlist' in effective_url: #type = 1
                        follows = json.loads(metadata)
                        print "-------------------------------------follows------------------------------- "
                        uid = re_uid.findall(effective_url)[0]
                        if "total_count" in follows.keys() and follows["total_count"]>0 and str(start) == "0":
                            for in range((follows["total_count"]-1)/ONEPAGE):
                                try:
                                    dbcurr.execute('INSERT INTO urlids(uk, start, limited, type, status) VALUES(%s, %s, %s, 1, 0)' % (uid, str(ONEPAGE*(i+1)), str(ONEPAGE)))
                                except Exception as ex:
                                    print "E1"str(ex)
                                    pass
                         
                        if "follow_list" in follows.keys():
                            for item in follows["follow_list"]:
                    if item['pubshare_count']==0:
                        print "---------------------count ==0------------------------------------------- "
                        #continue
         
                                = dbcurr.execute('SELECT id FROM user WHERE userid=%s', (item['follow_uk'],))
                                = dbcurr.fetchone()
                                print "user uk",item['follow_uk']
                                if not y:
                                    try:
                                        dbcurr.execute('INSERT INTO user(userid, username, files, status, downloaded, lastaccess,avatar_url,fans_count,follow_count,album_count) VALUES(%s, "%s", %s, 0, 0, "%s","%s",%s,%s,%s)' % (item['follow_uk'], item['follow_uname'],item['pubshare_count'],tnow,item['avatar_url'],item['fans_count'],item['follow_count'],item['album_count']))
                                    except Exception as ex:
                                        print "E13"str(ex)
                                        pass
                                else:
                                    print "-----------------userid exists--------------------------------- "
                        else:
                            print "delete 1", uid, start
                            dbcurr.execute('delete from urlids where uk=%s and type=1 and start>%s' % (uid, start))
                    elif 'getfanslist' in effective_url: #type = 2
                        fans = json.loads(metadata)
                        print "----------------------------------------fans---------------------------------- "
                        uid = re_uid.findall(effective_url)[0]
                        if "total_count" in fans.keys() and fans["total_count"]>0 and str(start) == "0":
                            for in range((fans["total_count"]-1)/ONEPAGE):
                                try:
                                    dbcurr.execute('INSERT INTO urlids(uk, start, limited, type, status) VALUES(%s, %s, %s, 2, 0)' % (uid, str(ONEPAGE*(i+1)), str(ONEPAGE)))
                                except Exception as ex:
                                    print "E2"str(ex)
                                    pass
                         
                        if "fans_list" in fans.keys():
                            for item in fans["fans_list"]:
                                if item['pubshare_count']==0:
                    print "---------------------count ==0------------------------------------------- "
                                    #continue
                                = dbcurr.execute('SELECT id FROM user WHERE userid=%s', (item['fans_uk'],))
                                = dbcurr.fetchone()
                                print "user uk",item['fans_uk']
                                if not y:
                                    try:
                                        dbcurr.execute('INSERT INTO user(userid, username, files, status, downloaded, lastaccess,avatar_url,fans_count,follow_count,album_count) VALUES(%s, "%s", %s, 0, 0, "%s","%s",%s,%s,%s)' % (item['fans_uk'], item['fans_uname'],item['pubshare_count'],tnow,item['avatar_url'],item['fans_count'],item['follow_count'],item['album_count']))
                                    except Exception as ex:
                                        print "E23"str(ex)
                                        pass
                                else:
                                    print "-----------------userid exists--------------------------------- "
                                 
                        else:
                            print "delete 2", uid, start
                            dbcurr.execute('delete from urlids where uk=%s and type=2 and start>%s' % (uid, start))
                    else:
                        shares = json.loads(metadata)
                        print "shares"
                        uid = re_uid.findall(effective_url)[0]
                        totals+=1
                        if "total_count" in shares.keys() and shares["total_count"]>0 and str(start) == "0":
                            for in range((shares["total_count"]-1)/ONESHAREPAGE):
                                try:
                                    dbcurr.execute('INSERT INTO urlids(uk, start, limited, type, status) VALUES(%s, %s, %s, 0, 0)' % (uid, str(ONESHAREPAGE*(i+1)), str(ONESHAREPAGE)))
                                except Exception as ex:
                                    print "E3"str(ex)
                                    pass
                        if "records" in shares.keys():
                            for item in shares["records"]:
                                print "-------------------------------------filename------------------ ",item['title']
                                print "--------------------------------------------------------------- "
                                try:
                                    stamp_t=int(item["feed_time"])/1000
                                    t= time.localtime(int(stamp_t))
                                    share_time=time.strftime("%Y-%m-%d %H:%M:%S",t)
                                    urls=""
                                    if "shorturl" in item.keys():
                                        urls=item['shorturl']
                                    news+=1
                                    length=""
                                    if "filelist" in item.keys():
                                        length=str(item['filelist'][0]['size'])
                                    dbcurr.execute('INSERT INTO share(fid,userid, filename, shareid, status,filetype,share_time,create_time,urls,down,length) VALUES("%s",%s, "%s", %s, 0,"%s","%s","%s","%s",0,"%s")' % (sid(int(item['shareid'])),uid, item['title'], item['shareid'],get_category(get_ext(item['title'])),share_time,tnow,urls,length))
                                    # time.sleep(10)
                                     
                                except Exception as ex:
                                    print " >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>E33 "str(ex)
                                    print "item --------------------------------------------- "
                                    # time.sleep(10)
                                    pass
                        else:
                            print "delete 0", uid, start
                            dbcurr.execute('delete from urlids where uk=%s and type=0 and start>%s' % (uid, str(start)))
                         
                    dbcurr.execute('delete from urlids where id=%s' % (id, ))
                    dbconn.commit()
            except Exception as ex:
                print "E5"str(ex), id
             
            pid = re_pptt.findall(effective_url)
             
            if pid:
                print "pid>>>", pid
                ppid = int(pid[0])
                PROXY_LIST[ppid][6-= 1
        dbcurr.close()
        dbconn.close()
    #演示站http://wwww.yunsou.me        
    def worker(k):
        global success, failed
        dbconn = mdb.connect(DB_HOST, DB_USER, DB_PASS, 'baiduyun', charset='utf8')
        dbcurr = dbconn.cursor()
        dbcurr.execute('SET NAMES utf8')
        dbcurr.execute('set global wait_timeout=60000')
        while True:
            #dbcurr.execute('select * from urlids where status=0 order by type limit 1')
            dbcurr.execute('select * from urlids where status=0 limit %s,1'%(str(k),))
            = dbcurr.fetchall()
            #print d
            if d:
                id = d[0][0]
                uk = d[0][1]
                start = d[0][2]
                limit = d[0][3]
                type = d[0][4]
                dbcurr.execute('update urlids set status=1 where id=%s' % (str(id),))
                url = ""
                if type == 0:
                    url = URL_SHARE.format(uk=uk, start=start, id=id).encode('utf-8')
                elif  type == 1:
                    url = URL_FOLLOW.format(uk=uk, start=start, id=id).encode('utf-8')
                elif type == 2:
                    url = URL_FANS.format(uk=uk, start=start, id=id).encode('utf-8')
                if url:
                    hc_q.put((type, url))
            if len(d)==0:
                print " data user uk  "
                dbcurr.execute('select * from user where status=0 limit %s,100'%(str(k*100),))
                print "user "
                = dbcurr.fetchall()
                #print "uk",d
                if d:
                    for item in d:
                        try:
                            print "update user",item[1]
                            dbcurr.execute('insert into urlids(uk, start, limited, type, status) values("%s", 0, %s, 0, 0)' % (item[1], str(ONESHAREPAGE)))
                            dbcurr.execute('insert into urlids(uk, start, limited, type, status) values("%s", 0, %s, 1, 0)' % (item[1], str(ONEPAGE)))
                            dbcurr.execute('insert into urlids(uk, start, limited, type, status) values("%s", 0, %s, 2, 0)' % (item[1], str(ONEPAGE)))
                            dbcurr.execute('update user set status=1 where userid=%s and id=%s' % (item[1],item[6]))
                        except Exception as ex:
                            print "E6"str(ex)
                else:
                    time.sleep(1)
            dbconn.commit()
        dbcurr.close()
        dbconn.close()
    #演示站http://wwww.yunsou.me 
    def req_worker(inx):
        = requests.Session()
        while True:
            time.sleep(1)
            req_item = hc_q.get()
             
            req_type = req_item[0]
            url = req_item[1]
            try:
                = s.get(url)
                hc_r.put((r.text, url))
            except:
                pass
    for item in range(3):    
        = threading.Thread(target = req_worker, args = (item,))
        t.setDaemon(True)
        t.start()
    for item in range(2):     
        = threading.Thread(target = worker, args = (item,))
        s.setDaemon(True)
        s.start()
    for item in range(2):    
        = threading.Thread(target = response_worker, args = ())
        t.setDaemon(True)
        t.start()
    while 1:
        pass

       ok,完工,想看的可以来看下百度云搜

    本文出自 “Cocos2D-X” 博客,请务必保留此出处http://lonag.blog.51cto.com/3340984/1716517

  • 相关阅读:
    单击按钮左键弹起菜单
    高亮选中MEMO某一行
    DelphiTXT文档编辑器
    桌面名人名言
    判断richtextbox选中的是否为图片
    数组
    解决Linux下IDEA无法使用ibus输入法的问题和tip乱码
    Spring实现AOP的多种方式
    java术语(PO/POJO/VO/BO/DAO/DTO)
    idea intellij对Spring进行单元测试
  • 原文地址:https://www.cnblogs.com/lyf83/p/5069002.html
Copyright © 2011-2022 走看看