zoukankan      html  css  js  c++  java
  • 抓取虾米歌曲信息

    # -*- coding: utf-8 -*-
    """
    Created on Fri Aug 01 18:09:21 2014
    
    @author: omom
    """
    import urllib2
    from bs4 import BeautifulSoup
    
    src="http://www.xiami.com/widget/40537093_376239,185689,1769863609,3351090,1769863610,3562321,183942,1769736296,3418502,1770127750,1770201852,3351083,3351088,3351082,1769496545,1769496547,1769496546,3418497,_235_346_FF8719_494949_0/multiPlayer.swf"
    #src="http://www.xiami.com/widget/40537093_1771331004,1771331002,55553,3478385,1769187978,380807,3478389,1770464110,55552,1771331001,380865,3478386,380834,380869,55670,55823,1772165872,55549,1769187987,380818,_235_346_FF8719_494949_0/multiPlayer.swf"
    #src="http://www.xiami.com/widget/40537093_380863,380832,55550,380830,380837,380861,380799,380866,380808,1770464109,55559,380860,1771512727,3478391,1771331023,55711,55556,380797,1769074612,380788,380810,3478387,380852,55705,55865,1769187981,380787,380862,1770464107,1771360882,55700,1770464108,55869,55867,3478388,380835,1769187983,3364419,1769115993,1771331005,_235_346_FF8719_494949_0/multiPlayer.swf"
    
    a,b,c=src.split("_",2)
    b=b.rstrip(",")
    ids=b.split(",")
    
    music_base="http://www.xiami.com/song/playlist/id/%s/object_name/default/object_id"
    
    def decrypt_url(s):
        s=s.replace('^','0')
        src=list(s)
        rows_count=int(src.pop(0))
        dst_list=[]    
        dst=[]
        src_len=len(src)
        row_len,reminder=divmod(src_len,rows_count)
    
        for i in range(rows_count):
            dst_list.append([])
    
        
        start=stop=0
        for row_list in dst_list:
            stop=start+row_len
            if reminder>0:
                stop+=1
                reminder-=1
        
            row_list.extend(src[start:stop])
            start=stop
    
    
        while 1:
            try:
                for row in dst_list:
                    dst.append(row.pop(0))
            except IndexError:
                break
        url= ''.join(dst)        
        return urllib2.unquote(url).replace('^','0')
        
    
    import time
    def  collect(mid=376239,high_quality=False):
        req=music_base%str(mid)
    
        req=urllib2.Request(req)
        req.add_header("Accept","text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8")
        req.add_header("Accept-Language","zh-CN,zh;q=0.8")
        req.add_header("Cache-Control","no-cache")
        req.add_header("Connection","close")
        req.add_header("Pragma","no-cache")
        req.add_header("Referer","http://www.baidu.com/")
        
        req.add_header("User-Agent","Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.143 Safari/537.36")
        req.add_header("cookie","_unsign_token=d0e87c7230b44e116e3b8e96c48c9b62; __gads=ID=3e9c72b9e0b3e7ba:T=1407824092:S=ALNI_MYxedT1iMAiA-IXbcgEu4Ss_XiRaw; box_opened=1; bdshare_firstime=1409207591670; __utma=251084815.350459004.1409209135.1409209135.1409209135.1; __utmz=251084815.1409209135.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); member_auth=2WydGIYavmhn16fDTt9ldyYb5%2BHTT2eFyY9Yjb4ovwQnIooIY9H%2Bx6uVQg5L3yCaq2HKtwNJXYSZg3aFgGLx8Kg; user=40534293%22%E5%93%8E%E5%B0%8F%E7%AC%A8%E8%9B%8Ba%22%220%221%22%3Ca+href%3D%27%2Fwebsitehelp%23help9_3%27+%3Edo%3C%2Fa%3E%220%220%220%22ee71485989%221409217771; ahtena_is_show=false; recent_tags=%E6%BF%80%E6%83%85+%E7%97%9B%E8%8B%A6+%E4%BC%A4%E5%BF%83+%E5%BF%A7%E4%BC%A4+; user_from=1; t_sign_auth=0; __guestplay=MTc3MjEzMDMyMywxOzE3NzE0MTkwNTQsMjsxNzY5OTI0MjQ0LDE%3D; pnm_cku822=187n%2BqZ9mgNqgJnCG0WtMC8x7vAtsC0zrXQcNA%3D%7CnOiH84T3i%2FOL%2F4zwi%2FyG9VU%3D%7CneiHGXz6UeRW5k4rRCFXLkskQdt3xmHTad%2B6Gro%3D%7Cmu6b9JHlkuGd5Z3pmuad6pDjnu2c65%2Fkneef5JjhluyX7JjhmuCFJQ%3D%3D%7Cm%2B%2BT%2FGIXeAx4D2AUbwBl1mcbhfZW1n3Fv8F03GvTZte00XHR%7CmO6BH2wDdg11Gm4bbht0B2gcYBVmCX0OdQZpHWEUZwh8D3gDowM%3D%7Cme6d7oHyneiH84Twn%2BmR64TzUw%3D%3D; CNZZDATA921624=cnzz_eid%3D6125411959-1407824089-%26ntime%3D1409550640; CNZZDATA2629111=cnzz_eid%3D1781743730-1407814089-%26ntime%3D1409450640; _xiamitoken=7cec7fe673a3672812c4b714a31d6687; isg=67257CF91c74F3297A603C00A816D262; sec=5401410089735bee8e0075e0b6825e0ba6a0a485")
    
        c=0
        while 1:
            try:
                page=urllib2.urlopen(req)
                time.sleep(0.5)
                break
            except urllib2.HTTPError:
                    c+=1
                    if c==5:
                        print 'id is:',mid
                        return {}
    
    
        dom=BeautifulSoup(page.read(),features="xml")
        try:
            title=dom.find("title").text
            song_id=dom.find("song_id").text
            url=dom.find("location").text
            url=decrypt_url(url)
            lyric=dom.find("lyric").text
            background=dom.find("background").text
            
            album_id=dom.find("album_id").text
            album_pic_s=dom.find("pic").text
            album_pic=dom.find("album_pic").text
            album_name=dom.find("album_name").text    
            
            artist_id=dom.find("artist_id").text
            artist=dom.find("artist").text
        except AttributeError:
            print 'id is:',id
        print title
        if high_quality:
            url=url.split("?auth_key")[0][::-1].replace("l_","h_")[::-1]
        return {"title":title,"song_id":song_id,"url":url,
                "lyric":lyric,"background":background,
                "album_id":album_id,"album_pic_s":album_pic_s,
                "album_pic":album_pic,"album_name":album_name,
                "artist_id":artist_id,"artist":artist,
                "xiami":True,            
                }
        
        
        
    #from  pprint import pprint 
    #pprint(collect())
    
    def split_var(s):
        
        s_list=s.split("
    ")
        dst=""
        for i in s_list:
            line=i.strip()
            if line=="":
                continue
            var=line.split("=")[0]
            
            dst+='"'+var+'":'+var+','
         
        dst="{"+dst+"}"
        print ''    
        print dst
        print ''
        dst=""
        for i in s_list:
            line=i.strip()
            if line=="":
                continue
            var=line.split("=")[0]
            dst+=var+"=i[""+var+""]"+"
    "
        print dst
        print ""
        print 
        
        
    s='''
        title=dom.find("title").text
        song_id=dom.find("song_id").text
        url=decrypt_url(url)
        lyric=dom.find("lyric").text
        background=dom.find("background").text
        
        album_id=dom.find("album_id").text
        album_pic_s=dom.find("pic").text
        album_pic=dom.find("album_pic").text
        album_name=dom.find("album_name").text    
        
        artist_id=dom.find("artist_id").text
        artist=dom.find("artist").text
          
        '''
    
    
    ##已失效。必须实时获取
    split_var(s)
    from pprint import pprint
    
    dst=[]
    for i in ids:
        data=collect(i)
        if data:
            dst.append(data)
        
    import json,urllib
    data=json.dumps(dst)
    
    dst={"data":data}
    
    #print urllib.urlopen("http://localhost/music/upload",data=urllib.urlencode(dst)).read()
    
    
        
        
  • 相关阅读:
    Spring+MyBatis
    MyBatis的关于批量数据操作的测试
    mysql插入数据后返回自增ID的方法,last_insert_id(),selectkey
    Java数据持久层框架 MyBatis之API学习五(Mapper XML 文件)
    Java数据持久层框架 MyBatis之API学习三(XML 映射配置文件)
    WebView高危接口安全检测
    Android studio Debug效率提升
    Android Bitmap圆角
    Android 瘦身攻略
    过滤Emoji表情😊
  • 原文地址:https://www.cnblogs.com/Yeah-come-on/p/3957696.html
Copyright © 2011-2022 走看看