zoukankan html css js c++ java
Baidu音乐爬虫

Baidu音乐歌曲爬虫：
1、分析Baidu音乐歌曲下载接口，组装参数
2、判断是否需要登录
　　a、使用cookie
　　b、使用selenium
3、歌曲信息页面分析
4、数据表设计
歌曲类型表
歌曲表
表都无所谓，自己设计就行。
-------------------------------
# -*- coding: utf-8 -*-
'''
    ***
        _author_= "fengshaungzi"
        _time_='2018-4-10'
        _python_version_ = 'python2.7'
        _script_type_ = 'spider'
        url = 'http://music.baidu.com/tag/类型?start=0&size=20&third_type=0'
    ***
'''
from os import path
from bs4 import BeautifulSoup
import urllib,urllib2,requests,cookielib
import sys,time,datetime
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import pymysql,shutil
import sys,os
reload(sys)
sys.setdefaultencoding('utf-8')
d = path.dirname(__file__)

class BadiuMusicSpider():
    def __init__(self):
        pass
    def login(self,cursor,type_id,type_q):
        chrome_options = Options()
        chrome_options.add_argument('--headless')
        chrome_options.add_argument('--disable-gpu')
        driver = webdriver.Chrome()
        driver.maximize_window()
        driver.get("http://i.baidu.com/welcome/")
        time.sleep(5)
        driver.find_element_by_xpath('/html/body/header/div/div/a[2]').click()
        time.sleep(2)
        driver.find_element_by_xpath('//*[@id="TANGRAM__PSP_10__userName"]').clear()
        driver.find_element_by_xpath('//*[@id="TANGRAM__PSP_10__userName"]').send_keys('用户')
        time.sleep(2)
        driver.find_element_by_xpath('//*[@id="TANGRAM__PSP_10__password"]').clear()
        driver.find_element_by_xpath('//*[@id="TANGRAM__PSP_10__password"]').send_keys('密码')
        ##如果有验证码
        time.sleep(3)
        try:
            driver.find_element_by_xpath('//*[@id="TANGRAM__PSP_10__verifyCodeChange"]').click()
            input = raw_input(u'请输入验证码：')
            code = driver.find_element_by_xpath('//*[@id="TANGRAM__PSP_10__verifyCode"]')
            code.clear()
            code.send_keys(input)
        except:
            print u'没有验证码。'
        driver.find_element_by_xpath('//*[@id="TANGRAM__PSP_10__submit"]').submit()
        time.sleep(2)
        self.parse_html(driver,cursor,type_id,type_q)
    def parse_html(self,driver,cursor,type_id,type_q,page=1,):

        #response =  urllib2.urlopen(url).read()
        #response = opener.open(urllib2.Request(url, headers=headers))
        #response = response.read()
        #response = requests.get(url, headers=headers, cookies=cookies).content
        #response = opener.open(urllib2.Request(url, headers=headers))
        #response = response.read()

        start = (page-1)*20
        print u'---开始获取第{0}页的数据----'.format(page)
        url = 'http://music.baidu.com/tag/{0}?start={1}&size=20&third_type=0'.format(type_q,start)
        driver.get(url)
        time.sleep(2)
        response = driver.page_source
        obj = BeautifulSoup(response, 'html.parser')
        ##获取歌曲m_url
        span_list = obj.find_all('span',{"class":"song-title"})
        ## 判断下是否有下一页
        try:
            driver.find_element_by_class_name('page-navigator-next')
            next_page = 1
        except:
            next_page = 0
        #try:
        for v in span_list:
            list = []
            try:
                m_url = v.find('a')['href']
            except:
                continue
            ###获取song_id
            song_id = m_url.replace('/song/', '')
            ##组装下url头部
            m_url = 'http://music.baidu.com{0}'.format(m_url)
            ###开始获取歌曲信息
            data = self.save_music_info(m_url,type_id)
            ### 判断data['check']==0，说明歌曲已经存在跳出这次循环
            if data.has_key('check'):
                print u'---该歌曲已经存在---'
                continue
            singer_path = u"G:\www\music2\"+data['singer']
            ###歌曲信息获取完毕开始下载歌曲 需要song_id
            music_lrc = self.save_music_lrc(driver,song_id,singer_path)
            if  music_lrc.has_key('words') and music_lrc['words'] =='暂无':
                data['words'] =''
            else:
                print u"歌词："+music_lrc['lrc_name']
                data['words'] = u'music2/LRC/'+music_lrc['lrc_name']
            data['filepath'] = u'music2/{0}/{1}.mp3'.format(data['singer'],data['name'])
            ## 设置id的值
            cursor.execute('select  id from network_music order by cast(id as SIGNED INTEGER) desc limit 0,1')
            old_id = cursor.fetchone()
            if old_id:
                id_n = str(int(old_id[0]) + 1)
            else:
                id_n = str(1)
            # 进入数据库
            list = [(id_n,data['name'],data['singer'],data['album'],data['publishtime'],data['publishcompany'],data['composer'],data['lyrics'], 
                data['filesize'],data['filetime'],data['userhead'],data['types'],data['status'],data['words'],data['filepath'])]
            #xprint list
            self.save_db(cursor,list)
        '''
        except:
            ## 记入log
            try:
                datetime_now = datetime.datetime.now()
                datetime_str = '{0}-{1}-{2} {3}:{4}:{5}'.format(datetime_now.year,datetime_now.month,datetime_now.day,datetime_now.hour,datetime_now.minute,datetime_now.second)
                effect_row = cursor.executemany("insert into music_log(page,datetime)values(%s,%s)",[(page,datetime_str)])
                ## 提交，不然无法保存新建或者修改的数据
                conn.commit()
            except:
                print 'Add log fault!'
        '''
        page = page + 1
        #input = raw_input('输入任意值继续执行：')
        if next_page==1:
            print u'------开始获取下一页的数据----'
            self.parse_html(driver,cursor,type_id,type_q,page=page)
        else:
            print u"-----爬虫程序即将结束-----"
            cursor.close()
            conn.close()

    def save_music_info(self,m_url,type_id):
        data = {}
        music_info_response = urllib2.urlopen(m_url).read()
        music_info_obj = BeautifulSoup(music_info_response, 'html.parser')
        ##获取歌曲信息  name  singer alnum  pubdate pic  tag  company
        name =  music_info_obj.find('span',{"class":"name"}).text.strip()
        name = name.replace('"','')
        name = name.replace("'",'')
        singer = music_info_obj.find('span',{"class":"artist"}).find('a').text.strip()
        singer = singer.replace('"', '')
        singer = singer.replace("'", '')
        if os.path.exists("G:\www\music2\"+singer) == False:
            os.mkdir("G:\www\music2\"+singer)
        else:
            print u'歌手文件夹已经存在！'
        album = music_info_obj.find('p',{"class":"album"}).find('a').text.strip()
        ##发布时间需要处理； 排除空白的情况
        if music_info_obj.find('p',{"class":"publish"}).text.strip() ==u'发行时间：':
            publishtime = '未知'
        else:
            publishtime = music_info_obj.find('p',{"class":"publish"}).text.strip()
            publishtime = publishtime.replace(u'发行时间：','')
        ##发行公司需要处理；排除空白的情况
        if music_info_obj.find('p',{"class":"company"}).text.strip() ==u'发行公司：':
            publishcompany = '未知'
        else:
            publishcompany = music_info_obj.find('p',{"class":"company"}).text.strip()
            publishcompany = publishcompany.replace(u'发行公司：','')

        ###获取图片
        pic_url = music_info_obj.find('img',{"class":"music-song-ing"})['src']
        if pic_url:
            pic_path = self.save_pic(pic_url)
        data['name'] = name
        print u"歌名："+name
        data['singer'] = singer
        print u"歌手：" + singer
        data['album'] = album
        data['publishtime'] =publishtime
        data['publishcompany'] = publishcompany
        data['composer']  = ''
        data['lyrics'] = ''
        data['filesize'] = ''
        data['filetime'] = 0
        data['userhead'] = pic_path if pic_path else ''
        data['types'] = ','+str(type_id)+','
        data['status'] = 0
        ## 判断数据库是否重复
        #print 'select id,TYPES from network_music where NAME="{0}" and SINGER="{1}"'.format(name,singer)
        cursor.execute('select id,TYPES from network_music where NAME="{0}" and SINGER="{1}"'.format(name,singer))
        result_types = cursor.fetchall()
        if result_types:
            if str(type_id) in result_types[0][1]:
                pass
            else:
                types = result_types[0][1] + str(type_id)+','
                cursor.execute("UPDATE network_music SET TYPES='{0}' WHERE id ={1}".format(types, result_types[0][0]))
                ## 提交，不然无法保存新建或者修改的数据
                conn.commit()
            data['check'] = 0
        return data

    def save_music_lrc(self, driver,song_id,singer_path):
        music_lrc = {}
        m_api = 'http://music.baidu.com/data/music/file?link=&song_id={0}'.format(song_id)
        driver.get(m_api)
        time.sleep(3)
        ### 找到最新的文件
        path_d = u'C:\Users\hz\Downloads'
        file_lists = os.listdir(path_d)
        try:
            file_lists.sort(key=lambda fn: os.path.getmtime(path_d + "\" + fn))
            filename = file_lists[-1]
            if filename:
                #print filename
                #print singer_path
                ### 移动到
                shutil.move(u'C:\Users\hz\Downloads\'+filename,singer_path)
        except:
            #os.remove(my_file)
            print u"移动失败，文件名字问题，手动修改"
        ##跳转到页面
        driver.get('http://music.baidu.com/song/{0}'.format(song_id))
        time.sleep(2)
        try:
            l_api = driver.find_element_by_xpath('//*[@id="lyricCont"]').get_attribute('data-lrclink')
            driver.get(l_api)
            time.sleep(2)
            try:
                music_lrc['lrc_name'] = self.get_lrc_path()
            except:
                print u'获取歌词文件名错误'
        except:
            music_lrc['words'] = '暂无'
            print u'没有歌词'
        return music_lrc

    def save_db(self,cursor,list):
        print list
        try:
            effect_row = cursor.executemany("insert into network_music(ID,NAME,SINGER,ALBUM,PUBLISHTIME,PUBLISHCOMPANY,COMPOSER,LYRICS, 
                FILESIZE,FILETIME,USERHEAD,TYPES,STATUS,WORDS,FILEPATH)values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s) ", list)
            ## 提交，不然无法保存新建或者修改的数据
            conn.commit()
        except:
            print 'Add this db fault!'

    def save_pic(self, pic_url, save_path=''):
        ##组装成接口
        pic_list = ['.jpg@','.png@','.jpeg@','.JPG@','.PNG@','.JPEG@']
        for v in pic_list:
            #print  pic_url
            if v in pic_url:
                check = 1
            else:
                endname = '.errorpic'
        if 'check' in vars() and check == 1:
            endname = v.replace('@', '')
        #print endname,pic_url
        save_path = path.join(d, 'music2/USERHEAD/')
        ###名字暂用时间戳
        picName = int(time.time())
        savepic = save_path + str(picName) + endname
        try:
            urllib.urlretrieve(pic_url, savepic)
            return 'music2/USERHEAD/' + str(picName) + endname
        except:
            return 'no'

    def get_lrc_path(self):
        path_d = u'C:\Users\hz\Downloads'
        file_lists = os.listdir(path_d)
        file_lists.sort(key=lambda fn: os.path.getmtime(path_d + "\" + fn))
        lrc_name =  file_lists[-1]
        '''
        if lrc_name:
            shutil.move(u'C:\Users\hz\Downloads\' + lrc_name, u'G:\www\music2\LRC\')
        '''
        return lrc_name

    '''

    def auto_down1(self, url, filename):
        try:
            urllib.urlretrieve(url, filename)
        except urllib.ContentTooShortError:
            print 'Network conditions is not good.Reloading.'
            auto_down(url, filename)

    def auto_down2(self, url, filename):
        ##加载cookies
        raw_cookies = "PSTM=1523331116; BIDUPSID=6598753517A81D738FD546C2D96EDAC5; BAIDUID=E5EE59A93C8788A953248CD76BEBD48D:FG=1; H_PS_PSSID=1425_18194_21127_26182_20928; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; PHPSESSID=bae76nl31pln7r47vi3i1o9jh7; Hm_lvt_4010fd5075fcfe46a16ec4cb65e02f04=1523420559,1523420572; PSINO=2; Hm_lpvt_4010fd5075fcfe46a16ec4cb65e02f04=1523425208"
        cookies = {}
        for line in raw_cookies.split(';'):
            key, value = line.split('=', 1)  # 1代表只分一次，得到两个数据
            cookies[key] = value
        r = requests.get(url, stream=True,cookies = cookies )
        f = open(filename, "wb")
        for chunk in r.iter_content(chunk_size=512):
            if chunk:
                f.write(chunk)
        f.close()

    def auto_down3(self, url, filename):
        cookie = cookielib.MozillaCookieJar()
        cookie.load('c.txt', ignore_expires=True, ignore_discard=True)
        opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookie))
        urllib2.install_opener(opener)
        music = urllib2.urlopen(url).read()
        f = open(filename,'wb')
        f.write(music)
        f.close()
    '''

if __name__ == "__main__":
    print r'Starting....'
    for i in range(5):
        sys.stdout.write('>'*i + '
')
        sys.stdout.flush()
        time.sleep(0.5)
    conn = pymysql.Connection(host="localhost", user="root", passwd="root", db='test1', charset="UTF8")
    # 创建指针
    cursor = conn.cursor()
    type = raw_input(r'请输入歌曲的类型： ').strip()
    ## 加入数据库
    ## 先判断值是否存在
    result = cursor.execute("select id from network_type where RESOURCETYPE='m' and TYPENAME='{0}'".format(type))
    if result == 0:
        print u'-----该类型不存在添加至数据库-------'
        effect_row = cursor.executemany("insert into network_type(PID,RESOURCETYPE,TYPENAME)values(%s,%s,%s)", [(-1,'m',type)])
        type_id = int(cursor.lastrowid)
    else:
        print u'-----该类型存在不需要添加至数据库-------'
        type_val= cursor.fetchall()
        type_id = type_val[0][0]
    ## 提交，不然无法保存新建或者修改的数据
    conn.commit()
    type_q = urllib2.quote(type)
    # 实例
    bmSpider  = BadiuMusicSpider()
    bmSpider.login(cursor,type_id,type_q)
----代码的逻辑
第一步：登录百度，使用selenium（本来我打算用selenium登录之后导出cookie，再通过加载cookie，但是遇到了些问题，再加上工作原因就没有用这个，下次我有空再试，验证码方面，没有设置，遇到验证码关了重启，只要登录成功了，可以爬很久了。）
第二步：输入歌曲类型，默认从第一页开始抓取，接下来就是各种循环，入库啥的，还有文件移动。
总的来说还是比较简单的一个爬虫，不足之处大佬轻喷。
查看全文
相关阅读:
css常用属性记录
 js字符串常用方法总结
 mongoose基本操作
 本地存储API
历史相关API
自定义播放器
 HTML5全屏操作API
HTML5自定义属性操作
 HTML5类操作
 案例:3D切割轮播图
原文地址：https://www.cnblogs.com/shuangzikun/p/python_taotao_baidu_music_spider.html