zoukankan html css js c++ java

歌词爬虫

因为要做对话聊天系统，需要大量的语料，所以决定用歌词作为训练数据试试，自己写了个爬虫，爬了大概23w首歌曲的歌词；

用此歌词用作问答对，然后用LSTM-QA模型做问答匹配，经过多次实验，达到一个不错的效果，基本上可以跟你正常聊天；

import re
import urllib
import urlparse
from BeautifulSoup import BeautifulSoup


url = u'http://www.lrcgc.com/'
def find_singers():
    singers_list = [] 
    response = urllib.urlopen('http://www.lrcgc.com/artist-00.html')
    data = response.read()
    soup = BeautifulSoup(data)    
    links = soup.findAll('a', href = re.compile(r'songlist.*.html'))
    for link in links:
        s = link.text
        l = link['href']
        singers_list.append([s, l])
    return singers_list

def find_songs(singer):
    singer_name, urls_0 = singer[0], singer[1]
    songs_href = [] 
    songs_list = [urls_0]
    song_list_old = [] 

    while len(songs_list) >0: 
        url_i = songs_list.pop() 
        song_list_old.append(url_i)
        response = urllib.urlopen(url+url_i)
        data = response.read()
        soup = BeautifulSoup(data)
        songs_list_links = soup.findAll('a', href = re.compile(r'songlist.*.html'))
        for link in songs_list_links:
            if link['href'] not in song_list_old:
                if link['href'] not in songs_list:
                    songs_list.append(link['href'])

        songs_href_list = soup.findAll('a', href = re.compile(r'lyric-.*.html'))
        for link in songs_href_list:
            songs_href.append(link['href'])

    return list(set(songs_href))

singers_list = find_singers()
dic = {}
for singer in singers_list:
    try:
        ss = find_songs(singer)
        print singer[0].encode('utf-8') + '\t' + str(len(ss))
        dic[singer[0]] = ss
    except:
        continue


def parse_song_href(singer, song_url):
    complete_url = url + song_url
    response = urllib.urlopen(complete_url)
    data = response.read()
    soup = BeautifulSoup(data)
    name = soup.findAll('a', id = 'J_downlrc')[0]['href']
    download_url = url + name

    try:
        content = urllib.urlopen(download_url.encode('utf-8')).read() 
        with open('./' +  name.encode('utf-8').split('/')[1], 'w') as f:
            f.write(content) 
        return download_url
    except:
        return False


for singer_name in dic.keys():
    for song_url in dic[singer_name]:
        print parse_song_href(singer_name, song_url)

查看全文

相关阅读:
Weex系列-入门2
Weex系列-入门1
Android ClassLoader笔记（二）
Android ClassLoader笔记（一）
Android-DataBinding入门系列（一）基本介绍
 Fiddler简单介绍
 colorWithPatternImage导致的图片错位问题
 自定义view-滑动开关
 ios设置行间距和部分文本颜色
 办公利器-一行代码搞定http服务

原文地址：https://www.cnblogs.com/LarryGates/p/6559737.html