import requests
import re
from bs4 import BeautifulSoup
from string import punctuation
import os
header={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'}
def str2url(s):
num_loc = s.find('h')
rows = int(s[0:num_loc])
strlen = len(s) - num_loc
cols = int(strlen/rows)
right_rows = strlen % rows
new_s = s[num_loc:]
output = ''
for i in range(len(new_s)):
x = i % rows
y = int(i / rows)
p = 0
if x <= right_rows:
p = x * (cols+1) + y
else:
p = right_rows * (cols+1) + (x - right_rows) * cols + y
output += new_s[int(p)]
return requests.utils.unquote(output).replace('^', '0')
def filtrate(G):
return G.strip().replace('<b class="key_red">','').replace('</b>','')
while True:
song_name = input('请输入歌曲名字')
sou = 'http://www.xiami.com/search?key='+song_name+'&pos=1'
song_list = requests.get(sou,headers=header).text
demo = re.compile('<td class="song_name">.*?<a.*?>(.*?)</a>.*?</td>.*?<td class="song_artist">.*?<a.*?>(.*?)</a>.*?</td>.*?<td class="song_album">.*?<a.*?>(.*?)</a>.*?</td>.*?play((.*?))',re.S)
lists = demo.findall(song_list)
n = 1
for a,b,c,d in lists:
print(n,filtrate(a),filtrate(b),filtrate(c))
n = n+1
choice = input('请选择歌曲序号')
m = 1
for a,b,c,d in lists:
d= d.split(',')[0].replace("'","")
if choice==str(m):
songs = 'http://www.xiami.com/widget/xml-single/uid/0/sid/'+str(d)
print(songs)
xml_contents = requests.get(songs,headers=header).text
demo = re.compile('CDATA[(.*?)]', re.S)
lists = demo.findall(xml_contents)
s = lists[4]
song_url = str2url(s)
w = ''
for word in lists[0]:
if word not in punctuation:
w = w+word
if os.path.isfile('F:\15\top100\'+ w+lists[2]+lists[3]+'.mp3'):
print(w + lists[2] + lists[3] + '已存在')
else:
print('正在下载' + filtrate(a) + '请等待')
songs = requests.get(song_url, headers=header).content
op = open('F:\15\top100\' + w + lists[2] + lists[3] + '.mp3', 'wb')
op.write(songs)
op.close()
print('已下载' + filtrate(a) + '请查看')
m = m + 1
song_name = input('请输入歌曲名字')
中间处理逻辑的其他两种方法:
我自己后来又想出的一种,先算坐标,再根据坐标计算,目前代码最少的方法:
import requests
strs = '9hFx%265%7153y3-edc4t%i216E2456F%7%3f9bt2aF183F323a325df8cpFm6%%51478uD%E432%%mi65218%1.t15-489531.8EF7%54mh5E%78dEA2n%3245E3p_3%5bf16%8e2412E173k15E26f2.tF3%61_2%e5E-b19'
h_index = strs.find('h')
strs_num = len(strs[h_index:])//int(strs[:h_index])
strs_m = len(strs[h_index:])%int(strs[:h_index])
lists = []
n = ''
for i in range(strs_num+1):
for j in range(int(strs[:h_index])):
if j<strs_m or i<strs_num:
lists.append((i,j))
for i,j in lists:
if j <= strs_m:
index = j*(strs_num+1)+i
else:
index = strs_m * (strs_num+1) + (j - strs_m) * strs_num + i
n+=strs[h_index:][index]
print(requests.utils.unquote(n).replace('^', '0'))
岳峰想出的,补全阵列,分完再去除的方法,容易理解一些,代码也稍微多了点:
import requests
raw_url='9hFx%265%7153y3-edc4t%i216E2456F%7%3f9bt2aF183F323a325df8cpFm6%%51478uD%E432%%mi65218%1.t15-489531.8EF7%54mh5E%78dEA2n%3245E3p_3%5bf16%8e2412E173k15E26f2.tF3%61_2%e5E-b19'
num_loc=raw_url.find('h')
rows=int(raw_url[0:num_loc])
strlen = len(raw_url) - num_loc
cols = int(strlen / rows)
right_rows = strlen % rows
new_s = raw_url[num_loc:]
new_c=''
strs=''
if right_rows!=0:
new_c = new_s + '*'
n=1
while 1:
if len(new_c)%rows==0:
break
else:
new_c = new_c[:-n*(cols + 1)] + '*' + new_c[-n*(cols + 1):]
n+=1
else:
new_c=new_s
for i in range(len(new_c)):
x=i%(rows)
y=int(i/(rows))
p=x*(cols+1)+y
strs+= new_c[int(p)]
aa=requests.utils.unquote(strs).replace('^', '0')
print(aa)