豆瓣上有图书的排行榜,所以这次写了一个豆瓣的爬虫。
首先是分析排行榜的url
根据这个可以很容易的知道不同图书的排行榜就是在网站后面加上/tag/【类别】,所以我们首先要获得图书的类别信息。
这里可以将读书首页的热门标签给爬下来。
爬取标签内容并不难,代码如下:
1 def getLabel(url): #获得热门标签 2 html = getHTMLText(url) 3 soup = BeautifulSoup(html, 'html.parser') 4 a = soup.find_all('a') 5 label_list = [] 6 for i in a: 7 try: 8 href = i.attrs['href'] 9 match = re.search(r'/tag/.*', href) 10 if match and match[0][5]!='?': 11 label_list.append(match[0]) 12 except: 13 continue 14 return label_list
接下来是进入排行榜页面进行信息爬取,
代码如下:
1 def getBookInfo(): 2 label_list = getLabel('https://book.douban.com/') 3 label = get_label(label_list) 4 name = [] 5 author = [] 6 price = [] 7 score = [] 8 number = [] 9 for page in label_list[int(label)-1:int(label)]: 10 for i in range(2): 11 html = getHTMLText('https://book.douban.com' + page + '?start=' + str(i*20) + '&type=T') 12 soup = BeautifulSoup(html, 'html.parser') 13 book_list = soup.find_all('div', attrs={'class':'info'}) #找到书籍的信息列表 14 for book in book_list: 15 a = book.find_all('a',attrs={'title':re.compile('.*')})[0] #找到包含书籍名的a标签 16 name.append(a.get('title')) #获得标题属性 17 18 pub = book.find_all('div', attrs={'class':'pub'})[0] 19 pub = pub.string.strip().replace(' ','') 20 author.append(re.findall(r'(.*?)/', pub)[0].strip()) 21 split_list = pub.split() #空格分割 22 for j in split_list: 23 match = re.search(r'd.*..*', j) #获得价格信息 24 if match: 25 price.append(match[0]) 26 27 span = book.find_all('span', attrs={'class':'pl'})[0] #获得评价人数所在标签 28 span = span.string.strip().replace(' ','') 29 number.append(re.findall(r'd+', span)[0]) #获得人数 30 31 span = book.find_all('span', attrs={'class':'rating_nums'})[0] 32 score.append(span.string) 33 34 tplt = "{:3} {:15} {:15} {:10} {:4} {:7}" #规定输出格式 35 print(tplt.format("序号", "书籍", "作者", "价格", "评分", "评价人数")) 36 l = len(name) 37 for count in range(l): 38 print(tplt.format(count+1, name[count],author[count],price[count],score[count],number[count]))
最终的总代码为:
1 import requests 2 import re 3 from bs4 import BeautifulSoup 4 5 6 def getHTMLText(url): 7 try: 8 r = requests.get(url, timeout=30) 9 r.raise_for_status() 10 r.encoding = r.apparent_encoding 11 return r.text 12 except: 13 return "" 14 15 16 def getLabel(url): #获得热门标签 17 html = getHTMLText(url) 18 soup = BeautifulSoup(html, 'html.parser') 19 a = soup.find_all('a') 20 label_list = [] 21 for i in a: 22 try: 23 href = i.attrs['href'] 24 match = re.search(r'/tag/.*', href) 25 if match and match[0][5]!='?': 26 label_list.append(match[0]) 27 except: 28 continue 29 return label_list 30 31 32 def get_label(label_list): 33 count = 1 34 for i in label_list: 35 print(str(count) + ': ' + label_list[count-1][5:] + ' ', end='') 36 count = count + 1 37 choose = input(' 请输入你想查询的图书类别:') 38 while int(choose)<=0 or int(choose)>=count: 39 choose = input(' 请输入正确的类别编号:') 40 return int(choose) 41 42 def getBookInfo(): 43 label_list = getLabel('https://book.douban.com/') 44 label = get_label(label_list) 45 name = [] 46 author = [] 47 price = [] 48 score = [] 49 number = [] 50 for page in label_list[int(label)-1:int(label)]: 51 for i in range(2): 52 html = getHTMLText('https://book.douban.com' + page + '?start=' + str(i*20) + '&type=T') 53 soup = BeautifulSoup(html, 'html.parser') 54 book_list = soup.find_all('div', attrs={'class':'info'}) #找到书籍的信息列表 55 for book in book_list: 56 a = book.find_all('a',attrs={'title':re.compile('.*')})[0] #找到包含书籍名的a标签 57 name.append(a.get('title')) #获得标题属性 58 59 pub = book.find_all('div', attrs={'class':'pub'})[0] 60 pub = pub.string.strip().replace(' ','') 61 author.append(re.findall(r'(.*?)/', pub)[0].strip()) 62 split_list = pub.split() #空格分割 63 for j in split_list: 64 match = re.search(r'd.*..*', j) #获得价格信息 65 if match: 66 price.append(match[0]) 67 68 span = book.find_all('span', attrs={'class':'pl'})[0] #获得评价人数所在标签 69 span = span.string.strip().replace(' ','') 70 number.append(re.findall(r'd+', span)[0]) #获得人数 71 72 span = book.find_all('span', attrs={'class':'rating_nums'})[0] 73 score.append(span.string) 74 75 tplt = "{:3} {:15} {:15} {:10} {:4} {:7}" #规定输出格式 76 print(tplt.format("序号", "书籍", "作者", "价格", "评分", "评价人数")) 77 l = len(name) 78 for count in range(l): 79 print(tplt.format(count+1, name[count],author[count],price[count],score[count],number[count])) 80 81 82 83 if __name__ =='__main__': 84 print('豆瓣图书综合排序查询 ') 85 getBookInfo() 86
最后的运行效果:
首先是类别表:
输入图书类别后就可以显示图书信息了:
我这里只爬取了两页的图书信息。
因为有些书的信息是不完整的,所以在爬取时可能会出现错误。我正则表达式写得也不是很好,很多地方都是会出错的,比如价格那儿。