zoukankan html css js c++ java

爬虫爬当当网书籍信息

拖了好久的一个爬虫

先上代码文字慢慢补

update(2018-5-7):加了出版社＝＝

updata(2018-6-29):啥都加了 https://github.com/general10/duangduang

首先我们是要爬取当当网书籍信息

在当前页面可以获取的是书名评论总数价格折扣

这几个数据都很好处理

直接就在html里抓出来就可以了

.text是获取内容 []是获取属性= =

 1 bookname = data.find_all('div', attrs={'class': 'name'})
 2 bookstar = data.find_all('div', attrs={'class': 'star'})
 3 bookprice = data.find_all('div', attrs={'class': 'price'})
 4 bookoff = data.find_all('span', attrs={'class': 'price_s'})    
 5 
 6 
 7 bookname[i].find('a')['title'] + " "                  # 书名
 8 bookprice[i].find('span').text[1:] + " "              # 价格
 9 bookoff[i].text[:-1] + " "                            # 折扣
10 bookstar[i].find('a').text[:-3] + " "                 # 评论数

在书名的url里我们可以获取到好评中评差评好评率

（其实上一步可以获取好评率那个小星星是有填充的）

我们可以在检查里看到

但是我们在网页源代码里是看不到评价的

因为评论这部分是ajax异步获取的

然后我们打开F12

在post请求里找评论的post

看一下这个preview的内容大概可以确定这个请求是评论的请求

然后我们看header下的Request URL

这个url可以通过productId categoryPath mainProductId 这三个参数来确定

这三个参数可以在网页源代码里通过正则表达式匹配出来

1 def getId(html):
2     id = {}
3     ma = re.search(r'"productId":"[d]+"', html)
4     id['productId'] = eval(ma.group().split(':')[-1])
5     ma = re.search(r'"categoryPath":"[d.]+"', html)
6     id['categoryPath'] = eval(ma.group().split(':')[-1])
7     ma = re.search(r'"mainProductId":"[d.]+"', html)
8     id['mainProductId'] = eval(ma.group().split(':')[-1])
9     return id

然后拼接url之后下载url内容

1 def getCommentUrl(id):
2     return 'http://product.dangdang.com/index.php?r=comment%2Flist&productId={productId}&categoryPath={categoryPath}&mainProductId={mainProductId}&mediumId=0&pageIndex=1&sortType=1&filterType=1&isSystem=1&tagId=0&tagFilterCount=0'.format(
3         productId=id['productId'], categoryPath=id['categoryPath'], mainProductId=id['mainProductId'])

url内容是个json

格式化一下之后就容易看得多了

直接用py自带的json解析工具解析一下就好了

 1 def getCommentCount(url):
 2     html = urllib2.urlopen(url).read()
 3 
 4     # 用正则表达式获取对应id
 5     id = getId(html)
 6 
 7     # 拼接ajax对应的url
 8     json_url = getCommentUrl(id)
 9 
10     # 获取url对应的json
11     json_html = json.loads(getJsonText(json_url))
12 
13     # 获取评论数
14     summary = json_html['data']['list']['summary']
15     comment= {}
16     comment['好评'] = summary['total_crazy_count']                    # 好评数
17     comment['中评'] = summary['total_indifferent_count']              # 中评数
18     comment['差评'] = summary['total_detest_count']                   # 差评数
19     comment['好评率'] = summary['goodRate']                           # 好评率
20     return comment

最后是写入excel

.write(行, 列, 内容)

 1 sheet1.write(page * 20 + i + 1, 0, page * 20 + i + 1)
 2 sheet1.write(page * 20 + i + 1, 1, bookname[i].find('a')['title'])
 3 sheet1.write(page * 20 + i + 1, 2, bookprice[i].find('span').text[1:])
 4 sheet1.write(page * 20 + i + 1, 3, bookoff[i].text[:-1])
 5 sheet1.write(page * 20 + i + 1, 4, bookstar[i].find('a').text[:-3])
 6 sheet1.write(page * 20 + i + 1, 5, data['好评'])
 7 sheet1.write(page * 20 + i + 1, 6, data['中评'])
 8 sheet1.write(page * 20 + i + 1, 7, data['差评'])
 9 sheet1.write(page * 20 + i + 1, 8, data['好评率'])
10 wb.save('test.xls')

result

all code

（这个是一开始的）

  1 # -*- coding: utf-8 -*
  2 
  3 import urllib2
  4 import xlwt
  5 from bs4 import BeautifulSoup
  6 from datashape import json
  7 import re
  8 import json
  9 import requests
 10 
 11 
 12 def getJsonText(url):
 13     try:
 14         r = requests.get(url, timeout=1)
 15         r.raise_for_status()
 16         r.encoding = r.apparent_encoding
 17         return r.text
 18     except:
 19         print '获取失败'
 20         return ''
 21 
 22 
 23 def getId(html):
 24     id = {}
 25     ma = re.search(r'"productId":"[d]+"', html)
 26     id['productId'] = eval(ma.group().split(':')[-1])
 27     ma = re.search(r'"categoryPath":"[d.]+"', html)
 28     id['categoryPath'] = eval(ma.group().split(':')[-1])
 29     ma = re.search(r'"mainProductId":"[d.]+"', html)
 30     id['mainProductId'] = eval(ma.group().split(':')[-1])
 31     return id
 32 
 33 def getCommentUrl(id):
 34     return 'http://product.dangdang.com/index.php?r=comment%2Flist&productId={productId}&categoryPath={categoryPath}&mainProductId={mainProductId}&mediumId=0&pageIndex=1&sortType=1&filterType=1&isSystem=1&tagId=0&tagFilterCount=0'.format(
 35         productId=id['productId'], categoryPath=id['categoryPath'], mainProductId=id['mainProductId'])
 36 
 37 def getCommentCount(url):
 38     html = urllib2.urlopen(url).read()
 39 
 40     # 用正则表达式获取对应id
 41     id = getId(html)
 42 
 43     # 拼接ajax对应的url
 44     json_url = getCommentUrl(id)
 45 
 46     # 获取url对应的json
 47     json_html = json.loads(getJsonText(json_url))
 48 
 49     # 获取评论数
 50     summary = json_html['data']['list']['summary']
 51     comment= {}
 52     comment['好评'] = summary['total_crazy_count']                    # 好评数
 53     comment['中评'] = summary['total_indifferent_count']              # 中评数
 54     comment['差评'] = summary['total_detest_count']                   # 差评数
 55     comment['好评率'] = summary['goodRate']                           # 好评率
 56     return comment
 57 
 58 def main():
 59     wb = xlwt.Workbook()
 60     sheet1 = wb.add_sheet("Sheet")
 61     sheet1.write(0, 0, unicode('序号', "utf-8"))
 62     sheet1.write(0, 1, unicode('书名', "utf-8"))
 63     sheet1.write(0, 2, unicode('价格', "utf-8"))
 64     sheet1.write(0, 3, unicode('折扣', "utf-8"))
 65     sheet1.write(0, 4, unicode('评论数', "utf-8"))
 66     sheet1.write(0, 5, unicode('好评', "utf-8"))
 67     sheet1.write(0, 6, unicode('中评', "utf-8"))
 68     sheet1.write(0, 7, unicode('差评', "utf-8"))
 69     sheet1.write(0, 8, unicode('好评率', "utf-8"))
 70 
 71     for page in range(25):
 72 
 73         url = 'http://bang.dangdang.com/books/bestsellers/01.00.00.00.00.00-24hours-0-0-1-%d' % (page+1)
 74         get = urllib2.urlopen(url).read()
 75         data = BeautifulSoup(get, 'lxml')
 76 
 77         bookname = data.find_all('div', attrs={'class': 'name'})
 78         bookstar = data.find_all('div', attrs={'class': 'star'})
 79         bookprice = data.find_all('div', attrs={'class': 'price'})
 80         bookoff = data.find_all('span', attrs={'class': 'price_s'})
 81 
 82         for i in range(20):
 83             bookurl = bookname[i].find('a')['href']
 84             data = getCommentCount(bookurl)
 85             print (str(page*20+i+1) + " "
 86                     + bookname[i].find('a')['title'] + " "                   # 书名
 87                     + bookprice[i].find('span').text[1:] + " "               # 价格
 88                     + bookoff[i].text[:-1] + " "                             # 折扣
 89                     + bookstar[i].find('a').text[:-3] + " "                  # 评论数
 90                     + data['好评'] + " "                                      # 好评数
 91                     + data['中评'] + " "                                      # 中评数
 92                     + data['差评'] + " "                                      # 差评数
 93                     + data['好评率'] + " "                                    # 好评率
 94                    )
 95 
 96             sheet1.write(page * 20 + i + 1, 0, page * 20 + i + 1)
 97             sheet1.write(page * 20 + i + 1, 1, bookname[i].find('a')['title'])
 98             sheet1.write(page * 20 + i + 1, 2, bookprice[i].find('span').text[1:])
 99             sheet1.write(page * 20 + i + 1, 3, bookoff[i].text[:-1])
100             sheet1.write(page * 20 + i + 1, 4, bookstar[i].find('a').text[:-3])
101             sheet1.write(page * 20 + i + 1, 5, data['好评'])
102             sheet1.write(page * 20 + i + 1, 6, data['中评'])
103             sheet1.write(page * 20 + i + 1, 7, data['差评'])
104             sheet1.write(page * 20 + i + 1, 8, data['好评率'])
105             wb.save('test.xls')
106 
107 main()

View Code

在mac里换了点东西才可以用（谁知道为什么

  1 # -*- coding: utf-8 -*
  2 
  3 import xlwt
  4 from bs4 import BeautifulSoup
  5 from datashape import json
  6 import re
  7 import json
  8 import requests
  9 
 10 
 11 def getJsonText(url):
 12     try:
 13         r = requests.get(url, timeout=1)
 14         r.raise_for_status()
 15         r.encoding = r.apparent_encoding
 16         return r.text
 17     except:
 18         print('获取失败')
 19         return ''
 20 
 21 
 22 def getId(html):
 23     id = {}
 24     ma = re.search(r'"productId":"[d]+"', html)
 25     id['productId'] = eval(ma.group().split(':')[-1])
 26     ma = re.search(r'"categoryPath":"[d.]+"', html)
 27     id['categoryPath'] = eval(ma.group().split(':')[-1])
 28     ma = re.search(r'"mainProductId":"[d.]+"', html)
 29     id['mainProductId'] = eval(ma.group().split(':')[-1])
 30     return id
 31 
 32 def getCommentUrl(id):
 33     return 'http://product.dangdang.com/index.php?r=comment%2Flist&productId={productId}&categoryPath={categoryPath}&mainProductId={mainProductId}&mediumId=0&pageIndex=1&sortType=1&filterType=1&isSystem=1&tagId=0&tagFilterCount=0'.format(
 34         productId=id['productId'], categoryPath=id['categoryPath'], mainProductId=id['mainProductId'])
 35 
 36 def getCommentCount(url):
 37     html = requests.get(url).text
 38 
 39     # 用正则表达式获取对应id
 40     id = getId(html)
 41 
 42     # 拼接ajax对应的url
 43     json_url = getCommentUrl(id)
 44 
 45     # 获取url对应的json
 46     json_html = json.loads(getJsonText(json_url))
 47 
 48     # 获取评论数
 49     summary = json_html['data']['list']['summary']
 50     comment = {}
 51     comment['好评'] = summary['total_crazy_count']                    # 好评数
 52     comment['中评'] = summary['total_indifferent_count']              # 中评数
 53     comment['差评'] = summary['total_detest_count']                   # 差评数
 54     comment['好评率'] = summary['goodRate']                           # 好评率
 55     return comment
 56 
 57 def main():
 58     wb = xlwt.Workbook()
 59     sheet1 = wb.add_sheet("Sheet")
 60     sheet1.write(0, 0, '序号')
 61     sheet1.write(0, 1, '书名')
 62     sheet1.write(0, 2, '作者')
 63     sheet1.write(0, 3, '出版社')
 64     sheet1.write(0, 4, '价格')
 65     sheet1.write(0, 5, '折扣')
 66     sheet1.write(0, 6, '评论数')
 67     sheet1.write(0, 7, '好评')
 68     sheet1.write(0, 8, '中评')
 69     sheet1.write(0, 9, '差评')
 70     sheet1.write(0, 10, '好评率')
 71 
 72     for page in range(2):
 73 
 74         url = 'http://bang.dangdang.com/books/bestsellers/01.00.00.00.00.00-24hours-0-0-1-%d' % (page+1)
 75         get = requests.get(url).text
 76         data = BeautifulSoup(get, 'lxml')
 77 
 78         bookname = data.find_all('div', attrs={'class': 'name'})
 79         bookstar = data.find_all('div', attrs={'class': 'star'})
 80         bookpublish = data.find_all('div', attrs={'class': 'publisher_info'})
 81         bookprice = data.find_all('div', attrs={'class': 'price'})
 82         bookoff = data.find_all('span', attrs={'class': 'price_s'})
 83 
 84 
 85         for i in range(20):
 86             bookurl = bookname[i].find('a')['href']
 87             comments = getCommentCount(bookurl)
 88             print (str(page*20+i+1) + " "
 89                     + bookname[i].find('a')['title'] + " "                   # 书名
 90                     + bookpublish[i*2].find('a').text + " "                  # 作者
 91                     + bookpublish[i*2+1].find('a').text + " "                # 出版社
 92                     + bookprice[i].find('span').text[1:] + " "               # 价格
 93                     + bookoff[i].text[:-1] + "折 "                           # 折扣
 94                     + bookstar[i].find('a').text[:-3] + " "                  # 评论数
 95                     + comments['好评'] + " "                                  # 好评数
 96                     + comments['中评'] + " "                                  # 中评数
 97                     + comments['差评'] + " "                                  # 差评数
 98                     + comments['好评率'] + "% "                               # 好评率
 99                    )
100 
101             sheet1.write(page * 20 + i + 1, 0, page * 20 + i + 1)
102             sheet1.write(page * 20 + i + 1, 1, bookname[i].find('a')['title'])
103             sheet1.write(page * 20 + i + 1, 2, bookpublish[i*2].find('a').text)
104             sheet1.write(page * 20 + i + 1, 3, bookpublish[i*2+1].find('a').text)
105             sheet1.write(page * 20 + i + 1, 4, bookprice[i].find('span').text[1:])
106             sheet1.write(page * 20 + i + 1, 5, bookoff[i].text[:-1] + '折')
107             sheet1.write(page * 20 + i + 1, 6, bookstar[i].find('a').text[:-3])
108             sheet1.write(page * 20 + i + 1, 7, comments['好评'])
109             sheet1.write(page * 20 + i + 1, 8, comments['中评'])
110             sheet1.write(page * 20 + i + 1, 9, comments['差评'])
111             sheet1.write(page * 20 + i + 1, 10, comments['好评率'] + '%')
112             wb.save('test.xls')
113 
114 main()

View Code

（存个图片）

查看全文

相关阅读:
每日一练leetcode
每日一练leetcode
每日一练leetcode
springboot搭建过程
 每日一练leetcode
每日一练leetcode
每日一练leetcode
安装 Redis 迎客
 windows系统上面如何后台执行程序迎客
 jira的详细安装和破解迎客

原文地址：https://www.cnblogs.com/general10/p/8979389.html