zoukankan html css js c++ java

爬取淘宝商品数据并保存在excel中

１.re实现

  1 import requests
  2 from requests.exceptions import RequestException
  3 import re,json
  4 import xlwt,xlrd
  5 
  6 # 数据
  7 DATA = []
  8 KEYWORD = 'python'
  9 HEADERS = {'user-agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome'
 10                         '/63.0.3239.132 Safari/537.36'}
 11 MAX_PAGE = 10
 12 
 13 
 14 def get_target(data_list):
 15     for item in data_list:
 16          temp = {
 17         'title': item['title'],
 18         'price': item['view_price'],
 19         'sales': item['view_sales'],
 20         'isTmall': '否' if float(item['view_fee']) else '是',
 21         'area': item['item_loc'],
 22         'name': item['nick'],
 23         'url': item['detail_url']
 24          }
 25          DATA.append(temp)
 26     return True
 27 
 28 
 29 # 发送http请求，获取网页源码
 30 def get_html(url,*args):
 31     try:
 32         if not args:
 33             response = requests.get(url,headers=HEADERS)
 34             global COOKIES
 35             COOKIES = response.cookies  # 获取cookie
 36         else:
 37             response = requests.get(url,headers=HEADERS,cookies=COOKIES)
 38 
 39         response.encoding = response.apparent_encoding
 40         return response.text
 41     except RequestException:
 42         print('请求源码出错！')
 43 
 44 # 解析源码，得到目标信息
 45 def parse_html(html,*args):
 46     if not args:
 47         pattern = re.compile(r'g_page_config = (.*?)g_srp_loadCss',re.S)
 48         # 去掉末尾的';'
 49         result = re.findall(pattern, html)[0].strip()[:-1]
 50         # 格式化json，可以用json在线解析工具查看结构
 51         content = json.loads(result)
 52         data_list = content['mods']['itemlist']['data']['auctions']
 53     else:
 54         pattern = re.compile(r'{.*}',re.S)
 55         result = re.findall(pattern,html)[0]
 56         content = json.loads(result)
 57         data_list = content['API.CustomizedApi']['itemlist']['auctions']
 58 
 59     get_target(data_list)
 60 
 61 
 62 def save_to_excel():
 63     f_name = '淘宝%s数据'%KEYWORD
 64     book = xlwt.Workbook(encoding='utf-8',style_compression=0)
 65     sheet = book.add_sheet(f_name)
 66     sheet.write(0, 0, 'title')
 67     sheet.write(0, 1, 'price')
 68     sheet.write(0, 2, 'sales')
 69     sheet.write(0, 3, 'isTmall')
 70     sheet.write(0, 4, 'area')
 71     sheet.write(0, 5, 'name')
 72     sheet.write(0, 6, 'url')
 73     for i in range(len(DATA)):
 74         sheet.write(i+1, 0, DATA[i]['title'])
 75         sheet.write(i+1, 1, DATA[i]['price'])
 76         sheet.write(i+1, 2, DATA[i]['sales'])
 77         sheet.write(i+1, 3, DATA[i]['isTmall'])
 78         sheet.write(i+1, 4, DATA[i]['area'])
 79         sheet.write(i+1, 5, DATA[i]['name'])
 80         sheet.write(i+1, 6, DATA[i]['url'])
 81     book.save('淘宝%s数据.xls'%KEYWORD)
 82 
 83 
 84 
 85 def main():
 86     for offset in range(MAX_PAGE):
 87         #  首页有12条异步加载的数据　api?
 88         if offset == 0:
 89             url1 = 'https://s.taobao.com/search?q={}&s={}'.format(KEYWORD,offset*44)
 90             html = get_html(url1)
 91             contents = parse_html(html)
 92 
 93             url2 = 'https://s.taobao.com/api?_ksTS=1532524504679_226&callback=jsonp227&ajax=true&m=customized&' 
 94                    'stats_click=search_radio_all:1&q={}'.format(KEYWORD)
 95             html = get_html(url2,2)
 96             contents = parse_html(html,2)
 97         else:
 98             url = 'https://s.taobao.com/search?q={}&s={}'.format(KEYWORD,offset*44)
 99             html = get_html(url)
100             contents = parse_html(html)
101 
102     save_to_excel()
103     print(len(DATA))
104 
105 if __name__ == '__main__':
106     main()

View Code

查看全文

相关阅读:
pexpect模块
 Python正则表达式
 telnetlib
paramiko
threadpool和Queue
logging
Python异常
 Python迭代器
 程序员工资那么高，却从不炫富？网友回复让人“笑喷了”！
小白到web前端工程师需要学习哪些知识？

原文地址：https://www.cnblogs.com/ray-mmss/p/9052977.html

热门文章
SQL语句
 数据库中的锁
 Java中的锁
 红黑树与AVL树
 死锁银行家算法
 java正则表达式
 java优先队列
 网络IO
lambda相关
 traceback模块