zoukankan      html  css  js  c++  java
  • 爬取淘宝商品数据并保存在excel中

    1.re实现
      1 import requests
      2 from requests.exceptions import RequestException
      3 import re,json
      4 import xlwt,xlrd
      5 
      6 # 数据
      7 DATA = []
      8 KEYWORD = 'python'
      9 HEADERS = {'user-agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome'
     10                         '/63.0.3239.132 Safari/537.36'}
     11 MAX_PAGE = 10
     12 
     13 
     14 def get_target(data_list):
     15     for item in data_list:
     16          temp = {
     17         'title': item['title'],
     18         'price': item['view_price'],
     19         'sales': item['view_sales'],
     20         'isTmall': '' if float(item['view_fee']) else '',
     21         'area': item['item_loc'],
     22         'name': item['nick'],
     23         'url': item['detail_url']
     24          }
     25          DATA.append(temp)
     26     return True
     27 
     28 
     29 # 发送http请求,获取网页源码
     30 def get_html(url,*args):
     31     try:
     32         if not args:
     33             response = requests.get(url,headers=HEADERS)
     34             global COOKIES
     35             COOKIES = response.cookies  # 获取cookie
     36         else:
     37             response = requests.get(url,headers=HEADERS,cookies=COOKIES)
     38 
     39         response.encoding = response.apparent_encoding
     40         return response.text
     41     except RequestException:
     42         print('请求源码出错!')
     43 
     44 # 解析源码,得到目标信息
     45 def parse_html(html,*args):
     46     if not args:
     47         pattern = re.compile(r'g_page_config = (.*?)g_srp_loadCss',re.S)
     48         # 去掉末尾的';'
     49         result = re.findall(pattern, html)[0].strip()[:-1]
     50         # 格式化json,可以用json在线解析工具查看结构
     51         content = json.loads(result)
     52         data_list = content['mods']['itemlist']['data']['auctions']
     53     else:
     54         pattern = re.compile(r'{.*}',re.S)
     55         result = re.findall(pattern,html)[0]
     56         content = json.loads(result)
     57         data_list = content['API.CustomizedApi']['itemlist']['auctions']
     58 
     59     get_target(data_list)
     60 
     61 
     62 def save_to_excel():
     63     f_name = '淘宝%s数据'%KEYWORD
     64     book = xlwt.Workbook(encoding='utf-8',style_compression=0)
     65     sheet = book.add_sheet(f_name)
     66     sheet.write(0, 0, 'title')
     67     sheet.write(0, 1, 'price')
     68     sheet.write(0, 2, 'sales')
     69     sheet.write(0, 3, 'isTmall')
     70     sheet.write(0, 4, 'area')
     71     sheet.write(0, 5, 'name')
     72     sheet.write(0, 6, 'url')
     73     for i in range(len(DATA)):
     74         sheet.write(i+1, 0, DATA[i]['title'])
     75         sheet.write(i+1, 1, DATA[i]['price'])
     76         sheet.write(i+1, 2, DATA[i]['sales'])
     77         sheet.write(i+1, 3, DATA[i]['isTmall'])
     78         sheet.write(i+1, 4, DATA[i]['area'])
     79         sheet.write(i+1, 5, DATA[i]['name'])
     80         sheet.write(i+1, 6, DATA[i]['url'])
     81     book.save('淘宝%s数据.xls'%KEYWORD)
     82 
     83 
     84 
     85 def main():
     86     for offset in range(MAX_PAGE):
     87         #  首页有12条异步加载的数据 api?
     88         if offset == 0:
     89             url1 = 'https://s.taobao.com/search?q={}&s={}'.format(KEYWORD,offset*44)
     90             html = get_html(url1)
     91             contents = parse_html(html)
     92 
     93             url2 = 'https://s.taobao.com/api?_ksTS=1532524504679_226&callback=jsonp227&ajax=true&m=customized&' 
     94                    'stats_click=search_radio_all:1&q={}'.format(KEYWORD)
     95             html = get_html(url2,2)
     96             contents = parse_html(html,2)
     97         else:
     98             url = 'https://s.taobao.com/search?q={}&s={}'.format(KEYWORD,offset*44)
     99             html = get_html(url)
    100             contents = parse_html(html)
    101 
    102     save_to_excel()
    103     print(len(DATA))
    104 
    105 if __name__ == '__main__':
    106     main()
    View Code
  • 相关阅读:
    python DB.fetchall()--获取数据库所有记录列表
    pybot/robot命令参数说明【dos下执行命令pybot.bat --help查看】
    win7 dos命令窗口内容显示不全解决办法--将命令执行结果输出到一个文件中
    【python cookbook】【数据结构与算法】2 从任意长度的可迭代对象中分解元素
    【python cookbook】【数据结构与算法】1将序列分解为单独的变量
    wxPython_Phoenix在线安装
    Python 进阶(五)定制类
    Python 进阶(四)类的继承
    墓型价格分析表
    用碑情况统计
  • 原文地址:https://www.cnblogs.com/ray-mmss/p/9052977.html
Copyright © 2011-2022 走看看