zoukankan      html  css  js  c++  java
  • process xlsx with pandas

    import pandas as pd
    import numpy as np
    
    !ls sales*.xlsx
    
    sales-feb-2014.xlsx
    sales-jan-2014.xlsx
    sales-mar-2014.xlsx
    
    import glob
    
    glob.glob('sales*.xlsx')
    
    ['sales-feb-2014.xlsx', 'sales-jan-2014.xlsx', 'sales-mar-2014.xlsx']
    
    all_data = pd.DataFrame()
    for f in glob.glob('sales*.xlsx'):
        df = pd.read_excel(f)
        all_data = all_data.append(df, ignore_index=True)
    
    all_data.head()
    
    account number name sku quantity unit price ext price date
    0 383080 Will LLC B1-20000 7 33.69 235.83 2014-02-01 09:04:59
    1 412290 Jerde-Hilpert S1-27722 11 21.12 232.32 2014-02-01 11:51:46
    2 412290 Jerde-Hilpert B1-86481 3 35.99 107.97 2014-02-01 17:24:32
    3 412290 Jerde-Hilpert B1-20000 23 78.90 1814.70 2014-02-01 19:56:48
    4 672390 Kuhn-Gusikowski S1-06532 48 55.82 2679.36 2014-02-02 03:45:20
    all_data['date'] = pd.to_datetime(all_data['date'])
    all_data.head()
    
    account number name sku quantity unit price ext price date
    0 383080 Will LLC B1-20000 7 33.69 235.83 2014-02-01 09:04:59
    1 412290 Jerde-Hilpert S1-27722 11 21.12 232.32 2014-02-01 11:51:46
    2 412290 Jerde-Hilpert B1-86481 3 35.99 107.97 2014-02-01 17:24:32
    3 412290 Jerde-Hilpert B1-20000 23 78.90 1814.70 2014-02-01 19:56:48
    4 672390 Kuhn-Gusikowski S1-06532 48 55.82 2679.36 2014-02-02 03:45:20
    status = pd.read_excel('customer-status.xlsx')
    status
    
    account number name status
    0 740150 Barton LLC gold
    1 714466 Trantow-Barrows silver
    2 218895 Kulas Inc bronze
    3 307599 Kassulke, Ondricka and Metz bronze
    4 412290 Jerde-Hilpert bronze
    5 729833 Koepp Ltd silver
    6 146832 Kiehn-Spinka silver
    7 688981 Keeling LLC silver
    8 786968 Frami, Hills and Schmidt silver
    9 239344 Stokes LLC gold
    10 672390 Kuhn-Gusikowski silver
    11 141962 Herman LLC gold
    12 424914 White-Trantow silver
    13 527099 Sanford and Sons bronze
    14 642753 Pollich LLC bronze
    15 257198 Cronin, Oberbrunner and Spencer gold
    all_data_st = pd.merge(all_data, status, how='left')
    all_data_st.head()
    
    account number name sku quantity unit price ext price date status
    0 383080 Will LLC B1-20000 7 33.69 235.83 2014-02-01 09:04:59 NaN
    1 412290 Jerde-Hilpert S1-27722 11 21.12 232.32 2014-02-01 11:51:46 bronze
    2 412290 Jerde-Hilpert B1-86481 3 35.99 107.97 2014-02-01 17:24:32 bronze
    3 412290 Jerde-Hilpert B1-20000 23 78.90 1814.70 2014-02-01 19:56:48 bronze
    4 672390 Kuhn-Gusikowski S1-06532 48 55.82 2679.36 2014-02-02 03:45:20 silver
    all_data_st[all_data_st['account number']==737550]
    
    account number name sku quantity unit price ext price date status
    15 737550 Fritsch, Russel and Anderson S1-47412 40 51.01 2040.40 2014-02-05 01:20:40 NaN
    25 737550 Fritsch, Russel and Anderson S1-06532 34 18.69 635.46 2014-02-07 09:22:02 NaN
    66 737550 Fritsch, Russel and Anderson S1-27722 15 70.23 1053.45 2014-02-16 18:24:42 NaN
    78 737550 Fritsch, Russel and Anderson S2-34077 26 93.35 2427.10 2014-02-20 18:45:43 NaN
    80 737550 Fritsch, Russel and Anderson S1-93683 31 10.52 326.12 2014-02-21 13:55:45 NaN
    91 737550 Fritsch, Russel and Anderson S2-82423 46 67.67 3112.82 2014-02-24 08:35:31 NaN
    117 737550 Fritsch, Russel and Anderson S2-82423 14 81.92 1146.88 2014-01-03 19:07:37 NaN
    122 737550 Fritsch, Russel and Anderson B1-53102 23 71.56 1645.88 2014-01-04 08:57:48 NaN
    134 737550 Fritsch, Russel and Anderson B1-53636 42 42.06 1766.52 2014-01-08 00:02:11 NaN
    140 737550 Fritsch, Russel and Anderson S1-27722 20 29.54 590.80 2014-01-09 13:20:40 NaN
    150 737550 Fritsch, Russel and Anderson S1-93683 22 71.68 1576.96 2014-01-11 23:47:36 NaN
    171 737550 Fritsch, Russel and Anderson S2-77896 41 38.43 1575.63 2014-01-16 22:39:11 NaN
    177 737550 Fritsch, Russel and Anderson S2-82423 13 24.98 324.74 2014-01-18 06:32:00 NaN
    209 737550 Fritsch, Russel and Anderson B1-53102 34 99.48 3382.32 2014-01-22 22:00:08 NaN
    223 737550 Fritsch, Russel and Anderson B1-65551 49 19.14 937.86 2014-01-27 01:59:54 NaN
    232 737550 Fritsch, Russel and Anderson S1-65481 47 45.71 2148.37 2014-01-28 15:49:45 NaN
    238 737550 Fritsch, Russel and Anderson S1-50961 1 34.81 34.81 2014-01-30 23:56:20 NaN
    245 737550 Fritsch, Russel and Anderson B1-50809 20 50.11 1002.20 2014-03-01 23:47:17 NaN
    258 737550 Fritsch, Russel and Anderson S2-83881 12 63.60 763.20 2014-03-04 15:26:20 NaN
    336 737550 Fritsch, Russel and Anderson S2-11481 47 49.22 2313.34 2014-03-22 23:44:18 NaN
    339 737550 Fritsch, Russel and Anderson S1-93683 22 18.19 400.18 2014-03-23 10:33:33 NaN
    379 737550 Fritsch, Russel and Anderson B1-65551 12 56.24 674.88 2014-03-31 08:43:24 NaN
    all_data_st['status'].fillna('bronze')
    all_data_st.head()
    
    account number name sku quantity unit price ext price date status
    0 383080 Will LLC B1-20000 7 33.69 235.83 2014-02-01 09:04:59 NaN
    1 412290 Jerde-Hilpert S1-27722 11 21.12 232.32 2014-02-01 11:51:46 bronze
    2 412290 Jerde-Hilpert B1-86481 3 35.99 107.97 2014-02-01 17:24:32 bronze
    3 412290 Jerde-Hilpert B1-20000 23 78.90 1814.70 2014-02-01 19:56:48 bronze
    4 672390 Kuhn-Gusikowski S1-06532 48 55.82 2679.36 2014-02-02 03:45:20 silver
    all_data_st['status'] = all_data_st['status'].astype('category')
    
    all_data_st.head()
    
    account number name sku quantity unit price ext price date status
    0 383080 Will LLC B1-20000 7 33.69 235.83 2014-02-01 09:04:59 NaN
    1 412290 Jerde-Hilpert S1-27722 11 21.12 232.32 2014-02-01 11:51:46 bronze
    2 412290 Jerde-Hilpert B1-86481 3 35.99 107.97 2014-02-01 17:24:32 bronze
    3 412290 Jerde-Hilpert B1-20000 23 78.90 1814.70 2014-02-01 19:56:48 bronze
    4 672390 Kuhn-Gusikowski S1-06532 48 55.82 2679.36 2014-02-02 03:45:20 silver
    all_data_st.dtypes
    
    account number             int64
    name                      object
    sku                       object
    quantity                   int64
    unit price               float64
    ext price                float64
    date              datetime64[ns]
    status                  category
    dtype: object
    
    all_data_st.sort_values('status').head()
    
    account number name sku quantity unit price ext price date status
    0 383080 Will LLC B1-20000 7 33.69 235.83 2014-02-01 09:04:59 NaN
    258 737550 Fritsch, Russel and Anderson S2-83881 12 63.60 763.20 2014-03-04 15:26:20 NaN
    261 383080 Will LLC S1-93683 28 90.86 2544.08 2014-03-05 05:11:49 NaN
    66 737550 Fritsch, Russel and Anderson S1-27722 15 70.23 1053.45 2014-02-16 18:24:42 NaN
    201 383080 Will LLC B1-53636 37 14.40 532.80 2014-01-20 22:18:21 NaN
    all_data_st['status'].cat.set_categories(['gold', 'silver', 'bronze'], inplace=True)
    
    all_data_st.sort_values('status').head()
    
    account number name sku quantity unit price ext price date status
    0 383080 Will LLC B1-20000 7 33.69 235.83 2014-02-01 09:04:59 NaN
    134 737550 Fritsch, Russel and Anderson B1-53636 42 42.06 1766.52 2014-01-08 00:02:11 NaN
    289 383080 Will LLC S1-82801 3 77.06 231.18 2014-03-11 16:38:10 NaN
    122 737550 Fritsch, Russel and Anderson B1-53102 23 71.56 1645.88 2014-01-04 08:57:48 NaN
    299 604255 Halvorson, Crona and Champlin S1-47412 27 25.69 693.63 2014-03-14 16:01:34 NaN
    all_data_st['status'].describe()
    
    count        307
    unique         3
    top       silver
    freq         140
    Name: status, dtype: object
    
    all_data_st.groupby(['status'])['quantity', 'unit price', 'ext price'].mean()
    
    quantity unit price ext price
    status
    gold 24.375000 53.723889 1351.944583
    silver 22.842857 57.272714 1320.032214
    bronze 24.284211 59.649684 1452.321368
    all_data_st.groupby(['status'])['quantity', 'unit price', 'ext price'].agg([np.sum, np.mean, np.std])
    
    quantity unit price ext price
    sum mean std sum mean std sum mean std
    status
    gold 1755 24.375000 14.575145 3868.12 53.723889 28.740080 97340.01 1351.944583 1182.657312
    silver 3198 22.842857 14.512843 8018.18 57.272714 26.556242 184804.51 1320.032214 1086.384051
    bronze 2307 24.284211 14.812777 5666.72 59.649684 26.530155 137970.53 1452.321368 1140.105645
    all_data_st.drop_duplicates(subset=['account number', 'name']).ix[:,[0, 1, 7]].groupby(['status'])['name'].count()
    
    status
    gold      4
    silver    7
    bronze    5
    Name: name, dtype: int64
  • 相关阅读:
    新浪JavaScript库
    Java中创建的对象多了,必然影响内存和性能
    Fidder抓包设置
    JavaScript处理股票数据
    按钮控制示例
    JavaScript更改css样式
    AppCan模拟器调试
    AppCan调试问题
    Java虚拟机JVM学习04 类的初始化
    COM组件和调用详细教程
  • 原文地址:https://www.cnblogs.com/otfsenter/p/6741656.html
Copyright © 2011-2022 走看看