zoukankan      html  css  js  c++  java
  • pandas使用

    import pandas as pd
    import numpy as np
    
    # 读取数据,并以表格的形式显示
    df1 = pd.DataFrame(pd.read_excel('a.xlsx'))
    
    # 判断是否为空,每个表格都会去判断,并返回True/False
    a1 = df1.isnull().values
    """
    [[False False False False False False False False False]
     [False False  True False False False False False False]
     [False False False False False False False False False]
     [False False False False False False False False False]
     [False False False False False  True False False False]
     [False False False False False False False False False]
     [False False False False False False False False False]]
    
    """
    
    # 判断是否为非空,每个表格都会去判断,并返回True/False,会以规则的表格形式返回
    a2 = df1.notnull()
    """
          1     2      3     4     5      6  open_acc  weight  data
    0  True  True   True  True  True   True      True    True  True
    1  True  True  False  True  True   True      True    True  True
    2  True  True   True  True  True   True      True    True  True
    3  True  True   True  True  True   True      True    True  True
    4  True  True   True  True  True  False      True    True  True
    5  True  True   True  True  True   True      True    True  True
    6  True  True   True  True  True   True      True    True  True
    """
    
    # 判断列是否重复,比对每一行,并返回True/False
    b = df1.duplicated()
    """
    0    False
    1    False
    2    False
    3    False
    4    False
    5     True
    6    False
    dtype: bool
    """
    
    # 返回去掉重复行后的表
    c = df1.drop_duplicates()
    """
         1    2    3    4    5    6  open_acc   weight        data
    0   12   12   23   34   56   67         2     12kg  2018/02/26
    1  asd  asd  NaN  aas  qwe  qwe        10   23000g   2018/02/6
    2  123   qw    q    q  qwe  qwe         7  45000mg   2018/5/26
    3    1    2    3    4    5    6        15     123t      2017/9
    4    1    2    3    4    5  NaN        20     456t      18/2/2
    6   12  123    4    3   er   32        12      34g   2015/3/01
    """
    
    # 空值以。。。。方式填充
    d1 = df1.fillna('\')
    """
         1    2   3    4    5    6  open_acc   weight        data
    0   12   12  23   34   56   67         2     12kg  2018/02/26
    1  asd  asd     aas  qwe  qwe        10   23000g   2018/02/6
    2  123   qw   q    q  qwe  qwe         7  45000mg   2018/5/26
    3    1    2   3    4    5    6        15     123t      2017/9
    4    1    2   3    4    5            20     456t      18/2/2
    5    1    2   3    4    5    6        15     123t      2017/9
    6   12  123   4    3   er   32        12      34g   2015/3/01
    """
    
    # 包含空值的行删除
    d2 = df1.dropna()
    """
         1    2   3   4    5    6  open_acc   weight        data
    0   12   12  23  34   56   67         2     12kg  2018/02/26
    2  123   qw   q   q  qwe  qwe         7  45000mg   2018/5/26
    3    1    2   3   4    5    6        15     123t      2017/9
    5    1    2   3   4    5    6        15     123t      2017/9
    6   12  123   4   3   er   32        12      34g   2015/3/01
    """
    
    # 给表更换新的列名,注意字段列表的长度一定要和列的长度相同
    column_names = ['a', 'b', 'c', 'd', 'e', 'f','g','h','i']
    df = pd.read_excel('a.xlsx', names=column_names)
    """
         a    b    c    d    e    f   g        h           i
    0   12   12   23   34   56   67   2     12kg  2018/02/26
    1  asd  asd  NaN  aas  qwe  qwe  10   23000g   2018/02/6
    2  123   qw    q    q  qwe  qwe   7  45000mg   2018/5/26
    3    1    2    3    4    5    6  15     123t      2017/9
    4    1    2    3    4    5  NaN  20     456t      18/2/2
    5    1    2    3    4    5    6  15     123t      2017/9
    6   12  123    4    3   er   32  12      34g   2015/3/01
    """
    
    # 打印第一列
    # print(df1[1])
    """
    0     12
    1    asd
    2    123
    3      1
    4      1
    5      1
    6     12
    Name: 1, dtype: object
    """
    
    # 将列中的12,1替换成'qwe'
    a = df1[1].replace([12, 1], 'qwe')
    """
    0    qwe
    1    qwe
    2    123
    3    qwe
    4    qwe
    5    qwe
    6    qwe
    Name: 1, dtype: object
    """
    
    # 打印数据的前n行,默认为5
    # print(df1.head())
    
    # 给数据分组
    """
    0-5     A  不包含0
    6-10    B
    11-15   C
    16-20   D
    """
    bins = [0, 5, 10, 15, 20]
    group_names = ['A', 'B', 'C', 'D']
    df1['categories'] = pd.cut(df1['open_acc'], bins, labels=group_names)
    """
         1    2    3    4    ...     open_acc   weight        data categories
    0   12   12   23   34    ...            2     12kg  2018/02/26          A
    1    1  asd  NaN  aas    ...           10   23000g   2018/02/6          B
    2  123   qw    q    q    ...            7  45000mg   2018/5/26          B
    3    1    2    3    4    ...           15     123t      2017/9          C
    4    1    2    3    4    ...           20     456t      18/2/2          D
    5    1    2    3    4    ...           15     123t      2017/9          C
    6   12  123    4    3    ...           12      34g   2015/3/01          C
    """
    
    # 获取到需要换算单位的行
    
    rows_with_lbs = df1['weight'].str.contains('mg').fillna(False)
    """
         1   2  3  4    5    6  open_acc   weight       data categories
    2  123  qw  q  q  qwe  qwe         7  45000mg  2018/5/26          B
    """
    
    # 换算单位
    for i, lbs_row in df1[rows_with_lbs].iterrows():
        # print(i) # 获取到值的行
        # print(lbs_row['weight'][-2:])
        weight = int(float(lbs_row['weight'][:-2]) / 1000)
        df1.at[i, 'weight'] = '{}g'.format(weight)
    # print(df1)
    """
         1    2    3    4    ...     open_acc  weight        data categories
    0   12   12   23   34    ...            2    12kg  2018/02/26          A
    1    1  asd  NaN  aas    ...           10  23000g   2018/02/6          B
    2  123   qw    q    q    ...            7     45g   2018/5/26          B
    3    1    2    3    4    ...           15    123t      2017/9          C
    4    1    2    3    4    ...           20    456t      18/2/2          D
    5    1    2    3    4    ...           15    123t      2017/9          C
    6   12  123    4    3    ...           12     34g   2015/3/01          C
    """
    
    # rows_with_lbs = df1['weight'].str.contains('t').fillna(False)
    # for i,lbs_row in df1[rows_with_lbs].iterrows():
    #     # print(i) # 获取到值的行
    #     # print(lbs_row['weight'][:-2])
    #     weight = int(float(lbs_row['weight'][:-2]) * 10000000)
    #     df1.at[i, 'weight'] = '{}g'.format(weight)
    
    # print(df1)
    
    
    # 换算单位封装
    def conversion(DataFrame, column, unit, num, new_unit='', null=False):
        """
    
        :param DataFrame: pandas对象
        :param column: 字段
        :param unit: 当前单位
        :param new_unit: 新单位
        :param num: 当前单位换算成新单位所需要乘的数
        :param null: 是否为空
        :return:pandas对象
    
             1    2    3    4    ...     open_acc     weight        data categories
        0   12   12   23   34    ...            2  1.20e+04g  2018/02/26          A
        1  asd  asd  NaN  aas    ...           10        23g   2018/02/6          B
        2  123   qw    q    q    ...            7        45g   2018/5/26          B
        3    1    2    3    4    ...           15  1.23e+08g      2017/9          C
        4    1    2    3    4    ...           20  4.56e+08g      18/2/2          D
        """
    
    
        rows_with_lbs = DataFrame[column].str.contains(unit).fillna(null)
        column_len = len(unit)
    
        for i, lbs_row in DataFrame[rows_with_lbs].iterrows():
    
            # print(lbs_row['weight'][-column_len:])
            weight = int(float(lbs_row[column][:-column_len]) * num)
    
            # 默认4位,超出长度变为科学计数法
            if len(str(weight)) > 4:
    
                DataFrame.at[i, column] = ('%0.2e' + new_unit) % weight
                """
                     1    2    3    4    ...     open_acc     weight        data categories
                0   12   12   23   34    ...            2  1.20e+04g  2018/02/26          A
                1  asd  asd  NaN  aas    ...           10  2.30e+01g   2018/02/6          B
                2  123   qw    q    q    ...            7  4.50e+01g   2018/5/26          B
                3    1    2    3    4    ...           15  1.23e+08g      2017/9          C
                4    1    2    3    4    ...           20  4.56e+08g      18/2/2          D
                """
            else:
                DataFrame.at[i, column] = str(weight) + new_unit
                """
                     1    2    3    4    ...     open_acc      weight        data categories
                0   12   12   23   34    ...            2      12000g  2018/02/26          A
                1  asd  asd  NaN  aas    ...           10         23g   2018/02/6          B
                2  123   qw    q    q    ...            7         45g   2018/5/26          B
                3    1    2    3    4    ...           15  123000000g      2017/9          C
                4    1    2    3    4    ...           20  456000000g      18/2/2          D
                """
    
        return DataFrame
    
    
    df1 = conversion(df1, 'weight', 'kg', 1000, 'g')  # kg - g
    
    df2 = conversion(df1, 'weight', 'mg', 0.001, 'g')  # mg -g
    
    df3 = conversion(df2, 'weight', 't', 1000000, 'g')  # t - g
    
    print(df3)
    
     
  • 相关阅读:
    平衡二叉树之RB树
    平衡二叉树之AVL树
    实现哈希表
    LeetCode Median of Two Sorted Arrays
    LeetCode Minimum Window Substring
    LeetCode Interleaving String
    LeetCode Regular Expression Matching
    PAT 1087 All Roads Lead to Rome
    PAT 1086 Tree Traversals Again
    LeetCode Longest Palindromic Substring
  • 原文地址:https://www.cnblogs.com/ZSMblog/p/9612465.html
Copyright © 2011-2022 走看看