zoukankan      html  css  js  c++  java
  • 2.5&2.6 numpy&pandas 笔记

    https://www.bilibili.com/video/av16378934?from=search&seid=9658415144190341605

    numpy.random模块 https://blog.csdn.net/fireflychh/article/details/73603838

    pandas && numpy

    1. 属性以及创建矩阵

    import numpy as np
    array = np.array([[1,2,3],
                      [2,3,4]]) #建立一个矩阵(注意'[]'的数量)
    print(array)
    print('number of dim', array.ndim)  #矩阵的维数
    print('shape', array.shape)    #矩阵的形状
    print('size', array.size)   #矩阵的大小,数据个数
    
    a = np.array([2,23,4], dtype=np.int) ##int32,32位int    @1
    a1 = np.zeros((3,4), dtype=np.int32)  #3行4列的全为0, 32位int类型数据
    a2 = np.ones((3,4), dtype=np.int32)  #3行4列的全为1, 32位int类型数据
    a3 = np.arange(10, 20, 2)  #生成从10到20(取不到20)的步长为2的数列
    a4 = np.arange(12).reshape((3,4))  #生成从0到11的,3行4列的矩阵
    a5 = np.linspace(1, 10, 5)    #生成线段,生成从1到10一共5段线段
    #  Out[13]: array([ 1.  ,  3.25,  5.5 ,  7.75, 10.  ])

    2. 基础运算

    import numpy as np
    a = np.array([10,20,30,40])
    b = np.arange(4)   #[0,1,2,3]
    c = a - b #减法
    d = a + b #加法
    e = a * b #乘法注意不是矩阵乘法
    f = b**2  #次方
    g = 10*np.sin(a) #sin与cos用法一样
    print(b < 3)    #[ True  True  True False]
    
    a1 = np.array([[1,1],
                   [0,1]])
    b1 = np.arange(4).reshape((2,2))
    c1 = a1 * b1 #逐个相乘    array([[0, 1],
                                   # [0, 3]])
    c1_dot = np.dot(a1, b1) ##矩阵乘法  array([[2, 4],
                                           #  [2, 3]])
    c1_dot_2 = a.dot(b) ##另一种写法 个人觉得可读性较差
    a3 =
    a2 = np.random.random((2,4))#array([[0.41322246, 0.70201529, 0.3669404 , 0.24802252],
                                   #[0.3022681 , 0.89514217, 0.63173225, 0.57511299]])
    x = np.sum(a2, axis=1)  #求和,axis=1表示以取行元素操作列, axis=0表示以列取元素操作行,Out[24]: array([1.73020067, 2.4042555 ])
    x1 = np.min(a2, axis=0) #最小值 array([0.3022681 , 0.70201529, 0.3669404 , 0.24802252])
    x2 = np.max(a2, axis=1) #最大值array([0.70201529, 0.89514217])
    import numpy as np
    a = np.arange(2, 14).reshape((3,4))
    print(np.argmin(a)) ##最小值的索引位置  0
    print(np.argmax(a))  ##最大值的索引位置  11
    print('平均值',np.mean(a)) ##平均值 或者  a.mean()
    print(np.average(a))  ##平均值
    print('中位数',np.median(a)) ##中位数 7.5
    print('累加',np.cumsum(a))  ##累加依次相邻两个数之和 [ 2  5  9 14 20 27 35 44 54 65 77 90]
    print('累差',np.diff(a))  ##相邻两个数之间的差值
    #累差 [[1 1 1]
    #[1 1 1]
    #[1 1 1]]
    print('输出非零数据的行数和列数',np.nonzero(a))
    #行数:(array([0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2], dtype=int64),
    #列数: array([0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3], dtype=int64))
    print('转置',np.transpose(a)) ##矩阵转置
    print('类似二值化',np.clip(a, 5, 9)) ##类似二值化 小于5的数变成5,大于9的数变成9
    #[[5 5 5 5]
    # [6 7 8 9]
    # [9 9 9 9]]   如果数据相反就只有大于5的变5而不是小于9的变9
    
    a1 = np.array([[1,4,2,7],
                  [3,1,5,8],
                   [8,3,5,6]])
    print("排序", np.sort(a1)) ##注意逐行排序
    # [[1 2 4 7]
    # [1 3 5 8]
    # [3 5 6 8]]

    3.numpy的索引

    import numpy as np
    a = np.arange(3,15).reshape((3,4))
    ##如果是数列的话直接a[n]
    print(a[2]) ##输出第二行所以数据
    print('等同于',a[2, :])
    print(a[2][2]) ##输出第二行第二列
    print(a[2, 1:3])##第二行第一列到第二列
    ##迭代行
    for raw in a:
        print(raw)
    ##迭代列
    for column in a.T: ##a.T 相当于 np.transpose(a) a的转置
        print(column)
    ##迭代项
    print('将矩阵转换成一行的序列', a.flatten()) ## 返回序列[ 3  4  5  6  7  8  9 10 11 12 13 14]
    for item in a.flat: ##flat() 返回的是一个生成序列的迭代器
        print(item)

    4.numpy 合并

    import numpy as np
    a = np.array([1,1,1])
    b = np.array([2,2,2])
    print('上下的合并',np.vstack((a,b,c))) ## 注意需要括号将合并的数据括起来
    #[[1 1 1]
    # [2 2 2]]
    print('左右合并',np.hstack((a,b,c)))  ##[1 1 1 2 2 2]
    
    ##注意transpose无法将序列转换为列
    #需要如下操作
    a1 = np.array([1,1,1])[:,np.newaxis] #或者reshape(n,1)也可以
    a2 = np.array([2,2,2])[:,np.newaxis]
    print(np.hstack((a1,a2)))
    #[[1 2]
    # [1 2]
    # [1 2]]
    print('总的合并方法(在newaxis方法下)',np.concatenate((a1,a2),axis=1))  ##(在newaxis方法下)才能使用,1行0列

    就newaxis方法进行如下解释

    首先newaxis作用是增加维度

    详解:np.newaxis在[]中第几位,a.shape的第几维就变成1,a的原来的维度依次往后排。
    例子:若a.shape=(a ,b, c)
    a[:, np.newaxis].shape= (a, 1, b, c)
    a[:, np.newaxis, np.newaxis].shape= (a, 1, 1, b, c)
    a[np.newaxis, :].shape= (1, a, b, c)
    a[np.newaxis, np.newaxis, :].shape= (1, 1, a, b, c)
    a[np.newaxis, :, np.newaxis].shape= (1, a, 1, b, c)
    a[np.newaxis, :, np.newaxis, :].shape= (1, a, 1, b, c)

    a[:, :,np.newaxis].shape= (a, b,1, c)

    另外

    np.newaxis=None

    a[np.newaxis, :, np.newaxis, :].shape == a[None,:,None,:]

    如下是具体例子:

    import numpy as np
    a = np.array([1,1,1])
    a1 = a[:,np.newaxis]
    b = np.array([2,2,2])
    b1 = b[np.newaxis,:]
    print(a.shape, a)
    print(a1.shape)
    print(a1)
    print(b.shape, b)
    print(b1.shape)
    print(b1)

    结果

    (3,) [1 1 1] 
    (3, 1)
    [[1]
    [1]
    [1]]
    (3,) [2 2 2]
    (1, 3)
    [[2 2 2]]

    c = np.array([[1,1,1],[2,2,2]])
    c1 = c[:,np.newaxis]
    d = np.array([[2,2,2],[3,3,3]])
    d1 = d[np.newaxis,:]
    print(c.shape)
    print(c)
    print(c1.shape)
    print(c1)
    print(d.shape)
    print(d)
    print(d1.shape)
    print(d1)
    结果
    (2, 3)
    [[1 1 1]
     [2 2 2]]
    (2, 1, 3)
    [[[1 1 1]]
    
     [[2 2 2]]]
    (2, 3)
    [[2 2 2]
     [3 3 3]]
    (1, 2, 3)
    [[[2 2 2]
      [3 3 3]]]

    5.numpy的array分割

    import numpy as np
    a = np.arange(12).reshape((3,4))
    print(a)
    ##没有整除会报ValueError
    print(np.split(a, 2, axis=1))  ##将a按照列数平均分成两份,每取一行分成两半
    print(np.split(a, 3, axis=0))  ##将a按照行数平均分成3分,每取一列分成3分
    #[array([[0, 1, 2, 3]]), array([[4, 5, 6, 7]]), array([[ 8,  9, 10, 11]])]
    
    ##不等量分割
    print(np.array_split(a,2,axis=0))
    #[array([[0, 1, 2, 3],
    #       [4, 5, 6, 7]]), array([[ 8,  9, 10, 11]])]
    print(np.array_split(a,[1,3],axis=1))##按照第一份分0列,第二份分1,2列,第三份分第3列([]相当于切片,索引)
    print('上下分割也就是按照行分割',np.vsplit(a,3))
    print('左右分割也就是按照列数分割',np.hsplit(a,2))

    6. nump的copy以及deep_copy

    In [1]: import numpy as np
    In [2]: a = np.arange(4)
    In [3]: b = a
    In [4]: a
    Out[4]: array([0, 1, 2, 3])
    In [5]: b
    Out[5]: array([0, 1, 2, 3])
    In [6]: a[0] = 11
    In [7]: a
    Out[7]: array([11,  1,  2,  3])
    In [8]: b
    Out[8]: array([11,  1,  2,  3])
    In [9]: b is a
    Out[9]: True
    In [10]: b[1:3] = [22,33]
    In [11]: b
    Out[11]: array([11, 22, 33,  3])
    In [12]: a
    Out[12]: array([11, 22, 33,  3])
    In [13]: c = a.copy() ##deep_copy
    In [14]: a[3] = 44
    In [15]: a
    Out[15]: array([11, 22, 33, 44])
    In [16]: c
    Out[16]: array([11, 22, 33,  3])

    pandas

    1.  基本介绍

     

    import numpy as np
    import pandas as pd
    s = pd.Series([1,3,6,np.nan,44,1]) ##列表,会自动加序号
    print('产生列表',s)
    dates = pd.date_range('20200206',periods=6)
    ##产生日期,periods表示生成的个数
    #DatetimeIndex(['2020-02-06', '2020-02-07', '2020-02-08', '2020-02-09',
    #               '2020-02-10', '2020-02-11'],
    #              dtype='datetime64[ns]', freq='D')
    df = pd.DataFrame(np.random.randn(6,4), index=dates, columns=['a','b','r','t'])
    ## 其中randn之后会讲到, index表示行的名称,column表示列的名称
    #                   a         b         r         t
    #2020-02-06 -0.661183  0.586394  0.867207  0.234598
    #2020-02-07  2.050175  0.962423  0.412010  1.479072
    #2020-02-08  1.132787  1.256786 -0.234434 -0.860920
    #2020-02-09 -0.118119  1.201684 -1.485074  0.460520
    #2020-02-10 -0.040067 -0.871771 -0.720332  0.055199
    #2020-02-11 -0.165296 -1.925641  0.002060  0.701225
    df1 = pd.DataFrame(np.arange(12).reshape((3,4)))
    ##默认名称就是0-n-1的数字
    #   0  1   2   3
    #0  0  1   2   3
    #1  4  5   6   7
    #2  8  9  10  11
    data = pd.Series(np.random.randn(1000),index=np.arange(1000))
    data = data.cumsum()#元素累加和,也是1000个数据
    
    
    
    #字典的表示方法
    df2 = pd.DataFrame({'A':[1.,2.,3.,4.],
                        'B':pd.date_range('20200206',periods=4),
                        'C':pd.Series(np.arange(4)),
                        'D':pd.Categorical(['test','train','train','tets']),
                        'E':'foo'})
    #     A          B  C      D    E
    #0  1.0 2020-02-06  0   test  foo
    #1  2.0 2020-02-07  1  train  foo
    #2  3.0 2020-02-08  2  train  foo
    #3  4.0 2020-02-09  3   tets  foo
    print('打印数据类型',df2.dtypes)
    print('打印列序号的名字',df2.columns)#打印列序号的名字 Index(['A', 'B', 'C', 'D', 'E'], dtype='object')
    print('打印每个数据',df2.values)
    print(df2.describe())
    print('转置',df2.T)
    print('排序',df2.sort_index(axis=1, ascending=False))
    ##倒序排序,按照行取值 ascend上升
    print('按照某列数据排序', df2.sort_values(by='C', ascending=False))
    ##按照'C'列数据进行排序

     

     

     

    2.  数据选择

    import pandas as pdi
    import numpy as np
    
    dates = pd.date_range('20200206', periods=6)
    df = pd.DataFrame(np.arange(24).reshape((6,4)), index=dates, columns=['A','B','C','D'])
    方法1
    df.A 与 df['A']一样
    df[0:3], df['20200206':'20200208]
    方法2
    select by label:loc
    df.loc['20200207']
    df.loc['20200207',['A','B']]
    方法3
    select by position: iloc
    df.iloc[1,1]
    df.iloc[[1,3,5],1:3]
    方法4
    Boolean indexing:
    df[df.A>8]

    3. 设置值

    import pandas as pd
    import numpy as np
    
    dates = pd.date_range('20200206', periods=6)
    df = pd.DataFrame(np.arange(24).reshape((6,4)), index=dates, columns=['A','B','C','D'])
    #改变值
    df.iloc[2,2] = 111
    df.loc['20200207','B'] = 222
    df[df.A > 4] = 0
    df.B[df.A > 4] = 0
    df['E'] = np.nan ##nan无穷大
    df['F'] = pd.Series(np.arange(6), index=dates)

    4. 处理丢失数据

    import pandas as pd
    import numpy as np
    
    dates = pd.date_range('20200206', periods=6)
    df = pd.DataFrame(np.arange(24).reshape((6,4)), index=dates, columns=['A','B','C','D'])
    df.iloc[0,1] = np.nan
    df.iloc[1,2] = np.nan
    
    df.dropna(axis=0, how='any')
    # axis=0按照行取数据删掉列
    # how={'any', 'all'}  any表示出现nan就删掉,all表示全部一行或者一列是nan删掉
    df.fillna(value=0)#将nan数据填入数据0
    df.isnull() #判断是否有丢失的数据
    #                A      B      C      D
    #2020-02-06  False   True  False  False
    #2020-02-07  False  False   True  False
    #2020-02-08  False  False  False  False
    #2020-02-09  False  False  False  False
    #2020-02-10  False  False  False  False
    #2020-02-11  False  False  False  Fals
    np.any(df.isnull()) == True ##检查是否有值丢失

    5.导入导出数据

    读取

     

    保存

     

      

    data = pd.read_csv('C:/Users/XXXX/Pictures/untitled3/student.csv')
    print(data)
    data.to_pickle('C:/Users/XXXX/Desktop/studstudent.pickle')

    6.合并

    1.concat方法

    import pandas as pd
    import numpy as np
    
    df1 = pd.DataFrame(np.ones((3,4))*0, columns=['a','b','c','d'])
    df2 = pd.DataFrame(np.ones((3,4))*1, columns=['a','b','c','d'])
    df3 = pd.DataFrame(np.ones((3,4))*2, columns=['a','b','c','d'])
    
    ##concatenation
    res = pd.concat([df1, df2, df3], axis=1, ignore_index=True)
    #0上下合并,1左右合并
    #ignore_index=True 表示忽略行列的名称
    df1 = pd.DataFrame(np.ones((3,4))*0, columns=['a','b','c','d'], index=[1,2,3])
    df2 = pd.DataFrame(np.ones((3,4))*1, columns=['b','c','d','e'], index=[2,3,4])
    
    res1 = pd.concat([df1,df2], join='inner',ignore_index=True)
    res2 = pd.concat([df1, df2], join='outer',ignore_index=True)
    ##默认是outer,如果名字大小不一样全用NaN填充
    #inner则不同的数据全部裁剪掉
    print(res1)
    print(res2)
    #结果
         b    c    d
    0  0.0  0.0  0.0
    1  0.0  0.0  0.0
    2  0.0  0.0  0.0
    3  1.0  1.0  1.0
    4  1.0  1.0  1.0
    5  1.0  1.0  1.0
         a    b    c    d    e
    0  0.0  0.0  0.0  0.0  NaN
    1  0.0  0.0  0.0  0.0  NaN
    2  0.0  0.0  0.0  0.0  NaN
    3  NaN  1.0  1.0  1.0  1.0
    4  NaN  1.0  1.0  1.0  1.0
    5  NaN  1.0  1.0  1.0  1.0
    res = pd.concat([df1,df2], axis=1, join_axes=[df1.index])
    #join_axes表示只考虑df1.index中行名称(删掉了4)
         a    b    c    d    b    c    d    e
    1  0.0  0.0  0.0  0.0  NaN  NaN  NaN  NaN
    2  0.0  0.0  0.0  0.0  1.0  1.0  1.0  1.0
    3  0.0  0.0  0.0  0.0  1.0  1.0  1.0  1.0

    2. append方法

    res = df1.append([df2,df3], ignore_index=True)

     3. merge方法

     

    import pandas as pd
    import numpy as np
    
    left = pd.DataFrame({'key':['K0','K1','K2','K3'],
                         'A':['A0','A1','A2','A3'],
                         'B':['B0','B1','B2','B3']})
    right = pd.DataFrame({'key':['K0','K1','K2','K3'],
                         'A':['C0','C1','C2','C3'],
                         'D':['D0','D1','D2','D3']})
    
    res = pd.merge(left, right, on='key') #基于'key'的合并
      A_x   B key A_y   D
    0  A0  B0  K0  C0  D0
    1  A1  B1  K1  C1  D1
    2  A2  B2  K2  C2  D2
    3  A3  B3  K3  C3  D3
    res = pd.merge(left, right,on='key', suffixes=['_boy', '_girl'], how='outer')
    ##suffixes在相同的index后加上后缀
      A_boy   B key A_girl   D
    0    A0  B0  K0     C0  D0
    1    A1  B1  K1     C1  D1
    2    A2  B2  K2     C2  D2
    3    A3  B3  K3     C3  D3
    
    
    left = pd.DataFrame({'key1':['K0','K0','K1','K2'],
                         'key2':['K0','K1','K0','K1'],
                         'A':['A0','A1','A2','A3'],
                         'B':['B0','B1','B2','B3']})
    right = pd.DataFrame({'key1':['K0','K1','K1','K2'],
                          'key2':['K0','K0','K0','K0'],
                         'C':['C0','C1','C2','C3'],
                         'D':['D0','D1','D2','D3']})
    res = pd.merge(left, right, on=['key1', 'key2'],how='outer',indicator=True)
    #how=['innner', 'outer', 'left', 'right'] 默认inner
    #innner ,outer类似上述
    #left,right基于左边还是右边的key来进行合并
    #indicator显示是否两者都有,默认Flase
         A    B key1 key2    C    D      _merge
    0   A0   B0   K0   K0   C0   D0        both
    1   A1   B1   K0   K1  NaN  NaN   left_only
    2   A2   B2   K1   K0   C1   D1        both
    3   A2   B2   K1   K0   C2   D2        both
    4   A3   B3   K2   K1  NaN  NaN   left_only
    5  NaN  NaN   K2   K0   C3   D3  right_only
    
    res = pd.merge(left, right, left_index=True, right_index=True, how='outer')
    #left_index,right_index=True表示用左边右边的index进行合并
        A   B key1_x key2_x   C   D key1_y key2_y
    0  A0  B0     K0     K0  C0  D0     K0     K0
    1  A1  B1     K0     K1  C1  D1     K1     K0
    2  A2  B2     K1     K0  C2  D2     K1     K0
    3  A3  B3     K2     K1  C3  D3     K2     K0

     

    7.画图(部分)

    import pandas as pd
    import numpy as np
    import matplotlib.pyplot as plt
    
    data = pd.Series(np.random.randn(1000),index=np.arange(1000))
    data = data.cumsum()
    data.plot()
    plt.show()
    
    data = pd.DataFrame(np.random.randn(1000,4), columns=list('ABCD'))
    data = data.cumsum()
    data.plot()
    plt.show()
    
    a = data.plot.scatter(x='A', y='B', color='Green')
    data.plot.scatter(x='A',y='C', color='Yellow',ax = a)#ax=a将两副图画在一起
    plt.show()

     

     

     

     

     

     

     

     

     

     

     

     

     

     

     

  • 相关阅读:
    vi命令
    linux pip国内镜像修改
    LDA数学八卦笔记(二)Beta/Dirichlet分布
    不经风雨不见彩虹(个人作业——软件工程实践总结&个人技术博客)
    Java FX前端开发与测试
    个人作业——软件评测
    福大周润发队——团队作业4:系统设计和数据库设计
    福大周润发队——团队作业3:需求分析
    福大周润发队——团队作业2:github编程实战
    结对作业二——顶会热词统计的实现
  • 原文地址:https://www.cnblogs.com/shish/p/12267186.html
Copyright © 2011-2022 走看看