zoukankan      html  css  js  c++  java
  • numpy教程 pandas教程 Python数据科学计算简介(莫烦 视频链接+代码笔记)

    #conding:utf-8
    import numpy as np
    import pandas as pd
    import matplotlib.pyplot as plt
    array = np.array([[1, 2, 3],
                      [2, 3, 4]]) #矩阵存为数组
    print(array)
    print('number of dim,几行', array.ndim)
    print('shape,行数*列数', array.shape)
    print('size,有多少元素', array.size)
    a = np.array([2, 23, 4], dtype=np.int64)  #类型:例如dype=np.float64
    print(a) #[ 2 23  4],没有,区别于列表
    print(a.dtype)  #输出:int64
    #定义矩阵
    a = np.array([[2, 23, 4],
                  [2, 23, 4]])
    print(a)
    #定义全部为0的矩阵
    a = np.zeros((3,4)) #3行4列的全部为0的矩阵
    print('3行4列的全部为0的矩阵')
    print(a)
    a = np.ones((1, 2), dtype=np.int16)  #1行2列的全部为0的矩阵
    print('1行2列的全部为1的矩阵')
    print(a)
    a = np.empty((3, 4))
    print('3行4列的全部什么都没有(几乎接近于0的数字)的矩阵')
    print(a)
    a = np.arange(10, 20, 2)
    print(a)
    #[10 12 14 16 18]
    a = np.arange(12).reshape((3,4))
    print(a)
    # [[ 0  1  2  3]
    #  [ 4  5  6  7]
    #  [ 8  9 10 11]]
    a = np.linspace(1, 10, 5)
    print('生成1-10分成5段的序列')
    print(a)
    #[ 1.    3.25  5.5   7.75 10.  ]
    
    #+-*=/
    a = np.array([10, 20, 30, 40])
    b = np.arange(4)
    print(a, b)
    c = a - b
    print('a-b', c)
    c = a + b
    print('a+b', c)
    c = a * b
    print('a*b', c)
    c = a**2
    print('a*a', c)
    c = 10 * np.sin(a)  #c = 10 * np.cos(a)
    print('10*sin(a)', c)
    print(b < 3)  #[ True  True  True False]
    #矩阵运算
    a = np.array([[1, 1],
                 [0, 1]])
    b = np.arange(4).reshape((2, 2))
    c = a * b #对应位置直接相乘
    print(c)
    c_dot = np.dot(a, b) #矩阵乘法
    c_dot_2 = a.dot(b)  #同上,矩阵乘法
    print('矩阵乘法', c_dot)
    print('矩阵乘法方法2', c_dot_2)
    
    a = np.random.random((2, 4))
    print(a)  #2行4列的0~1之间的值
    print('求和', np.sum(a))
    print('最小值', np.min(a))
    print('最大值', np.max(a))
    print('每一行求和', np.sum(a, axis=1)) #每一行求和 [1.31972875 1.51855042]
    print('每一列求最小值', np.min(a, axis=0)) #   每一列求最小值 [0.01769909 0.35831739 0.27856868 0.40177896]
    print('对行求平均值', np.mean(a, axis=1))
    #6.6 numpy的基本运算2
    a = np.arange(2, 14).reshape((3, 4))
    print(a)
    print('最小值的索引', np.argmin(a)) #最大值的索引np.argmax(a)
    print('平均值')
    print(np.mean(a))
    print(a.mean())
    print(np.average(a))
    print('中位数')
    print(np.median(a))
    print('逐项累加', np.cumsum(a)) #逐项累加
    print('每邻近的两项相减', np.diff(a))
    print('输出非0的位置(行数和列数)', np.nonzero(a))
    a = np.arange(14, 2, -1).reshape((3, 4))
    print(a)
    print('逐行排序', np.sort(a))
    #矩阵的反向(转置)
    print('矩阵转置', np.transpose(a))
    print('a^T*a', (a.T).dot(a))
    print('array中所有<5的变成5,>9的变成9,中间的不变', np.clip(a, 5, 9))
    #7.7numpy的索引
    a = np.arange(3, 15).reshape((3,4))
    print(a)
    print(a[2][1])
    print(a[1, 1:3])
    print(a[1, :])
    print(a[:, 1])
    for row in a:   #迭代矩阵的行
        print(row)
    for column in a.T:  #迭代矩阵的列
        print(column)
    print("输出矩阵a里面的元素:", a.flatten())  #输出[ 3  4  5  6  7  8  9 10 11 12 13 14]
    for item in a.flat:
        print(item)
    #8.8 numpy的array合并
    a = np.array([1, 1, 1])
    b = np.array([2, 2, 2])
    print("上下合并:", np.vstack((a, b)))
    
    #11. 11pandas基本介绍
    s = pd.Series([1, 3, 6, np.nan, 44, 1])
    print(s)
    dates = pd.date_range('20160101',periods = 6)
    print('设置索引', dates)
    #定义DataFrame
    #方法一:numpy导入数据
    df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=['a', 'b', 'c', 'd'])  #index行索引, columns列索引
    print(df)
    #                    a         b         c         d
    # 2016-01-01 -0.852069  0.454103  0.720401 -1.379524
    # 2016-01-02 -0.695040  0.045785  0.721502 -0.462416
    # 2016-01-03 -0.501414  0.215428  1.421680 -2.380329
    # 2016-01-04  0.750305  0.012037  0.774156 -0.889714
    # 2016-01-05  0.120922  1.640206 -0.058068  1.104911
    # 2016-01-06 -0.059252 -0.252355 -0.192977 -1.294317
    #字典一行一行的导入数据
    df2 = pd.DataFrame({'A': 1.,
                        'B': pd.Timestamp('20130102'),
                        'C': pd.Series(1, index=list(range(4)), dtype='float32'),
                        'D': np.array([3]*4, dtype='float32'),
                        'E': pd.Categorical(['test', 'train', 'test',  'train']),
                        'F': 'foo'})
    print(df2)
    print('每个维度的类型:', df2.dtypes)
    print('输出所有列的标序:', df2.index)
    print('输出所有行的标序:', df2.columns)
    print('输出每一行的值:', df2.values)
    print('描述(数值型的方差均值等):', df2.describe())
    print('矩阵转置:', df2.T)
    print('列项倒排(行项axis=0):', df2.sort_index(axis=1, ascending=False))
    print('按值排序:', df2.sort_values(by='E'))
    #12. 12pandas选择数据
    datas = pd.date_range('20130101', periods=6)
    df3 = pd.DataFrame(np.arange(24).reshape((6, 4)), index=datas, columns=['A', 'B', 'C', 'D'])
    print(df3)
    print('输出一列,两种方法:', df3['A'], df3.A)
    print(df3[0:3], df3['20130102':'20130104'])
    print('根据标签来选择(.loc):', df3.loc['20130102'])
    print(df3.loc[:, ['A', 'B']])
    print(df3.loc['20130101', ['A', 'B']])
    print('筛选出3到5行,1到3列:', df3.iloc[3:5, 1:3])
    print('将iloc和loc结合起来筛选,同时用数字和标签混合筛选(.ix):', df3.ix[:3, ['A', 'C']])
    print('df3[df3.A<8]', df3[df3.A < 8])
    #13. 13pandas设置值
    df3.iloc[2, 2] = 111
    print(df3)
    df3.loc['20130101', 'B'] = 222
    print(df3)
    df3[df3.A > 4] = 0
    print(df3)
    df3['E'] = np.nan
    print('添加一行新的空序列E:', df3)
    df3['F'] = pd.Series([1, 2, 3, 4, 5, 6], index=pd.date_range('20130101', periods = 6))
    print('添加一列F:', df3)
    #17. 17pandas处理缺失值
    df3.iloc[0, 1] = np.nan
    df3.iloc[1, 2] = np.nan
    print(df3)
    print('只要列上有nan就丢掉整列:', df3.dropna(axis=1, how='any')) #只要列上有nan就丢掉整列
    print('只有整列都是nan的时候才丢掉整列:', df3.dropna(axis=1, how='all'))
    print(df3.fillna(value=0))
    print('检测是否有nan', df3.isnull())
    print('数据中至少有一个nan:', np.any(df3.isnull())==True)
    #15. 15pandas导入导出
    path = '../data/'
    #data = pd.read_csv(path+'101_wang_feat.csv')
    #print('读取文件', data)
    #data.to_pickle(path+'sivetest.pickle')
    #16. 16pandas合并concat
    #concatenating
    df1 = pd.DataFrame(np.ones((3, 4))*0, columns=['a', 'b', 'c', 'd'])
    df2 = pd.DataFrame(np.ones((3, 4))*1, columns=['a', 'b', 'c', 'd'])
    df3 = pd.DataFrame(np.ones((3, 4))*2, columns=['a', 'b', 'c', 'd'])
    print(df1)
    print(df2)
    print(df3)
    res = pd.concat([df1, df2, df3], axis=0)
    print(res)
    #join ['inner', 'outer']
    df1 = pd.DataFrame(np.ones((3, 4))*0, columns=['a', 'b', 'c', 'd'], index=[1, 2, 3])
    df2 = pd.DataFrame(np.ones((3, 4))*1, columns=['b', 'c', 'd', 'e'], index=[2, 3, 4])
    df3 = pd.DataFrame(np.ones((3, 4))*1, columns=['b', 'c', 'd', 'e'], index=[2, 3, 4])
    print(df1)
    print(df2)
    res = pd.concat([df1, df2])
    print('合并df1,df2所有的项,没有的值补为NAN:', res)
    res = pd.concat([df1, df2], join='outer')
    print("join='outer'(默认形式同上)合并df1,df2所有的项,没有的值补为NAN:", res)
    res = pd.concat([df1, df2], join='inner', ignore_index=True)
    print(" join='inner'合并df1,df2都有的项, ignore_index=True,序项排序", res)
    res = pd.concat([df1, df2], axis=1, join_axes=[df1.index])
    print("join_axes=[df1.index],以df1的index为准", res)
    res = df1.append([df2, df3], ignore_index=True)
    print("在df1后面追加df2:", res)
    s1 = pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd'])
    print(s1)
    res = df1.append(s1, ignore_index=True)
    print(res)
    #17. 17pandas合并merge
    left = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3'],
                           'A': ['A0', 'A1', 'A2', 'A3'],
                           'B': ['B0', 'B1', 'B2', 'B3']})
    right = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3'],
                            'C': ['C0', 'C1', 'C2', 'C3'],
                            'D': ['D0', 'D1', 'D2', 'D3']})
    print(left)
    print(right)
    res = pd.merge(left, right, on='key')
    print('基于key合并left和right:', res)
    left = pd.DataFrame({'key1': ['K0', 'K0', 'K1', 'K2'],
                         'key2': ['K0', 'K1', 'K0', 'K1'],
                           'A': ['A0', 'A1', 'A2', 'A3'],
                           'B': ['B0', 'B1', 'B2', 'B3']})
    right = pd.DataFrame({'key1': ['K0', 'K1', 'K1', 'K2'],
                          'key2': ['K0', 'K0', 'K0', 'K0'],
                            'C': ['C0', 'C1', 'C2', 'C3'],
                            'D': ['D0', 'D1', 'D2', 'D3']})
    #how=['left', 'right', 'inner', 'outer']
    res = pd.merge(left, right, on=['key1', 'key2'])
    print('合并多列时输出满足多列值同时相同的部分(默认how=inner):')
    print(res)
    res = pd.merge(left, right, on=['key1', 'key2'], how='outer', indicator=True)
    print("合并多列时输出所以已有的多列值的组合,没有的补NAN(how=outer),indicator=True显示左右组合的情况。默认该列的名字是_merge,改名字: indicator='indicator_column':")
    print(res)
    #merged by index
    left = pd.DataFrame({'A': ['A0', 'A1', 'A2'],
                         'B': ['B0', 'B1', 'B2']},
                         index=['K0', 'K1', 'K2'])
    right = pd.DataFrame({'C': ['C0', 'C1', 'C2'],
                          'D': ['D0', 'D1', 'D2']},
                         index=['K0', 'K2', 'K3'])
    print("left")
    print(left)
    print("right")
    print(right)
    res = pd.merge(left, right, left_index=True, right_index=True, how='outer')
    print("merged by index")
    print(res)
    
    boys = pd.DataFrame({'k': ['K0', 'K1', 'K2'],
                         'age': [1, 2, 3]})
    girls = pd.DataFrame({'k': ['K0', 'K0', 'K3'],
                          'age': [4, 5, 6]})
    print(boys)
    print(girls)
    res = pd.merge(boys, girls, on='k', suffixes=['_boy', '_girl'], how='outer')
    print(res)
    #18, 18pandas plot
    data = pd.DataFrame(np.random.randn(1000, 4),
                     index=np.arange(1000),
                     columns=list("ABCD"))
    print(data.head())
    data = data.cumsum()
    print(data.head())
    data.plot()
    plt.show()
    #plot方法:
    #'bar','hist', 'box', 'kde', 'area', 'scatter', 'hexbin', 'pie'
    ax = data.plot.scatter(x='A', y='B', color='DarkBlue', label='Class 1')
    data.plot.scatter(x='A', y='C', color='green', label='Class 2', ax=ax)  #ax=ax,将两个图打印在一张图上
    plt.show()
  • 相关阅读:
    【NOI D2T1】量子通信(容斥原理+卡常)
    CF1555D Say No to Palindromes(线段树)
    CF1554B Cobb
    CF1554A Cherry
    【做题笔记】UVA10162 Last Digit
    【做题记录】CF1223D Sequence Sorting
    CF39H
    UVA10763
    题解 AT2361 [AGC012A] AtCoder Group Contest
    このブログについて | About this blog
  • 原文地址:https://www.cnblogs.com/dshn/p/9109602.html
Copyright © 2011-2022 走看看