zoukankan      html  css  js  c++  java
  • Pandas读取文件

    # 1. 使用to_excel创建Excel文件
    import pandas as pd
    df = pd.DataFrame({'id':[1,2,3],'name':['zs','ls','ww']})
    # 默认会有索引,将ID列设置成索引,会返回一个新的df,如果想要在原来的df上修改需要添加参数inplace=True
    df = df.set_index('id')
    df.to_excel('./output.xlsx')
    print('end')
    # 2. 使用pandas读取文件
    import pandas as pd
    # 此处需要安装依赖库xlrd
    people = pd.read_excel('~/Desktop/People.xlsx')
    print('获取文件中的行和列:',people.shape)
    print("-"*20)
    print('获取文件中的列名:',people.columns)
    print("-"*20)
    # 默认取前五行
    print('获取文件中的前几行数据信息:',people.head())
    print("-"*20)
    print('获取文件中的后几行数据信息:',people.tail())
    print("-"*20)
    # 注意常见问题:
    # 1. 读取的时候,默认会将第一行作为列名,我们可以修改
    people = pd.read_excel('~/Desktop/People.xlsx',header = 1)
    print(people.columns)
    输出:
    获取文件中的行和列: (19972, 6)
    --------------------
    获取文件中的列名: Index(['ID', 'Type', 'Title', 'FirstName', 'MiddleName', 'LastName'], dtype='object')
    --------------------
    获取文件中的前几行数据信息:    ID      Type Title FirstName MiddleName    LastName
    0   1  Employee   NaN       Ken          J     Sánchez
    1   2  Employee   NaN     Terri        Lee       Duffy
    2   3  Employee   NaN   Roberto        NaN  Tamburello
    3   4  Employee   NaN       Rob        NaN     Walters
    4   5  Employee   Ms.      Gail          A    Erickson
    --------------------
    获取文件中的后几行数据信息:           ID                 Type Title FirstName MiddleName    LastName
    19967  20773  Individual Customer   NaN   Crystal        NaN         Guo
    19968  20774  Individual Customer   NaN  Isabella          F  Richardson
    19969  20775  Individual Customer   NaN   Crystal          S          He
    19970  20776  Individual Customer   NaN   Crystal        NaN       Zheng
    19971  20777  Individual Customer   NaN   Crystal        NaN          Hu
    --------------------
    Index([1, 'Employee', 'NULL', 'Ken', 'J', 'Sánchez'], dtype='object')
    # 2. 使用pandas读取文件
    import pandas as pd
    #2. 如果第一行或者其他行不满足我们的需求时,我们可以自定义
    # 第一种: 设置header为None,会使用默认的01234
    people = pd.read_excel('~/Desktop/People.xlsx',header = None)
    print(people.columns)
    print("-"*20)
    print(people.head())
    print("-"*20)
    # 第二种: 认为的设置默认值
    people.columns = ['ID1','Type1','Title1','FirstName1','MiddleName1','LastName1']
    print(people.columns)
    print("-"*20)
    print(people.head())
    print("-"*20)
    # 重新存储
    people.set_index('ID1',inplace = True)
    print(people.head())
    print("-"*20)
    people.to_excel('./People1.xlsx')
    print('end')
    print("-"*20)
    # 注意读取数据的时候,会将ID1右作为一列输出出来,所以可以在读取的时候用参数指定一下
    people1 = pd.read_excel('./People1.xlsx',index_col = "ID1")
    print(people1.head())
    输出:
    Int64Index([0, 1, 2, 3, 4, 5], dtype='int64')
    --------------------
        0         1      2          3           4           5
    0  ID      Type  Title  FirstName  MiddleName    LastName
    1   1  Employee    NaN        Ken           J     Sánchez
    2   2  Employee    NaN      Terri         Lee       Duffy
    3   3  Employee    NaN    Roberto         NaN  Tamburello
    4   4  Employee    NaN        Rob         NaN     Walters
    --------------------
    Index(['ID1', 'Type1', 'Title1', 'FirstName1', 'MiddleName1', 'LastName1'], dtype='object')
    --------------------
      ID1     Type1 Title1 FirstName1 MiddleName1   LastName1
    0  ID      Type  Title  FirstName  MiddleName    LastName
    1   1  Employee    NaN        Ken           J     Sánchez
    2   2  Employee    NaN      Terri         Lee       Duffy
    3   3  Employee    NaN    Roberto         NaN  Tamburello
    4   4  Employee    NaN        Rob         NaN     Walters
    --------------------
            Type1 Title1 FirstName1 MiddleName1   LastName1
    ID1                                                    
    ID       Type  Title  FirstName  MiddleName    LastName
    1    Employee    NaN        Ken           J     Sánchez
    2    Employee    NaN      Terri         Lee       Duffy
    3    Employee    NaN    Roberto         NaN  Tamburello
    4    Employee    NaN        Rob         NaN     Walters
    --------------------
    end
    --------------------
            Type1 Title1 FirstName1 MiddleName1   LastName1
    ID1                                                    
    ID       Type  Title  FirstName  MiddleName    LastName
    1    Employee    NaN        Ken           J     Sánchez
    2    Employee    NaN      Terri         Lee       Duffy
    3    Employee    NaN    Roberto         NaN  Tamburello
    4    Employee    NaN        Rob         NaN     Walters
    import pandas as pd
    # 指定读哪个表
    sheet = pd.read_excel('~/Desktop/sheet.xlsx',sheet_name='sheet2')
    print(sheet.head())
    print("-"*20)
    # 3. 如果数据在表格中没有顶格写时
    # skiprows : 跳过几行
    # usecols: 使用那几列(C,指的就是Excel上的ABCD....)
    book = pd.read_excel('~/Desktop/Books.xlsx',skiprows=3,usecols ="C:F")
    print(book.head())
    输出:
       ID  age
    0   0   18
    1   1   19
    --------------------
       ID      Name  InStore
    0 NaN  Book_001      NaN
    1 NaN  Book_002      NaN
    2 NaN  Book_003      NaN
    3 NaN  Book_004      NaN
    4 NaN  Book_005      NaN
    import pandas as pd
    book = pd.read_excel('~/Desktop/Books.xlsx',skiprows=3,usecols ="C:F")
    print(book['ID'])
    print("-"*20)
    # 修改ID 的值
    book["ID"].at[0] = 1
    print(book['ID'])
    print("-"*20)
    ''' at和iat函数是只能选择某个位置的值,iat是按照行索引和列索引的位置来选取数据的。而at是按照行索引和列索引来选 取数据;
        loc和iloc函数的功能包含at和iat函数的功能
    '''
    # 使用for循环给ID列添加内容
    for i in book.index:
        book["ID"].at[i] = i+1
    # # 注意: 添加后ID由于默认是float类型,修改一下类型
    book["ID"]= book["ID"].astype(int)
    print(book['ID'])
    print("-"*20)
    # # 给inStore添加交替值
    book["InStore"] = book["InStore"].astype(str)
    for i in book.index:
        book["InStore"].at[i] = 'Yes' if i%2==0 else "No"
    book.set_index('ID',inplace=True)
    book.to_excel('./books1.xlsx')
    输出:
    0    NaN
    1    NaN
    2    NaN
    3    NaN
    4    NaN
    5    NaN
    6    NaN
    7    NaN
    8    NaN
    9    NaN
    10   NaN
    11   NaN
    12   NaN
    13   NaN
    14   NaN
    15   NaN
    16   NaN
    17   NaN
    18   NaN
    19   NaN
    Name: ID, dtype: float64
    --------------------
    0     1.0
    1     NaN
    2     NaN
    3     NaN
    4     NaN
    5     NaN
    6     NaN
    7     NaN
    8     NaN
    9     NaN
    10    NaN
    11    NaN
    12    NaN
    13    NaN
    14    NaN
    15    NaN
    16    NaN
    17    NaN
    18    NaN
    19    NaN
    Name: ID, dtype: float64
    --------------------
    0      1
    1      2
    2      3
    3      4
    4      5
    5      6
    6      7
    7      8
    8      9
    9     10
    10    11
    11    12
    12    13
    13    14
    14    15
    15    16
    16    17
    17    18
    18    19
    19    20
    Name: ID, dtype: int64

     原始数据:

    修改后:

    函数填充计算列

    
    
    import pandas as pd
    books = pd.read_excel('~/Desktop/04Books.xlsx',index_col = 'ID')
    print(books)
    print("-"*20)
    # 计算Price的值(这种方法是列与列之间对齐后进行计算)
    books["Price"] = books['OnePrice'] * books['Count']
    print(books)
    print("-"*20)
    # 如果只想算某一段就可以,使用循环迭代(是单元格与单元格之间的操作)
    for i in range(5,16):
    books["Price"].at[i] = books["OnePrice"].at[i] * books["Count"].at[i]
    print(books)
    # 如果想修改原文件,直接写回去就可以
    books.to_excel('./05Books.xlsx')
    输出:
    Name OnePrice Count Price
    ID
    1 Product_001 9.82 5 NaN
    2 Product_002 11.99 4 NaN
    3 Product_003 9.62 6 NaN
    4 Product_004 11.08 8 NaN
    5 Product_005 7.75 3 NaN
    6 Product_006 7.34 4 NaN
    7 Product_007 10.97 6 NaN
    8 Product_008 11.14 7 NaN
    9 Product_009 8.98 2 NaN
    10 Product_010 9.18 3 NaN
    11 Product_011 8.31 4 NaN
    12 Product_012 7.29 9 NaN
    13 Product_013 8.36 5 NaN
    14 Product_014 9.16 6 NaN
    15 Product_015 10.31 3 NaN
    16 Product_016 10.26 6 NaN
    17 Product_017 11.95 8 NaN
    18 Product_018 11.22 2 NaN
    19 Product_019 10.95 4 NaN
    20 Product_020 8.82 6 NaN
    --------------------
    Name OnePrice Count Price
    ID
    1 Product_001 9.82 5 49.10
    2 Product_002 11.99 4 47.96
    3 Product_003 9.62 6 57.72
    4 Product_004 11.08 8 88.64
    5 Product_005 7.75 3 23.25
    6 Product_006 7.34 4 29.36
    7 Product_007 10.97 6 65.82
    8 Product_008 11.14 7 77.98
    9 Product_009 8.98 2 17.96
    10 Product_010 9.18 3 27.54
    11 Product_011 8.31 4 33.24
    12 Product_012 7.29 9 65.61
    13 Product_013 8.36 5 41.80
    14 Product_014 9.16 6 54.96
    15 Product_015 10.31 3 30.93
    16 Product_016 10.26 6 61.56
    17 Product_017 11.95 8 95.60
    18 Product_018 11.22 2 22.44
    19 Product_019 10.95 4 43.80
    20 Product_020 8.82 6 52.92
    --------------------
    Name OnePrice Count Price
    ID
    1 Product_001 9.82 5 49.10
    2 Product_002 11.99 4 47.96
    3 Product_003 9.62 6 57.72
    4 Product_004 11.08 8 88.64
    5 Product_005 7.75 3 23.25
    6 Product_006 7.34 4 29.36
    7 Product_007 10.97 6 65.82
    8 Product_008 11.14 7 77.98
    9 Product_009 8.98 2 17.96
    10 Product_010 9.18 3 27.54
    11 Product_011 8.31 4 33.24
    12 Product_012 7.29 9 65.61
    13 Product_013 8.36 5 41.80
    14 Product_014 9.16 6 54.96
    15 Product_015 10.31 3 30.93
    16 Product_016 10.26 6 61.56
    17 Product_017 11.95 8 95.60
    18 Product_018 11.22 2 22.44
    19 Product_019 10.95 4 43.80
    20 Product_020 8.82 6 52.92
     

    原始数据:

    修改后:

    排序

    import pandas as pd
    books = pd.read_excel('~/Desktop/05List.xlsx')
    print(books)
    print("-"*20)
    # 将价钱进行排列
    '''
    by='Price' : 跟据哪一列进行排序
    inplace=True : 是否在原有的DataFrame上修改,
    ascending = True : 默认为升序
    '''
    books.sort_values(by='Price',inplace=True,ascending = False)
    # 将买过的书按价格高低排列
    '''
    by=['Buy','Price'] :先排Buy,在这个基础上排Price
    ascending = [True,False]: 分别指定buy和price的升降序
    '''
    books.sort_values(by=['Buy','Price'],inplace=True,ascending = [True,False])
    print(books)
    输出:
        ID         Name  Price  Buy
    0    1  Product_001   9.82  Yes
    1    2  Product_002  11.99  Yes
    2    3  Product_003   9.62   No
    3    4  Product_004  11.08  Yes
    4    5  Product_005   7.75  Yes
    5    6  Product_006   7.34   No
    6    7  Product_007  10.97  Yes
    7    8  Product_008  11.14  Yes
    8    9  Product_009   8.98   No
    9   10  Product_010   9.18  Yes
    10  11  Product_011   8.31  Yes
    11  12  Product_012   7.29   No
    12  13  Product_013   8.36  Yes
    13  14  Product_014   9.16  Yes
    14  15  Product_015  10.31   No
    15  16  Product_016  10.26  Yes
    16  17  Product_017  11.95  Yes
    17  18  Product_018  11.22   No
    18  19  Product_019  10.95  Yes
    19  20  Product_020   8.82  Yes
    --------------------
        ID         Name  Price  Buy
    17  18  Product_018  11.22   No
    14  15  Product_015  10.31   No
    2    3  Product_003   9.62   No
    8    9  Product_009   8.98   No
    5    6  Product_006   7.34   No
    11  12  Product_012   7.29   No
    1    2  Product_002  11.99  Yes
    16  17  Product_017  11.95  Yes
    7    8  Product_008  11.14  Yes
    3    4  Product_004  11.08  Yes
    6    7  Product_007  10.97  Yes
    18  19  Product_019  10.95  Yes
    15  16  Product_016  10.26  Yes
    0    1  Product_001   9.82  Yes
    9   10  Product_010   9.18  Yes
    13  14  Product_014   9.16  Yes
    19  20  Product_020   8.82  Yes
    12  13  Product_013   8.36  Yes
    10  11  Product_011   8.31  Yes
    4    5  Product_005   7.75  Yes

     数据库连接

    import pandas as pd
    import pymysql
    conn=pymysql.connect(host='localhost',user='root',passwd='123',db='tieba',port=3306,charset='utf8mb4')
    query = 'SELECT id,info,url FROM tieba_info'
    df = pd.read_sql_query(query,conn)
    print(df)

    赠人玫瑰,手有余香,如果我的文章有幸能够帮到你,麻烦帮忙点下右下角的推荐,谢谢!

    作者: imcati

    出处: https://www.cnblogs.com/imcati/>

    本文版权归作者所有,欢迎转载,但未经作者同意必须保留此段声明,且在文章页面明显位置给出, 原文链接

  • 相关阅读:
    【转】Senior Data Structure · 浅谈线段树(Segment Tree)
    1260:【例9.4】拦截导弹(Noip1999)
    1255:迷宫问题
    1253:抓住那头牛
    1254:走出迷宫
    POJ 3692 幼儿园做游戏 最大团 模板题
    POJ 1466 大学谈恋爱 二分匹配变形 最大独立集
    POJ 3422 矩阵取数 最小费用流拆点+负边
    POJ 2195 一人一房 最小费用流 建图 水题
    POJ 3068 运送危险化学品 最小费用流 模板题
  • 原文地址:https://www.cnblogs.com/imcati/p/11305719.html
Copyright © 2011-2022 走看看