zoukankan      html  css  js  c++  java
  • 数据加载存储和文件格式

    原文地址:

    https://github.com/AsuraDong/Blog/blob/master/Articles/%E6%9C%BA%E5%99%A8%E5%AD%A6%E4%B9%A0/%E6%95%B0%E6%8D%AE%E5%8A%A0%E8%BD%BD%E5%AD%98%E5%82%A8%E5%92%8C%E6%96%87%E4%BB%B6%E6%A0%BC%E5%BC%8F.md

    1.读取文本格式数据

    import pandas as pd
    import numpy as np
    import sys
    import pymysql
    # 图片:pandas解析函数
    df = pd.read_csv('ex1.csv')
    print(df)
       a   b   c   d message
    0  1   2   3   4   hello
    1  5   6   7   8   world
    2  9  10  11  12     foo
    
    df = pd.read_table('ex1.csv',sep=',') #可以使用read_table,但必须指定分隔符
    # sep还可以是正则表达式
    print(df)
       a   b   c   d message
    0  1   2   3   4   hello
    1  5   6   7   8   world
    2  9  10  11  12     foo
    
    df = pd.read_csv('ex2.csv',header = None)#不是每一个csv都有header
    print(df)
       0   1   2   3      4
    0  1   2   3   4  hello
    1  5   6   7   8  world
    2  9  10  11  12    foo
    
    df = pd.read_csv('ex2.csv',names=['a','b','c','d','names'])#指定名字
    print(df)
       a   b   c   d  names
    0  1   2   3   4  hello
    1  5   6   7   8  world
    2  9  10  11  12    foo
    
    names=['a','b','c','d','names']
    df = pd.read_csv('ex2.csv',names=names,index_col='names') #将names做成索引
    print(df)
    #names对应三个,abcd分别有对应的
           a   b   c   d
    names               
    hello  1   2   3   4
    world  5   6   7   8
    foo    9  10  11  12
    
    df = pd.read_csv('csv_mindex.csv')
    print('原始样子:','
    ',df)
    df = pd.read_csv('csv_mindex.csv',index_col=['keys','key2'])
    #层次化索引.
    #请注意keys和key2的顺序
    print(df)
    原始样子: 
       keys key2  value1  value2
    0  one    a       1       2
    1  one    b       3       4
    2  two    a       9      10
    3  two    c      13      14
               value1  value2
    keys key2                
    one  a          1       2
         b          3       4
    two  a          9      10
         c         13      14
    
    df = pd.read_csv('ex4.csv')
    print('原始样子:','
    ',df)
    #跳过文件的第几行
    print()
    df = pd.read_csv('ex4.csv',skiprows=[0,2])
    print(df)
    原始样子: 
                                                               # hey!
    a                                           b   c   d    message
    # just wanted to make things more difficult NaN NaN NaN      NaN
    1                                           2   NaN 4      hello
    
       a  b   c  d message
    0  1  2 NaN  4   hello
    
    pd.isnull(df)# 处理缺失值
    df = pd.read_csv('ex4.csv',skiprows=[0,2],na_values=['hello'])# 接收一组用于表示缺失值的字符串
    print(df)
    print(pd.isnull(df))
       a  b   c  d  message
    0  1  2 NaN  4      NaN
           a      b     c      d  message
    0  False  False  True  False     True
    
    sentinels = {'message':['foo','NA'],'d':['a','NaN']}# 用一个字典为各列指定不同的NA标记值
    df = pd.read_csv('ex4.csv',skiprows=[0,2],na_values=sentinels)
    print(df)
       a  b   c  d message
    0  1  2 NaN  4   hello
    
    # 图片:read_table/csv参数

    2.逐块读取文本文件

    # nrows参数指定只读取定行。算上第一行哦
    pd.read_csv('ex1.csv',nrows=4)
    <style> .dataframe thead tr:only-child th { text-align: right; }
    .dataframe thead th {
        text-align: left;
    }
    
    .dataframe tbody tr th {
        vertical-align: top;
    }
    
    </style>
     abcdmessage
    0 1 2 3 4 hello
    1 5 6 7 8 world
    2 9 10 11 12 foo
    # chunksize 指定分块读取
    chunks = pd.read_csv('ex1.csv',chunksize=2)
    print(chunks)
    <pandas.io.parsers.TextFileReader object at 0x0000007D7E4A39B0>
    
    for chunk in chunks:
        print(chunk)
        print('='*10,)
       a  b  c  d message
    0  1  2  3  4   hello
    1  5  6  7  8   world
    ==========
       a   b   c   d message
    2  9  10  11  12     foo
    ==========
    

    3.将数据写出到文本格式

    data = pd.read_csv('ex1.csv',nrows=3)
    data.to_csv('ex1_1.csv') #to_csv写入
    data.to_csv('ex1_2.csv',sep='|')# 别的分隔符
    data.to_csv('ex1_1.csv',na_rep='NULL')# 缺失值会被替换为na_rep
    data.to_csv(sys.stdout,index=False,header=False) 
    # 行、列标签被禁止
    # 输出到控制台
    1,2,3,4,hello
    5,6,7,8,world
    9,10,11,12,foo
    
    data.to_csv(sys.stdout,index=False,columns=['a','b'])
    a,b
    1,2
    5,6
    9,10
    
    data.to_csv(sys.stdout)
    ,a,b,c,d,message
    0,1,2,3,4,hello
    1,5,6,7,8,world
    2,9,10,11,12,foo
    

    4.DataFrame

    # 可以将json格式的数据传给DataFreame
    # 也可以数据将数据库的rows传给DataFrame
    conn = pymysql.Connect(host='172.31.238.166',port=3306,user='luowang',passwd='root',
                          charset='UTF8',db='dyx')
    cursor=conn.cursor()
    sql='select * from access_log';
    cursor.execute(sql)
    rows= cursor.fetchall()
    print(cursor.description)
    (('aid', 3, None, 16, 16, 0, False), ('site_id', 3, None, 16, 16, 0, False), ('count', 3, None, 32, 32, 0, False))
    
    # cursor.description第一个保存了列的信息
    # pd.DataFrame(rows,columns=[i[0] for i in cursor.description])
    pd.DataFrame(rows,columns=zip(*cursor.description)[0])
    ---------------------------------------------------------------------------
    
    TypeError                                 Traceback (most recent call last)
    
    <ipython-input-74-05969a36ac33> in <module>()
          1 # cursor.description第一个保存了列的信息
          2 # pd.DataFrame(rows,columns=[i[0] for i in cursor.description])
    ----> 3 pd.DataFrame(rows,columns=zip(*cursor.description)[0])
    
    
    TypeError: 'zip' object is not subscriptable
    
    [i[0] for i in cursor.description]
    ['aid', 'site_id', 'count']
    
    pd.DataFrame(list(rows),columns=[i[0] for i in cursor.description]) #rows必须是list类型
    <style> .dataframe thead tr:only-child th { text-align: right; }
    .dataframe thead th {
        text-align: left;
    }
    
    .dataframe tbody tr th {
        vertical-align: top;
    }
    
    </style>
     aidsite_idcount
    0 1 1 45
    1 2 3 100
    2 3 1 230
    3 4 2 10
    4 5 5 205
    5 6 4 13
    6 7 3 220
    7 8 5 545
    8 9 3 201
    9 10 10 10
    10 11 11 11

  • 相关阅读:
    Spring Cloud入门
    HTML常用标签
    Spring boot 入门
    数据库 基本操作
    jquery中的ajax方法参数
    反射详解
    SpringMVC框架
    Java NIO
    MQ(消息队列)的使用场景以及常见的MQ
    英文字母和中文汉字在不同字符集编码下的字节数
  • 原文地址:https://www.cnblogs.com/AsuraDong/p/7413098.html
Copyright © 2011-2022 走看看