zoukankan      html  css  js  c++  java
  • Pandas操作数据库及保存csv

    数据的保存

    import pandas as pd
    import numpy as np
    from pandas import Series
    
    col_db = [['one',1,2,3,4,np.nan],['two',5,6,8,'world',np.nan],['three',9,10,11,12,'foo']]
    data = pd.DataFrame(col_db,columns = ['somthing','a','b','c','d','message'])
    data
    
        somthing	a	b	c	d	message
    0	one	1	2	3	4	NaN
    1	two	5	6	8	world	NaN
    2	three	9	10	11	12	foo
    
    # 利用DataFrame的to_csv方法,csv默认为逗号分隔
    data.to_csv('save.csv')
    # 指定分隔符
    data.to_csv('save_.csv',sep='|')
    # 缺失值在输出结果中会被表示为空字符串,你可以指定
    data.to_csv('save_1.csv',na_rep='NULL')
    # 禁用行和列的标签,只保留数据
    data.to_csv('save_2.csv',header=False,index=False)
    # 输出指定的列,并以指定的顺序排列
    data.to_csv('save_2.csv',index=False,columns=['a','b','c'])
    # Series也有一个to_csv方法,from_csv可以直接读取csv
    Series.from_csv('save_2.csv')
    
    a     b
    1     2
    5     6
    9    10
    dtype: object
    

    手工处理分隔符格式

    大部分的表格型数据都能用pd.read_table进行加载,但是由于含有畸形行的文件而使read_table出毛病的情况并不少见
    例如如下的格式文件:

    a, b, c d
    1, 2, 3
    1, 2, 3, 4
    import csv
    # 直接使用pd.read_csv会报错,这里需要引入csv模块,进行处理
    f = open('save_2.csv')
    # 将已打开的文件型对象传给csv.reader
    reader = csv.reader(f)
    for line in reader:
        print(line,type(line))
    
    ['a', 'b', 'c'] <class 'list'>
    ['1', '2', '3'] <class 'list'>
    ['5', '6', '8', '10'] <class 'list'>
    
    # 整理这个reader
    lines = list(csv.reader(open('save_2.csv')))
    lines
    
    [['a', 'b', 'c'], ['1', '2', '3'], ['5', '6', '8', '10']]
    
    header, values = lines[0], lines[1:]
    # 压缩为元组,再把值恢复矩阵
    p = zip(header, zip(*values))
    for i in p:
        print(i)
    
    ('a', ('1', '5'))
    ('b', ('2', '6'))
    ('c', ('3', '8'))
    
    # 字典推导式
    {h:v for h,v in zip(header, zip(*values))}
    
    {'a': ('1', '5'), 'b': ('2', '6'), 'c': ('3', '8')}
    
    # 手工输出分隔符文件,可以使用csv.writer
    # w模式会覆盖并重新生成
    with open('save_2.csv','w') as f:
        writer = csv.writer(f)
        writer.writerow(('new_1','new_2'))
    

    JSON数据

    如何将JSON对象转为DataFrame或其他便于分析的数据结构

    
    import json
    obj = '''
    {
    "name":"wes",
    "places_lived":["United Statues","Spain","Germany"],
    "pet": null,
    "siblings":[{"name":"Scott","age":25,"pet":"Zuko"},
        {"name":"Katie","age":33,"pet":"Cisco"}]
    }
    '''
    # 选取一部分符合dataFrame格式的
    result = json.loads(obj)
    pd.DataFrame(result['siblings'])
    
        age	name	pet
    0	25	Scott	Zuko
    1	33	Katie	Cisco
    

    使用数据库

    # 导入内置的SQLite数据库
    import sqlite3
    query = '''
    CREATE TABLE test
    (
    a VARCHAR(20),
    b VARCHAR(20),
    c REAL,
    d INT
    );
    '''
    
    # 直接在内存中创建
    con = sqlite3.connect(':memory:')
    con.execute(query)
    con.commit()
    
    # 插入几行数据
    data = [('Atlanta','Georgia',1.25,6),
            ('Tallahassee','Florida',2.6,3),
            ('Sacramento','California',1.7,5)
           ]
    stmt = 'INSERT INTO test VALUES(?,?,?,?)'
    con.executemany(stmt,data)
    con.commit()
    
    # 从表中选取数据
    cursor = con.execute('select * from test')
    rows = cursor.fetchall()
    rows
    
    [('Atlanta', 'Georgia', 1.25, 6),
     ('Tallahassee', 'Florida', 2.6, 3),
     ('Sacramento', 'California', 1.7, 5)]
    
     #取出列表名
    cursor.description
    (('a', None, None, None, None, None, None),
     ('b', None, None, None, None, None, None),
     ('c', None, None, None, None, None, None),
     ('d', None, None, None, None, None, None))
    
     # zip(*)返回矩阵,与zip作用相反
    k = zip(*cursor.description)
    # for i in k:
    #     print(i)
    # 直接使用k[0]会报错,zip对象不支持'zip' object is not subscriptable,需要借助list包装
    list(k)[0]
    
    ('a', 'b', 'c', 'd')
    
    pd.DataFrame(rows,columns=list(zip(*cursor.description))[0])
    
        a	b	c	d
    0	Atlanta	Georgia	1.25	6
    1	Tallahassee	Florida	2.60	3
    2	Sacramento	California	1.70	5
    
    pandas有一个可以简化上面过程的read_sql函数,只需要传入select语句链接对象即可
    import pandas.io.sql as sql
    sql.read_sql('select * from test',con)
    
        a	b	c	d
    0	Atlanta	Georgia	1.25	6
    1	Tallahassee	Florida	2.60	3
    2	Sacramento	California	1.70	5
    
    # pandas链接mysql同理
    import pymysql
    
    conn = pymysql.connect(host='127.0.0.1',port=3306,user='root',passwd='123456',db='taobao',charset='utf8')
    sql.read_sql('select * from tblive2',conn)
    
    存取MongoDB中的数据
    import pymongo
    
    # 创建链接对象
    con2 = pymongo.MongoClient('localhost',port=27017)
    
    # 链接数据库
    db = con2.wechat_spider
    
    # 使用posts集合(这是我电脑里的)
    find_list = db.posts.find()
    # 这里由于不能直接传入迭代对象,需要借助list
    pd.DataFrame(list(find_list))
    
  • 相关阅读:
    HttpClient-----待补充
    JDK8的新特性
    关于日期转换的知识点(SimpleDateFormat)
    mybatis中的增删改查操作
    mybatis的快速入门
    018 HDFS中,namenode与datanode的交互
    Unit的各种断言
    分组数据
    Javassist学习总结
    hibernate Validator 6.X 的学习,bean的约束(字段,get方法上的验证)
  • 原文地址:https://www.cnblogs.com/lishi-jie/p/9996013.html
Copyright © 2011-2022 走看看