zoukankan      html  css  js  c++  java
  • pandas连接MySQL和impala

    pandas连接MySQL和impala

    SQL连接

    read_sql(sql, 
    		 con, 
    		 index_col=None, 
    		 coerce_float=True, 
    		 params=None, 
    		 parse_dates=None, 
    		 columns=None, 
    		 chunksize=None)
    
    

    参数的意义:

    • sql: 为可执行的sql语句
    • con: 数据库的连接
    • index_col: 选择某一列作为index
    • coerce_float: 将数字形式的字符串直接以float型读入
    • params: 返回传递参数的查询字符串
    • parse_dates: 将某一列日期型字符串转换为datetime型数据,与pd.to_datetime函数功能类似。可以直接提供需要转换的列名以默认的日期形式转换,也可以用字典的格式提供列名和转换的日期格式,比如{column_name: format string}(format string:"%Y:%m:%H:%M:%S")
    • colunms: 要选取的列。一般没啥用,因为在sql命令里面一般就指定要选择的列了
    • chunksize: 如果提供了一个整数值,那么就会返回一个generator,每次输出的行数就是提供的值的大小。
    import pandas as pd
    
    # 方法一:用DBAPI构建数据库链接engine
    import pymysql
    conn = pymysql.connect(host='localhost',
                           user='root',
                           password='',
                           database='database_name')
    df = pd.read_sql("select * from table_name",con=conn)
    
    
    # 方法二:用sqlalchemy构建数据库链接engine
    import sqlalchemy
    from sqlalchemy import create_engine
    #connect_info = 'mysql+pymysql://{}:{}@{}:{}/{}?charset=utf8'.format(DB_USER, DB_PASS, DB_HOST, DB_PORT, DATABASE)
    connect_info = 'mysql+pymysql://{}:{}@{}:{}/{}'.format('root',
    													   '', 
    													   'localhost', 
    													   3306, 
    													   'nowcoderdb')
    engine=create_engine(connect_info)
    df2=pd.read_sql("select * from table_name",con=engine)
    
    

    impala连接

    # Impala 配置
    import pandas as pd
    from impala.dbapi import connect 
    import decimal
    
    
    
    def impala_data_get(isql, db_name, table_name):
        conn = connect(host=HOST, 
                        port=PORT,
                        user=USER,
                        password=PASSWOED,
                        auth_mechanism=AUTH_MECHANISM
                        )
        cur = conn.cursor()
        # cur.execute("use snap")
        # cur.execute("show tables")
        #     INVALIDATE METADATA 表名
    
        frech_isql = "refresh `" + db_name + "`.`" + table_name + "`"
        cur.execute(frech_isql)
        cur.execute(isql)
        # 获取字段名
        des = cur.description
        cols = [item[0] for item in des]
        # 获取数据
        data_list=cur.fetchall()
        # DataFrame    
        data = pd.DataFrame(data_list, columns=cols)
        return data, cols 
    

    pandas 基础设置

    详见

    pd.set_option('display.max_rows',xxx) # 最大行数
    pd.set_option('display.min_rows',xxx) # 最小显示行数
    pd.set_option('display.max_columns',xxx) # 最大显示列数
    pd.set_option ('display.max_colwidth',xxx) #最大列字符数
    pd.set_option( 'display.precision',2) # 浮点型精度
    pd.set_option('display.float_format','{:,}'.format) #逗号分隔数字
    pd.set_option('display.float_format',  '{:,.2f}'.format) #设置浮点精度
    pd.set_option('display.float_format', '{:.2f}%'.format) #百分号格式化
    pd.set_option('plotting.backend', 'altair') # 更改后端绘图方式
    pd.set_option('display.max_info_columns', 200) # info输出最大列数
    pd.set_option('display.max_info_rows', 5) # info计数null时的阈值
    pd.describe_option() #展示所有设置和描述
    pd.reset_option('all') #重置所有设置选项
    

    csv 以日期存储

    import datetime 
    import os 
    # date_time = (datetime.datetime.now() + datetime.timedelta(days=-1)).strftime('%Y-%m-%d %H:%M:%S').split(' ')[0]
    save_time = datetime.datetime.now().strftime('%Y_%m_%d').split(' ')[0]
    # print(csv_name)
    file_path = './data/' + str(save_time) +'/'
    file_path 
    
    
    outpath = ''
    if not os.path.exists(file_path):
        os.mkdir(file_path)
    

    绘图

    # 添加转入转出指标
    def spec_advertiser_id_cbti_time_series(id_from_df, all_dfs, min_data=0, max_data=2000000):
        top25_advertiser = list(id_from_df[:25].id)
        food_drinks_datas = all_dfs
        i = 0
        fig = plt.figure(figsize=(40,30))
        # fig = plt.figure(figsize=(16,14))
        print(top25_advertiser)
        required_datas = pd.DataFrame()
        for advertiser_id in top25_advertiser:
    #         food_drinks_datas
            spec_advertiser_datas = food_drinks_datas[food_drinks_datas.id==str(advertiser_id)].sort_values(
                by='etl_date')
        #     plt.subplot(5, 5, i+1) 
    #         print(spec_advertiser_datas['cost'].max(), spec_advertiser_datas['balance'].max())
    #         display(spec_advertiser_datas)
    
            
            ax = fig.add_subplot(5, 5, i+1)
            lns1 = ax.plot(spec_advertiser_datas['etl_date'], spec_advertiser_datas['cost'], color='green', label='cost')
            plt.xticks(rotation=45)
            plt.yticks(rotation=45)
    
    
    
            ax2 = ax.twinx()
            lns2 = ax2.plot(spec_advertiser_datas['etl_date'], spec_advertiser_datas['balance'], color='red', label='balance')
            plt.xticks(rotation=45)
            plt.yticks(rotation=45)
        #     plt.xlabel("time")
        #     plt.ylabel("cost")    
        #     print(11, help(ax.set_xticks))
        
    #         ax3 = ax.twinx()
            lns3 = ax2.plot(spec_advertiser_datas['etl_date'], 
                               spec_advertiser_datas['transfer_in'], 
    #                            marker='+',
                               color='orangered', 
                               label='transfer_in',
                           alpha=0.3)
    #         plt.xticks(rotation=45)
    #         plt.yticks(rotation=45)
            
    #         ax4 = ax.twinx()
            lns4 = ax2.plot(spec_advertiser_datas['etl_date'], 
                            spec_advertiser_datas['transfer_out'], 
    #                         marker='v',
                            color='blue', 
                            label='transfer_out',
                           alpha=0.3)
    #         plt.xticks(rotation=45)
    #         plt.yticks(rotation=45)        
    #         ax5 = ax.twinx()
    #         lns5 = ax5.plot(spec_advertiser_datas['etl_date'], 
    #                         spec_advertiser_datas['income'], 
    #                            marker='o',
    #                         color='y', 
    #                         label='income',
    #                        alpha=0.7)
            
            ax.set_xlabel("etl_date", 
        #                   rotation=45
                         )
            ax.set_ylabel(r"cost", 
        #                   rotation=45
                         )
    #         ax2.set_ylabel(r"balance",
    #     #                    rotation=45
    #                       )
    #         ax2.set_ylabel(r"balance",
    # #                         rotation=45
    #               )
    #         ax2.set_ylabel(r"balance",
    # #                         rotation=45
    #               )
    #         ax2.set_ylabel(r"balance",
    # #                         rotation=45
    #               )
        #     ax.set_xticks(rotation=45)
        #     ax.set_yticks(rotation=45)
        #     ax2.set_yticks(rotation=45)
            lns = lns1+lns2+lns3+lns4
            labs = [l.get_label() for l in lns]
            ax.legend(lns, labs, loc=2)
            
    #         ax.set_ylim(0,1000000)
    #         ax2.set_ylim(0, 5300000)
            ax.set_ylim(min_data, max_data)
            ax2.set_ylim(min_data, max_data)
    #         ax3.set_ylim(0, food_drinks_datas.balance.max())
    #         ax4.set_ylim(0, food_drinks_datas.balance.max())
    #         ax5.set_ylim(0, food_drinks_datas.balance.max())
        #     ax.legend(loc=0)
        #     ax2.legend(loc=0)
    
            ax.set_title('id' + str(i) + ": " + str(advertiser_id))
    
    #         plt.xticks(rotation=45)
    #         plt.yticks(rotation=45) 
        #     plt.title(str(i) + ":" + str(advertiser_id), loc='right')
        #     plt.legend(['cost', 'balance'])
            ax_ = plt.gca()
            x_major_locator=MultipleLocator(15)
            ax_.xaxis.set_major_locator(x_major_locator)
    
            i += 1 
            
            required_datas = required_datas.append(spec_advertiser_datas)
    
        plt.subplots_adjust(wspace=0.4, hspace=0.9)
        plt.show()
        return required_datas
    
  • 相关阅读:
    古代汉语:含义“死”的归类
    Inesa 路由器登录信息
    调试第三方源码
    暑假作业日总结
    暑假作业日总结
    暑假作业日总结
    暑假作业日总结
    暑假作业日总结
    暑假作业:《大道至简》读后感
    暑假作业日总结
  • 原文地址:https://www.cnblogs.com/geoffreygao/p/14976902.html
Copyright © 2011-2022 走看看