zoukankan      html  css  js  c++  java
  • Pandas——比较两个dataframe之间的区别

    import pandas as pd
    import datacompy
    #导入datafcompy库 比较两个dataframe之间的区别
    #同时也可以比较两个txt之间的区别
    df1 = pd.read_csv("1.txt",header=None)
    df2 = pd.read_csv("22.txt",header=None)
    dd = datacompy.Compare(df1,df2,join_columns=0)
    print(dd.report())

    结果

    DataComPy Comparison
    --------------------

    DataFrame Summary
    -----------------

    DataFrame Columns Rows
    0 df1 1 2
    1 df2 1 6

    Column Summary
    --------------

    Number of columns in common: 1
    Number of columns in df1 but not in df2: 0
    Number of columns in df2 but not in df1: 0

    Row Summary
    -----------

    Matched on: 0
    Any duplicates on match values: No
    Absolute Tolerance: 0
    Relative Tolerance: 0
    Number of rows in common: 2
    Number of rows in df1 but not in df2: 0
    Number of rows in df2 but not in df1: 4

    Number of rows with some compared columns unequal: 0
    Number of rows with all compared columns equal: 2

    Column Comparison
    -----------------

    Number of columns compared with some values unequal: 0
    Number of columns compared with all values equal: 1
    Total number of values which compare unequal: 0

    Sample Rows Only in df2 (First 10 Columns)
    ------------------------------------------

    0
    3 vasdj
    4 顺嘿嘿
    5 顺顺
    2 afdlkaewlhg

    import pandas as pd
    import datacompy 
    import glob
    import os
    
    all_files1 = glob.glob(r"C:Users15773Desktop	est1*.txt")
    all_files2 = glob.glob(r"C:Users15773Desktop	est2*.txt")
    for file1 in all_files1:
        file1_basename = os.path.basename(file1)
        for file2 in all_files2:
            file2_basename = os.path.basename(file2)
            if file1_basename == file2_basename:
                df1 = pd.read_csv(file1,header=None)
                df2 = pd.read_csv(file2,header=None)
                dd = datacompy.Compare(df1,df2,join_columns=0)
                report = dd.report()
                print(df1)
                print(df2)
                print(report)
                txt_name = str(file1_basename)+"_result.txt"
                result_txt = open(txt_name,'w')
                result_txt.write(report)
                result_txt.close()
    print("process done")
    from io import StringIO
    import pandas as pd
    import datacompy
    
    data1 = """acct_id,dollar_amt,name,float_fld,date_fld
    10000001234,123.45,George Maharis,14530.1555,2017-01-01
    10000001235,0.45,Michael Bluth,1,2017-01-01
    10000001236,1345,George Bluth,,2017-01-01
    10000001237,123456,Bob Loblaw,345.12,2017-01-01
    10000001239,1.05,Lucille Bluth,,2017-01-01
    """
    
    data2 = """acct_id,dollar_amt,name,float_fld
    10000001234,123.4,George Michael Bluth,14530.155
    10000001235,0.45,Michael Bluth,
    10000001236,1345,George Bluth,1
    10000001237,123456,Robert Loblaw,345.12
    10000001238,1.05,Loose Seal Bluth,111
    """
    
    df1 = pd.read_csv(StringIO(data1))
    df2 = pd.read_csv(StringIO(data2))
    
    compare = datacompy.Compare(
        df1,
        df2,
        join_columns='acct_id',  #You can also specify a list of columns
        abs_tol=0, #Optional, defaults to 0
        rel_tol=0, #Optional, defaults to 0
        df1_name='Original', #Optional, defaults to 'df1'
        df2_name='New' #Optional, defaults to 'df2'
        )
    compare.matches(ignore_extra_columns=False)
    # False
    
    # This method prints out a human-readable report summarizing and sampling differences
    print(compare.report())

    DataComPy Comparison
    --------------------

    DataFrame Summary
    -----------------

    DataFrame Columns Rows
    0 Original 5 5
    1 New 4 5

    Column Summary
    --------------

    Number of columns in common: 4
    Number of columns in Original but not in New: 1
    Number of columns in New but not in Original: 0

    Row Summary
    -----------

    Matched on: acct_id
    Any duplicates on match values: No
    Absolute Tolerance: 0
    Relative Tolerance: 0
    Number of rows in common: 4
    Number of rows in Original but not in New: 1
    Number of rows in New but not in Original: 1

    Number of rows with some compared columns unequal: 4
    Number of rows with all compared columns equal: 0

    Column Comparison
    -----------------

    Number of columns compared with some values unequal: 3
    Number of columns compared with all values equal: 1
    Total number of values which compare unequal: 6

    Columns with Unequal Values or Types
    ------------------------------------

    Column Original dtype New dtype # Unequal Max Diff # Null Diff
    2 dollar_amt float64 float64 1 0.0500 0
    0 float_fld float64 float64 3 0.0005 2
    1 name object object 2 0.0000 0

    Sample Rows with Unequal Values
    -------------------------------

    acct_id float_fld (Original) float_fld (New)
    2 10000001236 NaN 1.000
    0 10000001234 14530.1555 14530.155
    1 10000001235 1.0000 NaN

    acct_id name (Original) name (New)
    3 10000001237 Bob Loblaw Robert Loblaw
    0 10000001234 George Maharis George Michael Bluth

    acct_id dollar_amt (Original) dollar_amt (New)
    0 10000001234 123.45 123.4

    Sample Rows Only in Original (First 10 Columns)
    -----------------------------------------------

    acct_id dollar_amt name float_fld date_fld
    4 10000001239 1.05 Lucille Bluth NaN 2017-01-01

    Sample Rows Only in New (First 10 Columns)
    ------------------------------------------

    acct_id dollar_amt name float_fld
    5 10000001238 1.05 Loose Seal Bluth 111.0

  • 相关阅读:
    让资源管理器不显示最近常用文件夹
    票房实际是屌丝血
    为什么读了很多书,还是过不好这一生?
    抱怨就像呕吐
    finally关键字小复习
    Java中菜单组件
    Java的GUI窗体出现乱码解决方法
    Java中GUI的默认窗体布局 和 常见的窗体布局方案
    适配器类(便利类)的由来:当你自己写的类中想用某个接口中个别方法的时候(注意:不是所有的方法),肿么办?
    技术管理者工作成效评估表
  • 原文地址:https://www.cnblogs.com/shunguo/p/14567902.html
Copyright © 2011-2022 走看看