zoukankan      html  css  js  c++  java
  • Pandas_VBA_数据清洗对比

    VBA处理数据与Python Pandas处理数据案例比较

    Author : Collin_PXY

    需求:

    现有一个 csv文件,包含'CNUM'和'COMPANY'两列,数据里包含空行,且有内容重复的行数据。
    要求:
    1)去掉空行;
    2)重复行数据只保留一行有效数据;
    3)修改'COMPANY'列的名称为'Company_New‘;
    4)并在其后增加六列,分别为'C_col','D_col','E_col','F_col','G_col','H_col'。

    在这里插入图片描述

    一,使用 Python Pandas来处理:

    import pandas as pd
    import numpy as np
    from pandas import DataFrame,Series
    
    def deal_with_data(filepath,newpath):
        file_obj=open(filepath)
        df=pd.read_csv(file_obj)    # 读取csv文件,创建 DataFrame
        df=df.reindex(columns=['CNUM','COMPANY','C_col','D_col','E_col','F_col','G_col','H_col'],fill_value=None)   # 重新指定列索引
        df.rename(columns={'COMPANY':'Company_New'}, inplace = True)  # 修改列名
        df=df.dropna(axis=0,how='all')                 # 去除 NAN 即文件中的空行
        df['CNUM'] = df['CNUM'].astype('int32')        # 将 CNUM 列的数据类型指定为 int32
        df = df.drop_duplicates(subset=['CNUM', 'Company_New'], keep='first')  # 去除重复行
        df.to_csv(newpath,index=False,encoding='GBK')
        file_obj.close()
       
    if __name__=='__main__':
        file_path=r'C:Users12078DesktoppythonCNUM_COMPANY.csv'
        file_save_path=r'C:Users12078DesktoppythonCNUM_COMPANY_OUTPUT.csv'
        deal_with_data(file_path,file_save_path)
    
    
    

    二,使用 VBA来处理:

    Option Base 1
    Option Explicit
    
    
    Sub main()
    	On Error GoTo error_handling
    	Dim wb                  As Workbook
    	Dim wb_out              As Workbook
    	Dim sht                 As Worksheet
    	Dim sht_out             As Worksheet
    	Dim rng                 As Range
    	Dim usedrows            As Byte
    	Dim usedrows_out        As Byte
    	Dim dict_cnum_company   As Object
    	Dim str_file_path       As String
            Dim str_new_file_path   As String
            
            'assign values to variables:
           str_file_path = "C:Users12078DesktopPythonCNUM_COMPANY.csv"
           str_new_file_path = "C:Users12078DesktopPythonCNUM_COMPANY_OUTPUT.csv"
    	
    	Set wb = checkAndAttachWorkbook(str_file_path)
    	Set sht = wb.Worksheets("CNUM_COMPANY")
    	Set wb_out = Workbooks.Add
    	wb_out.SaveAs str_new_file_path, xlCSV  'create a csv file
    	Set sht_out = wb_out.Worksheets("CNUM_COMPANY_OUTPUT")
    
    	Set dict_cnum_company = CreateObject("Scripting.Dictionary")
    	usedrows = WorksheetFunction.Max(getLastValidRow(sht, "A"), getLastValidRow(sht, "B"))
    
    	'rename the header 'COMPANY' to 'Company_New',remove blank & duplicate lines/rows.
    	Dim cnum_company As String
    	cnum_company = ""
    	For Each rng In sht.Range("A1", "A" & usedrows)
    	    If VBA.Trim(rng.Offset(0, 1).Value) = "COMPANY" Then
    	        rng.Offset(0, 1).Value = "Company_New"
    	    End If
    	    cnum_company = rng.Value & "-" & rng.Offset(0, 1).Value
    	    If VBA.Trim(cnum_company) <> "-" And Not dict_cnum_company.Exists(rng.Value & "-" & rng.Offset(0, 1).Value) Then
    	        dict_cnum_company.Add rng.Value & "-" & rng.Offset(0, 1).Value, ""
    	    End If
    	Next rng
    	
    	'loop the keys of dict split the keyes by '-' into cnum array and company array.
    	Dim index_dict As Byte
    	Dim arr_cnum()
    	Dim arr_Company()
    	For index_dict = 0 To UBound(dict_cnum_company.keys)
    	    ReDim Preserve arr_cnum(1 To UBound(dict_cnum_company.keys) + 1)
    	    ReDim Preserve arr_Company(1 To UBound(dict_cnum_company.keys) + 1)
    	    arr_cnum(index_dict + 1) = Split(dict_cnum_company.keys()(index_dict), "-")(0)
    	    arr_Company(index_dict + 1) = Split(dict_cnum_company.keys()(index_dict), "-")(1)
    	    Debug.Print index_dict
    	Next
    
    	'assigns the value of the arrays to the celles.
    	sht_out.Range("A1", "A" & UBound(arr_cnum)) = Application.WorksheetFunction.Transpose(arr_cnum)
    	sht_out.Range("B1", "B" & UBound(arr_Company)) = Application.WorksheetFunction.Transpose(arr_Company)
    
    	'add 6 columns to output csv file:
    	Dim arr_columns() As Variant
    	arr_columns = Array("C_col", "D_col", "E_col", "F_col", "G_col", "H_col")   '
    	sht_out.Range("C1:H1") = arr_columns
    	Call checkAndCloseWorkbook(str_file_path, False)
    	Call checkAndCloseWorkbook(str_new_file_path, True)
    
    Exit Sub
    error_handling:
       Call checkAndCloseWorkbook(str_file_path, False)
       Call checkAndCloseWorkbook(str_new_file_path, False)
    End Sub
    
    ' 辅助函数:
    'Get last row of Column N in a Worksheet
    Function getLastValidRow(in_ws As Worksheet, in_col As String)
        getLastValidRow = in_ws.Cells(in_ws.Rows.count, in_col).End(xlUp).Row
    End Function
    
    Function checkAndAttachWorkbook(in_wb_path As String) As Workbook
        Dim wb As Workbook
        Dim mywb As String
        mywb = in_wb_path
       
       For Each wb In Workbooks
            If LCase(wb.FullName) = LCase(mywb) Then
                Set checkAndAttachWorkbook = wb
                Exit Function
            End If
        Next
       
       Set wb = Workbooks.Open(in_wb_path, UpdateLinks:=0)
       Set checkAndAttachWorkbook = wb
    
    End Function
     
    Function checkAndCloseWorkbook(in_wb_path As String, in_saved As Boolean)
        Dim wb As Workbook
        Dim mywb As String
        mywb = in_wb_path
        For Each wb In Workbooks
            If LCase(wb.FullName) = LCase(mywb) Then
                wb.Close savechanges:=in_saved
                Exit Function
            End If
        Next
    End Function
    
    
    

    三,输出结果:

    两种方法输出结果相同:

    在这里插入图片描述

    四,比较总结:

    Python pandas 内置了大量处理数据的方法,我们不需要重复造轮子,用起来很方便,代码简洁的多。
    Excel VBA 处理这个需求,使用了 数组,字典等数据结构(实际需求中,数据量往往很大,所以一些地方没有直接使用遍历单元格的方法),以及处理字符串,数组和字典的很多方法,对文件的操作也很复杂,一旦出错,调试起来比python也较困难,代码已经尽量优化,但还是远比 Python要多。

  • 相关阅读:
    luogu P1353 [USACO08JAN]跑步Running
    bzoj 2002: [Hnoi2010]Bounce 弹飞绵羊
    [USACO3.2]Sweet Butter
    [SDOI2009]Elaxia的路线
    [USACO5.4]Telecowmunication
    [洛谷1681]最大正方形II
    [清华集训2014]奇数国
    [洛谷2814]家谱
    [洛谷1868]饥饿的奶牛
    [HNOI2003]激光炸弹
  • 原文地址:https://www.cnblogs.com/Collin-pxy/p/13038448.html
Copyright © 2011-2022 走看看