zoukankan      html  css  js  c++  java
  • 数据预处理--数据选择

    筛选空值

    #The Pandas library uses NaN, which stands for "not a number", to indicate a missing value.
    #we can use the pandas.isnull() function which takes a pandas series and returns a series of True and False values
    age = titanic_survival["Age"]
    #print(age.loc[0:10])
    
    #true or false
    age_is_null = pd.isnull(age)
    #print age_is_null
    
    #value or nan
    age_null_true = age[age_is_null]
    #print age_null_true
    
    
    age_null_count = len(age_null_true)
    print(age_null_count)

    求均值

    # 方法一
    #we have to filter out the missing values before we calculate the mean.
    good_ages = titanic_survival["Age"][age_is_null == False]
    #print good_ages
    correct_mean_age = sum(good_ages) / len(good_ages)
    print correct_mean_age
    
    # 方法二
    # missing data is so common that many pandas methods automatically filter for it
    correct_mean_age = titanic_survival["Age"].mean()
    print correct_mean_age
    
    #错误的方法
    #The result of this is that mean_age would be nan. This is because any calculations we do with a null value also result in a null value
    mean_age = sum(titanic_survival["Age"]) / len(titanic_survival["Age"])
    print mean_age

    数据透视表

    port_stats = titanic_survival.pivot_table(index="Embarked", values=["Fare","Survived"], aggfunc=np.sum)

    排序

    new_titanic_survival = titanic_survival.sort_values("Age",ascending=False)

    列名处理

    col_names = food_info.columns.tolist()
    #print col_names
    gram_columns = []
    
    for c in col_names:
        if c.endswith("(g)"):
            gram_columns.append(c)
    gram_df = food_info[gram_columns]
    print(gram_df.head(3))

    按列类型过滤

    # 查找类型列
    cat_features = list(train.select_dtypes(include=['object']).columns)
    print "Categorical: {} features".format(len(cat_features))
    
    # 查找连续数值列
    cont_features = [cont for cont in list(train.select_dtypes(
                     include=['float64', 'int64']).columns) if cont not in ['loss', 'id']]
    print "Continuous: {} features".format(len(cont_features))

    查看类型变量类别个数

    cat_uniques = []
    for cat in cat_features:
        cat_uniques.append(len(train[cat].unique()))
        
    uniq_values_in_categories = pd.DataFrame.from_items([('cat_name', cat_features), ('unique_values', cat_uniques)])

    类型转换

    data['id']=data['id'].astype('int64')

    数据筛选

    # 在df1,不在df2
    df1=df1[~df1['cust_no'].isin(df2['cust_no'].tolist())]
    
    # 日期转时间
    data['date']=data['date'].astype("str").apply(lambda x:datetime.strptime(x,'%Y%M%D'))

    多个dataframe合并处理

    input1=pd.read_csv(path+folder+"01.csv",encoding="utf-8")
    input2=pd.read_csv(path+folder+"02.csv",encoding="utf-8")
    input3=pd.read_csv(path+folder+"03.csv",encoding="utf-8")
    inputs=[input1,input2,input3]
    df_all=reduce(lambda left,right:pd.merge(left,right,on="cust_no",how="inner"),inputs)

    多个列合并

    credit_type = pd.get_dummies(data["credit_type"],drop_first=True,prefix="credit_type")
    tran_branch = pd.get_dummies(data["tran_branch"],prefix="branch")
    data=pd.concat([data,credit_type,tran_branch],axis=1)
  • 相关阅读:
    Altium Designer如何导出SMT贴片机用的坐标文件
    STM8S003设计注意事项
    Keil4打开KEIL5未响应卡死的问题
    STM32 adc 多通道采集相互串扰问题解决
    STM32 RS485 和串口 只能接收不能发送问题解决
    AD中元器件报警的处理——器件高度报警
    QT乱码解决办法《转》
    STM32下载失败,st-link v2 在线下载sw模式检测不到
    docker部署普罗米修斯监控
    进程管理常用命令
  • 原文地址:https://www.cnblogs.com/itbuyixiaogong/p/9850541.html
Copyright © 2011-2022 走看看