zoukankan      html  css  js  c++  java
  • 06_pandas_useful_code

    # data is a DataFrame type
    
    data.sample(nums)     # 随机取nums个值
    
    data.col.unique()        # 返回col取的所有值
    
    #对于变量(不论连续或者离散或者类型变量), 得到其col的取值直方图
    fig = data.loan_amnt.hist(bins=50)   # loan_amnt is a col in data
    fig.set_title('loan Amount hist')
    fig.set_xlabel('loan Amount')
    fig.set_ylabel('Number of Loans')

    data.open_acc.dropna().unique() # col dropna() 删去所有含有缺失值的
    
    np.where(binary_array, 1, 0) # binary_array中为True的用1代替,其他的用0代替
    
    data['defaulted'] = np.where(data.loan_status.isin(['Default']), 1, 0)
    
    data.loan_status.isin(['Default']) # 判断loan_status是否是‘Default’,返回列表
    data['col'].value_counts()
    
    # for each category of home ownership
    
    fig = data['home_ownership'].value_counts().plot.bar()
    fig.set_title('Home Ownership')
    fig.set_ylabel('Number of customers')

    data.groupby([col])['y'].sum() 

    缺失值

    data.isnull().sum() # 计算每个col中为null的数目
    
    data.isnull().mean() #计算每个col中null数据所占比例
    
    data['cabin_null'] = np.where(data.Cabin.isnull(), 1, 0)
    
    data.groupby(['Survived'])['cabin_null'].mean()
    
    data['emp_length_redefined'] = data.emp_length.map(length_dict)
    
    data.emp_length_redefined.unique()
    
    data[data.emp_title.isnull()].groupby(['emp_length_redefined'])['emp_length'].count().sort_values() / value

    Outliers

    import seaborn as sns
    sns.distplot(data.Age)

    # another way of visualising outliers is using boxplots and whiskers,
    # which provides the quantiles (box) and inter-quantile range (whiskers),
    # with the outliers sitting outside the error bars (whiskers).
    
    # All the dots in the plot below are outliers according to the quantiles + 1.5 IQR rule
    
    fig = data.boxplot(column='Fare')
    fig.set_title('')
    fig.set_xlabel('Survived')
    fig.set_ylabel('Fare')

    # let's look at the values of the quantiles so we can
    # calculate the upper and lower boundaries for the outliers
    
    # 25%, 50% and 75% in the output below indicate the
    # 25th quantile, median and 75th quantile respectively
    
    data.Fare.describe()
    # Let's calculate the upper and lower boundaries
    # to identify outliers according
    # to interquantile proximity rule
    
    IQR = data.Fare.quantile(0.75) - data.Fare.quantile(0.25)
    
    Lower_fence = data.Fare.quantile(0.25) - (IQR * 1.5)
    Upper_fence = data.Fare.quantile(0.75) + (IQR * 1.5)
    
    Upper_fence, Lower_fence, IQR
    multiple_tickets = pd.concat(
        [
            high_fare_df.groupby('Ticket')['Fare'].count(),
            high_fare_df.groupby('Ticket')['Fare'].mean()
        ],
        axis=1)
    
    multiple_tickets.columns = ['Ticket', 'Fare']
    multiple_tickets.head(10)

    Rare Values

     Let's make a combined plot of the label frequency and
    # the time to pass testing.
    # This will help us  visualise the relationship between the
    # target and the labels of X3
    
    fig, ax = plt.subplots(figsize=(8, 4))
    plt.xticks(temp_df.index, temp_df['X3'], rotation=0)
    
    ax2 = ax.twinx()
    ax.bar(temp_df.index, temp_df["X3_perc_cars"], color='lightgrey')
    ax2.plot(temp_df.index, temp_df["y"], color='green', label='Seconds')
    ax.set_ylabel('percentage of cars per category')
    ax2.set_ylabel('Seconds')

    # let's automate the above process for all the categorical variables
    
    for col in cols_to_use:
        # calculate the frequency of the different labels in the variable
        temp_df = pd.Series(data[col].value_counts() / total_cars).reset_index()
    
        # rename the columns
        temp_df.columns = [col, col + '_perc_cars']
    
        # merge onto the mean time to pass the test
        temp_df = temp_df.merge(
            data.groupby([col])['y'].mean().reset_index(), on=col, how='left')
    
        # plot the figure as shown above
        fig, ax = plt.subplots(figsize=(8, 4))
        plt.xticks(temp_df.index, temp_df[col], rotation=0)
        ax2 = ax.twinx()
    
        ax.bar(
            temp_df.index,
            temp_df[col + '_perc_cars'],
            color='lightgrey',
            label=col)
    
        ax2.plot(
            temp_df.index,
            temp_df["y"],
            color='green',
        )
    
        ax.set_ylabel('percentage of cars per category')
        ax2.set_ylabel('Seconds')
        ax.legend()
        plt.show()
    # let's automate the replacement of infrequent categories
    # by the label 'rare' in the remaining categorical variables
    
    # I start from 1 because I already replaced the first variable in
    # the list
    for col in cols_to_use[1:]:
        
        # calculate the % of cars in each category
        temp_df = pd.Series(data[col].value_counts() / total_cars)
    
        # create a dictionary to replace the rare labels with the
        # string 'rare'
        grouping_dict = {
            k: ('rare' if k not in temp_df[temp_df >= 0.1].index else k)
            for k in temp_df.index
        }
        
        # replace the rare labels
        data[col + '_grouped'] = data[col].map(grouping_dict)
    
    data.head()
    # In order to use this variables to build machine learning using sklearn
    # first we need to replace the labels by numbers.
    
    # The correct way to do this, is to first separate into training and test
    # sets. And then create a replacing dictionary using the train set
    # and replace the strings both in train and test using the dictionary
    # created.
    
    # This will lead to the introduction of missing values / NaN in the
    # test set, for those labels that are not present in the train set
    # we saw this effect in the previous lecture
    
    # in the section dedicated to rare values later in the course, I will
    # show you how to avoid this problem
    
    # now, in order to speed up the demonstration, I will replace the
    # labels by strings in the entire dataset, and then divide into
    # train and test. 
    # but remember: THIS IS NOT GOOD PRACTICE!
    
    # original variables
    for col in cols_to_use:
        # create the dic and replace the strings in one line
        data.loc[:, col] = data.loc[:, col].map(
            {k: i
             for i, k in enumerate(data[col].unique(), 0)})
    
    # variables with grouped categories
    for col in ['X1_grouped', 'X6_grouped', 'X3_grouped', 'X2_grouped']:
        # create the dic and replace the strings in one line
        data.loc[:, col] = data.loc[:, col].map(
            {k: i
             for i, k in enumerate(data[col].unique(), 0)})
    # let's capture the first letter
    data['Cabin_reduced'] = data['Cabin'].astype(str).str[0]
    # Now I will replace the letters in the reduced cabin variable
    
    # create replace dictionary
    cabin_dict = {k: i for i, k in enumerate(X_train['Cabin_reduced'].unique(), 0)}
    
    # replace labels by numbers with dictionary
    X_train.loc[:, 'Cabin_reduced'] = X_train.loc[:, 'Cabin_reduced'].map(cabin_dict)
    X_test.loc[:, 'Cabin_reduced'] = X_test.loc[:, 'Cabin_reduced'].map(cabin_dict)
  • 相关阅读:
    Linux内核的异常修复原理
    sudo: insmod: command not found
    在Qemu+ARM上运行Minix3内核
    2021.34 面对干扰
    2021.33 实践
    selenium+python自动化106
    python测试开发django-111.模型管理器(models.Manager)
    python笔记64
    python笔记63
    python笔记62
  • 原文地址:https://www.cnblogs.com/ziwh666/p/12326878.html
Copyright © 2011-2022 走看看