zoukankan      html  css  js  c++  java
  • Python数据分析与机器学习-Pandas_3

    import pandas as pd
    import numpy as np
    titanic_survival = pd.read_csv("titanic_train.csv")
    titanic_survival.head()
    print(titanic_survival.shape)
    
    (891, 12)
    
    # The Pandas library uses NaN, which stands for "not a number", to indicate a missing value.
    # We can use the pandas.isnull() function which takes a pandas series and returns a series of True and False values
    age = titanic_survival["Age"]
    print(age.loc[0:10])
    age_is_null = pd.isnull(age)
    print(age_is_null)
    age_null_true = age[age_is_null]
    print(age_null_true)
    print(len(age_null_true))
    
    0     22.0
    1     38.0
    2     26.0
    3     35.0
    4     35.0
    5      NaN
    6     54.0
    7      2.0
    8     27.0
    9     14.0
    10     4.0
    Name: Age, dtype: float64
    0      False
    1      False
    2      False
    3      False
    4      False
    5       True
    6      False
    7      False
    8      False
    9      False
    10     False
    11     False
    12     False
    13     False
    14     False
    15     False
    16     False
    17      True
    18     False
    19      True
    20     False
    21     False
    22     False
    23     False
    24     False
    25     False
    26      True
    27     False
    28      True
    29      True
           ...  
    861    False
    862    False
    863     True
    864    False
    865    False
    866    False
    867    False
    868     True
    869    False
    870    False
    871    False
    872    False
    873    False
    874    False
    875    False
    876    False
    877    False
    878     True
    879    False
    880    False
    881    False
    882    False
    883    False
    884    False
    885    False
    886    False
    887    False
    888     True
    889    False
    890    False
    Name: Age, Length: 891, dtype: bool
    5     NaN
    17    NaN
    19    NaN
    26    NaN
    28    NaN
    29    NaN
    31    NaN
    32    NaN
    36    NaN
    42    NaN
    45    NaN
    46    NaN
    47    NaN
    48    NaN
    55    NaN
    64    NaN
    65    NaN
    76    NaN
    77    NaN
    82    NaN
    87    NaN
    95    NaN
    101   NaN
    107   NaN
    109   NaN
    121   NaN
    126   NaN
    128   NaN
    140   NaN
    154   NaN
           ..
    718   NaN
    727   NaN
    732   NaN
    738   NaN
    739   NaN
    740   NaN
    760   NaN
    766   NaN
    768   NaN
    773   NaN
    776   NaN
    778   NaN
    783   NaN
    790   NaN
    792   NaN
    793   NaN
    815   NaN
    825   NaN
    826   NaN
    828   NaN
    832   NaN
    837   NaN
    839   NaN
    846   NaN
    849   NaN
    859   NaN
    863   NaN
    868   NaN
    878   NaN
    888   NaN
    Name: Age, Length: 177, dtype: float64
    177
    
    # The result of this is that mean_age would be nan. This is because any calculations we do with a null value also result in a null value
    mean_age = sum(titanic_survival["Age"])/len(titanic_survival["Age"])
    print(mean_age)
    
    nan
    
    # We have to filter out the missing values before we calculate the mean.
    good_ages = titanic_survival["Age"][age_is_null==False]
    correct_mean_age = sum(good_ages)/len(good_ages)
    print(correct_mean_age)
    
    29.69911764705882
    
    # Missing data is so common that many pandas methods automatically filter for it
    correct_mean_age = titanic_survival["Age"].mean()
    print(correct_mean_age)
    
    29.69911764705882
    
    # Mean fare for each class
    passenger_classes = [1,2,3]
    fares_by_class = {}
    for this_class in passenger_classes:
        pclass_rows = titanic_survival[titanic_survival["Pclass"] == this_class]
        pclass_fares = pclass_rows["Fare"]
        fare_for_class = pclass_fares.mean()
        fares_by_class[this_class] = fare_for_class
    print(fares_by_class)
    
    {1: 84.15468749999992, 2: 20.66218315217391, 3: 13.675550101832997}
    
    # index tells the method which column to group by values is the column that we want to apply 
    # the calculation to aggrunc specifies the calculation we want to perform
    passenger_survival = titanic_survival.pivot_table(index="Pclass",values="Survived", aggfunc=np.mean)
    print(passenger_survival)
    
            Survived
    Pclass          
    1       0.629630
    2       0.472826
    3       0.242363
    
    passenger_age = titanic_survival.pivot_table(index="Pclass", values="Age")
    print(passenger_age)
    
                  Age
    Pclass           
    1       38.233441
    2       29.877630
    3       25.140620
    
    port_stats = titanic_survival.pivot_table(index="Embarked", values=["Fare","Survived"], aggfunc=np.sum)
    print(port_stats)
    
                    Fare  Survived
    Embarked                      
    C         10072.2962        93
    Q          1022.2543        30
    S         17439.3988       217
    
    #specifying axis=1 or axis='columns' will drop any columns that have null values
    print(titanic_survival.shape)
    drop_na_columns = titanic_survival.dropna(axis=1)
    print(drop_na_columns.shape)
    new_titanic_survival = titanic_survival.dropna(axis=0,subset=["Age", "Sex"])
    print(new_titanic_survival)
    
    (891, 12)
    (891, 9)
         PassengerId  Survived  Pclass  
    0              1         0       3   
    1              2         1       1   
    2              3         1       3   
    3              4         1       1   
    4              5         0       3   
    6              7         0       1   
    7              8         0       3   
    8              9         1       3   
    9             10         1       2   
    10            11         1       3   
    11            12         1       1   
    12            13         0       3   
    13            14         0       3   
    14            15         0       3   
    15            16         1       2   
    16            17         0       3   
    18            19         0       3   
    20            21         0       2   
    21            22         1       2   
    22            23         1       3   
    23            24         1       1   
    24            25         0       3   
    25            26         1       3   
    27            28         0       1   
    30            31         0       1   
    33            34         0       2   
    34            35         0       1   
    35            36         0       1   
    37            38         0       3   
    38            39         0       3   
    ..           ...       ...     ...   
    856          857         1       1   
    857          858         1       1   
    858          859         1       3   
    860          861         0       3   
    861          862         0       2   
    862          863         1       1   
    864          865         0       2   
    865          866         1       2   
    866          867         1       2   
    867          868         0       1   
    869          870         1       3   
    870          871         0       3   
    871          872         1       1   
    872          873         0       1   
    873          874         0       3   
    874          875         1       2   
    875          876         1       3   
    876          877         0       3   
    877          878         0       3   
    879          880         1       1   
    880          881         1       2   
    881          882         0       3   
    882          883         0       3   
    883          884         0       2   
    884          885         0       3   
    885          886         0       3   
    886          887         0       2   
    887          888         1       1   
    889          890         1       1   
    890          891         0       3   
    
                                                      Name     Sex   Age  SibSp  
    0                              Braund, Mr. Owen Harris    male  22.0      1   
    1    Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
    2                               Heikkinen, Miss. Laina  female  26.0      0   
    3         Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
    4                             Allen, Mr. William Henry    male  35.0      0   
    6                              McCarthy, Mr. Timothy J    male  54.0      0   
    7                       Palsson, Master. Gosta Leonard    male   2.0      3   
    8    Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)  female  27.0      0   
    9                  Nasser, Mrs. Nicholas (Adele Achem)  female  14.0      1   
    10                     Sandstrom, Miss. Marguerite Rut  female   4.0      1   
    11                            Bonnell, Miss. Elizabeth  female  58.0      0   
    12                      Saundercock, Mr. William Henry    male  20.0      0   
    13                         Andersson, Mr. Anders Johan    male  39.0      1   
    14                Vestrom, Miss. Hulda Amanda Adolfina  female  14.0      0   
    15                    Hewlett, Mrs. (Mary D Kingcome)   female  55.0      0   
    16                                Rice, Master. Eugene    male   2.0      4   
    18   Vander Planke, Mrs. Julius (Emelia Maria Vande...  female  31.0      1   
    20                                Fynney, Mr. Joseph J    male  35.0      0   
    21                               Beesley, Mr. Lawrence    male  34.0      0   
    22                         McGowan, Miss. Anna "Annie"  female  15.0      0   
    23                        Sloper, Mr. William Thompson    male  28.0      0   
    24                       Palsson, Miss. Torborg Danira  female   8.0      3   
    25   Asplund, Mrs. Carl Oscar (Selma Augusta Emilia...  female  38.0      1   
    27                      Fortune, Mr. Charles Alexander    male  19.0      3   
    30                            Uruchurtu, Don. Manuel E    male  40.0      0   
    33                               Wheadon, Mr. Edward H    male  66.0      0   
    34                             Meyer, Mr. Edgar Joseph    male  28.0      1   
    35                      Holverson, Mr. Alexander Oskar    male  42.0      1   
    37                            Cann, Mr. Ernest Charles    male  21.0      0   
    38                  Vander Planke, Miss. Augusta Maria  female  18.0      2   
    ..                                                 ...     ...   ...    ...   
    856         Wick, Mrs. George Dennick (Mary Hitchcock)  female  45.0      1   
    857                             Daly, Mr. Peter Denis     male  51.0      0   
    858              Baclini, Mrs. Solomon (Latifa Qurban)  female  24.0      0   
    860                            Hansen, Mr. Claus Peter    male  41.0      2   
    861                        Giles, Mr. Frederick Edward    male  21.0      1   
    862  Swift, Mrs. Frederick Joel (Margaret Welles Ba...  female  48.0      0   
    864                             Gill, Mr. John William    male  24.0      0   
    865                           Bystrom, Mrs. (Karolina)  female  42.0      0   
    866                       Duran y More, Miss. Asuncion  female  27.0      1   
    867               Roebling, Mr. Washington Augustus II    male  31.0      0   
    869                    Johnson, Master. Harold Theodor    male   4.0      1   
    870                                  Balkic, Mr. Cerin    male  26.0      0   
    871   Beckwith, Mrs. Richard Leonard (Sallie Monypeny)  female  47.0      1   
    872                           Carlsson, Mr. Frans Olof    male  33.0      0   
    873                        Vander Cruyssen, Mr. Victor    male  47.0      0   
    874              Abelson, Mrs. Samuel (Hannah Wizosky)  female  28.0      1   
    875                   Najib, Miss. Adele Kiamie "Jane"  female  15.0      0   
    876                      Gustafsson, Mr. Alfred Ossian    male  20.0      0   
    877                               Petroff, Mr. Nedelio    male  19.0      0   
    879      Potter, Mrs. Thomas Jr (Lily Alexenia Wilson)  female  56.0      0   
    880       Shelley, Mrs. William (Imanita Parrish Hall)  female  25.0      0   
    881                                 Markun, Mr. Johann    male  33.0      0   
    882                       Dahlberg, Miss. Gerda Ulrika  female  22.0      0   
    883                      Banfield, Mr. Frederick James    male  28.0      0   
    884                             Sutehall, Mr. Henry Jr    male  25.0      0   
    885               Rice, Mrs. William (Margaret Norton)  female  39.0      0   
    886                              Montvila, Rev. Juozas    male  27.0      0   
    887                       Graham, Miss. Margaret Edith  female  19.0      0   
    889                              Behr, Mr. Karl Howell    male  26.0      0   
    890                                Dooley, Mr. Patrick    male  32.0      0   
    
         Parch            Ticket      Fare        Cabin Embarked  
    0        0         A/5 21171    7.2500          NaN        S  
    1        0          PC 17599   71.2833          C85        C  
    2        0  STON/O2. 3101282    7.9250          NaN        S  
    3        0            113803   53.1000         C123        S  
    4        0            373450    8.0500          NaN        S  
    6        0             17463   51.8625          E46        S  
    7        1            349909   21.0750          NaN        S  
    8        2            347742   11.1333          NaN        S  
    9        0            237736   30.0708          NaN        C  
    10       1           PP 9549   16.7000           G6        S  
    11       0            113783   26.5500         C103        S  
    12       0         A/5. 2151    8.0500          NaN        S  
    13       5            347082   31.2750          NaN        S  
    14       0            350406    7.8542          NaN        S  
    15       0            248706   16.0000          NaN        S  
    16       1            382652   29.1250          NaN        Q  
    18       0            345763   18.0000          NaN        S  
    20       0            239865   26.0000          NaN        S  
    21       0            248698   13.0000          D56        S  
    22       0            330923    8.0292          NaN        Q  
    23       0            113788   35.5000           A6        S  
    24       1            349909   21.0750          NaN        S  
    25       5            347077   31.3875          NaN        S  
    27       2             19950  263.0000  C23 C25 C27        S  
    30       0          PC 17601   27.7208          NaN        C  
    33       0        C.A. 24579   10.5000          NaN        S  
    34       0          PC 17604   82.1708          NaN        C  
    35       0            113789   52.0000          NaN        S  
    37       0        A./5. 2152    8.0500          NaN        S  
    38       0            345764   18.0000          NaN        S  
    ..     ...               ...       ...          ...      ...  
    856      1             36928  164.8667          NaN        S  
    857      0            113055   26.5500          E17        S  
    858      3              2666   19.2583          NaN        C  
    860      0            350026   14.1083          NaN        S  
    861      0             28134   11.5000          NaN        S  
    862      0             17466   25.9292          D17        S  
    864      0            233866   13.0000          NaN        S  
    865      0            236852   13.0000          NaN        S  
    866      0     SC/PARIS 2149   13.8583          NaN        C  
    867      0          PC 17590   50.4958          A24        S  
    869      1            347742   11.1333          NaN        S  
    870      0            349248    7.8958          NaN        S  
    871      1             11751   52.5542          D35        S  
    872      0               695    5.0000  B51 B53 B55        S  
    873      0            345765    9.0000          NaN        S  
    874      0         P/PP 3381   24.0000          NaN        C  
    875      0              2667    7.2250          NaN        C  
    876      0              7534    9.8458          NaN        S  
    877      0            349212    7.8958          NaN        S  
    879      1             11767   83.1583          C50        C  
    880      1            230433   26.0000          NaN        S  
    881      0            349257    7.8958          NaN        S  
    882      0              7552   10.5167          NaN        S  
    883      0  C.A./SOTON 34068   10.5000          NaN        S  
    884      0   SOTON/OQ 392076    7.0500          NaN        S  
    885      5            382652   29.1250          NaN        Q  
    886      0            211536   13.0000          NaN        S  
    887      0            112053   30.0000          B42        S  
    889      0            111369   30.0000         C148        C  
    890      0            370376    7.7500          NaN        Q  
    
    [714 rows x 12 columns]
    
    row_index_83_age = titanic_survival.loc[83,"Age"]
    row_index_1000_pclass = titanic_survival.loc[766,"Pclass"]
    print(row_index_83_age)
    print(row_index_1000_pclass)
    print(titanic_survival.loc[83])
    
    28.0
    1
    PassengerId                         84
    Survived                             0
    Pclass                               1
    Name           Carrau, Mr. Francisco M
    Sex                               male
    Age                                 28
    SibSp                                0
    Parch                                0
    Ticket                          113059
    Fare                              47.1
    Cabin                              NaN
    Embarked                             S
    Name: 83, dtype: object
    
    new_titanic_survival = titanic_survival.sort_values("Age",ascending=False)
    print(new_titanic_survival[0:10])
    titanic_reindexed = new_titanic_survival.reset_index(drop=True)
    print(titanic_reindexed.loc[0:10])
    
         PassengerId  Survived  Pclass                                  Name  
    630          631         1       1  Barkworth, Mr. Algernon Henry Wilson   
    851          852         0       3                   Svensson, Mr. Johan   
    493          494         0       1               Artagaveytia, Mr. Ramon   
    96            97         0       1             Goldschmidt, Mr. George B   
    116          117         0       3                  Connors, Mr. Patrick   
    672          673         0       2           Mitchell, Mr. Henry Michael   
    745          746         0       1          Crosby, Capt. Edward Gifford   
    33            34         0       2                 Wheadon, Mr. Edward H   
    54            55         0       1        Ostby, Mr. Engelhart Cornelius   
    280          281         0       3                      Duane, Mr. Frank   
    
          Sex   Age  SibSp  Parch      Ticket     Fare Cabin Embarked  
    630  male  80.0      0      0       27042  30.0000   A23        S  
    851  male  74.0      0      0      347060   7.7750   NaN        S  
    493  male  71.0      0      0    PC 17609  49.5042   NaN        C  
    96   male  71.0      0      0    PC 17754  34.6542    A5        C  
    116  male  70.5      0      0      370369   7.7500   NaN        Q  
    672  male  70.0      0      0  C.A. 24580  10.5000   NaN        S  
    745  male  70.0      1      1   WE/P 5735  71.0000   B22        S  
    33   male  66.0      0      0  C.A. 24579  10.5000   NaN        S  
    54   male  65.0      0      1      113509  61.9792   B30        C  
    280  male  65.0      0      0      336439   7.7500   NaN        Q  
        PassengerId  Survived  Pclass                                  Name   Sex  
    0           631         1       1  Barkworth, Mr. Algernon Henry Wilson  male   
    1           852         0       3                   Svensson, Mr. Johan  male   
    2           494         0       1               Artagaveytia, Mr. Ramon  male   
    3            97         0       1             Goldschmidt, Mr. George B  male   
    4           117         0       3                  Connors, Mr. Patrick  male   
    5           673         0       2           Mitchell, Mr. Henry Michael  male   
    6           746         0       1          Crosby, Capt. Edward Gifford  male   
    7            34         0       2                 Wheadon, Mr. Edward H  male   
    8            55         0       1        Ostby, Mr. Engelhart Cornelius  male   
    9           281         0       3                      Duane, Mr. Frank  male   
    10          457         0       1             Millet, Mr. Francis Davis  male   
    
         Age  SibSp  Parch      Ticket     Fare Cabin Embarked  
    0   80.0      0      0       27042  30.0000   A23        S  
    1   74.0      0      0      347060   7.7750   NaN        S  
    2   71.0      0      0    PC 17609  49.5042   NaN        C  
    3   71.0      0      0    PC 17754  34.6542    A5        C  
    4   70.5      0      0      370369   7.7500   NaN        Q  
    5   70.0      0      0  C.A. 24580  10.5000   NaN        S  
    6   70.0      1      1   WE/P 5735  71.0000   B22        S  
    7   66.0      0      0  C.A. 24579  10.5000   NaN        S  
    8   65.0      0      1      113509  61.9792   B30        C  
    9   65.0      0      0      336439   7.7500   NaN        Q  
    10  65.0      0      0       13509  26.5500   E38        S  
    
    # This function returns the hundredth item from a series
    def hundredth_row(column):
        # Extract the hundredth item
        hundreth_item = column.iloc[99]
        return hundreth_item
    # Return the hundredth item from each column
    hundreth_row = titanic_survival.apply(hundredth_row)
    print(hundreth_row)
    
    PassengerId                  100
    Survived                       0
    Pclass                         2
    Name           Kantor, Mr. Sinai
    Sex                         male
    Age                           34
    SibSp                          1
    Parch                          0
    Ticket                    244367
    Fare                          26
    Cabin                        NaN
    Embarked                       S
    dtype: object
    
    def not_null_count(column):
        column_null = pd.isnull(column)
        null = column[column_null]
        return len(null)
    column_null_count = titanic_survival.apply(not_null_count)
    print(column_null_count)
    
    PassengerId      0
    Survived         0
    Pclass           0
    Name             0
    Sex              0
    Age            177
    SibSp            0
    Parch            0
    Ticket           0
    Fare             0
    Cabin          687
    Embarked         2
    dtype: int64
    
    # By passing in the axis=1 argument, we can use the DataFrame.apply() method to iterate over rows instead of columns.
    def which_class(row):
        pclass = row['Pclass']
        if pd.isnull(pclass):
            return "Unknown"
        elif pclass == 1:
            return "First Class"
        elif pclass == 2:
            return "Second Class"
        elif pclass == 3:
            return "Third Class"
    classes = titanic_survival.apply(which_class,axis=1)
    print(classes)
    
    0       Third Class
    1       First Class
    2       Third Class
    3       First Class
    4       Third Class
    5       Third Class
    6       First Class
    7       Third Class
    8       Third Class
    9      Second Class
    10      Third Class
    11      First Class
    12      Third Class
    13      Third Class
    14      Third Class
    15     Second Class
    16      Third Class
    17     Second Class
    18      Third Class
    19      Third Class
    20     Second Class
    21     Second Class
    22      Third Class
    23      First Class
    24      Third Class
    25      Third Class
    26      Third Class
    27      First Class
    28      Third Class
    29      Third Class
               ...     
    861    Second Class
    862     First Class
    863     Third Class
    864    Second Class
    865    Second Class
    866    Second Class
    867     First Class
    868     Third Class
    869     Third Class
    870     Third Class
    871     First Class
    872     First Class
    873     Third Class
    874    Second Class
    875     Third Class
    876     Third Class
    877     Third Class
    878     Third Class
    879     First Class
    880    Second Class
    881     Third Class
    882     Third Class
    883    Second Class
    884     Third Class
    885     Third Class
    886    Second Class
    887     First Class
    888     Third Class
    889     First Class
    890     Third Class
    Length: 891, dtype: object
    
    def is_minor(row):
        if row["Age"]<18:
            return True
        else:
            return False
    minors = titanic_survival.apply(is_minor,axis=1)
    print(minors)
    
    def generate_age_label(row):
        age = row["Age"]
        if pd.isnull(age):
            return "unknown"
        elif age<18:
            return "minor"
        else:
            return "adult"
    age_labels = titanic_survival.apply(generate_age_label,axis=1)
    print(age_labels)
    
    0      False
    1      False
    2      False
    3      False
    4      False
    5      False
    6      False
    7       True
    8      False
    9       True
    10      True
    11     False
    12     False
    13     False
    14      True
    15     False
    16      True
    17     False
    18     False
    19     False
    20     False
    21     False
    22      True
    23     False
    24      True
    25     False
    26     False
    27     False
    28     False
    29     False
           ...  
    861    False
    862    False
    863    False
    864    False
    865    False
    866    False
    867    False
    868    False
    869     True
    870    False
    871    False
    872    False
    873    False
    874    False
    875     True
    876    False
    877    False
    878    False
    879    False
    880    False
    881    False
    882    False
    883    False
    884    False
    885    False
    886    False
    887    False
    888    False
    889    False
    890    False
    Length: 891, dtype: bool
    0        adult
    1        adult
    2        adult
    3        adult
    4        adult
    5      unknown
    6        adult
    7        minor
    8        adult
    9        minor
    10       minor
    11       adult
    12       adult
    13       adult
    14       minor
    15       adult
    16       minor
    17     unknown
    18       adult
    19     unknown
    20       adult
    21       adult
    22       minor
    23       adult
    24       minor
    25       adult
    26     unknown
    27       adult
    28     unknown
    29     unknown
            ...   
    861      adult
    862      adult
    863    unknown
    864      adult
    865      adult
    866      adult
    867      adult
    868    unknown
    869      minor
    870      adult
    871      adult
    872      adult
    873      adult
    874      adult
    875      minor
    876      adult
    877      adult
    878    unknown
    879      adult
    880      adult
    881      adult
    882      adult
    883      adult
    884      adult
    885      adult
    886      adult
    887      adult
    888    unknown
    889      adult
    890      adult
    Length: 891, dtype: object
    
    titanic_survival['age_labels'] = age_labels
    age_group_survival = titanic_survival.pivot_table(index="age_labels",values="Survived")
    print(age_group_survival)
    
                Survived
    age_labels          
    adult       0.381032
    minor       0.539823
    unknown     0.293785
  • 相关阅读:
    [转]使用@Test 也可以从spring容器中获取依赖注入
    idea/ecipse中使用maven集成springmvc相关jar包时候,出错:java.lang.ClassNotFoundException: org.springframework.web.servlet.DispatcherServlet
    mongodb 权限设置--用户名、密码、端口
    java中import static和import的区别【转】
    python 数字的四舍五入的问题
    数据库——索引(面试、笔试必会)
    Python 中的那些坑总结——持续更新
    python2和python3的区别——持续更新
    常用的排序算法的时间复杂度和空间复杂度
    Libpacp 深度剖析
  • 原文地址:https://www.cnblogs.com/SweetZxl/p/11124203.html
Copyright © 2011-2022 走看看