老唐数据分析机器学习
# pandas_1
import pandas
food_info = pandas.read_csv("food_info.csv")
#print(type(food_info))
print (food_info.dtypes)
'''
NDB_No int64
Shrt_Desc object
Water_(g) float64
Energ_Kcal int64
Protein_(g) float64
Lipid_Tot_(g) float64
Ash_(g) float64
Carbohydrt_(g) float64
Fiber_TD_(g) float64
Sugar_Tot_(g) float64
Calcium_(mg) float64
Iron_(mg) float64
Magnesium_(mg) float64
Phosphorus_(mg) float64
Potassium_(mg) float64
Sodium_(mg) float64
Zinc_(mg) float64
Copper_(mg) float64
Manganese_(mg) float64
Selenium_(mcg) float64
Vit_C_(mg) float64
Thiamin_(mg) float64
Riboflavin_(mg) float64
Niacin_(mg) float64
Vit_B6_(mg) float64
Vit_B12_(mcg) float64
Vit_A_IU float64
Vit_A_RAE float64
Vit_E_(mg) float64
Vit_D_mcg float64
Vit_D_IU float64
Vit_K_(mcg) float64
FA_Sat_(g) float64
FA_Mono_(g) float64
FA_Poly_(g) float64
Cholestrl_(mg) float64
dtype: object
'''
#first_rows = food_info.head()
#print first_rows
#print(food_info.head(3))
#print food_info.columns
#print food_info.shape
#pandas uses zero-indexing
#Series object representing the row at index 0.
#print food_info.loc[0]
# Series object representing the seventh row.
#food_info.loc[6]
# Will throw an error: "KeyError: 'the label [8620] is not in the [index]'"
#food_info.loc[8620]
#The object dtype is equivalent to a string in Python
#object - For string values
#int - For integer values
#float - For float values
#datetime - For time values
#bool - For Boolean values
#print(food_info.dtypes)
# Returns a DataFrame containing the rows at indexes 3, 4, 5, and 6.
#food_info.loc[3:6]
# Returns a DataFrame containing the rows at indexes 2, 5, and 10. Either of the following approaches will work.
# Method 1
#two_five_ten = [2,5,10]
#food_info.loc[two_five_ten]
# Method 2
#food_info.loc[[2,5,10]]
# Series object representing the "NDB_No" column.
#ndb_col = food_info["NDB_No"]
#print ndb_col
# Alternatively, you can access a column by passing in a string variable.
#col_name = "NDB_No"
#ndb_col = food_info[col_name]
#columns = ["Zinc_(mg)", "Copper_(mg)"]
#zinc_copper = food_info[columns]
#print zinc_copper
#print zinc_copper
# Skipping the assignment.
#zinc_copper = food_info[["Zinc_(mg)", "Copper_(mg)"]]
#print(food_info.columns)
#print(food_info.head(2))
col_names = food_info.columns.tolist()
#print col_names
gram_columns = []
for c in col_names:
if c.endswith("(g)"):
gram_columns.append(c)
gram_df = food_info[gram_columns]
print(gram_df.head(3))
'''
Water_(g) Protein_(g) Lipid_Tot_(g) Ash_(g) Carbohydrt_(g)
0 15.87 0.85 81.11 2.11 0.06
1 15.87 0.85 81.11 2.11 0.06
2 0.24 0.28 99.48 0.00 0.00
Fiber_TD_(g) Sugar_Tot_(g) FA_Sat_(g) FA_Mono_(g) FA_Poly_(g)
0 0.0 0.06 51.368 21.021 3.043
1 0.0 0.06 50.489 23.426 3.012
2 0.0 0.00 61.924 28.732 3.694
'''
# pandas_2
import pandas
food_info = pandas.read_csv("food_info.csv")
col_names = food_info.columns.tolist()
print(col_names)
print(food_info.head(3))
'''
['NDB_No', 'Shrt_Desc', 'Water_(g)', 'Energ_Kcal', 'Protein_(g)', 'Lipid_Tot_(g)', 'Ash_(g)', 'Carbohydrt_(g)', 'Fiber_TD_(g)', 'Sugar_Tot_(g)', 'Calcium_(mg)', 'Iron_(mg)', 'Magnesium_(mg)', 'Phosphorus_(mg)', 'Potassium_(mg)', 'Sodium_(mg)', 'Zinc_(mg)', 'Copper_(mg)', 'Manganese_(mg)', 'Selenium_(mcg)', 'Vit_C_(mg)', 'Thiamin_(mg)', 'Riboflavin_(mg)', 'Niacin_(mg)', 'Vit_B6_(mg)', 'Vit_B12_(mcg)', 'Vit_A_IU', 'Vit_A_RAE', 'Vit_E_(mg)', 'Vit_D_mcg', 'Vit_D_IU', 'Vit_K_(mcg)', 'FA_Sat_(g)', 'FA_Mono_(g)', 'FA_Poly_(g)', 'Cholestrl_(mg)']
NDB_No Shrt_Desc Water_(g) Energ_Kcal Protein_(g)
0 1001 BUTTER WITH SALT 15.87 717 0.85
1 1002 BUTTER WHIPPED WITH SALT 15.87 717 0.85
2 1003 BUTTER OIL ANHYDROUS 0.24 876 0.28
Lipid_Tot_(g) Ash_(g) Carbohydrt_(g) Fiber_TD_(g) Sugar_Tot_(g) ...
0 81.11 2.11 0.06 0.0 0.06 ...
1 81.11 2.11 0.06 0.0 0.06 ...
2 99.48 0.00 0.00 0.0 0.00 ...
Vit_A_IU Vit_A_RAE Vit_E_(mg) Vit_D_mcg Vit_D_IU Vit_K_(mcg)
0 2499.0 684.0 2.32 1.5 60.0 7.0
1 2499.0 684.0 2.32 1.5 60.0 7.0
2 3069.0 840.0 2.80 1.8 73.0 8.6
FA_Sat_(g) FA_Mono_(g) FA_Poly_(g) Cholestrl_(mg)
0 51.368 21.021 3.043 215.0
1 50.489 23.426 3.012 219.0
2 61.924 28.732 3.694 256.0
[3 rows x 36 columns]
'''
#print food_info["Iron_(mg)"]
#div_1000 = food_info["Iron_(mg)"] / 1000
#print div_1000
# Adds 100 to each value in the column and returns a Series object.
#add_100 = food_info["Iron_(mg)"] + 100
# Subtracts 100 from each value in the column and returns a Series object.
#sub_100 = food_info["Iron_(mg)"] - 100
# Multiplies each value in the column by 2 and returns a Series object.
#mult_2 = food_info["Iron_(mg)"]*2
#It applies the arithmetic operator to the first value in both columns, the second value in both columns, and so on
water_energy = food_info["Water_(g)"] * food_info["Energ_Kcal"]
water_energy = food_info["Water_(g)"] * food_info["Energ_Kcal"]
iron_grams = food_info["Iron_(mg)"] / 1000
food_info["Iron_(g)"] = iron_grams
#Score=2×(Protein_(g))−0.75×(Lipid_Tot_(g))
weighted_protein = food_info["Protein_(g)"] * 2
weighted_fat = -0.75 * food_info["Lipid_Tot_(g)"]
initial_rating = weighted_protein + weighted_fat
# the "Vit_A_IU" column ranges from 0 to 100000, while the "Fiber_TD_(g)" column ranges from 0 to 79
#For certain calculations, columns like "Vit_A_IU" can have a greater effect on the result,
#due to the scale of the values
# The largest value in the "Energ_Kcal" column.
max_calories = food_info["Energ_Kcal"].max()
# Divide the values in "Energ_Kcal" by the largest value.
normalized_calories = food_info["Energ_Kcal"] / max_calories
normalized_protein = food_info["Protein_(g)"] / food_info["Protein_(g)"].max()
normalized_fat = food_info["Lipid_Tot_(g)"] / food_info["Lipid_Tot_(g)"].max()
food_info["Normalized_Protein"] = normalized_protein
food_info["Normalized_Fat"] = normalized_fat
#By default, pandas will sort the data by the column we specify in ascending order and return a new DataFrame
# Sorts the DataFrame in-place, rather than returning a new DataFrame.
#print food_info["Sodium_(mg)"]
food_info.sort_values("Sodium_(mg)", inplace=True)
print (food_info["Sodium_(mg)"])
#Sorts by descending order, rather than ascending.
food_info.sort_values("Sodium_(mg)", inplace=True, ascending=False)
print (food_info["Sodium_(mg)"])
'''
760 0.0
758 0.0
405 0.0
761 0.0
2269 0.0
...
8184 NaN
8185 NaN
8195 NaN
8251 NaN
8267 NaN
Name: Sodium_(mg), Length: 8618, dtype: float64
276 38758.0
5814 27360.0
6192 26050.0
1242 26000.0
1245 24000.0
...
8184 NaN
8185 NaN
8195 NaN
8251 NaN
8267 NaN
Name: Sodium_(mg), Length: 8618, dtype: float64
'''
# pandas_3
import pandas as pd
import numpy as np
titanic_survival = pd.read_csv("titanic_train.csv")
titanic_survival.head()
#The Pandas library uses NaN, which stands for "not a number", to indicate a missing value.
#we can use the pandas.isnull() function which takes a pandas series and returns a series of True and False values
age = titanic_survival["Age"]
# print(age.loc[0:10])
age_is_null = pd.isnull(age)
# print (age_is_null)
age_null_true = age[age_is_null]
print (age_null_true)
age_null_count = len(age_null_true)
print(age_null_count)
'''
5 NaN
17 NaN
19 NaN
26 NaN
28 NaN
..
859 NaN
863 NaN
868 NaN
878 NaN
888 NaN
Name: Age, Length: 177, dtype: float64
177
'''
#The result of this is that mean_age would be nan. This is because any calculations we do with a null value also result in a null value
mean_age = sum(titanic_survival["Age"]) / len(titanic_survival["Age"])
print (mean_age)
'''
nan
'''
#we have to filter out the missing values before we calculate the mean.
good_ages = titanic_survival["Age"][age_is_null == False]
#print good_ages
correct_mean_age = sum(good_ages) / len(good_ages)
print(correct_mean_age)
'''
29.69911764705882
'''
# missing data is so common that many pandas methods automatically filter for it
correct_mean_age = titanic_survival["Age"].mean()
print(correct_mean_age)
'''
29.69911764705882
'''
#mean fare for each class
passenger_classes = [1, 2, 3]
fares_by_class = {}
for this_class in passenger_classes:
pclass_rows = titanic_survival[titanic_survival["Pclass"] == this_class]
pclass_fares = pclass_rows["Fare"]
fare_for_class = pclass_fares.mean()
fares_by_class[this_class] = fare_for_class
print (fares_by_class)
'''
{1: 84.1546875, 2: 20.662183152173913, 3: 13.675550101832993}
'''
#index tells the method which column to group by
#values is the column that we want to apply the calculation to
#aggfunc specifies the calculation we want to perform
passenger_survival = titanic_survival.pivot_table(index="Pclass", values="Survived", aggfunc=np.mean)
print (passenger_survival)
'''
Survived
Pclass
1 0.629630
2 0.472826
3 0.242363
'''
passenger_age = titanic_survival.pivot_table(index="Pclass", values="Age")
print(passenger_age)
'''
Age
Pclass
1 38.233441
2 29.877630
3 25.140620
'''
port_stats = titanic_survival.pivot_table(index="Embarked", values=["Fare","Survived"], aggfunc=np.sum)
print(port_stats)
'''
Fare Survived
Embarked
C 10072.2962 93
Q 1022.2543 30
S 17439.3988 217
'''
#specifying axis=1 or axis='columns' will drop any columns that have null values
drop_na_columns = titanic_survival.dropna(axis=1)
new_titanic_survival = titanic_survival.dropna(axis=0,subset=["Age", "Sex"])
print (drop_na_columns.shape)
'''
(891, 9)
'''
row_index_83_age = titanic_survival.loc[83,"Age"]
row_index_1000_pclass = titanic_survival.loc[766,"Pclass"]
print(row_index_83_age)
print(row_index_1000_pclass)
'''
28.0
1
'''
new_titanic_survival = titanic_survival.sort_values("Age",ascending=False)
print (new_titanic_survival[0:10])
titanic_reindexed = new_titanic_survival.reset_index(drop=True) # (drop=True) 表示原来的索引不要了,生成新的索引
print(titanic_reindexed.iloc[0:10])
'''
PassengerId Survived Pclass Name
630 631 1 1 Barkworth, Mr. Algernon Henry Wilson
851 852 0 3 Svensson, Mr. Johan
493 494 0 1 Artagaveytia, Mr. Ramon
96 97 0 1 Goldschmidt, Mr. George B
116 117 0 3 Connors, Mr. Patrick
672 673 0 2 Mitchell, Mr. Henry Michael
745 746 0 1 Crosby, Capt. Edward Gifford
33 34 0 2 Wheadon, Mr. Edward H
54 55 0 1 Ostby, Mr. Engelhart Cornelius
280 281 0 3 Duane, Mr. Frank
Sex Age SibSp Parch Ticket Fare Cabin Embarked
630 male 80.0 0 0 27042 30.0000 A23 S
851 male 74.0 0 0 347060 7.7750 NaN S
493 male 71.0 0 0 PC 17609 49.5042 NaN C
96 male 71.0 0 0 PC 17754 34.6542 A5 C
116 male 70.5 0 0 370369 7.7500 NaN Q
672 male 70.0 0 0 C.A. 24580 10.5000 NaN S
745 male 70.0 1 1 WE/P 5735 71.0000 B22 S
33 male 66.0 0 0 C.A. 24579 10.5000 NaN S
54 male 65.0 0 1 113509 61.9792 B30 C
280 male 65.0 0 0 336439 7.7500 NaN Q
PassengerId Survived Pclass Name Sex
0 631 1 1 Barkworth, Mr. Algernon Henry Wilson male
1 852 0 3 Svensson, Mr. Johan male
2 494 0 1 Artagaveytia, Mr. Ramon male
3 97 0 1 Goldschmidt, Mr. George B male
4 117 0 3 Connors, Mr. Patrick male
5 673 0 2 Mitchell, Mr. Henry Michael male
6 746 0 1 Crosby, Capt. Edward Gifford male
7 34 0 2 Wheadon, Mr. Edward H male
8 55 0 1 Ostby, Mr. Engelhart Cornelius male
9 281 0 3 Duane, Mr. Frank male
Age SibSp Parch Ticket Fare Cabin Embarked
0 80.0 0 0 27042 30.0000 A23 S
1 74.0 0 0 347060 7.7750 NaN S
2 71.0 0 0 PC 17609 49.5042 NaN C
3 71.0 0 0 PC 17754 34.6542 A5 C
4 70.5 0 0 370369 7.7500 NaN Q
5 70.0 0 0 C.A. 24580 10.5000 NaN S
6 70.0 1 1 WE/P 5735 71.0000 B22 S
7 66.0 0 0 C.A. 24579 10.5000 NaN S
8 65.0 0 1 113509 61.9792 B30 C
9 65.0 0 0 336439 7.7500 NaN Q
'''
# This function returns the hundredth item from a series
def hundredth_row(column):
# Extract the hundredth item
hundredth_item = column.iloc[99]
return hundredth_item
# Return the hundredth item from each column
hundredth_row = titanic_survival.apply(hundredth_row)
print (hundredth_row)
'''
PassengerId 100
Survived 0
Pclass 2
Name Kantor, Mr. Sinai
Sex male
Age 34
SibSp 1
Parch 0
Ticket 244367
Fare 26
Cabin NaN
Embarked S
dtype: object
'''
# 判断每列中缺失值个数
def not_null_count(column):
column_null = pd.isnull(column)
null = column[column_null]
return len(null)
column_null_count = titanic_survival.apply(not_null_count)
print (column_null_count)
'''
PassengerId 0
Survived 0
Pclass 0
Name 0
Sex 0
Age 177
SibSp 0
Parch 0
Ticket 0
Fare 0
Cabin 687
Embarked 2
age_labels 0
dtype: int64
'''
# len(titanic_survival[pd.isnull(titanic_survival)])
# titanic_survival
#By passing in the axis=1 argument, we can use the DataFrame.apply() method to iterate over rows instead of columns.
def which_class(row):
pclass = row['Pclass']
if pd.isnull(pclass):
return "Unknown"
elif pclass == 1:
return "First Class"
elif pclass == 2:
return "Second Class"
elif pclass == 3:
return "Third Class"
classes = titanic_survival.apply(which_class, axis=1)
print (classes)
'''
0 Third Class
1 First Class
2 Third Class
3 First Class
4 Third Class
...
886 Second Class
887 First Class
888 Third Class
889 First Class
890 Third Class
Length: 891, dtype: object
'''
def is_minor(row):
if row["Age"] < 18:
return True
else:
return False
minors = titanic_survival.apply(is_minor, axis=1)
#print minors
# 离散化
def generate_age_label(row):
age = row["Age"]
if pd.isnull(age):
return "unknown"
elif age < 18:
return "minor"
else:
return "adult"
age_labels = titanic_survival.apply(generate_age_label, axis=1)
print (age_labels)
'''
0 adult
1 adult
2 adult
3 adult
4 adult
...
886 adult
887 adult
888 unknown
889 adult
890 adult
Length: 891, dtype: object
'''
titanic_survival['age_labels'] = age_labels
age_group_survival = titanic_survival.pivot_table(index="age_labels", values="Survived")
print(age_group_survival)
'''
Survived
age_labels
adult 0.381032
minor 0.539823
unknown 0.293785
'''
# pandas_4
#Series (collection of values)
#DataFrame (collection of Series objects)
#Panel (collection of DataFrame objects)
#A Series object can hold many data types, including
#float - for representing float values
#int - for representing integer values
#bool - for representing Boolean values
#datetime64[ns] - for representing date & time, without time-zone
#datetime64[ns, tz] - for representing date & time, with time-zone
#timedelta[ns] - for representing differences in dates & times (seconds, minutes, etc.)
#category - for representing categorical values
#object - for representing String values
#FILM - film name
#RottenTomatoes - Rotten Tomatoes critics average score
#RottenTomatoes_User - Rotten Tomatoes user average score
#RT_norm - Rotten Tomatoes critics average score (normalized to a 0 to 5 point system)
#RT_user_norm - Rotten Tomatoes user average score (normalized to a 0 to 5 point system)
#Metacritic - Metacritic critics average score
#Metacritic_User - Metacritic user average score
import pandas as pd
fandango = pd.read_csv('fandango_score_comparison.csv')
series_film = fandango['FILM']
print(type(series_film))
print('=========================')
print(series_film[0:5])
print('=========================')
series_rt = fandango['RottenTomatoes']
print (series_rt[0:5])
'''
<class 'pandas.core.series.Series'>
=========================
0 Avengers: Age of Ultron (2015)
1 Cinderella (2015)
2 Ant-Man (2015)
3 Do You Believe? (2015)
4 Hot Tub Time Machine 2 (2015)
Name: FILM, dtype: object
=========================
0 74
1 85
2 80
3 18
4 14
Name: RottenTomatoes, dtype: int64
'''
fandango.head()
'''
FILM RottenTomatoes RottenTomatoes_User Metacritic Metacritic_User IMDB Fandango_Stars Fandango_Ratingvalue RT_norm RT_user_norm ... IMDB_norm RT_norm_round RT_user_norm_round Metacritic_norm_round Metacritic_user_norm_round IMDB_norm_round Metacritic_user_vote_count IMDB_user_vote_count Fandango_votes Fandango_Difference
0 Avengers: Age of Ultron (2015) 74 86 66 7.1 7.8 5.0 4.5 3.70 4.3 ... 3.90 3.5 4.5 3.5 3.5 4.0 1330 271107 14846 0.5
1 Cinderella (2015) 85 80 67 7.5 7.1 5.0 4.5 4.25 4.0 ... 3.55 4.5 4.0 3.5 4.0 3.5 249 65709 12640 0.5
2 Ant-Man (2015) 80 90 64 8.1 7.8 5.0 4.5 4.00 4.5 ... 3.90 4.0 4.5 3.0 4.0 4.0 627 103660 12055 0.5
3 Do You Believe? (2015) 18 84 22 4.7 5.4 5.0 4.5 0.90 4.2 ... 2.70 1.0 4.0 1.0 2.5 2.5 31 3136 1793 0.5
4 Hot Tub Time Machine 2 (2015) 14 28 29 3.4 5.1 3.5 3.0 0.70 1.4 ... 2.55 0.5 1.5 1.5 1.5 2.5 88 19560 1021 0.5
5 rows × 22 columns
'''
# fandango.loc[[0,1],['FILM','RottenTomatoes']]
# fandango.FILM[0]
fandango.iloc[1,2]
'''
80
'''
# Import the Series object from pandas
from pandas import Series
film_names = series_film.values
print (type(film_names))
# print (film_names)
#print film_names
rt_scores = series_rt.values
#print (rt_scores)
series_custom = Series(rt_scores , index=film_names)
series_custom[['Minions (2015)', 'Leviathan (2014)']]
'''
<class 'numpy.ndarray'>
Minions (2015) 54
Leviathan (2014) 99
dtype: int64
'''
# int index is also aviable
series_custom = Series(rt_scores , index=film_names)
print(series_custom[['Minions (2015)', 'Leviathan (2014)']])
fiveten = series_custom[5:10]
print(fiveten)
'''
Minions (2015) 54
Leviathan (2014) 99
dtype: int64
The Water Diviner (2015) 63
Irrational Man (2015) 42
Top Five (2014) 86
Shaun the Sheep Movie (2015) 99
Love & Mercy (2015) 89
dtype: int64
'''
original_index = series_custom.index.tolist()
# print(original_index)
sorted_index = sorted(original_index)
sorted_by_index = series_custom.reindex(sorted_index)
print (sorted_by_index)
'''
'71 (2015) 97
5 Flights Up (2015) 52
A Little Chaos (2015) 40
A Most Violent Year (2014) 90
About Elly (2015) 97
..
What We Do in the Shadows (2015) 96
When Marnie Was There (2015) 89
While We're Young (2015) 83
Wild Tales (2014) 96
Woman in Gold (2015) 52
Length: 146, dtype: int64
'''
sc2 = series_custom.sort_index()
sc3 = series_custom.sort_values()
#print(sc2[0:10])
print(sc3[0:10])
'''
Paul Blart: Mall Cop 2 (2015) 5
Hitman: Agent 47 (2015) 7
Hot Pursuit (2015) 8
Fantastic Four (2015) 9
Taken 3 (2015) 9
The Boy Next Door (2015) 10
The Loft (2015) 11
Unfinished Business (2015) 11
Mortdecai (2015) 12
Seventh Son (2015) 12
dtype: int64
'''
#The values in a Series object are treated as an ndarray, the core data type in NumPy
import numpy as np
# Add each value with each other
print (np.add(series_custom, series_custom))
# Apply sine function to each value
np.sin(series_custom)
# Return the highest value (will return a single value not a Series)
np.max(series_custom)
'''
Avengers: Age of Ultron (2015) 148
Cinderella (2015) 170
Ant-Man (2015) 160
Do You Believe? (2015) 36
Hot Tub Time Machine 2 (2015) 28
...
Mr. Holmes (2015) 174
'71 (2015) 194
Two Days, One Night (2014) 194
Gett: The Trial of Viviane Amsalem (2015) 200
Kumiko, The Treasure Hunter (2015) 174
Length: 146, dtype: int64
100
'''
#will actually return a Series object with a boolean value for each film
series_custom > 50
series_greater_than_50 = series_custom[series_custom > 50]
criteria_one = series_custom > 50
criteria_two = series_custom < 75
both_criteria = series_custom[criteria_one & criteria_two]
print(both_criteria)
'''
Avengers: Age of Ultron (2015) 74
The Water Diviner (2015) 63
Unbroken (2014) 51
Southpaw (2015) 59
Insidious: Chapter 3 (2015) 59
The Man From U.N.C.L.E. (2015) 68
Run All Night (2015) 60
5 Flights Up (2015) 52
Welcome to Me (2015) 71
Saint Laurent (2015) 51
Maps to the Stars (2015) 60
Pitch Perfect 2 (2015) 67
The Age of Adaline (2015) 54
The DUFF (2015) 71
Ricki and the Flash (2015) 64
Unfriended (2015) 60
American Sniper (2015) 72
The Hobbit: The Battle of the Five Armies (2014) 61
Paper Towns (2015) 55
Big Eyes (2014) 72
Maggie (2015) 54
Focus (2015) 57
The Second Best Exotic Marigold Hotel (2015) 62
The 100-Year-Old Man Who Climbed Out the Window and Disappeared (2015) 67
Escobar: Paradise Lost (2015) 52
Into the Woods (2014) 71
Inherent Vice (2014) 73
Magic Mike XXL (2015) 62
Woman in Gold (2015) 52
The Last Five Years (2015) 60
Jurassic World (2015) 71
Minions (2015) 54
Spare Parts (2015) 52
dtype: int64
'''
#data alignment same index
rt_critics = Series(fandango['RottenTomatoes'].values, index=fandango['FILM'])
rt_users = Series(fandango['RottenTomatoes_User'].values, index=fandango['FILM'])
rt_mean = (rt_critics + rt_users)/2
print(rt_mean)
'''
FILM
Avengers: Age of Ultron (2015) 80.0
Cinderella (2015) 82.5
Ant-Man (2015) 85.0
Do You Believe? (2015) 51.0
Hot Tub Time Machine 2 (2015) 21.0
...
Mr. Holmes (2015) 82.5
'71 (2015) 89.5
Two Days, One Night (2014) 87.5
Gett: The Trial of Viviane Amsalem (2015) 90.5
Kumiko, The Treasure Hunter (2015) 75.0
Length: 146, dtype: float64
'''
# pandas_5
import pandas as pd
#will return a new DataFrame that is indexed by the values in the specified column
#and will drop that column from the DataFrame
#without the FILM column dropped
fandango = pd.read_csv('fandango_score_comparison.csv')
print (type(fandango))
fandango_films = fandango.set_index('FILM', drop=False)
#print(fandango_films.index)
'''
<class 'pandas.core.frame.DataFrame'>
'''
# Slice using either bracket notation or loc[]
fandango_films["Avengers: Age of Ultron (2015)":"Hot Tub Time Machine 2 (2015)"]
fandango_films.loc["Avengers: Age of Ultron (2015)":"Hot Tub Time Machine 2 (2015)"]
# Specific movie
fandango_films.loc['Kumiko, The Treasure Hunter (2015)']
# Selecting list of movies
movies = ['Kumiko, The Treasure Hunter (2015)', 'Do You Believe? (2015)', 'Ant-Man (2015)']
fandango_films.loc[movies]
#When selecting multiple rows, a DataFrame is returned,
#but when selecting an individual row, a Series object is returned instead
'''
FILM RottenTomatoes RottenTomatoes_User Metacritic Metacritic_User IMDB Fandango_Stars Fandango_Ratingvalue RT_norm RT_user_norm ... IMDB_norm RT_norm_round RT_user_norm_round Metacritic_norm_round Metacritic_user_norm_round IMDB_norm_round Metacritic_user_vote_count IMDB_user_vote_count Fandango_votes Fandango_Difference
FILM
Kumiko, The Treasure Hunter (2015) Kumiko, The Treasure Hunter (2015) 87 63 68 6.4 6.7 3.5 3.5 4.35 3.15 ... 3.35 4.5 3.0 3.5 3.0 3.5 19 5289 41 0.0
Do You Believe? (2015) Do You Believe? (2015) 18 84 22 4.7 5.4 5.0 4.5 0.90 4.20 ... 2.70 1.0 4.0 1.0 2.5 2.5 31 3136 1793 0.5
Ant-Man (2015) Ant-Man (2015) 80 90 64 8.1 7.8 5.0 4.5 4.00 4.50 ... 3.90 4.0 4.5 3.0 4.0 4.0 627 103660 12055 0.5
3 rows × 22 columns
'''
#The apply() method in Pandas allows us to specify Python logic
#The apply() method requires you to pass in a vectorized operation
#that can be applied over each Series object.
import numpy as np
# returns the data types as a Series
types = fandango_films.dtypes
#print types
# filter data types to just floats, index attributes returns just column names
float_columns = types[types.values == 'float64'].index
# use bracket notation to filter columns to just float columns
float_df = fandango_films[float_columns]
#print float_df
# `x` is a Series object representing a column
deviations = float_df.apply(lambda x: np.std(x))
print(deviations)
'''
Metacritic_User 1.505529
IMDB 0.955447
Fandango_Stars 0.538532
Fandango_Ratingvalue 0.501106
RT_norm 1.503265
RT_user_norm 0.997787
Metacritic_norm 0.972522
Metacritic_user_nom 0.752765
IMDB_norm 0.477723
RT_norm_round 1.509404
RT_user_norm_round 1.003559
Metacritic_norm_round 0.987561
Metacritic_user_norm_round 0.785412
IMDB_norm_round 0.501043
Fandango_Difference 0.152141
dtype: float64
'''
rt_mt_user = float_df[['RT_user_norm', 'Metacritic_user_nom']]
rt_mt_user.apply(lambda x: np.std(x), axis=1)
'''
FILM
Avengers: Age of Ultron (2015) 0.375
Cinderella (2015) 0.125
Ant-Man (2015) 0.225
Do You Believe? (2015) 0.925
Hot Tub Time Machine 2 (2015) 0.150
...
Mr. Holmes (2015) 0.025
'71 (2015) 0.175
Two Days, One Night (2014) 0.250
Gett: The Trial of Viviane Amsalem (2015) 0.200
Kumiko, The Treasure Hunter (2015) 0.025
Length: 146, dtype: float64
'''