import pandas as pd import numpy as np
Step 1.加载数据集
# header=0以第一行作为列名 tip = pd.read_csv("lianx.csv",sep=',',header=0) tip.head()
Step 2.删除第 1,4,7,9,11,13,14列,保存修改
a = list(tip.columns) print(a) b = [] c = 0 for i in a: c= c+1 if c in [1,4,7,9,11,13,14]: b.append(i) # print(b) # 删除列 tip = tip.drop(b,axis=1) tip.head()
step 3.重命名列列索引依次为
1) alcohol
2) malic_acid
3) alcalinity_of_ash
4) magnesium
5) flavanoids
6) proanthocyanins
7) hue
c = ['alcohol','malic_acid','alcalinity_of_ash','magnesium','flavanoids','proanthocyanins','hue'] b = list(tip.columns[:7]) b2 = list(tip.columns) print(b) print(b2) d = dict(zip(b,c)) print(d) tip.rename(columns=d,inplace=True) tip.head()
step 4.将alcohol 这一列的前三行改为NaN
#tip.iloc[:3,0]=np.nan tip.iloc[:3,0]=np.nan tip.head()
step 6. 将 alcohol 和 magnesium列的缺失值分别用10和100进行填充
tip['alcohol'] = tip['alcohol'].fillna(10) tip['magnesium'] = tip['magnesium'].fillna(100) tip.head()
step 7.创建10以内的10个随机整数
import random seven = np.random.randint(0,10,10) seven
step 8.根据上面的随机数,作为行索引,选取alcohol列,赋值为NaN
tip.iloc[seven,0]=np.nan
tip.head()
step 9.统计缺失值得个数
tip.isnull().sum()
Step 10.删除包含缺失值得行
tip.dropna()
Step 11. 让索引重新从0开始
a = list(tip.index) b = list(range(len(a))) c = dict(zip(a,b)) tip.rename(index=c)# 映射操作