python基础库Pandas

zoukankan html css js c++ java

python基础库Pandas
一、Series

import numpy as np

import pandas as pd

#下面是创建Series的三种方法

#方法1：s1 = pd.Series([1,2,3,4])

#方法2：s2 = pd.Series(np.arange(10)) # 通过numpy.arange创建

#方法3：s3 = pd.Series({'1':1,'2':2,'3':3}) # 通过字典创建

s1.values # 查看值

s1.index # 查看索引

s4 = pa.Series([1,2,3,4],index=['A','B','C','D']) # 设置索引

s4.to_dict() # 转化成字典

pd.isnull(s4) #判断其中元素是否为NaN，pd.notnull()同理

二、DataFrame

from pandas import Series,DataFrame

#通过粘贴板导入dataframe

df = pd.read_clipboard() # 在此之前需要你copy一个表

df.columns # 输出列名

df.'列名' # 输出列的数值(是一个Series)

df_new = DataFrame(df,columns=['列名1','列名2'])

s1 = pd.Series(df['列名']) # 输出这一列，dataframe的每一列是一个series

s1.index\values 即对series操作，或者通过s1['索引值']
- df1.iterrows() #返回一个生成器，可以用for循环来访问
  
  eg: for row in df1.iterrows():
  
  print(row) #返回的数据为一个tuple
- s1,s2,s3为3个Series，用其组成一个人dataframe：
  
  df_new = pd.DataFrame([s1,s2,s3],index=['A','B','C'])
  
  # index是每个Series的名称
  
  # 初始是按横向拼接成的dataframe
  
  df1 = df1.T #转置，转置之后就和直接用dataframe生成的一样了
  
  三、IO操作：
  
  1、从粘贴板读取
  
  1 df1 = pd.read_clipboard()
  2 df1.to_clipboard() # 写入粘贴板
  
  2、CSV文件
  
  1 df1.to_csv('名字.csv',index=False) # false则表示不添加索引号
  2 df2 = pd.read_csv('df1.csv') # 读取CSV文件
  
  3、json
  
  1 df1.to_json() # 转化成json文件
  2 pd.read_json(df1.to_json()) # 读取json文件
  
  4、html
  
  1 df1.to_html('df1_html') # 转换成HTML文件
  
  5、excel
  
  1 df1.to_excel('df1.xlsx') # 生成Excel文件
  
  四、Selecting and Indexing
  
  1 df.head() # 返回前五行
  2 df.tail() # 返回后五行
  3 # 返回更多的内容则在括号中写出来，不写则默认为五行
  4 df.iloc[:,:] #索引切片，定位，基于index，与索引名无关
  5 df.loc[:,:] # 根据索引名来，label来过滤
  
  五、NaN
  
  n = np.nan
  
  type(n) 是个浮点数float
  
  与nan的运算结果均是nan
  
  nan in series：
  
  s1.isnull\notnull() 判断是否为nan
  
  s1.dropna() # 删除掉value为NaN的行
  
  nan in dataframe：
  
  判断同series
  
  1 df.dropna(axis=0,how='any'，thresh=None) # axis表示行和列0,1来表示,how为any时表示有Nan就删掉，为all时表示全为nan时才删掉；thresh表示一个界限，超过这个数字的nan则被删掉
  2 df.fillna(value=1) # 表示所有为nan的地方填充为1
  3 df.fillna(value={0:0,1:1,2:2,3:3}) # 表示第一列的填充1，第二列的填充2，后面同理
  
  七、mapping and replace
  
  当想在一个dataframe中加一列(columns)，可以直接加df['列名']=Series([数据])
  
  也可以通过map：创建一个字典，字典中的键是dataframe中的columns：
  
  df1['新列名'] = df1['字典中的键那一列'].map(那个字典) 这个可以固定对应位置，方便改值，可以指定index来改值
  
  replace in series:
  
  1 s1.replace({1,np.nan}) # 通过字典来改值
  2 s1.replace([1,2,3],[10,20,30]) # 把123索引改成10,20,30
  
  来自 <https://www.cnblogs.com/yudanqu/p/python_numpy_pandas.html>
  
  八、应用：
  
  import pandas
  
  food_info = pandas.read_csv('food_info.csv') #导入数据
  
  print(type(food_info))
  
  #print(food_info.dtypes) #显示数据类型
  
  print(food_info.head(3)) #默认显示前5行，这里显示3行
  
  print(food_info.tail(3)) #默认显示前5行，这里显示3行
  
  print(food_info.columns) #列表头
  
  print(food_info.shape) #矩阵大小
  
  结果：
  
  <class 'pandas.core.frame.DataFrame'>
  
  NDB_No Shrt_Desc ... FA_Poly_(g) Cholestrl_(mg)
  
  0 1001 BUTTER WITH SALT ... 3.043 215.0
  
  1 1002 BUTTER WHIPPED WITH SALT ... 3.012 219.0
  
  2 1003 BUTTER OIL ANHYDROUS ... 3.694 256.0
  
  [3 rows x 36 columns]
  
  NDB_No Shrt_Desc ... FA_Poly_(g) Cholestrl_(mg)
  
  8615 90480 SYRUP CANE ... 0.000 0.0
  
  8616 90560 SNAIL RAW ... 0.252 50.0
  
  8617 93600 TURTLE GREEN RAW ... 0.170 50.0
  
  [3 rows x 36 columns]
  
  Index(['NDB_No', 'Shrt_Desc', 'Water_(g)', 'Energ_Kcal', 'Protein_(g)',
  
  'Lipid_Tot_(g)', 'Ash_(g)', 'Carbohydrt_(g)', 'Fiber_TD_(g)',
  
  'Sugar_Tot_(g)', 'Calcium_(mg)', 'Iron_(mg)', 'Magnesium_(mg)',
  
  'Phosphorus_(mg)', 'Potassium_(mg)', 'Sodium_(mg)', 'Zinc_(mg)',
  
  'Copper_(mg)', 'Manganese_(mg)', 'Selenium_(mcg)', 'Vit_C_(mg)',
  
  'Thiamin_(mg)', 'Riboflavin_(mg)', 'Niacin_(mg)', 'Vit_B6_(mg)',
  
  'Vit_B12_(mcg)', 'Vit_A_IU', 'Vit_A_RAE', 'Vit_E_(mg)', 'Vit_D_mcg',
  
  'Vit_D_IU', 'Vit_K_(mcg)', 'FA_Sat_(g)', 'FA_Mono_(g)', 'FA_Poly_(g)',
  
  'Cholestrl_(mg)'],
  
  dtype='object')
  
  (8618, 36)
  
  取片操作：
  
  import pandas
  
  food_info = pandas.read_csv('food_info.csv')
  
  print(food_info.loc[3:5]) #支持切片操作，3到5行
  
  #取列的操作
  
  columns = ['Shrt_Desc','Energ_Kcal']
  
  print(food_info[columns])
  
  示例：
  
  import pandas
  
  food_info = pandas.read_csv('food_info.csv')
  
  col_names = food_info.columns.tolist() #取头，并并称列表
  
  print(col_names)
  
  gram_columns =[]
  
  for c in col_names:
  
  if c.endswith('(mg)'): #以什么结尾
  
  gram_columns.append(c)
  
  gram_df = food_info[gram_columns]
  
  print(gram_df.head(3))
  
  结果：得到一个包含mg的数据表
  
  升序和降序操作：
  
  import pandas
  
  food_info = pandas.read_csv('food_info.csv')
  
  food_info.sort_values('Sodium_(mg)',inplace=True,ascending=False) #默认是升序排列
  
  print(food_info['Sodium_(mg)'])
  
  #年龄分段：
  
  import pandas as pd
  
  import numpy as np
  
  titanic_survival = pd.read_csv("titanic_train.csv")
  
  #index tells the method which column to group by
  
  #values is the column that we want to apply the calculation to
  
  #aggfunc specifies the calculation we want to perform
  
  passenger_survival = titanic_survival.pivot_table(index="Pclass", values="Survived", aggfunc=np.mean)
  
  print (passenger_survival)
  
  #按照年龄统计
  
  def generate_age_label(row):
  
  age = row["Age"]
  
  if pd.isnull(age):
  
  return "unknown"
  
  elif age < 18:
  
  return "minor"
  
  else:
  
  return "adult"
  
  age_labels = titanic_survival.apply(generate_age_label, axis=1)
  
  print (age_labels)
  
  titanic_survival['age_labels'] = age_labels #列表增加一列
  
  age_group_survival = titanic_survival.pivot_table(index="age_labels", values="Survived")
  
  print(age_group_survival)
  
  pandas.pivot_table()
  
  几个重要的参数
  
  data：DataFrame对象
  
  values：源数据中的一列，数据透视表中用于观察分析的数据值，类似Excel中的值字段
  
  index：源数据中的一列，数据透视表用于行索引的数据值，类似Excel中的行字段
  
  columns：源数据中的一列，数据透视表用于列索引的数据值，类似Excel中列字段
  
  aggfunc：根据当前的行、列索引生成的数据透视表中有多个数据需要进行聚合时，对这多个数据需要进行的操作，默认为np.mean()
  
  apply函数
  
  apply函数是`pandas`里面所有函数中自由度最高的函数。该函数如下：
  
  DataFrame.apply(func, axis=0, broadcast=False, raw=False, reduce=None, args=(), **kwds)
  
  该函数最有用的是第一个参数，这个参数是函数，相当于C/C++的函数指针。
  
  这个函数需要自己实现，函数的传入参数根据axis来定，比如axis = 1，就会把一行数据作为Series的数据
  
  结构传入给自己实现的函数中，我们在函数中实现对Series不同属性之间的计算，返回一个结果，则apply函数
  
  会自动遍历每一行DataFrame的数据，最后将所有结果组合成一个Series数据结构并返回。
  
  原文链接：https://blog.csdn.net/qq_19528953/article/details/79348929
查看全文

相关阅读:
什么是shell
Jenkins+python+selenium持续继承自动化测试
 selenium+python自动化
 产品和项目的概念
 继承与派生：赋值兼容规则（转）
继承与派生：虚基类及其派生类的构造函数（转）
重载函数与函数模板(转)
继承与派生：作用域分辨符(转)
作用域和可见性(转)
继承与派生：派生类的析构函数(转)

原文地址：https://www.cnblogs.com/yifanrensheng/p/11393704.html