# -!- coding:utf-8 -!-
import pandas as pd
from datetime import date, timedelta
# 1,创建excel表格,简单放入数据
df = pd.DataFrame({'ID':[1,2,3],'name':['AA','BB','CC']}) # 如果不填入内容,就只创建一个excel表格
df = df.set_index('ID') # 将ID设为索引,如果不设置,有默认索引
df.to_excel('E:/test/test.xlsx')
print('done')
# 2,给表格加上表头,并另存为。。。
people = pd.read_excel('E:/test/test.xlsx',header = None) # header = None没有列头
people.columns = ['ID','Type','FirstName','MiddleName','LastName'] # 设置列头
people.set_index('ID',inplace=True) # 将ID设为索引,,inplace = True要带上
print(people.columns)
people.to_excel('E:/test/test1.xlsx')
# 3,如果已知哪列为索引,读取的时候使用index_col设置
people = pd.read_excel('E:/test/test1.xlsx',index_col='ID') # 将ID列设为索引
# 4,简单的填充表格
s1 = pd.Series([1,2,3],index=[1,2,3],name='A')
s2 = pd.Series([10,20,30],index=[1,2,3],name='B')
s3 = pd.Series([100,200,300],index=[1,2,3],name='C')
df = pd.DataFrame({s1.name:s1,s2.name:s2,s3.name:s3})
print(df)
# 5,前面带有空行,空列的表格,填充序列,时间,另存为。。。
def add_month(d, md):
yd = md // 12
m = d.month + md % 12
if m != 12:
yd += m // 12
m = m % 12
return date(d.year + yd, m, d.day)
books = pd.read_excel('E:/test/test3.xlsx', skiprows=3, usecols='C:F', # 跳过前面三个空行,列只读取C~F列
dtype={'ID': str, 'YESORNO': str, 'date': str}) # 将NO,YESORNO date三列设置为字符串类型,此处不能直接将NO设置成整型
start = date(2018, 1, 1)
for i in books.index:
books['ID'].at[i] = i + 1
books['YESORNO'].at[i] = 'Yes' if i % 2 == 0 else 'NO'
# 日期依次 +1
books['date'].at[i] = start + timedelta(days=i)
# 年份+1
books['date'].at[i] = date[start.year + i, start.month, start.day]
# 月份 +1
books['date'].at[i] = add_month(start,i) # 等价于 books.at[i,'date'] = add_month(start,i)
print(books)
books.set_index('ID',inplace = True)
books.to_excel('E:/test/test4.xlsx',)
6,表格中的算法
import pandas as pd
def add_2(x):
return x+2
# 已知单价和数量,计算总价
books = pd.read_excel('E:/test/book.xlsx',index_col='ID')
# 计算所有行价格
books['Price'] = books['listprice']*books['count']
#计算部分行价格
for i in range(5,10):
books['Price'].at[i] = books['listprice'].at[i] * books['count'].at[i]
# 单价+2
books['listprice'] = books['listprice'] +2
# 使用apply单价+2
books['listprice'] = books['listprice'].apply(add_2) # 注意此处add_2没有()
# 使用匿名函数
books['listprice'] = books['listprice'].apply(lambda x:x+2)
print(books)
7,排序
books = pd.read_excel('E:/test/book.xlsx',index_col='ID')
# 按照单价排序
books.sort_values(by='listprice',inplace = True,ascending= False) # by后面是指定排序的列,inplace = True不再生成新的DataFrame,ascending= False 从大到小排序
# 先按照单价(从大到小),再按照数量排序(由小到大)
books.sort_values(by=['listprice','count'],inplace = True,ascending=[False,True])
8, 筛选
def age_18_to_40(a):
return 18<=a<40
def level_a(s):
return 85<=s<=100
students = pd.read_excel('E:/test/student.xlsx',index_col='ID')
# 筛选出大于18岁,小于40岁,分数85分以上的学员
students = students.loc[students['Age'].apply(age_18_to_40)].loc[students['Score'].apply(level_a)]
# 使用匿名函数等价于以下 students['Age'] 可以等价与 students.Age
students = students.loc[students['Age'].apply(lambda a:18<=a<40)].loc[students['Score'].apply(lambda s:85<=s<=100)]
# -- coding:utf-8 --
import pandas as pd
import matplotlib.pyplot as plt
9,柱状图
student = pd.read_excel('E:/test/zhuzhuangtu.xlsx',index_col='ID')
student.sort_values(by='Number',inplace=True,ascending=False)
# 使用pandas的plot.bar制图
# student.plot.bar(x='Field',y='Number',color='orange',title='International Students by Field')
# 使用matplotlib制图
plt.bar(student.Field,student.Number,color='orange')
plt.xticks(student.Field,rotation = '90')
plt.xlabel('Field')
plt.ylabel('Number')
plt.title('International Students by Field',fontsize=16)
plt.tight_layout() # x轴,y轴标签显示全面(不加的话,标签会截断)
plt.show() # 柱状图在pycharm展示出来
print(student)
效果:
10,两组柱状图比较 (更多细节可以参考上面的9优化)
students = pd.read_excel('E:/test/zhuzhuangtu2.xlsx')
print(students)
# students.plot.bar(x='Field',y=['2020','2021'],color=['orange','red'])
students.plot.bar(x='Field',y=['year2020','year2021'])
plt.show()
11,计算长方形外切圆面积
import pandas as pd
import numpy as np
def get_circumcircle_area(l,w):
r = np.sqrt(l**2+w**2)/2 # 求长方形半径
return r**2*np.pi # 返回圆的面积
def wrapper(row):
return get_circumcircle_area(row['lenth'],row['width'])
rects = pd.read_excel('E:/test/yuan.xlsx',index_col='ID')
rects['ca'] = rects.apply(wrapper,axis=1) # axis=1 以行遍历, axis=0 以列遍历
# 等价于
# rects['ca'] = rects.apply(lambda row:get_circumcircle_area(row['lenth'],row['width']),axis=1)
print(rects)
12,饼图
# 饼图
students = pd.read_excel('E:/test/bingtu.xlsx',index_col='ID')
print(students)
# 简单的图形
# students['year2017'].plot.pie()
# 做些样式优化
students['year2017'].sort_values(ascending=True).plot.pie(fontsize=8,startangle=270)
plt.title('AAAAAA',fontsize=16,fontweight='bold')
plt.ylabel('year2017',fontsize=12,fontweight='bold')
plt.show()
13,折线图,区域叠加图,叠加柱状图
weeks = pd.read_excel('E:/test/week.xlsx')
print(weeks)
print(weeks.columns)
# 折线图
weeks.plot(y=['year2019','year2020','year2021']) # y=[],列表里面是以哪几列做折线图
# 区域叠加图
weeks.plot.area(y=['year2019','year2020','year2021'])
#叠加柱状图
weeks.plot.bar(y=['year2019','year2020','year2021'],stacked=True)
# 给折线图增加一些样式
plt.title('Sales Weekly Trend',fontsize=16,fontweight='bold')
plt.xticks(weeks.index,fontsize=8)
plt.ylabel('Total',fontsize=12,fontweight='bold')
plt.show()
14,散点图,面积分布图,
pd.options.display.max_columns = 777 # 打印的时候全部显示出来,不会收起部分数据
homes = pd.read_excel('E:/test/home_data.xlsx',index_col='Id')
print(homes)
# 散点图
# homes.plot.scatter(x='sqft_living',y='price')
# 面积分布图
# homes.sqft_living.plot.hist(bins = 5)
# 密度图
# homes.sqft_living.plot.kde()
# 数据之间的关联性
print(homes.corr())
plt.show()
15,多表杂碎数据合并
# -- coding:utf-8 --
import pandas as pd
students = pd.read_excel('E:/test/student_score.xlsx',sheet_name='student')
scores = pd.read_excel('E:/test/student_score.xlsx',sheet_name='score')
# 使用merge合并
table = students.merge(scores,how = 'left',on='ID').fillna(0) # 合并scores列
# 使用join合并
# table = students.join(scores,how='left').fillna(0) # join 合并,scores
table.Score = table.Score.astype(int)
print(table)
16,数据校验
# -- coding:utf-8 --
import pandas as pd
def score_validation(row):
try:
assert 0<=row.Score<=100
except:
print(f'#{row.ID}\t student{row.Name} has an invalid score {row.Score}.') # 加上\t是有的ID是个位数有的是两位数,打印信息对其
# 也可以这样写
# if not 0<=row.Score<=100:
# print(f'#{row.ID}\t student{row.Name} has an invalid score {row.Score}.')
students = pd.read_excel('E:/test/student2.xlsx')
students.apply(score_validation,axis =1) # 轴 axis=1 从左到右一行一行校验,axis=0 是从上到下
表格数据: 运行结果:
17,读取csv,tsv,txt中的数据
import pandas as pd
# 读取csv,tsv,txt中的数据
students = pd.read_csv('E:/test/student.csv', index_col='ID')
students2 = pd.read_csv('E:/test/student.tsv', sep='\t', index_col='ID') # 数据以制表符\t分开
students3 = pd.read_csv('E:/test/student.txt', sep='|', index_col='ID') # 数据以|分开