zoukankan      html  css  js  c++  java
  • 某市近年毕业情况的一个可视化

    数据来源于:https://data.cityofnewyork.us/Education/2005-2010-Graduation-Outcomes-By-Borough/avir-tzek

    数据理解

    原数据其实是有点乱的,第一列Demographic可以说是一些标签吧,有English Language Learners和English Proficient Students,有Special Education和General Education,有Asian、Black、Hispanic、white,有Female和Male,有Borough Total(这个便是总的了,只可惜我当初没发现,而是把男女加在一起来算总的的,哎哎哎走了弯路)

    第二列便是所谓的Borough,一共有5个镇Bronx、Brooklyn、Manhattan、Queens、Staten Island

    第三列Cohort,年份;第三列Total  Cohort,本年到毕业年所有的人数

    第四列Total Grads - n,本年毕业的人数。

    其他的列基本上就是毕业生中再区分类别的人数及比例了。本文没用到,便不再描述了。

    数据预处理

    library(dplyr)
    library(ggplot2)
    
    dat=read.csv("Graduation.csv",header=T)
    dat$Cohort=as.factor(dat$Cohort)#将年份转换成因子类型
    dat=dat[,c(1,2,3,4,5)] #以下只取前5列进行分析
    dat_df=tbl_df(dat)

      

    每年的毕业生总人数

    borough_total=dat[dat$Demographic=="Borough Total",] #筛选出统计总数的行
    borough_total_df=tbl_df(borough_total)
    by_Cohort=group_by(borough_total_df,Cohort)
    total=summarise(by_Cohort,total=sum(Total.Grads...n))
    
    ggplot(total,aes(x=Cohort,y=total))
      +geom_col(aes(fill=total))
      +theme(plot.title = element_text(hjust = 0.5),panel.background = element_rect(fill = "white", colour = "grey50"))
      +labs(title = "Total Graduations of 2001-2006")

     

    毕业生男女分布

    female=dat[dat$Demographic=="Female",]
    male=dat[dat$Demographic=="Male",]
    
    female_df=tbl_df(female)
    by_Cohort_fe=group_by(female_df,Cohort)
    female_tol=summarise(by_Cohort_fe,total=sum(Total.Grads...n))
    female_tol$sex="female"
    male_df=tbl_df(male) 
    by_Cohort_male=group_by(male_df,Cohort) 
    male_tol=summarise(by_Cohort_male,total=sum(Total.Grads...n)) 
    male_tol$sex="male"
    
    s1=rbind(female_tol,male_tol) #其实这里也可以先把female和male合并,然后做groupby 
    names(s1)=c("Cohort","num","sex") 
    
    ggplot(s1,aes(x=Cohort,y=num,fill = factor(sex)))+geom_col(position = "dodge")+theme(legend.title=element_blank(),plot.title = element_text(hjust = 0.5),panel.background = element_rect(fill = "white", colour = "grey50"))+labs(title = "female & male graduations of 2001-2006")
    

      

    不同ethnic groups所占比例

     这里计算的是Asian、Black、Hispanic、White这四种在2001-2006毕业的总人数占6年总毕业生人数的比例。

    by_Demographic=group_by(dat_df,Demographic)
    Demographic=summarise(by_Demographic,num=sum(Total.Grads...n))
    bing=Demographic[c(1,2,8,11),]
    bing$rat=paste(bing$Demographic,"(",round(bing$num/sum(bing$num)*100,2),"%)",sep="")
    
    ggplot(bing,aes(x="",y=num,fill=Demographic))+geom_bar(stat="identity",width=1)+coord_polar(theta = "y")+labs(x="",y="",title="total graduations of different regions")+theme(axis.ticks = element_blank(),plot.title = element_text(hjust = 0.5),panel.background = element_rect(fill = "white", colour = "grey50"),legend.title = element_blank())+scale_fill_discrete(breaks=bing$Demographic,labels=bing$rat)
    

      

    不同城镇的毕业人数及毕业率

    毕业人数

    by_year_Borough=group_by(borough_total_df,Cohort,Borough)
    year_Borough_num=summarise(by_year_Borough,gra=sum(Total.Grads...n),all=sum(Total.Cohort))
    year_Borough_num$rat=year_Borough_num$gra/year_Borough_num$all
    ggplot(year_Borough_num,aes(x=Cohort,y=gra,fill=factor(Borough)))+geom_col(position = "dodge")+theme(legend.title=element_blank(),plot.title = element_text(hjust = 0.5),panel.background = element_rect(fill = "white", colour = "grey50"))+labs(title = " graduations of different boroughs of 2001-2006")

      

     

    毕业率

    ggplot(year_Borough_num,aes(x=Cohort,y=rat,group=Borough,color=Borough))+geom_line()+geom_point(size=4, shape=20)+theme(plot.title = element_text(hjust = 0.5),panel.background = element_rect(fill = "white", colour = "grey50"))+labs(title = " graduation rate of Brooklyn from 2001 to 2006")
    

      

     

    不同Education

    by_Demographic_Cohort=group_by(dat_df,Demographic,Cohort)
    Demographic_Cohort=summarise(by_Demographic_Cohort,num=sum(Total.Grads...n),all=sum(Total.Cohort))
    Demographic_Cohort$rat=Demographic_Cohort$num/Demographic_Cohort$all
    
    #不同教育 Special Education" General Education Demographic_Cohort_education=Demographic_Cohort[Demographic_Cohort$Demographic=="Special Education"|Demographic_Cohort$Demographic=="General Education",] ggplot(Demographic_Cohort_education,aes(x=Demographic,y=rat))+geom_col(aes(fill=Demographic))+facet_grid(. ~ Cohort)+theme(plot.title = element_text(hjust = 0.5),panel.background = element_rect(fill = "white", colour = "grey50"),axis.text.x = element_blank())+labs(title = " graduation rate of different degree of Education"

      

      

    不同English

    #不同英文程度English Proficient Students  English Language Learners 
    Demographic_Cohort_english=Demographic_Cohort[Demographic_Cohort$Demographic=="English Proficient Students"|Demographic_Cohort$Demographic=="English Language Learners",]
    ggplot(Demographic_Cohort_english,aes(x=Demographic,y=rat))+geom_col(aes(fill=Cohort))+theme(plot.title = element_text(hjust = 0.5),panel.background = element_rect(fill = "white", colour = "grey50"))+labs(title = " graduation rate of different degree of English")

      

     

  • 相关阅读:
    2014--My Plan
    C++操作xls
    fail2ban[防止linux服务器被暴力破解]
    8.1向上转型
    寒假的一个月--实现自己的五部狂奏曲
    浅谈《think in java》:二 一切都是对象
    浅谈《think in java》:一 对象导论总结
    增加Myecllipse内存
    java 容器 集合 用法
    SVN用户切换
  • 原文地址:https://www.cnblogs.com/Hyacinth-Yuan/p/7995122.html
Copyright © 2011-2022 走看看