zoukankan      html  css  js  c++  java
  • R语言语法基础二

    R语言语法基础二

    重塑数据

    增加行和列

    # 创建向量
    city = c("Tampa","Seattle","Hartford","Denver")
    state = c("FL","WA","CT","CO")
    zipcode = c(33602, 98104, 06161, 80294)
    # 组合向量成数据帧
    address1 = cbind(city, state, zipcode)
    print(address1)
    
    # 另一种方式创建dataFrame
    address2 = data.frame(
        city = c("Lowry","Charlotte"),
        state = c("CO","FL"),
        zipcode = c("80230","33949")
    )
    print(address2)
    
    # 组合数据帧,rbind是行组合,cbind是列组成
    address3 = rbind(address1,address2)
    print(address3)
    
         city       state zipcode
    [1,] "Tampa"    "FL"  "33602"
    [2,] "Seattle"  "WA"  "98104"
    [3,] "Hartford" "CT"  "6161" 
    [4,] "Denver"   "CO"  "80294"
    
           city state zipcode
    1     Lowry    CO   80230
    2 Charlotte    FL   33949
    
           city state zipcode
    1     Tampa    FL   33602
    2   Seattle    WA   98104
    3  Hartford    CT    6161
    4    Denver    CO   80294
    5     Lowry    CO   80230
    6 Charlotte    FL   33949
    

    合并两个dataFrame

    #合并两个dataFrame,x和y
    #以c("bp","bmi","type")为key做join
    
    merged.Pima = merge(
        x = MASS::Pima.te, #如果已经装载入MASS库,就不用加MASS::
        y = MASS::Pima.tr,
        by.x = c("bp","bmi","type"),
        by.y = c("bp","bmi","type")
    )
    
    nrow(MASS::Pima.te)
    ncol(MASS::Pima.te)
    nrow(MASS::Pima.tr)
    ncol(MASS::Pima.tr)
    nrow(merged.Pima)
    ncol(merged.Pima)
    [1] 332
    [1] 8
    [1] 200
    [1] 8
    [1] 10
    [1] 13 # 8 + 8 - 3 = 13
    

    分片

    MASS::ships[1:5,c("type","year")]
    
       type year
    1    A   60
    2    A   60
    3    A   65
    4    A   65
    5    A   70
    # head(ships)和tail(ships)查看前后6条
    

    函数

    内置函数举例

    seq(5, 9, by = 0.4) #默认by为1
    mean(1:5)
    sum(1:5)
    
    [1] 5.0 5.4 5.8 6.2 6.6 7.0 7.4 7.8 8.2 8.6 9.0
    [1] 3
    [1] 15
    

    自定义函数

    #可以使用默认参数
    myfunc = function(a = 2, b)
    {
        print(a + b)
        #只能返回一个值
        return(a * b)
    }
    product = myfunc(b = 3)
    print(product)
    
    [1] 5
    [1] 6
    

    字符串

    在R语言中,不区分单引号和双引号,但要求成对出现

    # 字符串拼接
    a = "Hello"
    b = "How"
    c = "are you?"
    # R语言非数值对象无法直接运算,字符串拼接要用paste
    #seperator分隔符默认为空格
    paste(a,b,c, sep = "-")
    
    [1] "Hello-How-are you?"
    
    #格式化输出
    format(23.123456, digits = 5) #digits表示显示5位有效数字
    format(3.14159, nsmall = 8) #nsmall表示小数点后至少8位
    format(23.123456, scientific = TRUE) #科学计数法
    format(23.123456, width = 10, justify = "right") #位宽为10,右对齐
    
    [1] "23.123"
    [1] "2.312346e+01"
    [1] "3.14159000"
    [1] "  23.12346"
    
    #统计字数
    nchar("hello world")
    [1] 11
    
    #大小写
    toupper("Hello World!")
    tolower("Hello World!")
    
    #字符串截取
    substring("Extract", 5, 7)
    [1] "act"
    

    向量

    向量是R语言中最基本的原子性数据对象,内部数据类型相同。即使只有一个值,也当做长度位1的向量

    #索引
    t = c("Sun", "Mon", "Tue", "Wed", "Thurs", "Fri", "Sat")
    t[c(1, 3, 5)]
    
    [1] "Sun"   "Tue"   "Thurs"
    
    #排序
    t = c("Sun", "Mon", "Tue", "Wed", "Thurs", "Fri", "Sat")
    sort(t, decreasing = TRUE) #默认是递增
    

    列表

    列表是比向量还要高级的数据对象,可以包含不同类型的元素,如数字、字符串、向量、其他列表等,使用list函数创建

    创建

    #创建列表对象
    #本质上是5个对象
    list("Red", "Green", 21:25, TRUE, 51.23, 119.1)
    
    [[1]]
    [1] "Red"
    
    [[2]]
    [1] "Green"
    
    [[3]]
    [1] 21 22 23 24 25
    
    [[4]]
    [1] TRUE
    
    [[5]]
    [1] 51.23
    
    [[6]]
    [1] 119.1
    

    索引

    # 可以命名列元素
    list_data = list(
        c("Jan","Feb","Mar"),
        matrix(1:6, nrow = 2),
        list("green", 12.3)
    )
    # 用name函数进行命名
    names(list_data) = c("item1", "item2", "item3")
    print(list_data)
    
    # 索引访问
    print(list_data[1])
    # 符号(命名访问)访问
    print(list_data[["item3"]][1])
    

    添加&删除

    # 添加一列
    list_data[["new"]] = c("A","B","C")
    print(list_data)
    # 删除一列只要把那一列设为NULL即可
    

    合并&把list转化成vector

    #合并
    l1 = list(1,2,3)
    l2 = list("Sun","Mon","Tue")
    c(l1,l2)
    [[1]]
    [1] 1
    
    [[2]]
    [1] 2
    
    [[3]]
    [1] 3
    
    [[4]]
    [1] "Sun"
    
    [[5]]
    [1] "Mon"
    
    [[6]]
    [1] "Tue"
    
    #list转为向量
    r = unlist(l1)
    r
    [1] 1 2 3
    

    矩阵

    矩阵是其中元素以二维矩形布局布置的R对象。 它们包含相同原子类型的元素。

    创建

    # 用向量生成矩阵,4行,默认byrow=FALSE,即数据按列生成
    # 可以添加dimnames给维度命名
    M = matrix(1:12, nrow = 4, byrow = FALSE,
               dimnames = list(
                   c("row1","row2","row3","row4"),
                   c("col1","col2","col3")
               ))
    M
    
         col1 col2 col3
    row1    1    5    9
    row2    2    6   10
    row3    3    7   11
    row4    4    8   12
    

    索引

    M[c("row1","row3"),1]
    
    # 行列索引,跟MATLAB一样,可以使用符号索引或者数字索引
    row1 row3 
       1    3 
    

    基本运算

    m1 = matrix(c(3, 9, -1, 4, 2, -6),nrow = 2)
    m2 = matrix(c(5, 2, 0, 9, 3, 4), nrow = 2)
    
    print(m1)
    print(m2)
    # 这里的直接+-*./都是一一对应的运算
    print(m1*m2)
    # %*%是整数乘法,t()函数是转置
    print(m1%*%t(m2))
    
         [,1] [,2] [,3]
    [1,]    3   -1    2
    [2,]    9    4   -6
    
         [,1] [,2] [,3]
    [1,]    5    0    3
    [2,]    2    9    4
    
         [,1] [,2] [,3]
    [1,]   15    0    6
    [2,]   18   36  -24
    
         [,1] [,2]
    [1,]   21    5
    [2,]   27   30
    

    数组

    数组可以储存2维以上的R数据对象

    创建

    v1 = c(5,9,3)
    v2 = c(10:15)
    column.names <- c("COL1","COL2","COL3")
    row.names <- c("ROW1","ROW2","ROW3")
    matrix.names <- c("Matrix1","Matrix2")
    result = array(c(v1,v2),dim = c(3,3,2), #3x3x2维
                   dimnames = list(row.names, #给维度取名
                                column.names,
                                matrix.names))
    print(result) #这里的数据发生了循环补全
    
    , , Matrix1
    
         COL1 COL2 COL3
    ROW1    5   10   13
    ROW2    9   11   14
    ROW3    3   12   15
    
    , , Matrix2
    
         COL1 COL2 COL3
    ROW1    5   10   13
    ROW2    9   11   14
    ROW3    3   12   15
    

    索引

    result[c("ROW1","ROW2"),c(2,3),"Matrix2"]
    
         COL2 COL3
    ROW1   10   13
    ROW2   11   14
    

    apply

    v3 = array(c(v1,v2),dim = c(2,3))
    print(v3)
    # apply传入的是数组,第二个参数是维度(此例中为列)即运算方向
    # 第三个参数为函数,也可以是自己写的
    apply(v3,c(2),sum)
    
         [,1] [,2] [,3]
    [1,]    5    3   11
    [2,]    9   10   12
    [1] 14 13 23
    

    因子

    因子是用于对数据进行分类并将其存储为级别的数据对象

    是一种离散的数据类型

    data = c("East","West","East","North","North",
             "East","West","West","West","East","North")
    f = factor(data)
    f
    
     [1] East  West  East  North North East  West  West  West  East  North
    Levels: East North West
    

    数据帧

    统计中最重要的数据集合类型,类似于表格的形式

    创建

    data = data.frame(
        id = 1:5,
        name = c("Rick","Dan","Michelle","Ryan","Gary"),
        salary = c(623.3,515.2,611.0,729.0,843.25),
        start_date = as.Date(c("2012-01-01", "2013-09-23", 
                               "2014-11-15", "2014-05-11",
                               "2015-03-27"))
        stringsAsFactors = FALSE
    )
    data
    
      id     name salary start_date
    1  1     Rick 623.30 2012-01-01
    2  2      Dan 515.20 2013-09-23
    3  3 Michelle 611.00 2014-11-15
    4  4     Ryan 729.00 2014-05-11
    5  5     Gary 843.25 2015-03-27
    
    str(data) #structure查看数据帧结构
    'data.frame':	5 obs. of  4 variables:
     $ id        : int  1 2 3 4 5
     $ name      : chr  "Rick" "Dan" "Michelle" "Ryan" ...
     $ salary    : num  623 515 611 729 843
     $ start_date: Date, format: "2012-01-01" "2013-09-23" ...
    

    查看摘要信息

    summary(data)
    
           id        name               salary        start_date        
     Min.   :1   Length:5           Min.   :515.2   Min.   :2012-01-01  
     1st Qu.:2   Class :character   1st Qu.:611.0   1st Qu.:2013-09-23  
     Median :3   Mode  :character   Median :623.3   Median :2014-05-11  
     Mean   :3                      Mean   :664.4   Mean   :2014-01-14  
     3rd Qu.:4                      3rd Qu.:729.0   3rd Qu.:2014-11-15  
     Max.   :5                      Max.   :843.2   Max.   :2015-03-27  
    

    索引

    data[1:3,c("name","salary")]
    
          name salary
    1     Rick  623.3
    2      Dan  515.2
    3 Michelle  611.0
    

    扩展数据帧

    # 添加列
    data["dept"] = c("IT","Operations","IT","HR","Finance")
    data
    
      id     name salary start_date       dept
    1  1     Rick 623.30 2012-01-01         IT
    2  2      Dan 515.20 2013-09-23 Operations
    3  3 Michelle 611.00 2014-11-15         IT
    4  4     Ryan 729.00 2014-05-11         HR
    5  5     Gary 843.25 2015-03-27    Finance
    
    # 添加行
    emp.newdata = data.frame(
        id = c (6:8), 
        name = c("Rasmi","Pranab","Tusar"),
        salary = c(578.0,722.5,632.8), 
        start_date = as.Date(c("2013-05-21","2013-07-30","2014-06-17")),
        dept = c("IT","Operations","Fianance"),
        stringsAsFactors = FALSE
    )
    
    # 添加数据
    data
    emp.newdata
    rbind(data,emp.newdata)
    
      id     name salary start_date       dept
    1  1     Rick 623.30 2012-01-01         IT
    2  2      Dan 515.20 2013-09-23 Operations
    3  3 Michelle 611.00 2014-11-15         IT
    4  4     Ryan 729.00 2014-05-11         HR
    5  5     Gary 843.25 2015-03-27    Finance
    6  6    Rasmi 578.00 2013-05-21         IT
    7  7   Pranab 722.50 2013-07-30 Operations
    8  8    Tusar 632.80 2014-06-17   Fianance
    
  • 相关阅读:
    学习进度条05
    构建之法阅读笔记03
    子数组和最大值算法
    学习进度条04
    学习进度条03
    定制小学四则运算
    单元测试示例
    构建之法阅读笔记02
    学习进度条02
    decimal扩展方法(转换为字符串,去掉末尾的0)
  • 原文地址:https://www.cnblogs.com/fanghao/p/7617534.html
Copyright © 2011-2022 走看看