zoukankan      html  css  js  c++  java
  • R语言学习笔记之八

    摘要: 仅用于记录R语言学习过程:

    内容提要:

    字符串的处理、正则表达式、stringi包和stringr包

    正文:

      字符串的处理

    n  导读:

    nchar(x)函数:字符串的个数:

    > x <- c('fudan','jiaoda')

    > nchar(x)

    [1] 5 6   #返回字符串的个数

    length()函数:返回元素的个数

    > length(x)

    [1] 2

    u  toupper()函数:小写转大写

    > toupper('abc')

    [1] "ABC"

    u  tolower()函数:大写转小写

    > tolower('ABKC')

    [1] "abkc"

    u  paste()函数:(seq参数和collapse参数)粘贴功能

    > stringa <- LETTERS[1:5]

    > STRINGB <- 1:5

    > paste(stringa,STRINGB)

    [1] "A 1" "B 2" "C 3" "D 4" "E 5"

    > paste(stringa,STRINGB,seq = '-')  #seq分隔符

    [1] "A 1 -" "B 2 -" "C 3 -" "D 4 -" "E 5 -"

    > paste(stringa,STRINGB,collapse = '-')   # collapse分隔符

    [1] "A 1-B 2-C 3-D 4-E 5"

    u  paste0()函数:去掉了A和1之间的空格,seq和collapse的表型也不同

    > paste0(stringa,STRINGB)

    [1] "A1" "B2" "C3" "D4" "E5"

    > paste0(stringa,STRINGB,seq = '-')

    [1] "A1-" "B2-" "C3-" "D4-" "E5-"

    > paste0(stringa,STRINGB,collapse = '-')

    [1] "A1-B2-C3-D4-E5"

    u  strsplit()函数:字符串拆分功能

    > stringC <- paste(stringa, STRINGB, seq = '/')

    > strsplit(stringC,split = '/')   #根据/ 进行拆分

    [[1]]

    [1] "A 1 "

    [[2]]

    [1] "B 2 "

    [[3]]

    [1] "C 3 "

    [[4]]

    [1] "D 4 "

    [[5]]

    [1] "E 5 "

    u  substr()函数:字符串截取函数;同时具有赋值功能

    > stringd <- c('python','java','ruby','php','linux')

    > sub_str <- substr(stringd,start = 2,stop = 4) #截取2-4位的字符,如果不够,就有几个返回几个

    > sub_str

    [1] "yth" "ava" "uby" "hp"  "inu"

    #实现赋值的功能

    > substr(stringd,start = 2,stop = 4) <- 'aaa'

    > stringd

    [1] "paaaon" "jaaa"   "raaa"   "paa"    "laaax"

    grep()函数:用于提取字符串中指定的字符,可返回位置,也可返回具体的值。

    > seq_names <- c('EU_FRA02_C1_S2008','AF_COM12_80_20014','AF_COM17_F0_S2008',

    +                'AS_CHN11_C3_2004','EU-FRA-C3-S2007','NAUSA02E02005','AS_CHN12_N0_05',

    +                'NA_USA03_C2_S2007','NA USA04 A3 2004',

    +                'EU_UK01_A0_2009','eu_fra_a2_s98','SA/BRA08/00/1996')

    > fra_seq <- grep(pattern = 'FRA|fra',x =seq_names)

    > fra_seq

    [1]  1  5 11

    > seq_names[fra_seq]

    [1] "EU_FRA02_C1_S2008" "EU-FRA-C3-S2007" 

    [3] "eu_fra_a2_s98"   

    > fra_seq <- grep(pattern = 'FRA|fra',x =seq_names,value = TRUE)

    > fra_seq

    [1] "EU_FRA02_C1_S2008" "EU-FRA-C3-S2007" 

    [3] "eu_fra_a2_s98"

    u  grepl()函数:返回的是逻辑值。没有value参数。ignore.case参数表示是否忽略大小写,TRUE为忽略。

    > grepl(pattern = 'FRA|fra',x =seq_names)

     [1]  TRUE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE

    [10] FALSE  TRUE FALSE

    > fra_seq <- grepl(pattern = 'FRA|fra',x =seq_names,value = TRUE)  #或

    u  正则表达式:提取元素

    > spe_seq <- seq_names[!grepl(pattern = '[s|S][0-9]{2,4}\b',seq_names)]  #匹配右边界

    > spe_seq

    [1] "AF_COM12_80_20014" "AS_CHN11_C3_2004"

    [3] "NAUSA02E02005"     "AS_CHN12_N0_05"  

    [5] "NA USA04 A3 2004"  "EU_UK01_A0_2009" 

    [7] "SA/BRA08/00/1996"

    找到以ab开头的

    my_string <- c('above','about','abrotion','cab')

    grep(pattern = '\bab',x = my_string,value = T) #匹配左边界

    u  gsub()函数:把字符串变成数值,会把找到的所有字符都替换掉

    money <- c('$1888','$2888','$3888')

    gsub('\$',replacement = '',money)

    as.numeric(money)

    u  sub()函数:只会替换掉找到的第一个字符

    > money <- c('$1888 $2888 $3888')

    > sub('\$',replacement = '',money)

    [1] "1888 $2888 $3888"

    > gsub('\$',replacement = '',money)

    [1] "1888 2888 3888"

    regexpr()函数

    > test_string <- c('happy','apple','application','apolitic')

    > regexpr('pp',test_string)

    [1]  3  2  2 -1   #返回pp出现的位置,-1表示没有

    attr(,"match.length")

    [1]  2  2  2 -1

    attr(,"useBytes")

    [1] TRUE

    > test_string[regexpr('pp',test_string)>0]  #提取含pp的字符串

    [1] "happy"       "apple"       "application"

    gregexpr()函数:同regexpr()函数

    regexec()函数:同regexpr()函数

    u  agrep()函数:可以匹配英美单词不同写法

    > string1 <- c('I need a favour','my favorite sport','you made an error')

    > agrep('favor',string1)

    [1] 1 2

      正则表达式

    n  原义表达式:只代表自己

    > mystring1 <- c('apple','orange')

    > grep('p',mystring1)

    [1] 1

    n  转义表达式:代表其他含义

    > # .所有字符

    > mystring2 <- c('shudo','.dfs','-dsfd')

    > grep('.',mystring2)

    [1] 1 2 3

    >

    > mystring3 <- c('9anv','fss7','1000','ss7')

    > grep('[7-9]',mystring3)

    [1] 1 2 4

    >

    > # ^a,匹配a开头的

    > mystring4 <- c('apple','application','abb')

    > grep('^ap',mystring4)

    [1] 1 2

    > # [^]表示不是0-1

    > mystring5 <- c('9anv','fss7','1000','ss7')

    > grep('[^0-1]',mystring5)

    [1] 1 2 4

    > #{}代表重复的次数,{1,}表示重复大于1次

    > mystring6 <- c('1220','2289','2228','10002')

    > grep('2{2,3}',mystring6)

    [1] 1 2 3

    > # + 表示其最靠近的字符重复多次,()表示把括号内的内容看成一个整体

    > mystring7 <- c('food','foot','foul','fans')

    > grep ('fo+',mystring7)

    [1] 1 2 3

    > grep('fo{1,}',mystring7)

    [1] 1 2 3

    > grep('(fo){1,}',mystring7)

    [1] 1 2 3

    >

    > #* 匹配0次或以上

    > #| 管道符  或,满足其中之一就可被返回

    >

    > mystring8 <- c('kobe','messi','neymar')

    > grep('^k|^m',mystring8)

    [1] 1 2

    > # $表示匹配字符串末尾

    > mystring9 <- c('active','positive','negative','iention')

    > grep('ive$',mystring9)  #匹配字符串末尾

    [1] 1 2 3

    > grep('ive\b',mystring9)

    [1] 1 2 3

    n  保义字符:

    #

    mystring10 <- c('ac^bb','^df')

    grep('\^',mystring10)

    [1] 1 2

    \d = [0-9]  匹配数字0-9

    \D = [^0-9] 匹配非数字

    \s   匹配空白字符,空格,制表符,换行符

    \S  匹配非空白字符

    \w  匹配字母和数字   =[a-zA-Z0-9]

    \W  匹配非字母和数字  =[^a-zA-Z0-9]

    \b   匹配字符的边界

    \B   匹配字符的非边界

    \<   匹配以空白字符开始的文本  如‘ string’

    \>   匹配以空白字符结束的文本  如‘string ’

    示例:

    > mystring11 <- c('2013','abcd','13sg')

    > grep('\d',mystring11)

    [1] 1 3

    > grep('\D',mystring11)

    [1] 2 3

    > mystring12 <- c('foo t','    able','   moth  er','happy')

    > grep('\s',mystring12)

    [1] 1 2 3

    > grep('\S',mystring12)

    [1] 1 2 3 4

    > mystring13 <- c('theory','the republic','they')

    > grep('\<the\>',mystring13)   #以the作为边界的字符串,the为一个单独的单词

    [1] 2

      stringr与stringi包

    n  stringi包更加依赖正则表达式

    stringr中的常用函数

    str_c()函数:类似paste()函数

    > str_c('a','b')

    [1] "ab"

    > str_c('a','b',sep = '-')

    [1] "a-b"

    str_length()函数:用于字符串计数

    > str_length('abdc')

    [1] 4

    str_sub()函数:用于字符串提取,类似substr()函数,有三个参数:数据名,开始位置,结束位置(可以接受向量),可以接受赋值

    > yxf <- 'yi xue fang'

    > str_sub(yxf,c(1,4,8),c(2,6,11))

    [1] "yi"   "xue"  "fang"

    >

    > str_sub(yxf,1,1) <- 'Y'     #可以接受赋值

    > yxf

    [1] "Yi xue fang"

    str_dup()函数:用于复制

    > fruit <- c('apple','pear','banana')

    > str_dup(fruit,2)

    [1] "appleapple"   "pearpear"     "bananabanana"

    > fruit <- c('apple','pear','banana')

    > str_dup(fruit,2:4)

    [1] "appleapple"               "pearpearpear"           

    [3] "bananabananabananabanana"

    str_trim()函数:去掉字符串首尾的空格,也可以设置成right和left,分别去掉右边和左边的空格

    > string <- ' Eternal love for YanQ '

    > str_trim(string,side = 'both')

    [1] "Eternal love for YanQ"

    str_extract()函数:用于提取

    phones <- c('219 733 8965','329-293-8753','banana','595 794 7569',

                '387 287 6718','apple','233.398.9187','482 952 3315',

                '239 923 8115 and 842 566 4692','Work: 579-499-7527','$1000',

                'Home:543.355.3679')

    str_extract(phones,'([0-9]{3})[- .]([0-9]{3})[- .]([0-9]{4})\b')

    [1] "219 733 8965" "329-293-8753" NA             "595 794 7569" "387 287 6718"

     [6] NA             "233.398.9187" "482 952 3315" "239 923 8115" "579-499-7527"

    [11] NA             "543.355.3679"

    或写成:str_extract(phones,'([2-9][0-9]{2})[- .]([0-9]{3})[- .]([0-9]{4})')

    str_replace()函数:用于字符串替换,只替换找到的第一个

    > fruits <- c('one apple','two pears','three bananas')

    > str_replace(fruits,'[aeiou]','-')  #[被替换的对象] ,‘拟替换成的对象’

    [1] "-ne apple"     "tw- pears"     "thr-e bananas"

    str_replace_all()函数:替换所有

    > fruits <- c('one apple','two pears','three bananas')

    > str_replace_all(fruits,'[aeiou]','-')

    [1] "-n- -ppl-"     "tw- p--rs"     "thr-- b-n-n-s"

    n  stringi中的常用函数

    u  stri_join()函数:

    > stri_join(1:7,letters[1:7],sep = '-')

    [1] "1-a" "2-b" "3-c" "4-d" "5-e" "6-f" "7-g"

    > stri_join(1:7,letters[1:7],collapse = '-')

    [1] "1a-2b-3c-4d-5e-6f-7g"

    u  stri_cmp_eq() & stri_cmp_neq()函数:

    > stri_cmp_eq('ab','ab')

    [1] TRUE

    > stri_cmp_neq('ab','ab')

    [1] FALSE

    u  stri_cmp_lt() & stri_cmp_gt()函数:用于字符串比大小,lt 前者小于后者,gt前者大于后者

    > stri_cmp_lt('121','221')

    [1] TRUE

    > stri_cmp_lt('a121','b221')

    [1] TRUE

    > stri_cmp_gt('121','221')

    [1] FALSE

    u  stri_count()函数:用于计数

    > language <- c('python','R','PHP','Ruby','Java',

    +               'JavaScript','C','Oracle','C++','C#','Spark',

    +               'Go','Room','Good','Pathon','ScriptJava','R2R','C+','C*')

    > stri_count(language,fixed = 'R')

     [1] 0 1 0 1 0 0 0 0 0 0 0 0 1 0 0 0 2 0 0

    > stri_count(language,regex = '^J')

          [1] 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0

    u  stri_count_boundaries()函数:字符串元素个数的计数

    > test <- 'Theu00a0above-mentioned     features are very useful.

    + Warm thanks to their developers. Tomorrow is a ,new$% day###'

    > stri_count_boundaries(test,type = 'word')

    [1] 45

    > stri_count_boundaries(test,type = 'sentence')

    [1] 3

    > stri_count_boundaries(test,type = 'character')

    [1] 110

    u  stri_duplicated()函数:识别重复的字符串

    > stri_duplicated(c('a','b','a',NA,'a',NA))

    [1] FALSE FALSE  TRUE FALSE  TRUE  TRUE

    > stri_duplicated(c('a','b','a',NA,'a',NA),fromLast = T)  #从最后开始看

    [1]  TRUE FALSE  TRUE  TRUE FALSE FALSE

    > stri_duplicated_any(c('a','b','a',NA,'a',NA))

    [1] 3

    u  stri_dup()函数:重复

    > stri_dup(c('abc','parst'),c(4,2))

    [1] "abcabcabcabc" "parstparst" 

    u  stri_detect_fixec()函数:发现匹配函数

    > stri_detect_fixed(c('stringi R','REXAMINE','123'),c('i','R','0'))

    [1]  TRUE  TRUE FALSE

    u  stri_detect_regex()函数:

    > stri_detect_regex(c('above','abort','about','abnormal','abandon'),'^ab')

    [1] TRUE TRUE TRUE TRUE TRUE

    > stri_detect_regex(c('above','abort','about','abnormal','abandon'),'t\b')

    [1] FALSE  TRUE  TRUE FALSE FALSE

    > stri_detect_regex(c('ABOUT','abort','AboVE'),'^ab',case_insensitive = TRUE)  #忽略大小写

    [1] TRUE TRUE TRUE

    u  stri_startswith_fixed()函数:

    > stri_startswith_fixed(c('a1','a2','b3','a4','c5'),'a')

    [1]  TRUE  TRUE FALSE  TRUE FALSE

    >

    > stri_startswith_fixed(c('a1','a2','b3','a4','c5'),'a1')

    [1]  TRUE FALSE FALSE FALSE FALSE

    >

    > stri_startswith_fixed(c('abaDc','aabadc','ababa'),'ba',from = 2)  #从哪个字符开始匹配,从第二个字符开始匹配

    [1]  TRUE FALSE  TRUE

    u  stri_endswith_fixed()函数:

    > stri_endswith_fixed(c('abaDc','aabadc','ababa'),'ba')

    [1] FALSE FALSE  TRUE

    > stri_endswith_fixed(c('abaDc','aabadc','ababa'),'ba', to = 3)  #匹配到第几位,匹配到第三位

    [1]  TRUE FALSE  TRUE

    u  stri_extract_all()函数:提取

    > tEmp_text <- c('EU_FRA02_C1_S2008','AF_COM12_80_20014','AF_COM17_F0_S2008',

    +                'AS_CHN11_C3_2004','EU-FRA-C3-S2007','NAUSA02E02005','AS_CHN12_N0_05',

    +                'NA_USA03_C2_S2007','NA USA04 A3 2004',

    +                'EU_UK01_A0_2009','eu_fra_a2_s98','SA/BRA08/00/1996')

    >

    > # Generate a strings composed by several sequence names.

    >

    > stri_extract_all(tEmp_text,regex = '[0-9]{2,4}\b')

    [[1]]

    [1] "2008"

    [[2]]

    [1] "0014"

    [[3]]

    [1] "2008"

    [[4]]

    [1] "2004"

    [[5]]

    [1] "2007"

    [[6]]

    [1] "2005"

    [[7]]

    [1] "05"

    [[8]]

    [1] "2007"

    [[9]]

    [1] "04"   "2004"

    [[10]]

    [1] "2009"

    [[11]]

    [1] "98"

    [[12]]

    [1] "08"   "00"   "1996"

    u  stri_extract_all_fixed()函数:

    > stri_extract_all_fixed('abaBAba','Aba',case_insensitive = T, overlap =T)

    [[1]]   #可交叉

    [1] "aba" "aBA" "Aba"

    u  stri_extract_all_boundaries()函数:提取字符串的边界

    > stri_extract_all_boundaries('stringi: THE string processing package 123.48...')

    [[1]]

    [1] "stringi: "   "THE "        "string "     "processing " "package "  

    [6] "123.48..."   #但是带出来单词后面的空格

    u  stri_extract_all_words()函数:提取字符串的边界,去掉空格

    > stri_extract_all_words('stringi: THE string processing package 123.48...')

    [[1]]

    [1] "stringi"    "THE"        "string"     "processing" "package"    "123.48"

    u  stri_isempty()函数:字符串内是否为空

    > stri_isempty(c(',','','abc','123','u0105u0104',' '))

    [1] FALSE  TRUE FALSE FALSE FALSE FALSE

    u  stri_locate_all()函数:定位函数

    > stri_locate_all('I want to learn R to promote my statistical skills',fixed = 'to')

    [[1]]

         start end

    [1,]     8   9

    [2,]    19  20  #返回的是位置,起始和结束,可用于提取

  • 相关阅读:
    leetcode 279. Perfect Squares
    leetcode 546. Remove Boxes
    leetcode 312. Burst Balloons
    leetcode 160. Intersection of Two Linked Lists
    leetcode 55. Jump Game
    剑指offer 滑动窗口的最大值
    剑指offer 剪绳子
    剑指offer 字符流中第一个不重复的字符
    leetcode 673. Number of Longest Increasing Subsequence
    leetcode 75. Sort Colors (荷兰三色旗问题)
  • 原文地址:https://www.cnblogs.com/ppjs/p/9439159.html
Copyright © 2011-2022 走看看