R Lesson #7 - Searching characters
This lesson describes how to search for a pattern inside of a character vector. R has several functions that accept search patterns known as regular expressions. The different search functions are detailed below.# FUNCTION OUTPUT_TYPE DESCRIPTION
> # strsplit list explode string at matches
> # gsub character replace matches
> # grep integer return index of matches
> # grepl logical TRUE if matched
> # gregexpr list position/length of match(es)
>
> x <- "The quick brown fox jumps over the lazy dog."
> y <- strsplit(x, " ")
> y
[[1]]
[1] "The" "quick" "brown" "fox" "jumps" "over"
[7] "the" "lazy" "dog."
> y <- y[[1]]
> grep("u", y) # find "u"
[1] 2 5
> gsub("o", "OOOO", y) # replace "o" with "OOOO"
[1] "The" "quick" "brOOOOwn" "fOOOOx"
[5] "jumps" "OOOOver" "the" "lazy"
[9] "dOOOOg."
> grepl("u", y) # find "u"
[1] FALSE TRUE FALSE FALSE TRUE FALSE FALSE FALSE
[9] FALSE
> which(grepl("u", y)) # equivalent to grep
[1] 2 5
> strsplit(y, "u") # returns a list
[[1]]
[1] "The"
[[2]]
[1] "q" "ick"
[[3]]
[1] "brown"
[[4]]
[1] "fox"
[[5]]
[1] "j" "mps"
[[6]]
[1] "over"
[[7]]
[1] "the"
[[8]]
[1] "lazy"
[[9]]
[1] "dog."
>
> # to figure out where the match occurred,
> # we can use gregexpr, which returns a
> # somewhat complicated list structure
> g <- gregexpr("u", y) # returns a list
> str(g)
List of 9
$ : atomic [1:1] -1
..- attr(*, "match.length")= int -1
..- attr(*, "useBytes")= logi TRUE
$ : atomic [1:1] 2
..- attr(*, "match.length")= int 1
..- attr(*, "useBytes")= logi TRUE
$ : atomic [1:1] -1
..- attr(*, "match.length")= int -1
..- attr(*, "useBytes")= logi TRUE
$ : atomic [1:1] -1
..- attr(*, "match.length")= int -1
..- attr(*, "useBytes")= logi TRUE
$ : atomic [1:1] 2
..- attr(*, "match.length")= int 1
..- attr(*, "useBytes")= logi TRUE
$ : atomic [1:1] -1
..- attr(*, "match.length")= int -1
..- attr(*, "useBytes")= logi TRUE
$ : atomic [1:1] -1
..- attr(*, "match.length")= int -1
..- attr(*, "useBytes")= logi TRUE
$ : atomic [1:1] -1
..- attr(*, "match.length")= int -1
..- attr(*, "useBytes")= logi TRUE
$ : atomic [1:1] -1
..- attr(*, "match.length")= int -1
..- attr(*, "useBytes")= logi TRUE
> g # position of matches or -1 (no match)
[[1]]
[1] -1
attr(,"match.length")
[1] -1
attr(,"useBytes")
[1] TRUE
[[2]]
[1] 2
attr(,"match.length")
[1] 1
attr(,"useBytes")
[1] TRUE
[[3]]
[1] -1
attr(,"match.length")
[1] -1
attr(,"useBytes")
[1] TRUE
[[4]]
[1] -1
attr(,"match.length")
[1] -1
attr(,"useBytes")
[1] TRUE
[[5]]
[1] 2
attr(,"match.length")
[1] 1
attr(,"useBytes")
[1] TRUE
[[6]]
[1] -1
attr(,"match.length")
[1] -1
attr(,"useBytes")
[1] TRUE
[[7]]
[1] -1
attr(,"match.length")
[1] -1
attr(,"useBytes")
[1] TRUE
[[8]]
[1] -1
attr(,"match.length")
[1] -1
attr(,"useBytes")
[1] TRUE
[[9]]
[1] -1
attr(,"match.length")
[1] -1
attr(,"useBytes")
[1] TRUE
> attributes(g[[2]])$match.length
[1] 1
> regmatches(y, g) # get back the matches
[[1]]
character(0)
[[2]]
[1] "u"
[[3]]
character(0)
[[4]]
character(0)
[[5]]
[1] "u"
[[6]]
character(0)
[[7]]
character(0)
[[8]]
character(0)
[[9]]
character(0)
test1 <- c("abc", "Abc", "ABC", "ABCD")
> grep("abc", test1) # with the default arguments
[1] 1
> grep("abc", test1, value=TRUE)
[1] "abc"
> # with only letters the behavior is as expected,
> # special characters will be interpreted as
> # regular expressions (unless fixed=TRUE)
> grep("abc", test1, fixed=TRUE) # search exactly (fast)
[1] 1
> grep("abc", test1, ignore.case=TRUE)
[1] 1 2 3 4
> grep("abc", test1, invert=TRUE)
[1] 2 3 4
?regex
> # these can get very advanced, for example:
> # "/^([a-z0-9_\.-]+)@([\da-z\.-]+)\.([a-z\.]{2,6})$/"
> # can be used to match any email address
>
> # Repetition qualifiers:
> # "." matches anything once
> # "+" suffix means one or more times
> # "*" suffix means zero or more times
> # "?" suffix means zero or one time
> test2 <- c("m+n", "man", "moon", "mn", "mooon", "mon")
> # zero or more 'o' characters between m and n
> grep("mo*n", test2, value=TRUE)
[1] "moon" "mn" "mooon" "mon"
> # one or more 'o' characters between m and n
> grep("mo+n", test2, value=TRUE)
[1] "moon" "mooon" "mon"
> # anything between m and n once
> grep("m.n", test2, value=TRUE)
[1] "m+n" "man" "mon"
> # zero or more characters between m and n
> grep("m.*n", test2, value=TRUE)
[1] "m+n" "man" "moon" "mn" "mooon" "mon"
> # zero or one 'o' characters between m and n
> grep("mo?n", test2, value=TRUE)
[1] "mn" "mon"
>
> # special cases:
> # search for a special character (+) as is
> grep("m+n", test2, value=TRUE)
[1] "mn"
> grep("m\\+n", test2, value=TRUE)
[1] "m+n"
> grep("m+n", test2, fixed=TRUE, value=TRUE)
[1] "m+n"
> # 2, 3 or 4 'o' characters between m and n
> grep("mo{2,4}n", test2, value=TRUE)
[1] "moon" "mooon"
>
> # Anchoring: beginning and ending
> # "^" means must start at the beginning
> # "$" means must finish at the end
> test3 <- c("abc", "abc d", "abcd", "fabc")
> grep("abc", test3, value=TRUE)
[1] "abc" "abc d" "abcd" "fabc"
> grep("^abc", test3, value=TRUE)
[1] "abc" "abc d" "abcd"
> grep("abc$", test3, value=TRUE)
[1] "abc" "fabc"
> grep("^abc$", test3, value=TRUE)
[1] "abc"
>
> # Alternation constructs - either/or
> test4 <- c("men", "man", "women", "moon", "maan", "mn")
> # the pipe "|" means OR
> grep("e|a", test4, value=TRUE)
[1] "men" "man" "women" "maan"
> # combining multiple special characters together
> # use parentheses to group part of the expression
> grep("m(e|a)+n", test4, value=TRUE)
[1] "men" "man" "women" "maan"
> # parentheses can also group multiple characters
> grep("(en|an)", test4, value=TRUE)
[1] "men" "man" "women" "maan"
>
> # Character classes - matching in a group
> test5 <- c("moon", "maan", "mn", "myn", "m_n", "mDn")
> # square brackets "[]" define a "character class":
> # only vowels located between m and n
> grep("m[aeiou]*n", test5, value=TRUE)
[1] "moon" "maan" "mn"
> # the "^" in a character class means NOT
> # anything but a vowel located between m and n
> grep("m[^aeiou]*n", test5, value=TRUE)
[1] "mn" "myn" "m_n" "mDn"
> # the "-" in a character class defines a range
> # only lowercase consonants located between m and n
> grep("m[b-df-hj-np-tv-z]*n", test5, value=TRUE)
[1] "mn" "myn"
# Built-in classes: \d, \w
> sentence <- "pi is approximately equal to 3.14 or 22/7."
> # match numbers without escaping "."
> nlist_matches1 <- gregexpr("\\d+.\\d+", sentence)
> nlist1 <- regmatches(sentence, nlist_matches1)
> nlist1
[[1]]
[1] "3.14" "22/7"
> # match numbers while escaping "."
> nlist_matches2 <- gregexpr("\\d+\\.\\d+", sentence)
> nlist2 <- regmatches(sentence, nlist_matches2)
> nlist2
[[1]]
[1] "3.14"
> # list of all words
> wlist_matches1 <- gregexpr("\\w+", sentence)
> wlist1 <- regmatches(sentence, wlist_matches1)
> wlist1
[[1]]
[1] "pi" "is" "approximately"
[4] "equal" "to" "3"
[7] "14" "Also" "22"
[10] "7"
> # words with only letters
> wlist_matches2 <- gregexpr("[[:alpha:]]+", sentence)
> wlist2 <- regmatches(sentence, wlist_matches2)
> wlist2
[[1]]
[1] "pi" "is" "approximately"
[4] "equal" "to" "Also"
> # words starting with an Upper case letter
> wlist_matches3 <- gregexpr("[[:upper:]][[:alpha:]]+",
+ sentence)
> wlist3 <- regmatches(sentence, wlist_matches3)
> wlist3
[[1]]
[1] "Also"
>
> # Word boundaries
> # words starting with an "a"
> wlist_matches4 <- gregexpr("\\<(a|A)[[:alpha:]]+",
+ sentence)
> wlist4 <- regmatches(sentence, wlist_matches4)
> wlist4
[[1]]
[1] "approximately" "Also"
> # words ending in a vowel
> wlist_matches5 <- gregexpr("[[:alpha:]]+[aeiou]\\>",
+ sentence)
> wlist5 <- regmatches(sentence, wlist_matches5)
> wlist5
[[1]]
[1] "pi" "to" "Also"