DECIPHER - R Lesson #7 - Searching characters

R Lesson #7 - Searching characters

This lesson describes how to search for a pattern inside of a character vector. R has several functions that accept search patterns known as regular expressions. The different search functions are detailed below.

Hide output

# FUNCTION  OUTPUT_TYPE  DESCRIPTION
> # strsplit  list         explode string at matches
> # gsub      character    replace matches
> # grep      integer      return index of matches
> # grepl     logical      TRUE if matched
> # gregexpr  list         position/length of match(es)
> 
> x <- "The quick brown fox jumps over the lazy dog."
> y <- strsplit(x, " ")
> y
[[1]]
[1] "The"   "quick" "brown" "fox"   "jumps" "over" 
[7] "the"   "lazy"  "dog." 


> y <- y[[1]]
> grep("u", y) # find "u"
[1] 2 5
> gsub("o", "OOOO", y) # replace "o" with "OOOO"
[1] "The"      "quick"    "brOOOOwn" "fOOOOx"  
[5] "jumps"    "OOOOver"  "the"      "lazy"    
[9] "dOOOOg." 
> grepl("u", y) # find "u"
[1] FALSE  TRUE FALSE FALSE  TRUE FALSE FALSE FALSE
[9] FALSE
> which(grepl("u", y)) # equivalent to grep
[1] 2 5
> strsplit(y, "u") # returns a list
[[1]]
[1] "The"


[[2]]
[1] "q"   "ick"


[[3]]
[1] "brown"


[[4]]
[1] "fox"


[[5]]
[1] "j"   "mps"


[[6]]
[1] "over"


[[7]]
[1] "the"


[[8]]
[1] "lazy"


[[9]]
[1] "dog."


> 
> # to figure out where the match occurred,
> # we can use gregexpr, which returns a
> # somewhat complicated list structure
> g <- gregexpr("u", y) # returns a list
> str(g)
List of 9
 $ : atomic [1:1] -1
  ..- attr(*, "match.length")= int -1
  ..- attr(*, "useBytes")= logi TRUE
 $ : atomic [1:1] 2
  ..- attr(*, "match.length")= int 1
  ..- attr(*, "useBytes")= logi TRUE
 $ : atomic [1:1] -1
  ..- attr(*, "match.length")= int -1
  ..- attr(*, "useBytes")= logi TRUE
 $ : atomic [1:1] -1
  ..- attr(*, "match.length")= int -1
  ..- attr(*, "useBytes")= logi TRUE
 $ : atomic [1:1] 2
  ..- attr(*, "match.length")= int 1
  ..- attr(*, "useBytes")= logi TRUE
 $ : atomic [1:1] -1
  ..- attr(*, "match.length")= int -1
  ..- attr(*, "useBytes")= logi TRUE
 $ : atomic [1:1] -1
  ..- attr(*, "match.length")= int -1
  ..- attr(*, "useBytes")= logi TRUE
 $ : atomic [1:1] -1
  ..- attr(*, "match.length")= int -1
  ..- attr(*, "useBytes")= logi TRUE
 $ : atomic [1:1] -1
  ..- attr(*, "match.length")= int -1
  ..- attr(*, "useBytes")= logi TRUE
> g # position of matches or -1 (no match)
[[1]]
[1] -1
attr(,"match.length")
[1] -1
attr(,"useBytes")
[1] TRUE


[[2]]
[1] 2
attr(,"match.length")
[1] 1
attr(,"useBytes")
[1] TRUE


[[3]]
[1] -1
attr(,"match.length")
[1] -1
attr(,"useBytes")
[1] TRUE


[[4]]
[1] -1
attr(,"match.length")
[1] -1
attr(,"useBytes")
[1] TRUE


[[5]]
[1] 2
attr(,"match.length")
[1] 1
attr(,"useBytes")
[1] TRUE


[[6]]
[1] -1
attr(,"match.length")
[1] -1
attr(,"useBytes")
[1] TRUE


[[7]]
[1] -1
attr(,"match.length")
[1] -1
attr(,"useBytes")
[1] TRUE


[[8]]
[1] -1
attr(,"match.length")
[1] -1
attr(,"useBytes")
[1] TRUE


[[9]]
[1] -1
attr(,"match.length")
[1] -1
attr(,"useBytes")
[1] TRUE


> attributes(g[[2]])$match.length
[1] 1
> regmatches(y, g) # get back the matches
[[1]]
character(0)


[[2]]
[1] "u"


[[3]]
character(0)


[[4]]
character(0)


[[5]]
[1] "u"


[[6]]
character(0)


[[7]]
character(0)


[[8]]
character(0)


[[9]]
character(0)

Finding an exact pattern, also known as literal matching, is very fast. Depending on the input parameters, the grep function can return the indices of matches, the indices of non-matches, or the match value.

test1 <- c("abc", "Abc", "ABC", "ABCD")
> grep("abc", test1) # with the default arguments
[1] 1
> grep("abc", test1, value=TRUE)
[1] "abc"
> # with only letters the behavior is as expected,
> # special characters will be interpreted as
> # regular expressions (unless fixed=TRUE)
> grep("abc", test1, fixed=TRUE) # search exactly (fast)
[1] 1
> grep("abc", test1, ignore.case=TRUE)
[1] 1 2 3 4
> grep("abc", test1, invert=TRUE)
[1] 2 3 4

A regular expression is a grammer specifying the syntax for flexible pattern matching. Only the basics of regular expressions are described here.

?regex
> # these can get very advanced, for example:
> # "/^([a-z0-9_\.-]+)@([\da-z\.-]+)\.([a-z\.]{2,6})$/"
> # can be used to match any email address
> 
> # Repetition qualifiers:
> # "." matches anything once
> # "+" suffix means one or more times
> # "*" suffix means zero or more times
> # "?" suffix means zero or one time
> test2 <- c("m+n", "man", "moon", "mn", "mooon", "mon")
> # zero or more 'o' characters between m and n
> grep("mo*n", test2, value=TRUE)
[1] "moon"  "mn"    "mooon" "mon"  
> # one or more 'o' characters between m and n
> grep("mo+n", test2, value=TRUE)
[1] "moon"  "mooon" "mon"  
> # anything between m and n once
> grep("m.n", test2, value=TRUE)
[1] "m+n" "man" "mon"
> # zero or more characters between m and n
> grep("m.*n", test2, value=TRUE)
[1] "m+n"   "man"   "moon"  "mn"    "mooon" "mon"  
> # zero or one 'o' characters between m and n
> grep("mo?n", test2, value=TRUE)
[1] "mn"  "mon"
> 
> # special cases:
> # search for a special character (+) as is
> grep("m+n", test2, value=TRUE)
[1] "mn"
> grep("m\\+n", test2, value=TRUE)
[1] "m+n"
> grep("m+n", test2, fixed=TRUE, value=TRUE)
[1] "m+n"
> # 2, 3 or 4 'o' characters between m and n
> grep("mo{2,4}n", test2, value=TRUE)
[1] "moon"  "mooon"
> 
> # Anchoring:  beginning and ending
> # "^" means must start at the beginning
> # "$" means must finish at the end
> test3 <- c("abc", "abc d", "abcd", "fabc")
> grep("abc", test3, value=TRUE)
[1] "abc"   "abc d" "abcd"  "fabc" 
> grep("^abc", test3, value=TRUE)
[1] "abc"   "abc d" "abcd" 
> grep("abc$", test3, value=TRUE)
[1] "abc"  "fabc"
> grep("^abc$", test3, value=TRUE)
[1] "abc"
> 
> # Alternation constructs - either/or
> test4 <- c("men", "man", "women", "moon", "maan", "mn")
> # the pipe "|" means OR
> grep("e|a", test4, value=TRUE)
[1] "men"   "man"   "women" "maan" 
> # combining multiple special characters together
> # use parentheses to group part of the expression
> grep("m(e|a)+n", test4, value=TRUE)
[1] "men"   "man"   "women" "maan" 
> # parentheses can also group multiple characters
> grep("(en|an)", test4, value=TRUE)
[1] "men"   "man"   "women" "maan" 
> 
> # Character classes - matching in a group
> test5 <- c("moon", "maan", "mn", "myn", "m_n", "mDn")
> # square brackets "[]" define a "character class":
> # only vowels located between m and n
> grep("m[aeiou]*n", test5, value=TRUE)
[1] "moon" "maan" "mn"  
> # the "^" in a character class means NOT
> # anything but a vowel located between m and n
> grep("m[^aeiou]*n", test5, value=TRUE)
[1] "mn"  "myn" "m_n" "mDn"
> # the "-" in a character class defines a range
> # only lowercase consonants located between m and n
> grep("m[b-df-hj-np-tv-z]*n", test5, value=TRUE)
[1] "mn"  "myn"

There are also built-in pattern classes that will match multiple digits, words, or other pre-defined patterns.

# Built-in classes: \d, \w
> sentence <- "pi is approximately equal to 3.14 or 22/7."
> # match numbers without escaping "."
> nlist_matches1 <- gregexpr("\\d+.\\d+", sentence)
> nlist1 <- regmatches(sentence, nlist_matches1)
> nlist1
[[1]]
[1] "3.14" "22/7"


> # match numbers while escaping "."
> nlist_matches2 <- gregexpr("\\d+\\.\\d+", sentence)
> nlist2 <- regmatches(sentence, nlist_matches2)
> nlist2
[[1]]
[1] "3.14"


> # list of all words
> wlist_matches1 <- gregexpr("\\w+", sentence)
> wlist1 <- regmatches(sentence, wlist_matches1)
> wlist1
[[1]]
 [1] "pi"            "is"            "approximately"
 [4] "equal"         "to"            "3"            
 [7] "14"            "Also"          "22"           
[10] "7"            


> # words with only letters
> wlist_matches2 <- gregexpr("[[:alpha:]]+", sentence)
> wlist2 <- regmatches(sentence, wlist_matches2)
> wlist2
[[1]]
[1] "pi"            "is"            "approximately"
[4] "equal"         "to"            "Also"         


> # words starting with an Upper case letter
> wlist_matches3 <- gregexpr("[[:upper:]][[:alpha:]]+",
+    sentence)
> wlist3 <- regmatches(sentence, wlist_matches3)
> wlist3
[[1]]
[1] "Also"


> 
> # Word boundaries
> # words starting with an "a"
> wlist_matches4 <- gregexpr("\\<(a|A)[[:alpha:]]+",
+    sentence)
> wlist4 <- regmatches(sentence, wlist_matches4)
> wlist4
[[1]]
[1] "approximately" "Also"         


> # words ending in a vowel
> wlist_matches5 <- gregexpr("[[:alpha:]]+[aeiou]\\>",
+    sentence)
> wlist5 <- regmatches(sentence, wlist_matches5)
> wlist5
[[1]]
[1] "pi"   "to"   "Also"

< Previous Lesson Next Lesson >