DECIPHER - R Lesson #5 - Character basics

R Lesson #5 - Character basics

Despite its name, the character type can be used to store single letters, words, sentences, paragraphs, and numbers as text. In this sense, 'character' is short for 'character strings', which are commonly called 'strings' in many other programming languages. This lesson describes basic character manipulation, printing to the screen, and named indexing.

Hide output

# named indexing - indexing by character names
> x <- 1:26
> names(x) # NULL is a special object (undefined, length 0)
NULL
> names(x) <- letters
> x
 a  b  c  d  e  f  g  h  i  j  k  l  m  n  o  p  q  r 
 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 
 s  t  u  v  w  x  y  z 
19 20 21 22 23 24 25 26 
> x["a"] # equivalent to x[1]
a 
1 
> x[c("a", "d")] # x[c(1, 4)]
a d 
1 4 
> y <- sample(letters, 100, replace=TRUE)
> x[y] # index with another variable
 y  p  v  i  w  l  p  q  c  m  n  h  n  y  d  w  n  h 
25 16 22  9 23 12 16 17  3 13 14  8 14 25  4 23 14  8 
 q  k  v  h  h  f  a  d  p  j  f  n  x  h  i  k  g  b 
17 11 22  8  8  6  1  4 16 10  6 14 24  8  9 11  7  2 
 p  w  q  w  i  i  r  p  i  z  m  r  v  i  y  f  f  w 
16 23 17 23  9  9 18 16  9 26 13 18 22  9 25  6  6 23 
 f  x  s  i  p  h  d  k  i  n  k  d  b  q  a  o  a  p 
 6 24 19  9 16  8  4 11  9 14 11  4  2 17  1 15  1 16 
 f  e  a  p  n  p  f  e  y  c  l  x  w  f  r  c  u  u 
 6  5  1 16 14 16  6  5 25  3 12 24 23  6 18  3 21 21 
 r  l  p  m  x  l  w  f  p  q 
18 12 16 13 24 12 23  6 16 17 
> 
> z <- list(A=1:10, B=list(C=11:20, D="Choose me!"))
> # function arguments are always set with `=` not `<-`
> z
$A
 [1]  1  2  3  4  5  6  7  8  9 10


$B
$B$C
 [1] 11 12 13 14 15 16 17 18 19 20


$B$D
[1] "Choose me!"




> names(z)
[1] "A" "B"
> z[[1]]
 [1]  1  2  3  4  5  6  7  8  9 10
> z$A # indexing by name
 [1]  1  2  3  4  5  6  7  8  9 10
> z$B
$C
 [1] 11 12 13 14 15 16 17 18 19 20


$D
[1] "Choose me!"


> z$B$D
[1] "Choose me!"
> z["A"]
$A
 [1]  1  2  3  4  5  6  7  8  9 10


> z[["A"]]
 [1]  1  2  3  4  5  6  7  8  9 10
> 
> # familiar classes handle character data too:
> m <- matrix(letters[1:25], nrow=5)
> m
     [,1] [,2] [,3] [,4] [,5]
[1,] "a"  "f"  "k"  "p"  "u" 
[2,] "b"  "g"  "l"  "q"  "v" 
[3,] "c"  "h"  "m"  "r"  "w" 
[4,] "d"  "i"  "n"  "s"  "x" 
[5,] "e"  "j"  "o"  "t"  "y" 
> colnames(m) <- c("c1", "c2", "c3", "c4", "c5")
> rownames(m) <- c("r1", "r2", "r3", "r4", "r5")
> m
   c1  c2  c3  c4  c5 
r1 "a" "f" "k" "p" "u"
r2 "b" "g" "l" "q" "v"
r3 "c" "h" "m" "r" "w"
r4 "d" "i" "n" "s" "x"
r5 "e" "j" "o" "t" "y"
> a <- array(LETTERS[1:25], dim=c(5, 5))
> class(a) # matrix
[1] "matrix"
> toupper(m)==a # upper-case
     c1   c2   c3   c4   c5
r1 TRUE TRUE TRUE TRUE TRUE
r2 TRUE TRUE TRUE TRUE TRUE
r3 TRUE TRUE TRUE TRUE TRUE
r4 TRUE TRUE TRUE TRUE TRUE
r5 TRUE TRUE TRUE TRUE TRUE
> tolower(a)==m # lower-case
     c1   c2   c3   c4   c5
r1 TRUE TRUE TRUE TRUE TRUE
r2 TRUE TRUE TRUE TRUE TRUE
r3 TRUE TRUE TRUE TRUE TRUE
r4 TRUE TRUE TRUE TRUE TRUE
r5 TRUE TRUE TRUE TRUE TRUE

The paste function will separate its inputs by a separator, and, if desired, collapse the vector into a single element.

dna1 <- c("A", "T", "G", "C")
> dna2 <- c("a", "t", "g", "c")
> # given multiple character vectors, by default
> # paste will merge them with spaces between:
> paste(dna1, dna2)
[1] "A a" "T t" "G g" "C c"
> # the default behavior can be changed:
> paste(dna1, dna2, sep="")
[1] "Aa" "Tt" "Gg" "Cc"
> paste0(dna1, dna2) # paste0 defaults to sep=""
[1] "Aa" "Tt" "Gg" "Cc"
> # we can specify whatever separator we desire:
> paste(dna1, dna2, sep="*")
[1] "A*a" "T*t" "G*g" "C*c"
> paste(dna1, dna2, sep="+++")
[1] "A+++a" "T+++t" "G+++g" "C+++c"
> # we can also "collapse" the output to length 1
> paste(dna1, dna2, sep="*", collapse="_")
[1] "A*a_T*t_G*g_C*c"
> # collapse works on single vectors too:
> paste(dna1, collapse="-")
[1] "A-T-G-C"
> # whereas the `sep` argument does not apply
> # to single vector inputs
> paste(dna1, sep="-")
[1] "A" "T" "G" "C"
> 
> # many functions accept character inputs
> rep(2, times=10)
 [1] 2 2 2 2 2 2 2 2 2 2
> rep(c("a", "B"), times=5)
 [1] "a" "B" "a" "B" "a" "B" "a" "B" "a" "B"
> rep(c("a", "B"), times=c(5, 2))
[1] "a" "a" "a" "a" "a" "B" "B"
> rep(c("a", "B"), each=5)
 [1] "a" "a" "a" "a" "a" "B" "B" "B" "B" "B"

Here, two useful character functions are introduced: substring and strsplit. The substring function enables the extraction of parts of each character element. The strsplit function will burst each character element at a split point. Note that substring returns a character object, whereas strsplit returns a list.

# common functions acting on character vectors
> a <- c(A="One", B="Two", C="Three")
> # alternatively:
> a <- setNames(c("One", "Two", "Three"), c("A", "B", "C"))
> nchar(a)
A B C 
3 3 5 
> substring(a, 1, 2) # extract substrings
   A    B    C 
"On" "Tw" "Th" 
> substring(a, 4, 5) # beyond the nchar of a[1:2]
   A    B    C 
  ""   "" "ee" 
> substring(a[3], 3, 5) <- "ird" # can also set
> a
      A       B       C 
  "One"   "Two" "Third" 
> # note: the above replacement only works once per element
> substring(a[3], 3:4, 3:4)
[1] "i" "r"
> substring(a[3], 3:4, 3:4) <- c("1", "2")
> a
      A       B       C 
  "One"   "Two" "Th1rd" 
> 
> # use strsplit to "explode" a character vector
> s <- strsplit(a, "") # returns a list!
> s
$A
[1] "O" "n" "e"


$B
[1] "T" "w" "o"


$C
[1] "T" "h" "1" "r" "d"


> s$C
[1] "T" "h" "1" "r" "d"
> s <- unlist(s)
> s <- paste(s, collapse="") # put letters back together
> s # one string
[1] "OneTwoTh1rd"
> substring(s, 1:3, 3:5) # extract multiple substrings
[1] "One" "neT" "eTw"

It is common to display text on the screen with cat (concatenate) or print. The main difference between these two alternatives is that cat is unaware of the input object's class, and simply writes its contents to the screen, whereas print will change how the output looks depending on the input object's class.

dna <- sample(dna1, 50, replace=TRUE)
> print(dna) # pretty formatting depending on input class
 [1] "T" "A" "G" "A" "G" "T" "G" "T" "C" "C" "G" "G"
[13] "G" "A" "C" "G" "G" "A" "A" "A" "T" "A" "T" "G"
[25] "A" "G" "A" "C" "G" "T" "C" "C" "A" "G" "G" "G"
[37] "G" "T" "G" "T" "G" "A" "G" "G" "G" "T" "C" "C"
[49] "G" "G"
> cat(dna) # no pretty formatting
T A G A G T G T C C G G G A C G G A A A T A T G A G A C G T C C A G G G G T G T G A G G G T C C G G
> cat(dna, sep="") # no collapse argument
TAGAGTGTCCGGGACGGAAATATGAGACGTCCAGGGGTGTGAGGGTCCGG
> paste(dna, sep="") # cat behaves differently than paste
 [1] "T" "A" "G" "A" "G" "T" "G" "T" "C" "C" "G" "G"
[13] "G" "A" "C" "G" "G" "A" "A" "A" "T" "A" "T" "G"
[25] "A" "G" "A" "C" "G" "T" "C" "C" "A" "G" "G" "G"
[37] "G" "T" "G" "T" "G" "A" "G" "G" "G" "T" "C" "C"
[49] "G" "G"
> paste(dna, collapse="")
[1] "TAGAGTGTCCGGGACGGAAATATGAGACGTCCAGGGGTGTGAGGGTCCGG"
> 
> # delimiters, escaping quotes
> cat("\tThis is an indented paragraph of text",
+    "that I wish to wrap around the screen.",
+    "\nFollowed by a newline, where I will",
+    "'single' and \"double\" quote some text.")
    This is an indented paragraph of text that I wish
to wrap around the screen. 
Followed by a newline, where I will 'single' and
"double" quote some text.
> cat('" no escape needed if wrapped in single quotes.')
" no escape needed if wrapped in single quotes.
> cat("' no escape needed if wrapped in double quotes.")
' no escape needed if wrapped in double quotes.
> cat("so that 'you'", '"can" alternate', "as \"desired'.")
so that 'you' "can" alternate as "desired'.

< Previous Lesson Next Lesson >