R Lesson #5 - Character basics
Despite its name, the character type can be used to store single letters, words, sentences, paragraphs, and numbers as text. In this sense, 'character' is short for 'character strings', which are commonly called 'strings' in many other programming languages. This lesson describes basic character manipulation, printing to the screen, and named indexing.# named indexing - indexing by character names
> x <- 1:26
> names(x) # NULL is a special object (undefined, length 0)
NULL
> names(x) <- letters
> x
a b c d e f g h i j k l m n o p q r
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18
s t u v w x y z
19 20 21 22 23 24 25 26
> x["a"] # equivalent to x[1]
a
1
> x[c("a", "d")] # x[c(1, 4)]
a d
1 4
> y <- sample(letters, 100, replace=TRUE)
> x[y] # index with another variable
y p v i w l p q c m n h n y d w n h
25 16 22 9 23 12 16 17 3 13 14 8 14 25 4 23 14 8
q k v h h f a d p j f n x h i k g b
17 11 22 8 8 6 1 4 16 10 6 14 24 8 9 11 7 2
p w q w i i r p i z m r v i y f f w
16 23 17 23 9 9 18 16 9 26 13 18 22 9 25 6 6 23
f x s i p h d k i n k d b q a o a p
6 24 19 9 16 8 4 11 9 14 11 4 2 17 1 15 1 16
f e a p n p f e y c l x w f r c u u
6 5 1 16 14 16 6 5 25 3 12 24 23 6 18 3 21 21
r l p m x l w f p q
18 12 16 13 24 12 23 6 16 17
>
> z <- list(A=1:10, B=list(C=11:20, D="Choose me!"))
> # function arguments are always set with `=` not `<-`
> z
$A
[1] 1 2 3 4 5 6 7 8 9 10
$B
$B$C
[1] 11 12 13 14 15 16 17 18 19 20
$B$D
[1] "Choose me!"
> names(z)
[1] "A" "B"
> z[[1]]
[1] 1 2 3 4 5 6 7 8 9 10
> z$A # indexing by name
[1] 1 2 3 4 5 6 7 8 9 10
> z$B
$C
[1] 11 12 13 14 15 16 17 18 19 20
$D
[1] "Choose me!"
> z$B$D
[1] "Choose me!"
> z["A"]
$A
[1] 1 2 3 4 5 6 7 8 9 10
> z[["A"]]
[1] 1 2 3 4 5 6 7 8 9 10
>
> # familiar classes handle character data too:
> m <- matrix(letters[1:25], nrow=5)
> m
[,1] [,2] [,3] [,4] [,5]
[1,] "a" "f" "k" "p" "u"
[2,] "b" "g" "l" "q" "v"
[3,] "c" "h" "m" "r" "w"
[4,] "d" "i" "n" "s" "x"
[5,] "e" "j" "o" "t" "y"
> colnames(m) <- c("c1", "c2", "c3", "c4", "c5")
> rownames(m) <- c("r1", "r2", "r3", "r4", "r5")
> m
c1 c2 c3 c4 c5
r1 "a" "f" "k" "p" "u"
r2 "b" "g" "l" "q" "v"
r3 "c" "h" "m" "r" "w"
r4 "d" "i" "n" "s" "x"
r5 "e" "j" "o" "t" "y"
> a <- array(LETTERS[1:25], dim=c(5, 5))
> class(a) # matrix
[1] "matrix"
> toupper(m)==a # upper-case
c1 c2 c3 c4 c5
r1 TRUE TRUE TRUE TRUE TRUE
r2 TRUE TRUE TRUE TRUE TRUE
r3 TRUE TRUE TRUE TRUE TRUE
r4 TRUE TRUE TRUE TRUE TRUE
r5 TRUE TRUE TRUE TRUE TRUE
> tolower(a)==m # lower-case
c1 c2 c3 c4 c5
r1 TRUE TRUE TRUE TRUE TRUE
r2 TRUE TRUE TRUE TRUE TRUE
r3 TRUE TRUE TRUE TRUE TRUE
r4 TRUE TRUE TRUE TRUE TRUE
r5 TRUE TRUE TRUE TRUE TRUE
dna1 <- c("A", "T", "G", "C")
> dna2 <- c("a", "t", "g", "c")
> # given multiple character vectors, by default
> # paste will merge them with spaces between:
> paste(dna1, dna2)
[1] "A a" "T t" "G g" "C c"
> # the default behavior can be changed:
> paste(dna1, dna2, sep="")
[1] "Aa" "Tt" "Gg" "Cc"
> paste0(dna1, dna2) # paste0 defaults to sep=""
[1] "Aa" "Tt" "Gg" "Cc"
> # we can specify whatever separator we desire:
> paste(dna1, dna2, sep="*")
[1] "A*a" "T*t" "G*g" "C*c"
> paste(dna1, dna2, sep="+++")
[1] "A+++a" "T+++t" "G+++g" "C+++c"
> # we can also "collapse" the output to length 1
> paste(dna1, dna2, sep="*", collapse="_")
[1] "A*a_T*t_G*g_C*c"
> # collapse works on single vectors too:
> paste(dna1, collapse="-")
[1] "A-T-G-C"
> # whereas the `sep` argument does not apply
> # to single vector inputs
> paste(dna1, sep="-")
[1] "A" "T" "G" "C"
>
> # many functions accept character inputs
> rep(2, times=10)
[1] 2 2 2 2 2 2 2 2 2 2
> rep(c("a", "B"), times=5)
[1] "a" "B" "a" "B" "a" "B" "a" "B" "a" "B"
> rep(c("a", "B"), times=c(5, 2))
[1] "a" "a" "a" "a" "a" "B" "B"
> rep(c("a", "B"), each=5)
[1] "a" "a" "a" "a" "a" "B" "B" "B" "B" "B"
# common functions acting on character vectors
> a <- c(A="One", B="Two", C="Three")
> # alternatively:
> a <- setNames(c("One", "Two", "Three"), c("A", "B", "C"))
> nchar(a)
A B C
3 3 5
> substring(a, 1, 2) # extract substrings
A B C
"On" "Tw" "Th"
> substring(a, 4, 5) # beyond the nchar of a[1:2]
A B C
"" "" "ee"
> substring(a[3], 3, 5) <- "ird" # can also set
> a
A B C
"One" "Two" "Third"
> # note: the above replacement only works once per element
> substring(a[3], 3:4, 3:4)
[1] "i" "r"
> substring(a[3], 3:4, 3:4) <- c("1", "2")
> a
A B C
"One" "Two" "Th1rd"
>
> # use strsplit to "explode" a character vector
> s <- strsplit(a, "") # returns a list!
> s
$A
[1] "O" "n" "e"
$B
[1] "T" "w" "o"
$C
[1] "T" "h" "1" "r" "d"
> s$C
[1] "T" "h" "1" "r" "d"
> s <- unlist(s)
> s <- paste(s, collapse="") # put letters back together
> s # one string
[1] "OneTwoTh1rd"
> substring(s, 1:3, 3:5) # extract multiple substrings
[1] "One" "neT" "eTw"
dna <- sample(dna1, 50, replace=TRUE)
> print(dna) # pretty formatting depending on input class
[1] "T" "A" "G" "A" "G" "T" "G" "T" "C" "C" "G" "G"
[13] "G" "A" "C" "G" "G" "A" "A" "A" "T" "A" "T" "G"
[25] "A" "G" "A" "C" "G" "T" "C" "C" "A" "G" "G" "G"
[37] "G" "T" "G" "T" "G" "A" "G" "G" "G" "T" "C" "C"
[49] "G" "G"
> cat(dna) # no pretty formatting
T A G A G T G T C C G G G A C G G A A A T A T G A G A C G T C C A G G G G T G T G A G G G T C C G G
> cat(dna, sep="") # no collapse argument
TAGAGTGTCCGGGACGGAAATATGAGACGTCCAGGGGTGTGAGGGTCCGG
> paste(dna, sep="") # cat behaves differently than paste
[1] "T" "A" "G" "A" "G" "T" "G" "T" "C" "C" "G" "G"
[13] "G" "A" "C" "G" "G" "A" "A" "A" "T" "A" "T" "G"
[25] "A" "G" "A" "C" "G" "T" "C" "C" "A" "G" "G" "G"
[37] "G" "T" "G" "T" "G" "A" "G" "G" "G" "T" "C" "C"
[49] "G" "G"
> paste(dna, collapse="")
[1] "TAGAGTGTCCGGGACGGAAATATGAGACGTCCAGGGGTGTGAGGGTCCGG"
>
> # delimiters, escaping quotes
> cat("\tThis is an indented paragraph of text",
+ "that I wish to wrap around the screen.",
+ "\nFollowed by a newline, where I will",
+ "'single' and \"double\" quote some text.")
This is an indented paragraph of text that I wish
to wrap around the screen.
Followed by a newline, where I will 'single' and
"double" quote some text.
> cat('" no escape needed if wrapped in single quotes.')
" no escape needed if wrapped in single quotes.
> cat("' no escape needed if wrapped in double quotes.")
' no escape needed if wrapped in double quotes.
> cat("so that 'you'", '"can" alternate', "as \"desired'.")
so that 'you' "can" alternate as "desired'.