R Lesson #11 - Apply functions
The apply suite of functions provide a way of applying a function to subsets of different data structures. The equivalent is often called a Map in other programming languages.# Which map should I use?
> # INPUT OUTPUT MAP
> # vector, groups variable tapply
> # vector/list list lapply
> # vector/list variable sapply
> # matrix/array vector/array apply
> # vectors variable mapply
>
> # lapply - the most commonly used map
>
> l <- list(x=1, y=-1:-3, z=matrix(1:6, nrow=2))
> l
$x
[1] 1
$y
[1] -1 -2 -3
$z
[,1] [,2] [,3]
[1,] 1 3 5
[2,] 2 4 6
>
> f <- function(x) {
+ return(x)
+ }
> lapply(l$y, # vector input
+ f) # unnamed list output
[[1]]
[1] -1
[[2]]
[1] -2
[[3]]
[1] -3
> lapply(l, # list input
+ f) # named list output
$x
[1] 1
$y
[1] -1 -2 -3
$z
[,1] [,2] [,3]
[1,] 1 3 5
[2,] 2 4 6
>
> lapply(l, sum)
$x
[1] 1
$y
[1] -6
$z
[1] 21
> lapply(l, length)
$x
[1] 1
$y
[1] 3
$z
[1] 6
> lapply(l, range)
$x
[1] 1 1
$y
[1] -3 -1
$z
[1] 1 6
# sapply - a user-friendly wrapper for lapply
> # in many cases equivalent to unlist(lapply())
>
> sapply(l$y, # vector input
+ f) # vector output
[1] -1 -2 -3
> sapply(l, # list input
+ f) # list output
$x
[1] 1
$y
[1] -1 -2 -3
$z
[,1] [,2] [,3]
[1,] 1 3 5
[2,] 2 4 6
> sapply(l, sum)
x y z
1 -6 21
> sapply(l, length)
x y z
1 3 6
> sapply(l, range)
x y z
[1,] 1 -3 1
[2,] 1 -1 6
>
> # mapply - same elements of multiple vectors
> # essentially a multi-vector version of sapply
>
> mapply(rep, x=1:4, times=4:1) # list output
[[1]]
[1] 1 1 1 1
[[2]]
[1] 2 2 2
[[3]]
[1] 3 3
[[4]]
[1] 4
> mapply(rep, x=1:4, times=4) # matrix output
[,1] [,2] [,3] [,4]
[1,] 1 2 3 4
[2,] 1 2 3 4
[3,] 1 2 3 4
[4,] 1 2 3 4
> mapply(seq, from=1:10, to=10:1) # list output
[[1]]
[1] 1 2 3 4 5 6 7 8 9 10
[[2]]
[1] 2 3 4 5 6 7 8 9
[[3]]
[1] 3 4 5 6 7 8
[[4]]
[1] 4 5 6 7
[[5]]
[1] 5 6
[[6]]
[1] 6 5
[[7]]
[1] 7 6 5 4
[[8]]
[1] 8 7 6 5 4 3
[[9]]
[1] 9 8 7 6 5 4 3 2
[[10]]
[1] 10 9 8 7 6 5 4 3 2 1
> mapply(seq, from=1:10, to=1:10) # vector output
[1] 1 2 3 4 5 6 7 8 9 10
> mapply(sample, x=10:3, size=3) # matrix output
[,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8]
[1,] 6 8 5 1 6 1 2 1
[2,] 5 9 2 7 1 3 1 2
[3,] 1 6 4 4 3 5 4 3
# apply - rows or columns of matrices
> l$z
[,1] [,2] [,3]
[1,] 1 3 5
[2,] 2 4 6
> apply(l$z, 1, sum)
[1] 9 12
> rowSums(l$z) # faster
[1] 9 12
> apply(l$z, 2, min) # there is no colMins function
[1] 1 3 5
> apply(l$z, 2, length)
[1] 2 2 2
> apply(l$z, 1, range) # returns a matrix
[,1] [,2]
[1,] 1 2
[2,] 5 6
>
> a <- array(1:24, dim=2:4)
> a
, , 1
[,1] [,2] [,3]
[1,] 1 3 5
[2,] 2 4 6
, , 2
[,1] [,2] [,3]
[1,] 7 9 11
[2,] 8 10 12
, , 3
[,1] [,2] [,3]
[1,] 13 15 17
[2,] 14 16 18
, , 4
[,1] [,2] [,3]
[1,] 19 21 23
[2,] 20 22 24
> a[1,,]
[,1] [,2] [,3] [,4]
[1,] 1 7 13 19
[2,] 3 9 15 21
[3,] 5 11 17 23
> zzz <- apply(a, 1, print)
[,1] [,2] [,3] [,4]
[1,] 1 7 13 19
[2,] 3 9 15 21
[3,] 5 11 17 23
[,1] [,2] [,3] [,4]
[1,] 2 8 14 20
[2,] 4 10 16 22
[3,] 6 12 18 24
> apply(a, 1, sum)
[1] 144 156
> a[,, 1]
[,1] [,2] [,3]
[1,] 1 3 5
[2,] 2 4 6
> zzz <- apply(a, 3, print)
[,1] [,2] [,3]
[1,] 1 3 5
[2,] 2 4 6
[,1] [,2] [,3]
[1,] 7 9 11
[2,] 8 10 12
[,1] [,2] [,3]
[1,] 13 15 17
[2,] 14 16 18
[,1] [,2] [,3]
[1,] 19 21 23
[2,] 20 22 24
> apply(a, 3, min)
[1] 1 7 13 19
>
> # combinations of dimensions
> a[1, 1,]
[1] 1 7 13 19
> zzz <- apply(a, c(1, 2), print)
[1] 1 7 13 19
[1] 2 8 14 20
[1] 3 9 15 21
[1] 4 10 16 22
[1] 5 11 17 23
[1] 6 12 18 24
> apply(a, c(1, 2), sum) # every combo of row & col
[,1] [,2] [,3]
[1,] 40 48 56
[2,] 44 52 60
> zzz <- apply(a, c(1, 3), print)
[1] 1 3 5
[1] 2 4 6
[1] 7 9 11
[1] 8 10 12
[1] 13 15 17
[1] 14 16 18
[1] 19 21 23
[1] 20 22 24
> apply(a, c(1, 3), range) # output is an array
, , 1
[,1] [,2]
[1,] 1 2
[2,] 5 6
, , 2
[,1] [,2]
[1,] 7 8
[2,] 11 12
, , 3
[,1] [,2]
[1,] 13 14
[2,] 17 18
, , 4
[,1] [,2]
[1,] 19 20
[2,] 23 24
r <- readLines("<<PATH TO TaxaRanks.txt>>")
> head(r)
[1] "root;Viruses;dsDNA viruses, no RNA stage;Caudovirales;Myoviridae;Peduovirinae;Hpunalikevirus;"
[2] "root;Viruses;dsDNA viruses, no RNA stage;Caudovirales;Myoviridae;Peduovirinae;P2likevirus;"
[3] "root;Viruses;dsDNA viruses, no RNA stage;Caudovirales;Siphoviridae;Spbetalikevirus;"
[4] "root;Viruses;dsDNA viruses, no RNA stage;Herpesvirales;Herpesviridae;Betaherpesvirinae;Cytomegalovirus;"
[5] "root;cellular organisms;Archaea;Crenarchaeota ;Thermoprotei;Acidilobales;Acidilobaceae;Acidilobus;"
[6] "root;cellular organisms;Archaea;Crenarchaeota ;Thermoprotei;Acidilobales;Caldisphaeraceae;Caldisphaera;"
> tail(r)
[1] "root;cellular organisms;Eukaryota;Viridiplantae;Streptophyta;Zygnemophyceae;Zygnematales;Zygnematales incertae sedis;Fottea;"
[2] "root;cellular organisms;Eukaryota;unclassified eukaryotes;Palpitomonas;"
[3] "root;cellular organisms;Eukaryota;unclassified eukaryotes;Picozoa;Picomonadea;Picomonadida;Picomonadidae;Picomonas;"
[4] "root;cellular organisms;Eukaryota;unclassified eukaryotes;Telonemida;Telonema;"
[5] "root;cellular organisms;Eukaryota;unclassified eukaryotes;Trimastix;"
[6] "root;cellular organisms;Eukaryota;unclassified eukaryotes;Tsukubamonadidae;Tsukubamonas;"
> length(r)
[1] 6917
> # search for Humans in the taxonomy
> grep("Homininae", r, fixed=TRUE, value=TRUE)
[1] "root;cellular organisms;Eukaryota;Opisthokonta;Metazoa;Eumetazoa;Bilateria;Deuterostomia;Chordata;Craniata ;Vertebrata ;Gnathostomata ;Teleostomi;Euteleostomi;Sarcopterygii;Dipnotetrapodomorpha;Tetrapoda;Amniota;Mammalia;Theria ;Eutheria;Boreoeutheria;Euarchontoglires;Primates;Haplorrhini;Simiiformes;Catarrhini;Hominoidea;Hominidae;Homininae;Homo;"
> s <- strsplit(r, ";", fixed=TRUE)
> head(s)
[[1]]
[1] "root"
[2] "Viruses"
[3] "dsDNA viruses, no RNA stage"
[4] "Caudovirales"
[5] "Myoviridae"
[6] "Peduovirinae"
[7] "Hpunalikevirus"
[[2]]
[1] "root"
[2] "Viruses"
[3] "dsDNA viruses, no RNA stage"
[4] "Caudovirales"
[5] "Myoviridae"
[6] "Peduovirinae"
[7] "P2likevirus"
[[3]]
[1] "root"
[2] "Viruses"
[3] "dsDNA viruses, no RNA stage"
[4] "Caudovirales"
[5] "Siphoviridae"
[6] "Spbetalikevirus"
[[4]]
[1] "root"
[2] "Viruses"
[3] "dsDNA viruses, no RNA stage"
[4] "Herpesvirales"
[5] "Herpesviridae"
[6] "Betaherpesvirinae"
[7] "Cytomegalovirus"
[[5]]
[1] "root" "cellular organisms"
[3] "Archaea" "Crenarchaeota "
[5] "Thermoprotei" "Acidilobales"
[7] "Acidilobaceae" "Acidilobus"
[[6]]
[1] "root" "cellular organisms"
[3] "Archaea" "Crenarchaeota "
[5] "Thermoprotei" "Acidilobales"
[7] "Caldisphaeraceae" "Caldisphaera"
>
> l <- sapply(s, length)
> all(l==lengths(s))
[1] TRUE
> hist(l)
> s[[which.min(l)]]
[1] "root"
[2] "cellular organisms"
[3] "Archaea"
[4] "Diapherotrites"
[5] "Candidatus Iainarchaeum"
> s[[which.max(l)]]
[1] "root"
[2] "cellular organisms"
[3] "Eukaryota"
[4] "Opisthokonta"
[5] "Metazoa"
[6] "Eumetazoa"
[7] "Bilateria"
[8] "Deuterostomia"
[9] "Chordata"
[10] "Craniata "
[11] "Vertebrata "
[12] "Gnathostomata "
[13] "Teleostomi"
[14] "Euteleostomi"
[15] "Actinopterygii"
[16] "Actinopteri"
[17] "Neopterygii"
[18] "Teleostei"
[19] "Elopocephala"
[20] "Clupeocephala"
[21] "Euteleostei"
[22] "Neognathi"
[23] "Neoteleostei"
[24] "Eurypterygii"
[25] "Ctenosquamata"
[26] "Acanthomorpha"
[27] "Euacanthomorpha"
[28] "Holacanthopterygii"
[29] "Acanthopterygii"
[30] "Euacanthopterygii"
[31] "Percomorpha"
[32] "Perciformes"
[33] "Labroidei"
[34] "Cichlidae"
[35] "African cichlids"
[36] "Pseudocrenilabrinae"
[37] "Oreochromini"
[38] "Oreochromis"
>
> # select one level of the ranking from each
> select <- function(x, level) {
+ x[level]
+ }
> # note the extra parameter being passed
> level1 <- sapply(s, select, level=1)
> table(level1)
level1
root
6917
> level2 <- sapply(s, select, level=2)
> table(level2)
level2
cellular organisms Viruses
6913 4
> level3 <- sapply(s, select, level=3)
> table(level3)
level3
Archaea
97
Bacteria
1212
dsDNA viruses, no RNA stage
4
Eukaryota
5604
> level4 <- sapply(s, select, level=4)
> table(level4)
level4
Actinobacteria <phylum>
139
Alveolata
60
Amoebozoa
9
Apusozoa
8
Aquificae <phylum>
9
Armatimonadetes
2
Bacteroidetes/Chlorobi group
186
Breviatea
1
Caldiserica
1
Caudovirales
3
Centroheliozoa
3
Chlamydiae/Verrucomicrobia group
20
Chloroflexi <phylum>
13
Chrysiogenetes <phylum>
2
Crenarchaeota <phylum>
21
Cryptophyta
5
Cyanobacteria
60
Deferribacteres <phylum>
6
Deinococcus-Thermus
6
Diapherotrites
1
Dictyoglomi <phylum>
1
Elusimicrobia <phylum>
1
Euglenozoa
14
Euryarchaeota
67
Fibrobacteres/Acidobacteria group
9
Firmicutes
171
Fornicata
3
Fusobacteria <phylum>
8
Gemmatimonadetes <phylum>
1
Glaucocystophyceae
4
Haptophyceae
7
Herpesvirales
1
Heterolobosea
1
Jakobida
5
Katablepharidophyta
3
Korarchaeota
1
Malawimonadidae
1
Nitrospinae
1
Nitrospirae
4
Opisthokonta
3868
Parabasalia
2
Parvarchaeota
2
Planctomycetes
14
Proteobacteria
508
Rhizaria
23
Rhodophyta
239
Spirochaetes <phylum>
8
Stramenopiles
117
Synergistetes
9
Tenericutes
7
Thaumarchaeota
5
Thermodesulfobacteria <phylum>
2
Thermotogae <phylum>
7
unclassified Bacteria
17
unclassified eukaryotes
5
Viridiplantae
1226