DECIPHER - R Lesson #10

R Lesson #10 - Tabulating imported data

For this lesson, a CRAN Logs dataset is required, which is one of the comma-delimited download available from here. Each of the links contains a file with the package downloads from one day on the RStudio server. Select any day for download, then right-click the link and choose "Download Linked File" (or equivalent). Using a file connection, it is possible to read-in the file without first unzipping it, as shown below.

Hide output

# establish a gzip file connection
> g <- gzfile("<<PATH TO DOWNLOADED file.csv.gz",
+    "r") # open read-only
> r <- read.csv(g,
+    stringsAsFactors=F)
> close(g)
> object.size(r)
37146024 bytes
> dim(r)
[1] 443700     10
> head(r)
        date     time    size r_version r_arch
1 2015-11-21 04:37:01    9252     3.2.2 x86_64
2 2015-11-21 04:37:01  917418     3.2.2 x86_64
3 2015-11-21 04:36:20  398860         
4 2015-11-21 04:36:31    5061         
5 2015-11-21 04:36:53  242622         
6 2015-11-21 04:36:17 2153301         
       r_os              package  version country
1 linux-gnu               bitops    1.0-6      US
2 linux-gnu                RCurl 1.95-4.7      US
3                    permute    0.8-4      US
4                       PGM2      1.0      US
5                  phyloclim    0.9-4      US
6       PerformanceAnalytics 1.4.3541      US
  ip_id
1     1
2     1
3     2
4     2
5     2
6     2
> tail(r)
             date     time    size r_version r_arch
443695 2015-11-21 23:43:33 1776979     3.2.1 x86_64
443696 2015-11-21 23:43:31  504229     3.2.2   i386
443697 2015-11-21 23:43:36   76935     3.2.2 x86_64
443698 2015-11-21 23:43:29  290933     3.2.2 x86_64
443699 2015-11-21 23:43:33  955846     3.2.2 x86_64
443700 2015-11-21 23:43:34 1380833     3.2.2 x86_64
               r_os     package version country ip_id
443695 darwin10.8.0      Matrix   1.2-2      US   113
443696      mingw32    reshape2   1.4.1      US  8156
443697      mingw32 DataCombine   0.2.9      US  8156
443698    linux-gnu   iterators   1.0.8      US   113
443699    linux-gnu    jsonlite  0.9.17      CO 17464
443700      mingw32       rtfbs   0.3.4      US 13872

The table function can be used to tabulate data. Given an input vector, it returns the number of occurences of each unique entry.

table(c("D", "A", "B", "B", "D", "D", "C"))


A B C D 
1 2 1 3 
> t1 <- table(r$r_version)
> t1 # note that the output is sorted by name


2.11.1 2.12.0 2.12.1 2.12.2 2.13.1 2.14.0 2.14.1 
    22      1      2     18      6     23     51 
2.14.2 2.15.0 2.15.1 2.15.2 2.15.3  3.0.0  3.0.1 
   249     53    101    293    103    141    355 
 3.0.2  3.0.3  3.1.0  3.1.1  3.1.2  3.1.3  3.2.0 
  5053    571   1692   9329  13475   7643   7847 
 3.2.1  3.2.2  3.3.0 
 13383 256579   2767 
> sort(t1) # sort by increasing number


2.12.0 2.12.1 2.13.1 2.12.2 2.11.1 2.14.0 2.14.1 
     1      2      6     18     22     23     51 
2.15.0 2.15.1 2.15.3  3.0.0 2.14.2 2.15.2  3.0.1 
    53    101    103    141    249    293    355 
 3.0.3  3.1.0  3.3.0  3.0.2  3.1.3  3.2.0  3.1.1 
   571   1692   2767   5053   7643   7847   9329 
 3.2.1  3.1.2  3.2.2 
 13383  13475 256579 
> sort(t1, decreasing=TRUE)


 3.2.2  3.1.2  3.2.1  3.1.1  3.2.0  3.1.3  3.0.2 
256579  13475  13383   9329   7847   7643   5053 
 3.3.0  3.1.0  3.0.3  3.0.1 2.15.2 2.14.2  3.0.0 
  2767   1692    571    355    293    249    141 
2.15.3 2.15.1 2.15.0 2.14.1 2.14.0 2.11.1 2.12.2 
   103    101     53     51     23     22     18 
2.13.1 2.12.1 2.12.0 
     6      2      1 
> sessionInfo() # find your version of R
R version 3.2.2 (2015-08-14)
Platform: x86_64-apple-darwin13.4.0 (64-bit)
Running under: OS X 10.11.1 (El Capitan)


locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8


attached base packages:
[1] stats4    parallel  stats     graphics  grDevices
[6] utils     datasets  methods   base     
> # what fraction have each OS
> t2 <- table(r$r_os)
> t2


                             darwin10.8.0 
                2230                 1662 
        darwin13.1.0         darwin13.3.0 
                1182                    8 
        darwin13.4.0         darwin14.0.0 
               13355                    3 
        darwin14.1.0         darwin14.3.0 
                  21                    8 
        darwin14.4.0         darwin14.5.0 
                  43                  189 
        darwin15.0.0          darwin9.8.0 
                 197                   97 
         freebsd11.0            linux-gnu 
                 382               129812 
     linux-gnueabihf              mingw32 
                  34               170534 
> t2 <- t2/sum(t2) # convert to fraction
> head(sort(t2, decreasing=TRUE))


             mingw32            linux-gnu 
         0.533323743          0.405970784 
        darwin13.4.0
         0.041766091          0.006974046 
        darwin10.8.0         darwin13.1.0 
         0.005197697          0.003696557 
> # top 20 package downloads
> t3 <- table(r$package)
> tail(sort(t3), n=20)


       rJava          DBI      foreach       gtable 
        2555         2609         2715         2771 
    labeling      munsell    dichromat        proto 
        2792         2805         2814         2856 
  colorspace RColorBrewer       scales     reshape2 
        3033         3148         3211         3278 
    magrittr       digest      stringr         curl 
        3298         3321         3457         3463 
        plyr      stringi      ggplot2         Rcpp 
        3702         3965         4013         5465 
> # what countries have the most downloads
> t4 <- table(r$country)
> tail(sort(t4), n=20) # two letter code


    RU     ZA     HK     CH     AU     TH     ES 
  3223   3519   3784   4081   4154   5052   5943 
    CA     FR     BR     JP     KR     DE     IT 
  6468   6485   6742   8288   8887  10118  10186 
    IN     GB     CZ     TW     CN     US 
 10306  12869  15823  24205  60285 196083

The function tapply requires three inputs: a vector (X), an index (INDEX), and a function (FUN). It will apply the input function to the vector in groups according to the index. The type of result depends on what the function returns: if it is always length 1 then the result will be a vector, otherwise it will be a list. This behavior can be changed to always return a list by setting the simplify argument to FALSE.

# tapply - apply a function by group
> tapply(c("A", "B", "C"), # input vector
+    c(1, 1, 2), # indices
+    c) # function
$`1`
[1] "A" "B"


$`2`
[1] "C"


> # note that the result is a list
> tapply(c("A", "B", "C"),
+    c(1, 1, 2),
+    paste,
+    collapse="") # extra parameter
   1    2 
"AB"  "C" 
> # note that the result is a vector
> tapply(c("A", "B", "C"),
+    c(2, 1, 1),
+    paste,
+    collapse="")
   1    2 
"BC"  "A" 
> tapply(c("A", "B", "C"),
+    c(2, 1, 1),
+    paste,
+    collapse="",
+    simplify=FALSE) # return a list
$`1`
[1] "BC"


$`2`
[1] "A"


> # note that the result is always sorted by name

For the next commands, it is necessary to download the file here. This tab-delimited file contains a set of TV ratings from participants in the Global Episode Opinion Survey (GEOS).

# load in the GEOS dataset
> tv <- read.table("<<PATH TO GEOS_TVSeries.txt>>",
+    header=T,
+    sep="\t",
+    stringsAsFactors=FALSE)
> head(tv)
  Series        Episode Rating StdDev Count
1  Alias  Truth Be Told   8.74   1.58   194
2  Alias   So It Begins   8.34   1.65   179
3  Alias         Parity   8.31   1.71   167
4  Alias A Broken Heart   8.09   1.82   161
5  Alias    Doppelgnger   8.30   1.74   151
6  Alias      Reckoning   8.38   1.76   156
  Episode.number
1              1
2              2
3              3
4              4
5              5
6              6
> 
> t5 <- tapply(tv$Count, # data
+    tv$Series, # index
+    sum, # function
+    na.rm=TRUE) # extra arguments
> # note that the output is sorted by name
> head(t5) # the output is tagged by name
             24           Alias    Alien Nation 
          13253           10837             264 
American Gothic       Andromeda           Angel 
            711            9738           33275 
> tail(sort(t5), n=10) # most rated TV Series
             Doctor Who (1963) 
                         27039 
Star Trek: The Original Series 
                         30920 
                         Angel 
                         33275 
                 Stargate SG-1 
                         45493 
                   The X-Files 
                         47387 
                     Babylon 5 
                         56176 
      Buffy the Vampire Slayer 
                         62218 
Star Trek: The Next Generation 
                         62323 
    Star Trek: Deep Space Nine 
                         79348 
            Star Trek: Voyager 
                         82279 
> t6 <- tapply(tv$Rating,
+    tv$Series,
+    mean,
+    na.rm=TRUE)
> tail(sort(t6), n=10) # top 10 TV series
         Wonderfalls           Twin Peaks 
            8.685385             8.737742 
                  Oz         Carnivle 
            8.739464             8.742500 
Arrested Development             Deadwood 
            8.769434             8.801111 
            The Wire      Game of Thrones 
            8.847833             8.848500 
             Firefly     Band Of Brothers 
            8.858235             9.113000 
> 
> # weighted (by Count) average Rating
> t7 <- tapply(tv$Count*tv$Rating,
+    tv$Series,
+    sum,
+    na.rm=TRUE)
> w.avg <- t7/t5
> tail(sort(w.avg), n=10) # top 10 TV series
        Carnivle                   Oz 
            8.736204             8.739646 
              Dexter           Twin Peaks 
            8.763119             8.771006 
Arrested Development             Deadwood 
            8.771099             8.802536 
     Game of Thrones             The Wire 
            8.845765             8.851126 
             Firefly     Band Of Brothers 
            8.874777             9.112297 
> head(sort(w.avg), n=10) # worst 10 TV series
Mystery Science Theater 3000 
                    5.576703 
                   Threshold 
                    5.772278 
                  Family Guy 
                    5.850557 
                     Surface 
                    5.950123 
                Knight Rider 
                    6.053208 
                      Scrubs 
                    6.149460 
                  Space 1999 
                    6.178289 
               Lost In Space 
                    6.182098 
        Desperate Housewives 
                    6.188700 
                  Highlander 
                    6.194055 
> plot(w.avg,
+    pch=NA, # plot without points
+    xaxt='n', # no x-axis
+    xlab="", # no x-axis label
+    ylab="Weighted Average Rating")
> # display weighted average ratings
> # in alphabetical order
> text(x=1:length(w.avg), y=w.avg,
+    labels=names(w.avg),
+    cex=0.5)
> 
> # table is the same as tapply with FUN=length
> x <- tv$Series
> t8.table <- table(x) # count the number of series
> tail(sort(t8.table), n=10) # longest running TV Series
x
                            24 
                           209 
  Mystery Science Theater 3000 
                           209 
                    Smallville 
                           218 
                 Stargate SG-1 
                           223 
                       Friends 
                           239 
                    Family Guy 
                           267 
                          NCIS 
                           269 
                    South Park 
                           276 
CSI: Crime Scene Investigation 
                           330 
                  The Simpsons 
                           615 
> t8.tapply <- tapply(x,
+    x,
+    length)
> all(t8.table==t8.tapply)
[1] TRUE

It is sometimes useful to make a new index by merging the data in different vectors. Here, paste is used to create a new index from two columns. Using this new index with tapply breaks the output into more groups.

# using the CRAN dataset from before:
> index <- paste(r$r_version, r$r_os)
> # there is now a new indexing variable
> t9 <- table(index) # table of the new index
> tail(sort(t9)) # sometimes the version/os is missing
index
3.2.2 darwin13.4.0      3.2.1 mingw32 
              6547               8723 
     3.1.2 mingw32    3.2.2 linux-gnu 
             10609             115141 
             NA NA      3.2.2 mingw32 
            123943             131887 
> # now sum download sizes by the new index
> t10 <- tapply(as.numeric(r$size), # coerce to number
+    index,
+    sum,
+    na.rm=TRUE)
> tail(sort(t10), n=10) # biggest downloads by index
     3.1.1 mingw32    3.1.1 linux-gnu 
        3648520410         4431702724 
     3.2.0 mingw32      3.1.3 mingw32 
        5084462032         5732757172 
3.2.2 darwin13.4.0      3.2.1 mingw32 
        6772883004         9180003054 
     3.1.2 mingw32              NA NA 
       10819789887        60369378298 
   3.2.2 linux-gnu      3.2.2 mingw32 
      111322007214       137171388391

Histograms are a visual way of depicting some of the same results obtained above.

hist(tv$Rating)
> abline(v=mean(tv$Rating, na.rm=TRUE),
+    lty=2, lwd=2)
> hist(tv$Rating, breaks=100)
> hist(tv$Count,
+    main="TV Series Episodes",
+    xlab="Number of Ratings")
> hist(tv$StdDev,
+    freq=F, # show Density
+    xlab="Standard Deviation of Ratings",
+    col="lightyellow")

< Previous Lesson Next Lesson >