R Lesson #10 - Tabulating imported data
For this lesson, a CRAN Logs dataset is required, which is one of the comma-delimited download available from here. Each of the links contains a file with the package downloads from one day on the RStudio server. Select any day for download, then right-click the link and choose "Download Linked File" (or equivalent). Using a file connection, it is possible to read-in the file without first unzipping it, as shown below.# establish a gzip file connection
> g <- gzfile("<<PATH TO DOWNLOADED file.csv.gz",
+ "r") # open read-only
> r <- read.csv(g,
+ stringsAsFactors=F)
> close(g)
> object.size(r)
37146024 bytes
> dim(r)
[1] 443700 10
> head(r)
date time size r_version r_arch
1 2015-11-21 04:37:01 9252 3.2.2 x86_64
2 2015-11-21 04:37:01 917418 3.2.2 x86_64
3 2015-11-21 04:36:20 398860
4 2015-11-21 04:36:31 5061
5 2015-11-21 04:36:53 242622
6 2015-11-21 04:36:17 2153301
r_os package version country
1 linux-gnu bitops 1.0-6 US
2 linux-gnu RCurl 1.95-4.7 US
3 permute 0.8-4 US
4 PGM2 1.0 US
5 phyloclim 0.9-4 US
6 PerformanceAnalytics 1.4.3541 US
ip_id
1 1
2 1
3 2
4 2
5 2
6 2
> tail(r)
date time size r_version r_arch
443695 2015-11-21 23:43:33 1776979 3.2.1 x86_64
443696 2015-11-21 23:43:31 504229 3.2.2 i386
443697 2015-11-21 23:43:36 76935 3.2.2 x86_64
443698 2015-11-21 23:43:29 290933 3.2.2 x86_64
443699 2015-11-21 23:43:33 955846 3.2.2 x86_64
443700 2015-11-21 23:43:34 1380833 3.2.2 x86_64
r_os package version country ip_id
443695 darwin10.8.0 Matrix 1.2-2 US 113
443696 mingw32 reshape2 1.4.1 US 8156
443697 mingw32 DataCombine 0.2.9 US 8156
443698 linux-gnu iterators 1.0.8 US 113
443699 linux-gnu jsonlite 0.9.17 CO 17464
443700 mingw32 rtfbs 0.3.4 US 13872
table(c("D", "A", "B", "B", "D", "D", "C"))
A B C D
1 2 1 3
> t1 <- table(r$r_version)
> t1 # note that the output is sorted by name
2.11.1 2.12.0 2.12.1 2.12.2 2.13.1 2.14.0 2.14.1
22 1 2 18 6 23 51
2.14.2 2.15.0 2.15.1 2.15.2 2.15.3 3.0.0 3.0.1
249 53 101 293 103 141 355
3.0.2 3.0.3 3.1.0 3.1.1 3.1.2 3.1.3 3.2.0
5053 571 1692 9329 13475 7643 7847
3.2.1 3.2.2 3.3.0
13383 256579 2767
> sort(t1) # sort by increasing number
2.12.0 2.12.1 2.13.1 2.12.2 2.11.1 2.14.0 2.14.1
1 2 6 18 22 23 51
2.15.0 2.15.1 2.15.3 3.0.0 2.14.2 2.15.2 3.0.1
53 101 103 141 249 293 355
3.0.3 3.1.0 3.3.0 3.0.2 3.1.3 3.2.0 3.1.1
571 1692 2767 5053 7643 7847 9329
3.2.1 3.1.2 3.2.2
13383 13475 256579
> sort(t1, decreasing=TRUE)
3.2.2 3.1.2 3.2.1 3.1.1 3.2.0 3.1.3 3.0.2
256579 13475 13383 9329 7847 7643 5053
3.3.0 3.1.0 3.0.3 3.0.1 2.15.2 2.14.2 3.0.0
2767 1692 571 355 293 249 141
2.15.3 2.15.1 2.15.0 2.14.1 2.14.0 2.11.1 2.12.2
103 101 53 51 23 22 18
2.13.1 2.12.1 2.12.0
6 2 1
> sessionInfo() # find your version of R
R version 3.2.2 (2015-08-14)
Platform: x86_64-apple-darwin13.4.0 (64-bit)
Running under: OS X 10.11.1 (El Capitan)
locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
attached base packages:
[1] stats4 parallel stats graphics grDevices
[6] utils datasets methods base
> # what fraction have each OS
> t2 <- table(r$r_os)
> t2
darwin10.8.0
2230 1662
darwin13.1.0 darwin13.3.0
1182 8
darwin13.4.0 darwin14.0.0
13355 3
darwin14.1.0 darwin14.3.0
21 8
darwin14.4.0 darwin14.5.0
43 189
darwin15.0.0 darwin9.8.0
197 97
freebsd11.0 linux-gnu
382 129812
linux-gnueabihf mingw32
34 170534
> t2 <- t2/sum(t2) # convert to fraction
> head(sort(t2, decreasing=TRUE))
mingw32 linux-gnu
0.533323743 0.405970784
darwin13.4.0
0.041766091 0.006974046
darwin10.8.0 darwin13.1.0
0.005197697 0.003696557
> # top 20 package downloads
> t3 <- table(r$package)
> tail(sort(t3), n=20)
rJava DBI foreach gtable
2555 2609 2715 2771
labeling munsell dichromat proto
2792 2805 2814 2856
colorspace RColorBrewer scales reshape2
3033 3148 3211 3278
magrittr digest stringr curl
3298 3321 3457 3463
plyr stringi ggplot2 Rcpp
3702 3965 4013 5465
> # what countries have the most downloads
> t4 <- table(r$country)
> tail(sort(t4), n=20) # two letter code
RU ZA HK CH AU TH ES
3223 3519 3784 4081 4154 5052 5943
CA FR BR JP KR DE IT
6468 6485 6742 8288 8887 10118 10186
IN GB CZ TW CN US
10306 12869 15823 24205 60285 196083
# tapply - apply a function by group
> tapply(c("A", "B", "C"), # input vector
+ c(1, 1, 2), # indices
+ c) # function
$`1`
[1] "A" "B"
$`2`
[1] "C"
> # note that the result is a list
> tapply(c("A", "B", "C"),
+ c(1, 1, 2),
+ paste,
+ collapse="") # extra parameter
1 2
"AB" "C"
> # note that the result is a vector
> tapply(c("A", "B", "C"),
+ c(2, 1, 1),
+ paste,
+ collapse="")
1 2
"BC" "A"
> tapply(c("A", "B", "C"),
+ c(2, 1, 1),
+ paste,
+ collapse="",
+ simplify=FALSE) # return a list
$`1`
[1] "BC"
$`2`
[1] "A"
> # note that the result is always sorted by name
# load in the GEOS dataset
> tv <- read.table("<<PATH TO GEOS_TVSeries.txt>>",
+ header=T,
+ sep="\t",
+ stringsAsFactors=FALSE)
> head(tv)
Series Episode Rating StdDev Count
1 Alias Truth Be Told 8.74 1.58 194
2 Alias So It Begins 8.34 1.65 179
3 Alias Parity 8.31 1.71 167
4 Alias A Broken Heart 8.09 1.82 161
5 Alias Doppelgnger 8.30 1.74 151
6 Alias Reckoning 8.38 1.76 156
Episode.number
1 1
2 2
3 3
4 4
5 5
6 6
>
> t5 <- tapply(tv$Count, # data
+ tv$Series, # index
+ sum, # function
+ na.rm=TRUE) # extra arguments
> # note that the output is sorted by name
> head(t5) # the output is tagged by name
24 Alias Alien Nation
13253 10837 264
American Gothic Andromeda Angel
711 9738 33275
> tail(sort(t5), n=10) # most rated TV Series
Doctor Who (1963)
27039
Star Trek: The Original Series
30920
Angel
33275
Stargate SG-1
45493
The X-Files
47387
Babylon 5
56176
Buffy the Vampire Slayer
62218
Star Trek: The Next Generation
62323
Star Trek: Deep Space Nine
79348
Star Trek: Voyager
82279
> t6 <- tapply(tv$Rating,
+ tv$Series,
+ mean,
+ na.rm=TRUE)
> tail(sort(t6), n=10) # top 10 TV series
Wonderfalls Twin Peaks
8.685385 8.737742
Oz Carnivle
8.739464 8.742500
Arrested Development Deadwood
8.769434 8.801111
The Wire Game of Thrones
8.847833 8.848500
Firefly Band Of Brothers
8.858235 9.113000
>
> # weighted (by Count) average Rating
> t7 <- tapply(tv$Count*tv$Rating,
+ tv$Series,
+ sum,
+ na.rm=TRUE)
> w.avg <- t7/t5
> tail(sort(w.avg), n=10) # top 10 TV series
Carnivle Oz
8.736204 8.739646
Dexter Twin Peaks
8.763119 8.771006
Arrested Development Deadwood
8.771099 8.802536
Game of Thrones The Wire
8.845765 8.851126
Firefly Band Of Brothers
8.874777 9.112297
> head(sort(w.avg), n=10) # worst 10 TV series
Mystery Science Theater 3000
5.576703
Threshold
5.772278
Family Guy
5.850557
Surface
5.950123
Knight Rider
6.053208
Scrubs
6.149460
Space 1999
6.178289
Lost In Space
6.182098
Desperate Housewives
6.188700
Highlander
6.194055
> plot(w.avg,
+ pch=NA, # plot without points
+ xaxt='n', # no x-axis
+ xlab="", # no x-axis label
+ ylab="Weighted Average Rating")
> # display weighted average ratings
> # in alphabetical order
> text(x=1:length(w.avg), y=w.avg,
+ labels=names(w.avg),
+ cex=0.5)
>
> # table is the same as tapply with FUN=length
> x <- tv$Series
> t8.table <- table(x) # count the number of series
> tail(sort(t8.table), n=10) # longest running TV Series
x
24
209
Mystery Science Theater 3000
209
Smallville
218
Stargate SG-1
223
Friends
239
Family Guy
267
NCIS
269
South Park
276
CSI: Crime Scene Investigation
330
The Simpsons
615
> t8.tapply <- tapply(x,
+ x,
+ length)
> all(t8.table==t8.tapply)
[1] TRUE
# using the CRAN dataset from before:
> index <- paste(r$r_version, r$r_os)
> # there is now a new indexing variable
> t9 <- table(index) # table of the new index
> tail(sort(t9)) # sometimes the version/os is missing
index
3.2.2 darwin13.4.0 3.2.1 mingw32
6547 8723
3.1.2 mingw32 3.2.2 linux-gnu
10609 115141
NA NA 3.2.2 mingw32
123943 131887
> # now sum download sizes by the new index
> t10 <- tapply(as.numeric(r$size), # coerce to number
+ index,
+ sum,
+ na.rm=TRUE)
> tail(sort(t10), n=10) # biggest downloads by index
3.1.1 mingw32 3.1.1 linux-gnu
3648520410 4431702724
3.2.0 mingw32 3.1.3 mingw32
5084462032 5732757172
3.2.2 darwin13.4.0 3.2.1 mingw32
6772883004 9180003054
3.1.2 mingw32 NA NA
10819789887 60369378298
3.2.2 linux-gnu 3.2.2 mingw32
111322007214 137171388391
hist(tv$Rating)
> abline(v=mean(tv$Rating, na.rm=TRUE),
+ lty=2, lwd=2)
> hist(tv$Rating, breaks=100)
> hist(tv$Count,
+ main="TV Series Episodes",
+ xlab="Number of Ratings")
> hist(tv$StdDev,
+ freq=F, # show Density
+ xlab="Standard Deviation of Ratings",
+ col="lightyellow")