R Lesson #14 - Introduction to Biostrings
For this lesson we are going to be introduced to the Biostrings package, which is a popular way to work with biological sequences in R. First, we must ensure Biostrings is installed before we can load the library.1library(Biostrings) # load the Biostrings package
2dna <- DNAStringSet(c("ACTG", "NSYR", "CG"))
345dna[[1]]dna[1]dna[1:2]
678rna <- RNAStringSet(dna)rnadna==rna
9IUPAC_CODE_MAP
1011neditAt(dna[[1]], dna[[2]], fixed=FALSE) # 0 mismatchesneditAt(dna[[1]], dna[[2]], fixed=TRUE) # 4 mismatches
1213align <- DNAStringSet(c(".+AC-TG.", ".+ACCTG."))align
14151617length(align)unique(align)names(align) <- c("one", "two")align
181920alphabet(align)width(align) # analogous to nchar()alphabetFrequency(align)
21222324-252627-28fas1 <- system.file("extdata", "dm3_upstream2000.fa.gz", package="Biostrings")flyDNA <- readDNAStringSet(fas1) # This set contains 26,454 sequences that originated fromthe 2,000 nucleotides upstream of annotated transcriptionstart sites in the reference D. melanogaster genome. flyDNA
293031oligonucleotideFrequency(flyDNA, width=3, simplify.as="collapse")
3233s <- sample(length(flyDNA), 1000)subseq(flyDNA[s], 1, 100)
3435at1 <- IRanges(start=1, end=100)extractAt(flyDNA, at1) # DNAStringSetList
3637unlist(extractAt(flyDNA, at1)) # DNAStringSetunlist(unlist(extractAt(flyDNA, at1))) # DNAString
3839404142at2 <- sample(100, length(flyDNA), replace=TRUE)at2 <- lapply(at2, IRanges, width=1)at2 <- as(at2, "IRangesList")at2unlist(extractAt(flyDNA, at2))
434445v1 <- vmatchPattern("ACTG", flyDNA)v1unlist(extractAt(flyDNA, v1))
4647484950v2 <- vmatchPattern(pattern="AACTGNCATCG", subject=flyDNA, fixed="subject", max.mismatch=1)unlist(extractAt(flyDNA, v2))
5152replaceAt(flyDNA, v2) # deletes the matchesreplaceAt(flyDNA, v2, "+") # replaces the matches
5354555657unlist(flyDNA)pal <- findPalindromes(unlist(flyDNA), min.armlength=8, max.looplength=10, max.mismatch=1)
58596061626364656667686970717273bounds <- c(1, cumsum(width(flyDNA)))head(bounds) # boundaries of each sequencestarts <- start(pal)ends <- end(pal)cbind(head(starts), head(ends)) # pairs of positionsseq <- 1L # current sequencecounts <- integer(2000) # counts for upstream positionsfor (i in seq_along(starts)) { while (ends[i] >= bounds[seq + 1L]) { seq <- seq + 1L } if (starts[i] >= bounds[seq]) { range <- starts[i]:ends[i] - bounds[seq] + 1L counts[range] <- counts[range] + 1L }}
7475767778plot(-rev(seq_along(counts)), counts, type="l", xlab="Positions from transcription start site", ylab="Number of palindromes covering position")
7980818283fas2 <- system.file("extdata", "someorf.fa.gz", package="Biostrings")ORFs <- readDNAStringSet(fas2)ORFs
8485ORFs <- subseq(ORFs, 1001, -1001)ORFs
86aa <- translate(ORFs)
87vmatchPattern("*", aa)