R Lesson #14 - Introduction to Biostrings
For this lesson we are going to be introduced to the Biostrings package, which is a popular way to work with biological sequences in R. First, we must ensure Biostrings is installed before we can load the library.library(Biostrings) # load the Biostrings package
Loading required package: BiocGenerics
Loading required package: parallel
Attaching package: ‘BiocGenerics’
The following objects are masked from ‘package:parallel’:
clusterApply, clusterApplyLB, clusterCall,
clusterEvalQ, clusterExport, clusterMap,
parApply, parCapply, parLapply,
parLapplyLB, parRapply, parSapply,
parSapplyLB
The following objects are masked from ‘package:stats’:
IQR, mad, sd, var, xtabs
The following objects are masked from ‘package:base’:
anyDuplicated, append, as.data.frame,
basename, cbind, colMeans, colnames,
colSums, dirname, do.call, duplicated,
eval, evalq, Filter, Find, get, grep,
grepl, intersect, is.unsorted, lapply,
lengths, Map, mapply, match, mget, order,
paste, pmax, pmax.int, pmin, pmin.int,
Position, rank, rbind, Reduce, rowMeans,
rownames, rowSums, sapply, setdiff, sort,
table, tapply, union, unique, unsplit,
which, which.max, which.min
Loading required package: S4Vectors
Loading required package: stats4
Attaching package: ‘S4Vectors’
The following object is masked from ‘package:base’:
expand.grid
Loading required package: IRanges
Loading required package: XVector
Attaching package: ‘Biostrings’
The following object is masked from ‘package:base’:
strsplit
dna <- DNAStringSet(c("ACTG", "NSYR", "CG"))
dna[[1]]
4-letter "DNAString" instance
seq: ACTG
> dna[1]
A DNAStringSet instance of length 1
width seq
[1] 4 ACTG
> dna[1:2]
A DNAStringSet instance of length 2
width seq
[1] 4 ACTG
[2] 4 NSYR
rna <- RNAStringSet(dna)
> rna
A RNAStringSet instance of length 3
width seq
[1] 4 ACUG
[2] 4 NSYR
[3] 2 CG
> dna==rna
[1] TRUE TRUE TRUE
IUPAC_CODE_MAP
A C G T M R W
"A" "C" "G" "T" "AC" "AG" "AT"
S Y K V H D B
"CG" "CT" "GT" "ACG" "ACT" "AGT" "CGT"
N
"ACGT"
neditAt(dna[[1]], dna[[2]], fixed=FALSE) # 0 mismatches
[1] 0
> neditAt(dna[[1]], dna[[2]], fixed=TRUE) # 4 mismatches
[1] 4
align <- DNAStringSet(c(".+AC-TG.", ".+ACCTG."))
> align
A DNAStringSet instance of length 2
width seq
[1] 8 .+AC-TG.
[2] 8 .+ACCTG.
length(align)
[1] 2
> unique(align)
A DNAStringSet instance of length 2
width seq
[1] 8 .+AC-TG.
[2] 8 .+ACCTG.
> names(align) <- c("one", "two")
> align
A DNAStringSet instance of length 2
width seq names
[1] 8 .+AC-TG. one
[2] 8 .+ACCTG. two
alphabet(align)
[1] "A" "C" "G" "T" "M" "R" "W" "S" "Y" "K" "V" "H"
[13] "D" "B" "N" "-" "+" "."
> width(align) # analogous to nchar()
[1] 8 8
> alphabetFrequency(align)
A C G T M R W S Y K V H D B N - + .
[1,] 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 1 1 2
[2,] 1 2 1 1 0 0 0 0 0 0 0 0 0 0 0 0 1 2
fas1 <- system.file("extdata",
+ "dm3_upstream2000.fa.gz",
+ package="Biostrings")
> flyDNA <- readDNAStringSet(fas1)
>
> # This set contains 26,454 sequences that originated from
> the 2,000 nucleotides upstream of annotated transcription
> start sites in the reference D. melanogaster genome.
>
> flyDNA
A DNAStringSet instance of length 26454
width seq names
[1] 2000 GTTGGTGG...TGCACGGT NM_078863_up_2000...
[2] 2000 TTATTTAT...TCCTCGAT NM_001201794_up_2...
[3] 2000 TTATTTAT...TCCTCGAT NM_001201795_up_2...
[4] 2000 TTATTTAT...TCCTCGAT NM_001201796_up_2...
[5] 2000 TTATTTAT...TCCTCGAT NM_001201797_up_2...
... ... ...
[26450] 2000 ATTTACAA...AATAAAAC NM_001111010_up_2...
[26451] 2000 GATATACG...TATATATT NM_001015258_up_2...
[26452] 2000 GATATACG...TATATATT NM_001110997_up_2...
[26453] 2000 GATATACG...TATATATT NM_001276245_up_2...
[26454] 2000 CGTATGTA...ACAAATTG NM_001015497_up_2...
oligonucleotideFrequency(flyDNA,
+ width=3,
+ simplify.as="collapse")
AAA AAC AAG AAT ACA ACC
1993983 905456 900748 1465752 954857 547841
ACG ACT AGA AGC AGG AGT
510627 765761 760979 791574 549221 764487
ATA ATC ATG ATT CAA CAC
1171226 790838 880302 1466466 1114071 753589
CAG CAT CCA CCC CCG CCT
790741 875637 855883 479831 515840 561712
CGA CGC CGG CGT CTA CTC
691226 617419 517643 516995 531742 653339
CTG CTT GAA GAC GAG GAT
793467 915254 993671 513881 637293 797460
GCA GCC GCG GCT GGA GGC
915283 693226 618808 798138 677551 698698
GGG GGT GTA GTC GTG GTT
486510 543056 615815 525247 742186 902574
TAA TAC TAG TAT TCA TCC
1163197 605994 537799 1169575 807727 692488
TCG TCT TGA TGC TGG TGT
698648 768187 811903 918120 852311 961474
TTA TTC TTG TTT
1157169 999049 1127636 1986916
s <- sample(length(flyDNA), 1000)
> subseq(flyDNA[s], 1, 100)
A DNAStringSet instance of length 1000
width seq names
[1] 100 AAAAGAGCT...AATCGTGT NM_001273770_up_2...
[2] 100 ATTTGTGCC...CTGTCTTC NM_001170066_up_2...
[3] 100 TTTAAAATG...ATATTGAC NM_001275166_up_2...
[4] 100 CTTTCAAAA...GTTTTGCC NM_001169963_up_2...
[5] 100 CGAGGGCAC...ACAGATTG NM_143785_up_2000...
... ... ...
[996] 100 GAAGCAGGT...GAGTCTAC NM_143123_up_2000...
[997] 100 ATTGTAGTT...CCGGGTGG NM_136715_up_2000...
[998] 100 TCATTTTTG...CTCTCTTA NM_078996_up_2000...
[999] 100 ACGACTTCT...TCGAACTG NM_140033_up_2000...
[1000] 100 CACCGATGG...TTGAAACG NM_001260247_up_2...
at1 <- IRanges(start=1, end=100)
> extractAt(flyDNA, at1) # DNAStringSetList
DNAStringSetList of length 26454
[["NM_078863_up_2000_chr2L_16764737_f chr2L:16764737-16766736"]]
[["NM_001201794_up_2000_chr2L_8382455_f chr2L:8382455-8384454"]]
[["NM_001201795_up_2000_chr2L_8382455_f chr2L:8382455-8384454"]]
[["NM_001201796_up_2000_chr2L_8382455_f chr2L:8382455-8384454"]]
[["NM_001201797_up_2000_chr2L_8382455_f chr2L:8382455-8384454"]]
[["NM_164812_up_2000_chr2L_8382455_f chr2L:8382455-8384454"]]
[["NM_164814_up_2000_chr2L_8382455_f chr2L:8382455-8384454"]]
[["NM_164815_up_2000_chr2L_8382455_f chr2L:8382455-8384454"]]
[["NM_205935_up_2000_chr2L_8382455_f chr2L:8382455-8384454"]]
[["NM_205936_up_2000_chr2L_8382455_f chr2L:8382455-8384454"]]
...
<26444 more elements>
unlist(extractAt(flyDNA, at1)) # DNAStringSet
A DNAStringSet instance of length 26454
width seq names
[1] 100 GTTGGTGG...CGAAAATG NM_078863_up_2000...
[2] 100 TTATTTAT...TTGTTAGG NM_001201794_up_2...
[3] 100 TTATTTAT...TTGTTAGG NM_001201795_up_2...
[4] 100 TTATTTAT...TTGTTAGG NM_001201796_up_2...
[5] 100 TTATTTAT...TTGTTAGG NM_001201797_up_2...
... ... ...
[26450] 100 ATTTACAA...CATGGATC NM_001111010_up_2...
[26451] 100 GATATACG...GGTGCTGG NM_001015258_up_2...
[26452] 100 GATATACG...GGTGCTGG NM_001110997_up_2...
[26453] 100 GATATACG...GGTGCTGG NM_001276245_up_2...
[26454] 100 CGTATGTA...GAAGATTA NM_001015497_up_2...
> unlist(unlist(extractAt(flyDNA, at1))) # DNAString
2645400-letter "DNAString" instance
seq: GTTGGTGGCCCACCAGTGCCAAA...GATTACTGTAGTTAGGAAGATTA
at2 <- sample(100, length(flyDNA), replace=TRUE)
> at2 <- lapply(at2, IRanges, width=1)
> at2 <- as(at2, "IRangesList")
> at2
IRangesList of length 26454
[[1]]
IRanges object with 1 range and 0 metadata columns:
start end width
[1] 67 67 1
[[2]]
IRanges object with 1 range and 0 metadata columns:
start end width
[1] 70 70 1
[[3]]
IRanges object with 1 range and 0 metadata columns:
start end width
[1] 86 86 1
...
<26451 more elements>
> unlist(extractAt(flyDNA, at2))
A DNAStringSet instance of length 26454
width seq names
[1] 1 A NM_078863_up_2000...
[2] 1 C NM_001201794_up_2...
[3] 1 T NM_001201795_up_2...
[4] 1 C NM_001201796_up_2...
[5] 1 C NM_001201797_up_2...
... ... ...
[26450] 1 A NM_001111010_up_2...
[26451] 1 G NM_001015258_up_2...
[26452] 1 A NM_001110997_up_2...
[26453] 1 G NM_001276245_up_2...
[26454] 1 T NM_001015497_up_2...
v1 <- vmatchPattern("ACTG", flyDNA)
> v1
MIndex object of length 26454
$`NM_078863_up_2000_chr2L_16764737_f chr2L:16764737-16766736`
IRanges object with 3 ranges and 0 metadata columns:
start end width
[1] 539 542 4
[2] 1265 1268 4
[3] 1339 1342 4
$`NM_001201794_up_2000_chr2L_8382455_f chr2L:8382455-8384454`
IRanges object with 6 ranges and 0 metadata columns:
start end width
[1] 576 579 4
[2] 843 846 4
[3] 1350 1353 4
[4] 1438 1441 4
[5] 1688 1691 4
[6] 1947 1950 4
$`NM_001201795_up_2000_chr2L_8382455_f chr2L:8382455-8384454`
IRanges object with 6 ranges and 0 metadata columns:
start end width
[1] 576 579 4
[2] 843 846 4
[3] 1350 1353 4
[4] 1438 1441 4
[5] 1688 1691 4
[6] 1947 1950 4
...
<26451 more elements>
> unlist(extractAt(flyDNA, v1))
A DNAStringSet instance of length 188577
width seq names
[1] 4 ACTG NM_078863_up_2000...
[2] 4 ACTG NM_078863_up_2000...
[3] 4 ACTG NM_078863_up_2000...
[4] 4 ACTG NM_001201794_up_2...
[5] 4 ACTG NM_001201794_up_2...
... ... ...
[188573] 4 ACTG NM_001015497_up_2...
[188574] 4 ACTG NM_001015497_up_2...
[188575] 4 ACTG NM_001015497_up_2...
[188576] 4 ACTG NM_001015497_up_2...
[188577] 4 ACTG NM_001015497_up_2...
Warning message:
In .normarg_at2(at, x) : 'at' names were ignored
v2 <- vmatchPattern(pattern="AACTGNCATCG",
+ subject=flyDNA,
+ fixed="subject",
+ max.mismatch=1)
> unlist(extractAt(flyDNA, v2))
A DNAStringSet instance of length 1675
width seq names
[1] 11 AGCTGTCATCG NM_165124_up_2000...
[2] 11 AGCTGTCATCG NM_165123_up_2000...
[3] 11 AGCTGTCATCG NM_001259108_up_2...
[4] 11 AACTGCCATTG NM_205903_up_2000...
[5] 11 AACTGACATCC NM_135746_up_2000...
... ... ...
[1671] 11 AACTGCAATCG NM_001272822_up_2...
[1672] 11 AACTGGCAGCG NM_001042822_up_2...
[1673] 11 AACTGGCAGCG NM_001144752_up_2...
[1674] 11 AACTCCCATCG NM_001170391_up_2...
[1675] 11 AACTGCCATCG NM_001015255_up_2...
Warning message:
In .normarg_at2(at, x) : 'at' names were ignored
replaceAt(flyDNA, v2) # deletes the matches
A DNAStringSet instance of length 26454
width seq names
[1] 2000 GTTGGTGG...TGCACGGT NM_078863_up_2000...
[2] 2000 TTATTTAT...TCCTCGAT NM_001201794_up_2...
[3] 2000 TTATTTAT...TCCTCGAT NM_001201795_up_2...
[4] 2000 TTATTTAT...TCCTCGAT NM_001201796_up_2...
[5] 2000 TTATTTAT...TCCTCGAT NM_001201797_up_2...
... ... ...
[26450] 2000 ATTTACAA...AATAAAAC NM_001111010_up_2...
[26451] 2000 GATATACG...TATATATT NM_001015258_up_2...
[26452] 2000 GATATACG...TATATATT NM_001110997_up_2...
[26453] 2000 GATATACG...TATATATT NM_001276245_up_2...
[26454] 2000 CGTATGTA...ACAAATTG NM_001015497_up_2...
Warning message:
In .normarg_at2(at, x) : 'at' names were ignored
> replaceAt(flyDNA, v2, "+") # replaces the matches
A DNAStringSet instance of length 26454
width seq names
[1] 2000 GTTGGTGG...TGCACGGT NM_078863_up_2000...
[2] 2000 TTATTTAT...TCCTCGAT NM_001201794_up_2...
[3] 2000 TTATTTAT...TCCTCGAT NM_001201795_up_2...
[4] 2000 TTATTTAT...TCCTCGAT NM_001201796_up_2...
[5] 2000 TTATTTAT...TCCTCGAT NM_001201797_up_2...
... ... ...
[26450] 2000 ATTTACAA...AATAAAAC NM_001111010_up_2...
[26451] 2000 GATATACG...TATATATT NM_001015258_up_2...
[26452] 2000 GATATACG...TATATATT NM_001110997_up_2...
[26453] 2000 GATATACG...TATATATT NM_001276245_up_2...
[26454] 2000 CGTATGTA...ACAAATTG NM_001015497_up_2...
Warning message:
In .normarg_at2(at, x) : 'at' names were ignored
unlist(flyDNA)
52904706-letter "DNAString" instance
seq: GTTGGTGGCCCACCAGTGCCAAA...ATTCAAAGTGTAAGAACAAATTG
> pal <- findPalindromes(unlist(flyDNA),
+ min.armlength=8,
+ max.looplength=10,
+ max.mismatch=1)
bounds <- c(1, cumsum(width(flyDNA)))
> head(bounds) # boundaries of each sequence
[1] 1 2000 4000 6000 8000 10000
> starts <- start(pal)
> ends <- end(pal)
> cbind(head(starts), head(ends)) # pairs of positions
[,1] [,2]
[1,] 73 91
[2,] 238 253
[3,] 446 462
[4,] 654 672
[5,] 659 674
[6,] 791 806
> seq <- 1L # current sequence
> counts <- integer(2000) # counts for upstream positions
> for (i in seq_along(starts)) {
+ while (ends[i] >= bounds[seq + 1L]) {
+ seq <- seq + 1L
+ }
+ if (starts[i] >= bounds[seq]) {
+ range <- starts[i]:ends[i] - bounds[seq] + 1L
+ counts[range] <- counts[range] + 1L
+ }
+ }
plot(-rev(seq_along(counts)),
+ counts,
+ type="l",
+ xlab="Positions from transcription start site",
+ ylab="Number of palindromes covering position")
fas2 <- system.file("extdata",
+ "someorf.fa.gz",
+ package="Biostrings")
> ORFs <- readDNAStringSet(fas2)
> ORFs
A DNAStringSet instance of length 7
width seq names
[1] 5573 ACTTGTAAAT...TTGTTGATAT YAL001C TFC3 SGDI...
[2] 5825 TTCCAAGGCC...CTATTCTCTT YAL002W VPS8 SGDI...
[3] 2987 CTTCATGTCA...GCTGCCTCAT YAL003W EFB1 SGDI...
[4] 3929 CACTCATATC...GAAAAAGTAC YAL005C SSA1 SGDI...
[5] 2648 AGAGAAAGAG...GTGAACATAG YAL007C ERP2 SGDI...
[6] 2597 GTGTCCGGGC...ATGTACTTTT YAL008W FUN14 SGD...
[7] 2780 CAAGATAATG...AAAAAATCAC YAL009W SPO7 SGDI...
ORFs <- subseq(ORFs, 1001, -1001)
> ORFs
A DNAStringSet instance of length 7
width seq names
[1] 3573 ATGGTACTGA...ATCTACATAA YAL001C TFC3 SGDI...
[2] 3825 ATGGAGCAAA...AATAGTATAA YAL002W VPS8 SGDI...
[3] 987 ATGGCATCCA...AAAATTATAA YAL003W EFB1 SGDI...
[4] 1929 ATGTCAAAAG...AGTTGATTAA YAL005C SSA1 SGDI...
[5] 648 ATGATCAAAT...TTACGTTTAA YAL007C ERP2 SGDI...
[6] 597 ATGACTTTGG...TAACAAATGA YAL008W FUN14 SGD...
[7] 780 ATGGAGCCAG...ATCAGAATGA YAL009W SPO7 SGDI...
aa <- translate(ORFs)
vmatchPattern("*", aa)
MIndex object of length 7
$`YAL001C TFC3 SGDID:S0000001, Chr I from 152168-146596, reverse complement, Verified ORF`
IRanges object with 3 ranges and 0 metadata columns:
start end width
[1] 44 44 1
[2] 49 49 1
[3] 1191 1191 1
$`YAL002W VPS8 SGDID:S0000002, Chr I from 142709-148533, Verified ORF`
IRanges object with 1 range and 0 metadata columns:
start end width
[1] 1275 1275 1
$`YAL003W EFB1 SGDID:S0000003, Chr I from 141176-144162, Verified ORF`
IRanges object with 10 ranges and 0 metadata columns:
start end width
[1] 32 32 1
[2] 36 36 1
[3] 55 55 1
[4] 67 67 1
[5] 68 68 1
[6] 70 70 1
[7] 77 77 1
[8] 85 85 1
[9] 147 147 1
[10] 329 329 1
...
<4 more elements>