Managing big biological sequence data - Workflow part #2
Part #2 of the workflow uses sequences of the COI gene that were obtained from several locations in Pennsylvania. These DNA sequences are stored in FASTQ format along with their corresponding quality scores. After importing, the first step is to trim the sequences so that only the high quality center region remains. The subset of sequences that might belong to Pythium species will be identified by the presence of a conserved region common all Pythium. This analysis will be performed in batches so that all of the sequences do not need to fit in memory simultaneously.# Import from the compressed FASTQ sequence files
> path <- paste(data_dir,
+ "/FASTQ/",
+ sep="")
> files <- list.files(path)
> samples <- substring(files,
+ first=1,
+ last=nchar(files) - 6)
> for (i in seq_along(files)) {
+ cat(samples[i], ":\n", sep="")
+ Seqs2DB(paste(path, files[i], sep=""),
+ type="FASTQ",
+ dbFile=dbConn,
+ identifier=samples[i],
+ tblName="Reads")
+ }
BerksFarmA:
Reading FASTQ file chunk 5
38999 total sequences in table Reads.
Time difference of 2.72 secs
BerksFarmB:
Reading FASTQ file chunk 2
Added 10659 new sequences to table Reads.
49658 total sequences in table Reads.
Time difference of 0.76 secs
DauphinFarm:
Reading FASTQ file chunk 1
Added 3475 new sequences to table Reads.
53133 total sequences in table Reads.
Time difference of 0.23 secs
LancasterFarmA:
Reading FASTQ file chunk 7
Added 51099 new sequences to table Reads.
104232 total sequences in table Reads.
Time difference of 3.24 secs
LebanonFarmA:
Reading FASTQ file chunk 2
Added 9892 new sequences to table Reads.
114124 total sequences in table Reads.
Time difference of 1.27 secs
LebanonFarmB:
Reading FASTQ file chunk 5
Added 26667 new sequences to table Reads.
140791 total sequences in table Reads.
Time difference of 2.2 secs
LititzRunCreek:
Reading FASTQ file chunk 2
Added 8876 new sequences to table Reads.
149667 total sequences in table Reads.
Time difference of 0.99 secs
YorkFarm:
Reading FASTQ file chunk 9
Added 68726 new sequences to table Reads.
218393 total sequences in table Reads.
Time difference of 5.15 secs
>
> # note the primers as they might appear in the sequence
> forward <- "CCATCTCATCCCTGCGTGTCTCCGACTCAGNNNNNNNNNNTCAWCWMGATGGCTTTTTTCAAC"
> reverse <- "RRHWACKTGACTDATRATACCAAACCTATCCCCTGTGTGCCTTGGCAGTCTCAG"
> reverse <- as.character(reverseComplement(DNAStringSet(reverse)))
>
> # Trim the sequences by quality and identify
> # the subset belonging to the Pythium genus
> nSeqs <- SearchDB(dbConn,
+ tbl="Reads",
+ count=TRUE,
+ verbose=FALSE)
> offset <- 0
> ends <- starts <- counts <- integer(nSeqs)
> pBar <- txtProgressBar(max=nSeqs, style=3)
| | 0%
> while (offset < nSeqs) {
+ # Select a batch of sequences
+ dna <- SearchDB(dbConn,
+ tbl="Reads",
+ type="QualityScaledXStringSet",
+ limit=paste(offset, 1e4, sep=","),
+ verbose=FALSE)
+
+ # Convert quality scores to error probabilities
+ bounds <- TrimDNA(DNAStringSet(dna),
+ leftPatterns=forward,
+ rightPatterns=reverse,
+ quality=quality(dna),
+ verbose=FALSE)
+
+ # Store the results for later use
+ index <- (offset + 1):(offset + length(dna))
+ starts[index] <- start(bounds)
+ ends[index] <- end(bounds)
+
+ # Find the pattern expected in Pythium sequences
+ counts[index] <- vcountPattern(pattern[[1]],
+ subject=dna,
+ max.mismatch=4,
+ with.indels=TRUE,
+ fixed="subject") # allow ambiguities
+
+ offset <- offset + 1e4
+ setTxtProgressBar(pBar,
+ ifelse(offset > nSeqs, nSeqs, offset))
+ }
|============================================| 100%
>
> # Add the results to new columns in the database
> results <- data.frame(start=starts,
+ end=ends,
+ count=counts)
> Add2DB(results,
+ dbFile=dbConn,
+ tblName="Reads",
+ verbose=FALSE)
> BrowseDB(dbConn,
+ tblName="Reads",
+ limit=1000)
>
> # Cluster the reads in each sample by percent identity
> for (i in seq_along(samples)) {
+ cat(samples[i])
+
+ # Select moderately long sequences
+ dna <- SearchDB(dbConn,
+ tblName="Reads",
+ identifier=samples[i],
+ clause="count > 0 and
+ (end - start + 1) >= 100",
+ verbose=FALSE)
+
+ cat(":", length(dna), "sequences")
+
+ # Trim the sequences to the high-quality region
+ index <- as.numeric(names(dna))
+ dna <- subseq(dna,
+ start=starts[index],
+ end=ends[index])
+
+ # Cluster the sequences without a distance matrix
+ clusters <- IdClusters(myXStringSet=dna,
+ method="inexact",
+ cutoff=0.03, # > 97% identity
+ verbose=FALSE)
+
+ # Add the cluster numbers to the database
+ Add2DB(clusters,
+ dbFile=dbConn,
+ tblName="Reads",
+ verbose=FALSE)
+
+ cat(",",
+ length(unique(clusters[, 1])),
+ "clusters\n")
+ }
BerksFarmA: 567 sequences, 56 clusters
BerksFarmB: 137 sequences, 25 clusters
DauphinFarm: 58 sequences, 13 clusters
LancasterFarmA: 1162 sequences, 62 clusters
LebanonFarmA: 85 sequences, 21 clusters
LebanonFarmB: 3516 sequences, 115 clusters
LititzRunCreek: 3440 sequences, 261 clusters
YorkFarm: 2068 sequences, 28 clusters
>
> # Now the database contains a column of clusters
> BrowseDB(dbConn,
+ tblName="Reads",
+ limit=1000,
+ clause="cluster is not NULL")