DECIPHER - Big Biological Data Part #3

Managing big biological sequence data - Workflow part #3
In part #3 of the workflow, representatives from each sequence cluster are compared to known Pythium species. The goal of this analysis is to identify which organisms present in each sample are similar to known species. The known species are separated into two groups: those that are used as biocontrol agents (good strains) and those that are known to be plant pathogens (bad strains).
Hide output
ids <- IdentifyByRank(dbConn,
+    add2tbl=TRUE)


Updating column: "identifier"...
Formed 167 distinct groups.
Added to table Seqs: "identifier".
Time difference of 0.09 secs


> lens <- IdLengths(dbConn,
+    add2tbl=TRUE)


Lengths counted for 488 sequences.
Added to Seqs:  "bases", "nonbases", and "width".


Time difference of 0.05 secs


> BrowseDB(dbConn)
> 
> # separate Pythium strains into good and bad groups
> biocontrol <- c('Pythium oligandrum',
+    'Pythium nunn',
+    'Pythium periplocum')
> pathogen <- c('Pythium acanthicum', # strawberries:
+    'Pythium rostratum',
+    'Pythium middletonii',
+    'Pythium aristosporum', # grasses/cereals:
+    'Pythium graminicola',
+    'Pythium okanoganense',
+    'Pythium paddicum',
+    'Pythium volutum',
+    'Pythium arrhenomanes',
+    'Pythium buismaniae', # flowers:
+    'Pythium spinosum',
+    'Pythium mastophorum',
+    'Pythium splendens',
+    'Pythium violae', # carrots:
+    'Pythium paroecandrum',
+    'Pythium sulcatum',
+    'Pythium dissotocum', # potatoes:
+    'Pythium scleroteichum',
+    'Pythium myriotylum',
+    'Pythium heterothallicum', # lettuce:
+    'Pythium tracheiphilum',
+    'Pythium ultimum', # multiple plants:
+    'Pythium irregulare',
+    'Pythium aphanidermatum',
+    'Pythium debaryanum',
+    'Pythium sylvaticum')
> 
> # Select the longest sequence from each species
> species <- SearchDB(dbConn,
+    nameBy="identifier",
+    clause=paste("identifier in (",
+       paste("'",
+          c(biocontrol,
+             pathogen),
+          "'",
+          sep="",
+          collapse=", "),
+       ") group by identifier
+       having max(bases)",
+       sep=""))
Search Expression:
select identifier, _Seqs.sequence from Seqs join
_Seqs on Seqs.row_names = _Seqs.row_names where
_Seqs.row_names in (select row_names from Seqs where
identifier in ('Pythium oligandrum', 'Pythium nunn',
'Pythium periplocum', 'Pythium acanthicum', 'Pythium
rostratum', 'Pythium middletonii', 'Pythium
aristosporum', 'Pythium graminicola', 'Pythium
okanoganense', 'Pythium paddicum', 'Pythium
volutum', 'Pythium arrhenomanes', 'Pythium
buismaniae', 'Pythium spinosum', 'Pythium
mastophorum', 'Pythium splendens', 'Pythium violae',
'Pythium paroecandrum', 'Pythium sulcatum', 'Pythium
dissotocum', 'Pythium scleroteichum', 'Pythium
myriotylum', 'Pythium heterothallicum', 'Pythium
tracheiphilum', 'Pythium ultimum', 'Pythium
irregulare', 'Pythium aphanidermatum', 'Pythium
debaryanum', 'Pythium sylvaticum') group by
identifier having max(bases))


DNAStringSet of length: 29
Time difference of 0.01 secs


> 
> # Select the longest sequence in each cluster
> dna <- SearchDB(dbConn,
+    identifier="DauphinFarm", # choose a sample
+    tblName="Reads",
+    clause="cluster is not null
+       group by cluster
+       having max(end - start)")
Search Expression:
select row_names, sequence from _Reads where
row_names in (select row_names from Reads where
identifier is "DauphinFarm" and cluster is not null
group by cluster having max(end - start))


DNAStringSet of length: 13
Time difference of 0.06 secs


> 
> # Trim to the high quality central region
> index <- as.numeric(names(dna))
> dna <- subseq(dna,
+    start=starts[index],
+    end=ends[index])
> 
> # Create a tree with known and unknown species
> combined <- AlignSeqs(c(dna, species))
Determining distance matrix based on shared 8-mers:
  |============================================| 100%


Time difference of 0.1 secs


Clustering into groups by similarity:
  |============================================| 100%


Time difference of 0.07 secs


Aligning Sequences:
  |============================================| 100%


Time difference of 0.52 secs


Determining distance matrix based on alignment:
  |============================================| 100%


Time difference of 0.06 secs


Reclustering into groups by similarity:
  |============================================| 100%


Time difference of 0.07 secs


Realigning Sequences:
  |============================================| 100%


Time difference of 0.41 secs


Refining the alignment:
  |============================================| 100%


Time difference of 0.17 secs


> dists <- DistanceMatrix(combined,
+       verbose=FALSE,
+       correction="Jukes-Cantor")
> tree <- IdClusters(dists,
+    method="NJ", # Neighbor joining
+    type="dendrogram",
+    verbose=FALSE)
> plot(tree,
+    nodePar=list(lab.cex=0.5, pch=NA))
> 
> # Color known species based on their pathogenicity
> tree_colored <- dendrapply(tree,
+    function(x) {
+       if (is.leaf(x)) {
+          if (attr(x, "label") %in% pathogen) {
+             attr(x, "edgePar") <- list(col="red")
+          } else if (attr(x, "label") %in% biocontrol) {
+             attr(x, "edgePar") <- list(col="green")
+          }
+          
+          # remove the label
+          attr(x, "label") <- ""
+       }
+       return(x)
+    })
> plot(tree_colored)
> 
> # Disconnect from the sequence database
> dbDisconnect(dbConn)
[1] TRUE
> # permanently delete the database
> unlink("./COIgenes.sqlite") # optional!
< Previous Part